Add NWeight workload

lingzhouHZ · lingzhouHZ · commit 9e778a308253 · 2016-05-19T14:26:54.000+08:00
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -115,6 +115,12 @@
     # For NWeight
     MODEL_INPUT="hibench.nweight.model_path",
     EDGES="hibench.workload.edges",
+    DEGREE="hibench.nweight.degree",
+    MAX_OUT_EDGES="hibench.nweight.max_out_edges",
+    NUM_PARTITION="hibench.nweight.partitions",
+    STORAGE_LEVEL="hibench.nweight.storage_level",
+    DISABLE_KRYO="hibench.nweight.disable_kryo",
+    MODEL="hibench.nweight.model",
 
     # For streaming bench
     # zkHelper
diff --git a/conf/10-data-scale-profile.conf b/conf/10-data-scale-profile.conf
@@ -216,8 +216,20 @@ hibench.dfsioe.bigdata.write.file_size		1000
 
 #NWeight
 hibench.nweight.tiny.edges			100000
+hibench.nweight.tiny.degree                     3
+hibench.nweight.tiny.max_out_edges              30
 hibench.nweight.small.edges                     1000000
+hibench.nweight.small.degree                    3
+hibench.nweight.small.max_out_edges             30
 hibench.nweight.large.edges                     10000000
+hibench.nweight.large.degree                    3
+hibench.nweight.large.max_out_edges             30
 hibench.nweight.huge.edges                      100000000
+hibench.nweight.huge.degree                     3
+hibench.nweight.huge.max_out_edges              30
 hibench.nweight.gigantic.edges                  425000000
+hibench.nweight.gigantic.degree                 3
+hibench.nweight.gigantic.max_out_edges          30
 hibench.nweight.bigdata.edges                   4250000000
+hibench.nweight.bigdata.degree                  3
+hibench.nweight.bigdata.max_out_edges           30
diff --git a/src/pom.xml b/src/pom.xml
@@ -32,6 +32,7 @@
 	<jetty.version>8.1.14.v20131031</jetty.version>
 	<scalatest.version>2.2.1</scalatest.version>
 	<scalacheck.version>1.11.3</scalacheck.version>
+        <fastutil.version>6.5.15</fastutil.version>
     </properties>
 
     <repositories>
diff --git a/src/sparkbench/pom.xml b/src/sparkbench/pom.xml
@@ -92,6 +92,11 @@
             <artifactId>mahout-math</artifactId>
             <version>${mahout.version}</version>
         </dependency>
+        <dependency>
+            <groupId>it.unimi.dsi</groupId>
+            <artifactId>fastutil</artifactId>
+            <version>${fastutil.version}</version>
+        </dependency>
     </dependencies>
     <build>
         <plugins>
diff --git a/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Driver.scala b/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Driver.scala
@@ -0,0 +1,76 @@
+package com.intel.sparkbench.nweight 
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.scheduler.{JobLogger, StatsReportListener}
+
+import com.esotericsoftware.kryo.{Kryo, Serializer => KSerializer}
+import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
+import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer}
+
+/** 
+ * Compute NWeight for Graph G(V, E) as defined below
+ *     Weight(1)(u, v) = edge(u, v)
+ *     Weight(n)(u, v) = Sum (over {x|there are edges (u, x) and (x, v)}) Weight(n-1)(u, x)*Weight(1)(x, v)
+ * 
+ * Input is given in Text file format. Each line represents a Node and all out edges of that node (edge weight specified) 
+ *  <vertex> <vertex1>:<weight1>, <vertex2>:<weight2> ...) 
+ */
+object NWeight extends Serializable{
+ 
+  def parseArgs(args: Array[String]) = {
+    if (args.length < 7) {
+      System.err.println("Usage: <input> <output> <step> <max Out edges> " +
+          "<no. of result partitions> <storageLevel> <model>")
+      System.exit(1)
+    }
+    val input = args(0)
+    val output =  args(1)
+    val step = args(2).toInt
+    val maxDegree = args(3).toInt
+    val numPartitions = args(4).toInt
+    val storageLevel = args(5).toInt match {
+        case 0 => StorageLevel.OFF_HEAP
+        case 1 => StorageLevel.DISK_ONLY
+        case 2 => StorageLevel.DISK_ONLY_2
+        case 3 => StorageLevel.MEMORY_ONLY
+        case 4 => StorageLevel.MEMORY_ONLY_2
+        case 5 => StorageLevel.MEMORY_ONLY_SER 
+        case 6 => StorageLevel.MEMORY_ONLY_SER_2
+        case 7 => StorageLevel.MEMORY_AND_DISK
+        case 8 => StorageLevel.MEMORY_AND_DISK_2
+        case 9 => StorageLevel.MEMORY_AND_DISK_SER
+        case 10 => StorageLevel.MEMORY_AND_DISK_SER_2
+        case _ => StorageLevel.MEMORY_AND_DISK
+    }
+    val disableKryo = args(6).toBoolean
+    val model = args(7)
+
+    (input, output, step, maxDegree, numPartitions, storageLevel, disableKryo, model)
+  }
+  
+  def main(args: Array[String]) {
+    val (input, output, step, maxDegree, numPartitions, storageLevel, disableKryo, model) = parseArgs(args)
+
+    if(!disableKryo) {
+      System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    }
+    val sparkConf = new SparkConf()
+    if (model.toLowerCase == "graphx") 
+      sparkConf.setAppName("NWeightGraphX")
+    else
+      sparkConf.setAppName("NWeightPregel")
+    val sc = new SparkContext(sparkConf)
+ 
+    sc.addSparkListener(new JobLogger)
+    sc.addSparkListener(new StatsReportListener)
+
+    if (model.toLowerCase == "graphx") {
+      GraphxNWeight.nweight(sc, input, output, step, maxDegree, numPartitions, storageLevel)
+    } else {
+      PregelNWeight.nweight(sc, input, output, step, maxDegree, numPartitions, storageLevel)
+    }
+  }
+}
diff --git a/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/GraphxNWeight.scala b/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/GraphxNWeight.scala
@@ -0,0 +1,99 @@
+package com.intel.sparkbench.nweight 
+
+import scala.collection.JavaConversions._
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.HashPartitioner
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.impl.GraphImpl
+import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap
+
+/** * Compute NWeight for Graph G(V, E) as defined below *     Weight(1)(u, v) = edge(u, v)
+ *     Weight(n)(u, v) = Sum (over {x|there are edges (u, x) and (x, v)}) Weight(n-1)(u, x)*Weight(1)(x, v)
+ *
+ * Input is given in Text file format. Each line represents a Node and all out edges of that node (edge weight specified)
+ *  <vertex> <vertex1>:<weight1>, <vertex2>:<weight2> ...)
+ */
+
+object GraphxNWeight extends Serializable{
+
+  def mapF(edge: EdgeTriplet[SizedPriorityQueue, Double]) = {
+    val m = new Long2DoubleOpenHashMap()
+    val w1 = edge.attr
+    val id = edge.srcId
+    edge.dstAttr.foreach{ case (target, wn) =>
+      if (target != id)
+        m.put(target, wn*w1)
+    }
+    Iterator((id, m))
+  }
+
+  def reduceF(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
+    c2.long2DoubleEntrySet()
+      .fastIterator()
+      .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
+    c1
+  }
+
+  def updateF (id: VertexId, vdata: SizedPriorityQueue, msg: Option[Long2DoubleOpenHashMap]) = {
+    vdata.clear()
+    val weightMap = msg.orNull
+    if (weightMap != null) {
+      weightMap.long2DoubleEntrySet().fastIterator().foreach { pair =>
+        val src = pair.getLongKey()
+        val wn = pair.getDoubleValue()
+        vdata.enqueue((src, wn))
+      }
+    }
+    vdata
+  }
+
+  def nweight(sc: SparkContext, input: String, output: String, step: Int,
+    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {
+
+    //val start1 = System.currentTimeMillis
+    val part = new HashPartitioner(numPartitions)
+    val edges = sc.textFile(input, numPartitions).flatMap { line =>
+      val fields = line.split("\\s+", 2)
+      val src = fields(0).trim.toLong
+
+      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
+        val pair = pairStr.split(":")
+        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
+        (src, Edge(src, dest, weight))
+      }
+    }.partitionBy(part).map(_._2)
+
+    val vertices = edges.map { e =>
+      (e.srcId, (e.dstId, e.attr))
+    }.groupByKey(part).map { case (id, seq) =>
+      val vdata = new SizedPriorityQueue(maxDegree)
+      seq.foreach(vdata.enqueue(_))
+      (id, vdata)
+    }
+
+    var g = GraphImpl(vertices, edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()
+
+    var msg: RDD[(VertexId, Long2DoubleOpenHashMap)] = null
+    for (i <- 2 to step) {
+      msg = g.mapReduceTriplets(mapF _, reduceF _, Some(g.vertices , EdgeDirection.In))
+      g = g.outerJoinVertices(msg)(updateF _).persist(storageLevel)
+    }
+
+    g.vertices.map { case (vid, vdata) => 
+      var s = new StringBuilder
+      s.append(vid)
+
+      vdata.foreach { r =>
+        s.append(' ')
+        s.append(r._1)
+        s.append(':')
+        s.append(r._2)
+      }
+      s.toString
+    }.saveAsTextFile(output)
+  }
+}
+
diff --git a/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/PregelNWeight.scala b/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/PregelNWeight.scala
@@ -0,0 +1,88 @@
+package com.intel.sparkbench.nweight
+
+import scala.collection.JavaConversions._
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.HashPartitioner
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.impl.GraphImpl
+import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap
+
+/** * Compute NWeight for Graph G(V, E) as defined below *     Weight(1)(u, v) = edge(u, v)
+ *     Weight(n)(u, v) = Sum (over {x|there are edges (u, x) and (x, v)}) Weight(n-1)(u, x)*Weight(1)(x, v)
+ *
+ * Input is given in Text file format. Each line represents a Node and all out edges of that node (edge weight specified)
+ *  <vertex> <vertex1>:<weight1>, <vertex2>:<weight2> ...)
+ */
+
+object PregelNWeight extends Serializable{
+
+  def sendMsg(edge: EdgeTriplet[SizedPriorityQueue, Double]) = {
+    val m = new Long2DoubleOpenHashMap()
+    val w1 = edge.attr
+    val id = edge.srcId
+    edge.dstAttr.foreach{ case (target, wn) =>
+      if (target != id)
+        m.put(target, wn*w1)
+    }
+    Iterator((id, m))
+  }
+
+  def mergMsg(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
+    c2.long2DoubleEntrySet()
+      .fastIterator()
+      .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
+    c1
+  }
+
+  def vProg(id: VertexId, vdata: SizedPriorityQueue, msg: Long2DoubleOpenHashMap) = {
+    vdata.clear()
+    if (msg.size > 0) {
+      msg.long2DoubleEntrySet().fastIterator().foreach { pair =>
+        val src = pair.getLongKey()
+        val wn = pair.getDoubleValue()
+        vdata.enqueue((src, wn))
+      }
+      vdata
+    } else {
+      vdata.enqueue((id, 1))
+      vdata 
+    }
+  }
+
+  def nweight(sc: SparkContext, input: String, output: String, step: Int,
+    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {
+
+    //val start1 = System.currentTimeMillis
+    val part = new HashPartitioner(numPartitions)
+    val edges = sc.textFile(input, numPartitions).flatMap { line =>
+      val fields = line.split("\\s+", 2)
+      val src = fields(0).trim.toLong
+
+      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
+        val pair = pairStr.split(":")
+        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
+        (src, Edge(src, dest, weight))
+      }
+    }.partitionBy(part).map(_._2)
+
+    var g = GraphImpl(edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()
+
+    g = Pregel(g, new Long2DoubleOpenHashMap, step, EdgeDirection.In)(vProg _, sendMsg _, mergMsg _)
+
+    g.vertices.map { case (vid, vdata) => 
+      var s = new StringBuilder
+      s.append(vid)
+
+      vdata.foreach { r =>
+        s.append(' ')
+        s.append(r._1)
+        s.append(':')
+        s.append(r._2)
+      }
+      s.toString
+    }.saveAsTextFile(output)
+  }
+}
diff --git a/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Utils.scala b/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Utils.scala
@@ -0,0 +1,51 @@
+package com.intel.sparkbench.nweight
+
+import it.unimi.dsi.fastutil.objects.ObjectHeaps
+
+class SizedPriorityQueue(
+  val capacity:Int) extends Traversable[(Long, Double)] with Serializable {
+  protected val buf = new Array[(Long, Double)](capacity)
+  protected val comparator = new java.util.Comparator[(Long, Double)] with Serializable {
+    override def compare(m1: (Long, Double), m2: (Long, Double)) = if (m1._2 < m2._2) -1 else (if (m1._2 > m2._2) 1 else (if (m1._1 < m2._1) -1 else (if (m1._1 > m2._1) 1 else 0)))
+  }
+
+  protected var size_ = 0
+
+  override def size() = size_
+
+  def clear() {
+    size_ = 0
+  }
+
+  def fullySorted(): Array[(Long, Double)] = {
+    import scala.collection.JavaConversions._
+    val slicedBuf = buf.slice(0, size_ - 1)
+    java.util.Arrays.sort(slicedBuf, comparator)
+    slicedBuf
+  }
+
+  def foreach[U](f: ((Long, Double)) => U): Unit = {
+    for (i <- 0 until size_) f(buf(i))
+  }
+
+  def enqueue(value: (Long, Double)) {
+    if (size_ < capacity) {
+      buf(size_) = value
+      size_ = size_ + 1
+      ObjectHeaps.upHeap(buf, size_, size_ - 1, comparator)
+    } else if (comparator.compare(value, buf(0)) > 0) {
+      buf(0) = value
+      ObjectHeaps.downHeap(buf, size_, 0, comparator)
+    }
+  }
+
+}
+
+object SizedPriorityQueue {
+  def apply(capacity :Int)(elems: (Long, Double)*) = {
+    val q = new SizedPriorityQueue(capacity);
+    for ((i, v) <- elems)
+      q.enqueue(i, v);
+    q
+  }
+}
diff --git a/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/datagen/NWeightDataGenerator.scala b/src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/datagen/NWeightDataGenerator.scala
@@ -38,7 +38,7 @@ object NWeightDataGenerator {
       println(s"Total Records: $totalNumRecords")
     } else {
       System.err.println(
-        s"Usage: $NWeightDataGenerator <MODEL_PATH> <OUTPUT_PATH> <NUM_RECORDS> <NUM_PARTITIONS>"
+        s"Usage: $NWeightDataGenerator <MODEL_PATH> <OUTPUT_PATH> <NUM_RECORDS>"
       )
       System.exit(1)
     }
diff --git a/workloads/nweight/conf/00-nweight-default.conf b/workloads/nweight/conf/00-nweight-default.conf
@@ -2,6 +2,12 @@
 # override configurations here
 hibench.nweight.edges			${hibench.nweight.${hibench.scale.profile}.edges}
 hibench.nweight.base.hdfs		${hibench.hdfs.data.dir}/NWeight
+hibench.nweight.degree			${hibench.nweight.${hibench.scale.profile}.degree}
+hibench.nweight.max_out_edges		${hibench.nweight.${hibench.scale.profile}.max_out_edges}
+hibench.nweight.partitions		${hibench.default.map.parallelism}
+hibench.nweight.storage_level           7 
+hibench.nweight.disable_kryo            false
+hibench.nweight.model                   graphx
 
 # export for shell script
 hibench.workload.input			${hibench.nweight.base.hdfs}/${hibench.workload.dir.name.input}
diff --git a/workloads/nweight/conf/10-nweight-userdefine.conf b/workloads/nweight/conf/10-nweight-userdefine.conf
diff --git a/workloads/nweight/spark/scala/bin/run.sh b/workloads/nweight/spark/scala/bin/run.sh
diff --git a/workloads/nweight/spark/scala/scala.conf b/workloads/nweight/spark/scala/scala.conf

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ object NWeightDataGenerator {`
`38`	`38`	`println(s"Total Records: $totalNumRecords")`
`39`	`39`	`} else {`
`40`	`40`	`System.err.println(`
`41`		`- s"Usage: $NWeightDataGenerator <MODEL_PATH> <OUTPUT_PATH> <NUM_RECORDS> <NUM_PARTITIONS>"`
	`41`	`+ s"Usage: $NWeightDataGenerator <MODEL_PATH> <OUTPUT_PATH> <NUM_RECORDS>"`
`42`	`42`	`)`
`43`	`43`	`System.exit(1)`
`44`	`44`	`}`