Intel-bigdata
diff --git a/‎bin/functions/hibench_prop_env_mapping.py‎
Lines changed: 9 additions & 0 deletions b/‎bin/functions/hibench_prop_env_mapping.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎conf/00-default-properties.conf‎
Lines changed: 2 additions & 0 deletions b/‎conf/00-default-properties.conf‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎conf/10-data-scale-profile.conf‎
Lines changed: 21 additions & 1 deletion b/‎conf/10-data-scale-profile.conf‎
Lines changed: 21 additions & 1 deletion
diff --git a/‎src/pom.xml‎
Lines changed: 1 addition & 0 deletions b/‎src/pom.xml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/sparkbench/pom.xml‎
Lines changed: 5 additions & 0 deletions b/‎src/sparkbench/pom.xml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Driver.scala‎
Lines changed: 76 additions & 0 deletions b/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Driver.scala‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/GraphxNWeight.scala‎
Lines changed: 99 additions & 0 deletions b/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/GraphxNWeight.scala‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/PregelNWeight.scala‎
Lines changed: 88 additions & 0 deletions b/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/PregelNWeight.scala‎
Lines changed: 88 additions & 0 deletions
diff --git a/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Utils.scala‎
Lines changed: 51 additions & 0 deletions b/‎src/sparkbench/src/main/scala/com/intel/sparkbench/nweight/Utils.scala‎
Lines changed: 51 additions & 0 deletions
@@ -112,6 +112,15 @@
     WT_FILE_SIZE="hibench.dfsioe.write.file_size",
     MAP_JAVA_OPTS="hibench.dfsioe.map.java_opts",
     RED_JAVA_OPTS="hibench.dfsioe.red.java_opts",
+    # For NWeight
+    MODEL_INPUT="hibench.nweight.model_path",
+    EDGES="hibench.workload.edges",
+    DEGREE="hibench.nweight.degree",
+    MAX_OUT_EDGES="hibench.nweight.max_out_edges",
+    NUM_PARTITION="hibench.nweight.partitions",
+    STORAGE_LEVEL="hibench.nweight.storage_level",
+    DISABLE_KRYO="hibench.nweight.disable_kryo",
+    MODEL="hibench.nweight.model",
 
     # For streaming bench
     # zkHelper
 
@@ -150,6 +150,8 @@ hibench.randomtextwriter.bytestotal.hadoop1.name   test.randomtextwrite.total_by
 hibench.randomtextwriter.bytestotal.hadoop2.name   mapreduce.randomtextwriter.totalbytes
 hibench.randomtextwriter.bytestotal.name	   ${hibench.randomtextwriter.bytestotal.${hibench.hadoop.version}.name}
 
+hibench.nweight.model_path	${hibench.dependency.dir}/sparkbench/src/main/scala/com/intel/sparkbench/nweight/model/user-features
+
 # Workload Input/Output name setting for compress/uncompress mode
 hibench.workload.dir.name.compress_disable.input	Input
 hibench.workload.dir.name.compress_disable.output	Output
 
@@ -212,4 +212,24 @@ hibench.dfsioe.gigantic.write.file_size		400
 hibench.dfsioe.bigdata.read.number_of_files	2048
 hibench.dfsioe.bigdata.read.file_size		1000
 hibench.dfsioe.bigdata.write.number_of_files	2048
-hibench.dfsioe.bigdata.write.file_size		1000
+hibench.dfsioe.bigdata.write.file_size		1000
+
+#NWeight
+hibench.nweight.tiny.edges			100000
+hibench.nweight.tiny.degree                     3
+hibench.nweight.tiny.max_out_edges              30
+hibench.nweight.small.edges                     1000000
+hibench.nweight.small.degree                    3
+hibench.nweight.small.max_out_edges             30
+hibench.nweight.large.edges                     10000000
+hibench.nweight.large.degree                    3
+hibench.nweight.large.max_out_edges             30
+hibench.nweight.huge.edges                      100000000
+hibench.nweight.huge.degree                     3
+hibench.nweight.huge.max_out_edges              30
+hibench.nweight.gigantic.edges                  425000000
+hibench.nweight.gigantic.degree                 3
+hibench.nweight.gigantic.max_out_edges          30
+hibench.nweight.bigdata.edges                   4250000000
+hibench.nweight.bigdata.degree                  3
+hibench.nweight.bigdata.max_out_edges           30
@@ -32,6 +32,7 @@
 	<jetty.version>8.1.14.v20131031</jetty.version>
 	<scalatest.version>2.2.1</scalatest.version>
 	<scalacheck.version>1.11.3</scalacheck.version>
+        <fastutil.version>6.5.15</fastutil.version>
     </properties>
 
     <repositories>
 
@@ -92,6 +92,11 @@
             <artifactId>mahout-math</artifactId>
             <version>${mahout.version}</version>
         </dependency>
+        <dependency>
+            <groupId>it.unimi.dsi</groupId>
+            <artifactId>fastutil</artifactId>
+            <version>${fastutil.version}</version>
+        </dependency>
     </dependencies>
     <build>
         <plugins>
 
@@ -0,0 +1,76 @@
+package com.intel.sparkbench.nweight 
+
+import org.apache.spark.{SparkContext, SparkConf}
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.scheduler.{JobLogger, StatsReportListener}
+
+import com.esotericsoftware.kryo.{Kryo, Serializer => KSerializer}
+import com.esotericsoftware.kryo.io.{Input => KryoInput, Output => KryoOutput}
+import com.esotericsoftware.kryo.serializers.{JavaSerializer => KryoJavaSerializer}
+
+/** 
+ * Compute NWeight for Graph G(V, E) as defined below
+ *     Weight(1)(u, v) = edge(u, v)
+ *     Weight(n)(u, v) = Sum (over {x|there are edges (u, x) and (x, v)}) Weight(n-1)(u, x)*Weight(1)(x, v)
+ * 
+ * Input is given in Text file format. Each line represents a Node and all out edges of that node (edge weight specified) 
+ *  <vertex> <vertex1>:<weight1>, <vertex2>:<weight2> ...) 
+ */
+object NWeight extends Serializable{
+ 
+  def parseArgs(args: Array[String]) = {
+    if (args.length < 7) {
+      System.err.println("Usage: <input> <output> <step> <max Out edges> " +
+          "<no. of result partitions> <storageLevel> <model>")
+      System.exit(1)
+    }
+    val input = args(0)
+    val output =  args(1)
+    val step = args(2).toInt
+    val maxDegree = args(3).toInt
+    val numPartitions = args(4).toInt
+    val storageLevel = args(5).toInt match {
+        case 0 => StorageLevel.OFF_HEAP
+        case 1 => StorageLevel.DISK_ONLY
+        case 2 => StorageLevel.DISK_ONLY_2
+        case 3 => StorageLevel.MEMORY_ONLY
+        case 4 => StorageLevel.MEMORY_ONLY_2
+        case 5 => StorageLevel.MEMORY_ONLY_SER 
+        case 6 => StorageLevel.MEMORY_ONLY_SER_2
+        case 7 => StorageLevel.MEMORY_AND_DISK
+        case 8 => StorageLevel.MEMORY_AND_DISK_2
+        case 9 => StorageLevel.MEMORY_AND_DISK_SER
+        case 10 => StorageLevel.MEMORY_AND_DISK_SER_2
+        case _ => StorageLevel.MEMORY_AND_DISK
+    }
+    val disableKryo = args(6).toBoolean
+    val model = args(7)
+
+    (input, output, step, maxDegree, numPartitions, storageLevel, disableKryo, model)
+  }
+  
+  def main(args: Array[String]) {
+    val (input, output, step, maxDegree, numPartitions, storageLevel, disableKryo, model) = parseArgs(args)
+
+    if(!disableKryo) {
+      System.setProperty("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
+    }
+    val sparkConf = new SparkConf()
+    if (model.toLowerCase == "graphx") 
+      sparkConf.setAppName("NWeightGraphX")
+    else
+      sparkConf.setAppName("NWeightPregel")
+    val sc = new SparkContext(sparkConf)
+ 
+    sc.addSparkListener(new JobLogger)
+    sc.addSparkListener(new StatsReportListener)
+
+    if (model.toLowerCase == "graphx") {
+      GraphxNWeight.nweight(sc, input, output, step, maxDegree, numPartitions, storageLevel)
+    } else {
+      PregelNWeight.nweight(sc, input, output, step, maxDegree, numPartitions, storageLevel)
+    }
+  }
+}
@@ -0,0 +1,99 @@
+package com.intel.sparkbench.nweight 
+
+import scala.collection.JavaConversions._
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.HashPartitioner
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.impl.GraphImpl
+import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap
+
+/** * Compute NWeight for Graph G(V, E) as defined below *     Weight(1)(u, v) = edge(u, v)
+ *     Weight(n)(u, v) = Sum (over {x|there are edges (u, x) and (x, v)}) Weight(n-1)(u, x)*Weight(1)(x, v)
+ *
+ * Input is given in Text file format. Each line represents a Node and all out edges of that node (edge weight specified)
+ *  <vertex> <vertex1>:<weight1>, <vertex2>:<weight2> ...)
+ */
+
+object GraphxNWeight extends Serializable{
+
+  def mapF(edge: EdgeTriplet[SizedPriorityQueue, Double]) = {
+    val m = new Long2DoubleOpenHashMap()
+    val w1 = edge.attr
+    val id = edge.srcId
+    edge.dstAttr.foreach{ case (target, wn) =>
+      if (target != id)
+        m.put(target, wn*w1)
+    }
+    Iterator((id, m))
+  }
+
+  def reduceF(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
+    c2.long2DoubleEntrySet()
+      .fastIterator()
+      .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
+    c1
+  }
+
+  def updateF (id: VertexId, vdata: SizedPriorityQueue, msg: Option[Long2DoubleOpenHashMap]) = {
+    vdata.clear()
+    val weightMap = msg.orNull
+    if (weightMap != null) {
+      weightMap.long2DoubleEntrySet().fastIterator().foreach { pair =>
+        val src = pair.getLongKey()
+        val wn = pair.getDoubleValue()
+        vdata.enqueue((src, wn))
+      }
+    }
+    vdata
+  }
+
+  def nweight(sc: SparkContext, input: String, output: String, step: Int,
+    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {
+
+    //val start1 = System.currentTimeMillis
+    val part = new HashPartitioner(numPartitions)
+    val edges = sc.textFile(input, numPartitions).flatMap { line =>
+      val fields = line.split("\\s+", 2)
+      val src = fields(0).trim.toLong
+
+      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
+        val pair = pairStr.split(":")
+        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
+        (src, Edge(src, dest, weight))
+      }
+    }.partitionBy(part).map(_._2)
+
+    val vertices = edges.map { e =>
+      (e.srcId, (e.dstId, e.attr))
+    }.groupByKey(part).map { case (id, seq) =>
+      val vdata = new SizedPriorityQueue(maxDegree)
+      seq.foreach(vdata.enqueue(_))
+      (id, vdata)
+    }
+
+    var g = GraphImpl(vertices, edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()
+
+    var msg: RDD[(VertexId, Long2DoubleOpenHashMap)] = null
+    for (i <- 2 to step) {
+      msg = g.mapReduceTriplets(mapF _, reduceF _, Some(g.vertices , EdgeDirection.In))
+      g = g.outerJoinVertices(msg)(updateF _).persist(storageLevel)
+    }
+
+    g.vertices.map { case (vid, vdata) => 
+      var s = new StringBuilder
+      s.append(vid)
+
+      vdata.foreach { r =>
+        s.append(' ')
+        s.append(r._1)
+        s.append(':')
+        s.append(r._2)
+      }
+      s.toString
+    }.saveAsTextFile(output)
+  }
+}
+
@@ -0,0 +1,88 @@
+package com.intel.sparkbench.nweight
+
+import scala.collection.JavaConversions._
+import org.apache.spark.SparkContext
+import org.apache.spark.SparkContext._
+import org.apache.spark.rdd.RDD
+import org.apache.spark.HashPartitioner
+import org.apache.spark.storage.StorageLevel
+import org.apache.spark.graphx._
+import org.apache.spark.graphx.impl.GraphImpl
+import it.unimi.dsi.fastutil.longs.Long2DoubleOpenHashMap
+
+/** * Compute NWeight for Graph G(V, E) as defined below *     Weight(1)(u, v) = edge(u, v)
+ *     Weight(n)(u, v) = Sum (over {x|there are edges (u, x) and (x, v)}) Weight(n-1)(u, x)*Weight(1)(x, v)
+ *
+ * Input is given in Text file format. Each line represents a Node and all out edges of that node (edge weight specified)
+ *  <vertex> <vertex1>:<weight1>, <vertex2>:<weight2> ...)
+ */
+
+object PregelNWeight extends Serializable{
+
+  def sendMsg(edge: EdgeTriplet[SizedPriorityQueue, Double]) = {
+    val m = new Long2DoubleOpenHashMap()
+    val w1 = edge.attr
+    val id = edge.srcId
+    edge.dstAttr.foreach{ case (target, wn) =>
+      if (target != id)
+        m.put(target, wn*w1)
+    }
+    Iterator((id, m))
+  }
+
+  def mergMsg(c1: Long2DoubleOpenHashMap, c2: Long2DoubleOpenHashMap) = {
+    c2.long2DoubleEntrySet()
+      .fastIterator()
+      .foreach(pair => c1.put(pair.getLongKey(), c1.get(pair.getLongKey()) + pair.getDoubleValue()))
+    c1
+  }
+
+  def vProg(id: VertexId, vdata: SizedPriorityQueue, msg: Long2DoubleOpenHashMap) = {
+    vdata.clear()
+    if (msg.size > 0) {
+      msg.long2DoubleEntrySet().fastIterator().foreach { pair =>
+        val src = pair.getLongKey()
+        val wn = pair.getDoubleValue()
+        vdata.enqueue((src, wn))
+      }
+      vdata
+    } else {
+      vdata.enqueue((id, 1))
+      vdata 
+    }
+  }
+
+  def nweight(sc: SparkContext, input: String, output: String, step: Int,
+    maxDegree: Int, numPartitions: Int, storageLevel: StorageLevel) {
+
+    //val start1 = System.currentTimeMillis
+    val part = new HashPartitioner(numPartitions)
+    val edges = sc.textFile(input, numPartitions).flatMap { line =>
+      val fields = line.split("\\s+", 2)
+      val src = fields(0).trim.toLong
+
+      fields(1).split("[,\\s]+").filter(_.isEmpty() == false).map { pairStr =>
+        val pair = pairStr.split(":")
+        val (dest, weight) = (pair(0).trim.toLong, pair(1).toDouble)
+        (src, Edge(src, dest, weight))
+      }
+    }.partitionBy(part).map(_._2)
+
+    var g = GraphImpl(edges, new SizedPriorityQueue(maxDegree), storageLevel, storageLevel).cache()
+
+    g = Pregel(g, new Long2DoubleOpenHashMap, step, EdgeDirection.In)(vProg _, sendMsg _, mergMsg _)
+
+    g.vertices.map { case (vid, vdata) => 
+      var s = new StringBuilder
+      s.append(vid)
+
+      vdata.foreach { r =>
+        s.append(' ')
+        s.append(r._1)
+        s.append(':')
+        s.append(r._2)
+      }
+      s.toString
+    }.saveAsTextFile(output)
+  }
+}
@@ -0,0 +1,51 @@
+package com.intel.sparkbench.nweight
+
+import it.unimi.dsi.fastutil.objects.ObjectHeaps
+
+class SizedPriorityQueue(
+  val capacity:Int) extends Traversable[(Long, Double)] with Serializable {
+  protected val buf = new Array[(Long, Double)](capacity)
+  protected val comparator = new java.util.Comparator[(Long, Double)] with Serializable {
+    override def compare(m1: (Long, Double), m2: (Long, Double)) = if (m1._2 < m2._2) -1 else (if (m1._2 > m2._2) 1 else (if (m1._1 < m2._1) -1 else (if (m1._1 > m2._1) 1 else 0)))
+  }
+
+  protected var size_ = 0
+
+  override def size() = size_
+
+  def clear() {
+    size_ = 0
+  }
+
+  def fullySorted(): Array[(Long, Double)] = {
+    import scala.collection.JavaConversions._
+    val slicedBuf = buf.slice(0, size_ - 1)
+    java.util.Arrays.sort(slicedBuf, comparator)
+    slicedBuf
+  }
+
+  def foreach[U](f: ((Long, Double)) => U): Unit = {
+    for (i <- 0 until size_) f(buf(i))
+  }
+
+  def enqueue(value: (Long, Double)) {
+    if (size_ < capacity) {
+      buf(size_) = value
+      size_ = size_ + 1
+      ObjectHeaps.upHeap(buf, size_, size_ - 1, comparator)
+    } else if (comparator.compare(value, buf(0)) > 0) {
+      buf(0) = value
+      ObjectHeaps.downHeap(buf, size_, 0, comparator)
+    }
+  }
+
+}
+
+object SizedPriorityQueue {
+  def apply(capacity :Int)(elems: (Long, Double)*) = {
+    val q = new SizedPriorityQueue(capacity);
+    for ((i, v) <- elems)
+      q.enqueue(i, v);
+    q
+  }
+}