Merge remote-tracking branch 'upstream/master'

jtengyp · jtengyp · commit fc3aed36900e · 2017-10-31T11:09:00.000+08:00
update to master
diff --git a/README.md b/README.md
@@ -50,23 +50,45 @@ There are totally 19 workloads in HiBench. The workloads are divided into 6 cate
 
 **Machine Learning:**
 
-1. Bayesian Classification (bayes)
+1. Bayesian Classification (Bayes)
 
-    This workload benchmarks NaiveBayesian Classification implemented in Spark-MLLib/Mahout examples.
+    This workload benchmarks NaiveBayesian Classification implemented in Spark-MLLib. The workload uses the automatically generated documents whose words follow the zipfian distribution. The dict used for text generation is also from the default linux file /usr/share/dict/linux.words.
 
-    Large-scale machine learning is another important use of MapReduce. This workload tests the Naive Bayesian (a popular classification algorithm for knowledge discovery and data mining)  trainer in Mahout 0.7, which is an open source (Apache project) machine learning library. The workload uses the automatically generated documents whose words follow the zipfian distribution. The dict used for text generation is also from the default linux file /usr/share/dict/linux.words.
+2. K-means clustering (Kmeans)
 
-2. K-means clustering (kmeans)
+    This workload tests the K-means (a well-known clustering algorithm for knowledge discovery and data mining) clustering in Spark-MLlib. The input data set is generated by GenKMeansDataset based on Uniform Distribution and Guassian Distribution.
 
-    This workload tests the K-means (a well-known clustering algorithm for knowledge discovery and data mining) clustering in Mahout 0.7/Spark-MLlib. The input data set is generated by GenKMeansDataset based on Uniform Distribution and Guassian Distribution.
+3. Logistic Regression (LR)
 
-3. Logistic Regression (lr)
+    This workload benchmarks Logistic Regression (LR) implemented in Spark-MLLib with LBFGS optimizer. The input data set is generated by LogisticRegressionDataGenerator based on random balance decision tree. It contains three different kinds of data types, including categorical data, continuous data, and binary data.
 
-    This workload benchmarks Logistic Regression implemented in Spark-MLLib examples. Logistic Regreesion is realized with LBFGS. The input data set is generated by LabeledPointDataGenerator based on random balance decision tree. It contains three different kinds of data types, including categorical data, continuous data, and binary data.
+4. Alternating Least Squares (ALS)
 
-4. Alternating Least Squares (als)
+    This workload benchmarks Alternating Least Squares (ALS) implememnted in Spark-MLLib. The input data set is generated by RatingDataGenerator for a product recommendation system.
 
-    This workload benchmarks Alternating Least Squares implememnted in Spark-MLLib examples. The input data set is generated by RatingDataGenerator for a product recommendation system.
+5. Gradient Boosting Tree (GBT)
+
+    This workload benchmarks Gradient Boosting Tree (GBT) implememnted in Spark-MLLib. The input data set is generated by GradientBoostingTreeDataGenerator.
+
+6. Linear Regression (LiR)
+
+    This workload benchmarks Linear Regression (LiR) implemented in Spark-MLLib with SGD optimizer. The input data set is generated by LinearRegressionDataGenerator.
+
+7. Latent Dirichlet Allocation (lda)
+
+    This workload benchmarks Latent Dirichlet Allocation (LDA) implemented in Spark-MLLib. The input data set is generated by LDADataGenerator.
+
+8. Principal Components Analysis (PCA)
+
+    This workload benchmarks Principal Components Analysis (PCA) implemented in Spark-MLLib. The input data set is generated by PCADataGenerator.
+
+9. Random Forest (RF)
+
+    This workload benchmarks Random Forest (RF) implemented in Spark-MLLib. The input data set is generated by RandomForestDataGenerator.
+
+10. Support Vector Machine (SVM)
+
+    This workload benchmarks Support Vector Machine (SVM) implemented in Spark-MLLib. The input data set is generated by SVMDataGenerator.
 
 **SQL:**
 
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -119,7 +119,12 @@
     # For Random Forest
     NUM_EXAMPLES_RF="hibench.rf.examples",
     NUM_FEATURES_RF="hibench.rf.features",
-    NUMTREES="hibench.rf.numTrees",
+    NUM_TREES_RF="hibench.rf.numTrees",
+    NUM_CLASSES_RF="hibench.rf.numClasses",
+    FEATURE_SUBSET_STRATEGY_RF="hibench.rf.featureSubsetStrategy",
+    IMPURITY_RF="hibench.rf.impurity",
+    MAX_DEPTH_RF="hibench.rf.maxDepth",
+    MAX_BINS_RF="hibench.rf.maxBins",
     # For SVD
     NUM_EXAMPLES_SVD="hibench.svd.examples",
     NUM_FEATURES_SVD="hibench.svd.features",
diff --git a/bin/workloads/ml/lda/spark/run.sh b/bin/workloads/ml/lda/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA
+run_spark_job com.intel.hibench.sparkbench.ml.LDAExample --numTopics $NUM_TOPICS_LDA --maxIterations $NUM_ITERATIONS_LDA --optimizer $OPTIMIZER_LDA --maxResultSize $MAXRESULTSIZE_LDA $INPUT_HDFS $OUTPUT_HDFS
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/bin/workloads/ml/rf/spark/run.sh b/bin/workloads/ml/rf/spark/run.sh
@@ -26,7 +26,13 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.RandomForestClassification ${INPUT_HDFS} ${NUMTREES}
+OPTION="--numTrees $NUM_TREES_RF \
+        --numClasses $NUM_CLASSES_RF \
+        --featureSubsetStrategy $FEATURE_SUBSET_STRATEGY_RF \
+        --impurity $IMPURITY_RF \
+        --maxDepth $MAX_DEPTH_RF \
+        --maxBins $MAX_BINS_RF"
+run_spark_job com.intel.hibench.sparkbench.ml.RandomForestClassification $OPTION $INPUT_HDFS
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/conf/hibench.conf b/conf/hibench.conf
@@ -1,6 +1,6 @@
 # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata.
 # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf
-hibench.scale.profile                tiny 
+hibench.scale.profile                tiny
 # Mapper number in hadoop, partition number in Spark
 hibench.default.map.parallelism         8
 
diff --git a/conf/workloads/ml/rf.conf b/conf/workloads/ml/rf.conf
@@ -1,21 +1,26 @@
-hibench.rf.tiny.examples                        10
-hibench.rf.tiny.features                        100
-hibench.rf.small.examples                       100
-hibench.rf.small.features                       500
-hibench.rf.large.examples                       1000
-hibench.rf.large.features                       1000
-hibench.rf.huge.examples                        10000
-hibench.rf.huge.features                        200000
-hibench.rf.gigantic.examples                    10000
-hibench.rf.gigantic.features                    300000
-hibench.rf.bigdata.examples                     20000
-hibench.rf.bigdata.features                     220000
+hibench.rf.tiny.examples                10
+hibench.rf.tiny.features                100
+hibench.rf.small.examples               100
+hibench.rf.small.features               500
+hibench.rf.large.examples               1000
+hibench.rf.large.features               1000
+hibench.rf.huge.examples                10000
+hibench.rf.huge.features                200000
+hibench.rf.gigantic.examples            10000
+hibench.rf.gigantic.features            300000
+hibench.rf.bigdata.examples             20000
+hibench.rf.bigdata.features             220000
 
 
 hibench.rf.examples                     ${hibench.rf.${hibench.scale.profile}.examples}
 hibench.rf.features                     ${hibench.rf.${hibench.scale.profile}.features}
 hibench.rf.partitions                   ${hibench.default.map.parallelism}
 hibench.rf.numTrees                     100
+hibench.rf.numClasses                   2
+hibench.rf.featureSubsetStrategy        auto
+hibench.rf.impurity                     gini
+hibench.rf.maxDepth                     4
+hibench.rf.maxBins                      32
 
 hibench.workload.input                  ${hibench.hdfs.data.dir}/RF/Input
 hibench.workload.output                 ${hibench.hdfs.data.dir}/RF/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/ALSExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/ALSExample.scala
@@ -126,10 +126,11 @@ object ALSExample {
 
     println(s"Test RMSE = $rmse.")
     
-    // Recommend products for all users
+    // Recommend products for all users, enable the following code to test recommendForAll
+    /*
     val userRecommend = model.recommendProductsForUsers(numRecommends)
     userRecommend.count()
-
+    */
     sc.stop()
   }
 
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostingTree.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostingTree.scala
@@ -53,7 +53,7 @@ object GradientBoostingTree {
     // Train a GradientBoostedTrees model.
     // The defaultParams for Classification use LogLoss by default.
     val boostingStrategy = BoostingStrategy.defaultParams("Classification")
-    boostingStrategy.numIterations = numIterations // Note: Use more iterations in practice.
+    boostingStrategy.numIterations = numIterations
     boostingStrategy.treeStrategy.numClasses = numClasses
     boostingStrategy.treeStrategy.maxDepth = maxDepth
     // Empty categoricalFeaturesInfo indicates all features are continuous.
@@ -68,7 +68,6 @@ object GradientBoostingTree {
     }
     val testErr = labelAndPreds.filter(r => r._1 != r._2).count.toDouble / testData.count()
     println("Test Error = " + testErr)
-    println("Learned classification GBT model:\n" + model.toDebugString)
 
     sc.stop()
   }
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostingTreeDataGenerator.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/GradientBoostingTreeDataGenerator.scala
@@ -29,13 +29,13 @@ import org.apache.spark.rdd.RDD
 
 /**
  * :: DeveloperApi ::
- * Generate test data for LogisticRegression. This class chooses positive labels
+ * Generate test data for Gradient Boosting Tree. This class chooses positive labels
  * with probability `probOne` and scales features for positive examples by `eps`.
  */
 object GradientBoostingTreeDataGenerator {
 
   /**
-   * Generate an RDD containing test data for LogisticRegression.
+   * Generate an RDD containing test data for Gradient Boosting Tree.
    *
    * @param sc SparkContext to use for creating the RDD.
    * @param nexamples Number of examples that will be contained in the RDD.
@@ -44,7 +44,7 @@ object GradientBoostingTreeDataGenerator {
    * @param nparts Number of partitions of the generated RDD. Default value is 2.
    * @param probOne Probability that a label is 1 (and not 0). Default value is 0.5.
    */
-  def generateLogisticRDD(
+  def generateGBTRDD(
     sc: SparkContext,
     nexamples: Int,
     nfeatures: Int,
@@ -73,7 +73,7 @@ object GradientBoostingTreeDataGenerator {
     val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
     val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
       .getOrElse((parallel / 2).toString).toInt
-    val eps = 3
+    val eps = 0.3
 
     if (args.length == 3) {
       outputPath = args(0)
@@ -84,12 +84,12 @@ object GradientBoostingTreeDataGenerator {
       println(s"Num of Features: $numFeatures")
     } else {
       System.err.println(
-        s"Usage: $LogisticRegressionDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"
+        s"Usage: $GradientBoostingTreeDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>"
       )
       System.exit(1)
     }
 
-    val data = generateLogisticRDD(sc, numExamples, numFeatures, eps, numPartitions)
+    val data = generateGBTRDD(sc, numExamples, numFeatures, eps, numPartitions)
 
     data.saveAsObjectFile(outputPath)
 
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala
@@ -22,44 +22,63 @@ import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.mllib.clustering.{LDA, DistributedLDAModel, LocalLDAModel}
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.rdd.RDD
-
+import scopt.OptionParser
 object LDAExample {
-
+  case class Params(
+      inputPath: String = null,
+      outputPath: String = null,
+      numTopics: Int = 10,
+      maxIterations: Int = 10,
+      optimizer: String = "online",
+      maxResultSize: String = "1g")
+	  
   def main(args: Array[String]): Unit = {
-    var inputPath = ""
-    var outputPath = ""
-    var numTopics: Int = 10
-    var maxIterations: Int = 10
-    var optimizer = "online"
-    var maxResultSize = "1g"
+    val defaultParams = Params()
+	
+    val parser = new OptionParser[Params]("LDA") {
+	  head("LDA: an example app for LDA.")
+	  opt[String]("optimizer")
+        .text(s"optimizer, default: ${defaultParams.optimizer}")
+        .action((x, c) => c.copy(optimizer = x))
+      opt[String]("maxResultSize")
+        .text("max resultSize, default: ${defaultParams.maxResultSize}")
+        .action((x, c) => c.copy(maxResultSize = x))
+      opt[Int]("numTopics")
+        .text(s"number of Topics, default: ${defaultParams.numTopics}")
+        .action((x, c) => c.copy(numTopics = x))
+      opt[Int]("maxIterations")
+        .text(s"number of max iterations, default: ${defaultParams.maxIterations}")
+        .action((x, c) => c.copy(maxIterations = x))
+      arg[String]("<inputPath>")
+        .required()
+        .text("Input paths")
+        .action((x, c) => c.copy(inputPath = x))
+      arg[String]("<outputPath>")
+        .required()
+        .text("outputPath paths")
+        .action((x, c) => c.copy(outputPath = x))		
 
-    if (args.length == 6) {
-      inputPath = args(0)
-      outputPath = args(1)
-      numTopics = args(2).toInt
-      maxIterations = args(3).toInt
-      optimizer = args(4)
-      maxResultSize = args(5)
-    } else {
-       System.err.println(
-         s"Usage: $LDAExample <INPUT_PATH> <OUTPUT_PATH> <NUM_TOPICS> <MAX_RESULT_SIZE>"
-       )
-      System.exit(1)
     }
-
+	parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+  
+  def run(params: Params): Unit = {
     val conf = new SparkConf()
-        .setAppName("LDA Example")
-        .set("spark.driver.maxResultSize",maxResultSize)
+        .setAppName(s"LDA Example with $params")
+        .set("spark.driver.maxResultSize", params.maxResultSize)
     val sc = new SparkContext(conf)
 
-    val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath)
+    val corpus: RDD[(Long, Vector)] = sc.objectFile(params.inputPath)
     
     // Cluster the documents into numTopics topics using LDA
-    val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus)
+    val ldaModel = new LDA().setK(params.numTopics).setMaxIterations(params.maxIterations).setOptimizer(params.optimizer).run(corpus)
 
     // Save and load model.
-    ldaModel.save(sc, outputPath)
-    val savedModel = LocalLDAModel.load(sc, outputPath)
+    ldaModel.save(sc, params.outputPath)
+    val savedModel = LocalLDAModel.load(sc, params.outputPath)
 
     sc.stop()
   }
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/RandomForestClassification.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/RandomForestClassification.scala
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/RandomForestDataGenerator.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/RandomForestDataGenerator.scala
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/RatingDataGenerator.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/RatingDataGenerator.scala