Merge pull request #518 from jtengyp/linear

Meng, Peng · web-flow · commit feeca9af42b8 · 2017-10-31T21:00:52.000-05:00
change the form of arguments of Linear Regression to OptionParser
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -106,12 +106,10 @@
     NUM_ITERATIONS_ALS="hibench.als.num_iterations",
     LAMBDA_ALS="hibench.als.Lambda",
     KYRO_ALS="hibench.als.kyro",
-
     # For PCA
     NUM_EXAMPLES_PCA="hibench.pca.examples",
     NUM_FEATURES_PCA="hibench.pca.features",
     MAX_RESULT_SIZE_PCA ="hibench.pca.maxresultsize",
-
     # For Gradient Boosting Tree
     NUM_EXAMPLES_GBT="hibench.gbt.examples",
     NUM_FEATURES_GBT="hibench.gbt.features",
@@ -136,6 +134,8 @@
     # For Linear Regression
     NUM_EXAMPLES_LINEAR="hibench.linear.examples",
     NUM_FEATURES_LINEAR="hibench.linear.features",
+    NUM_ITERATIONS_LINEAR="hibench.linear.numIterations",
+    STEPSIZE_LINEAR="hibench.linear.stepSize",
     # For LDA
     NUM_DOCUMENTS_LDA="hibench.lda.num_of_documents",
     NUM_VOCABULARY_LDA="hibench.lda.num_of_vocabulary",
diff --git a/bin/workloads/ml/linear/spark/run.sh b/bin/workloads/ml/linear/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.LinearRegression ${INPUT_HDFS}
+run_spark_job com.intel.hibench.sparkbench.ml.LinearRegression --numIterations $NUM_ITERATIONS_LINEAR --stepSize $STEPSIZE_LINEAR $INPUT_HDFS
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/conf/workloads/ml/linear.conf b/conf/workloads/ml/linear.conf
@@ -1,19 +1,22 @@
-hibench.linear.tiny.examples                        50000
-hibench.linear.tiny.features                        10000
-hibench.linear.small.examples                       100000
-hibench.linear.small.features                       20000
-hibench.linear.large.examples                       200000
-hibench.linear.large.features                       30000
-hibench.linear.huge.examples                        300000
-hibench.linear.huge.features                        50000
-hibench.linear.gigantic.examples                    500000
-hibench.linear.gigantic.features                    80000
-hibench.linear.bigdata.examples                     1000000
-hibench.linear.bigdata.features                     100000
+hibench.linear.tiny.examples                50000
+hibench.linear.tiny.features                10000
+hibench.linear.small.examples               100000
+hibench.linear.small.features               20000
+hibench.linear.large.examples               200000
+hibench.linear.large.features               30000
+hibench.linear.huge.examples                300000
+hibench.linear.huge.features                50000
+hibench.linear.gigantic.examples            500000
+hibench.linear.gigantic.features            80000
+hibench.linear.bigdata.examples             1000000
+hibench.linear.bigdata.features             100000
 
 hibench.linear.examples                     ${hibench.linear.${hibench.scale.profile}.examples}
 hibench.linear.features                     ${hibench.linear.${hibench.scale.profile}.features}
 hibench.linear.partitions                   ${hibench.default.map.parallelism}
 
+hibench.linear.numIterations                100
+hibench.linear.stepSize                     0.00001
+
 hibench.workload.input                      ${hibench.hdfs.data.dir}/Linear/Input
 hibench.workload.output                     ${hibench.hdfs.data.dir}/Linear/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LinearRegression.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LinearRegression.scala
@@ -24,24 +24,50 @@ import org.apache.spark.mllib.regression.LinearRegressionModel
 import org.apache.spark.mllib.regression.LinearRegressionWithSGD
 import org.apache.spark.rdd.RDD
 
+import scopt.OptionParser
+
 object LinearRegression {
 
+  case class Params(
+      dataPath: String = null,
+      numIterations: Int = 100,
+      stepSize: Double = 0.00000001
+  )
+
   def main(args: Array[String]): Unit = {
-    var inputPath = ""
+    val defaultParams = Params()
 
-    if (args.length == 1) {
-      inputPath = args(0)
+    val parser = new OptionParser[Params]("Linear"){
+      head("Linear Regression: an example of linear regression with SGD optimizer")
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("stepSize")
+        .text(s"stepSize, default: ${defaultParams.stepSize}")
+        .action((x,c) => c.copy(stepSize = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("Input path for data")
+        .action((x,c) => c.copy(dataPath = x))
     }
-
-    val conf = new SparkConf().setAppName("LinearRegressionWithSGD")
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+   
+  def run(params: Params): Unit = {
+    val conf = new SparkConf().setAppName(s"LinearRegressionWithSGD with $params")
     val sc = new SparkContext(conf)
+    
+    val dataPath = params.dataPath
+    val numIterations = params.numIterations
+    val stepSize = params.stepSize
 
-    // Load training data in LIBSVM format.
-    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)
+    // Load training data in LabeledPoint format.
+    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)
     
     // Building the model
-    val numIterations = 100
-    val stepSize = 0.00000001
     val model = LinearRegressionWithSGD.train(data, numIterations, stepSize)
 
     // Evaluate model on training examples and compute training error