change the form of arguments of SVD to OptionParser

jtengyp · jtengyp · commit 25b298c62354 · 2017-11-01T10:22:20.000+08:00
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -132,6 +132,8 @@
     # For SVD
     NUM_EXAMPLES_SVD="hibench.svd.examples",
     NUM_FEATURES_SVD="hibench.svd.features",
+    NUM_SINGULAR_VALUES_SVD="hibench.svd.singularvalues",
+    COMPUTEU_SVD="hibench.svd.computeU",
     MAXRESULTSIZE_SVD="hibench.svd.maxresultsize",
     # For Linear Regression
     NUM_EXAMPLES_LINEAR="hibench.linear.examples",
diff --git a/bin/workloads/ml/svd/spark/run.sh b/bin/workloads/ml/svd/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.SVDExample $INPUT_HDFS $NUM_FEATURES_SVD $MAXRESULTSIZE_SVD
+run_spark_job com.intel.hibench.sparkbench.ml.SVDExample --numFeatures $NUM_FEATURES_SVD --numSingularValues $NUM_SINGULAR_VALUES_SVD --computeU $COMPUTEU_SVD --maxResultSize $MAXRESULTSIZE_SVD $INPUT_HDFS 
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/conf/workloads/ml/svd.conf b/conf/workloads/ml/svd.conf
@@ -1,27 +1,36 @@
-hibench.svd.tiny.examples                        100
-hibench.svd.tiny.features                        1000
-hibench.svd.tiny.maxresultsize                   "1g"
-hibench.svd.small.examples                       1000
-hibench.svd.small.features                       2000
-hibench.svd.small.maxresultsize                  "1g"
-hibench.svd.large.examples                       2000
-hibench.svd.large.features                       4000
-hibench.svd.large.maxresultsize                  "1g"
-hibench.svd.huge.examples                        5000
-hibench.svd.huge.features                        5000
-hibench.svd.huge.maxresultsize                   "4g"
-hibench.svd.gigantic.examples                    6000
-hibench.svd.gigantic.features                    6000
-hibench.svd.gigantic.maxresultsize               "4g"
-hibench.svd.bigdata.examples                     7000
-hibench.svd.bigdata.features                     7000
-hibench.svd.bigdata.maxresultsize                "4g"
+hibench.svd.tiny.examples                100
+hibench.svd.tiny.features                1000
+hibench.svd.tiny.singularvalues          800
+hibench.svd.tiny.maxresultsize           "1g"
+hibench.svd.small.examples               1000
+hibench.svd.small.features               2000
+hibench.svd.small.singularvalues         1500
+hibench.svd.small.maxresultsize          "1g"
+hibench.svd.large.examples               2000
+hibench.svd.large.features               4000
+hibench.svd.large.singularvalues         3000
+hibench.svd.large.maxresultsize          "1g"
+hibench.svd.huge.examples                5000
+hibench.svd.huge.features                5000
+hibench.svd.huge.singularvalues          4000
+hibench.svd.huge.maxresultsize           "4g"
+hibench.svd.gigantic.examples            6000
+hibench.svd.gigantic.features            6000
+hibench.svd.gigantic.singularvalues      5000
+hibench.svd.gigantic.maxresultsize       "4g"
+hibench.svd.bigdata.examples             7000
+hibench.svd.bigdata.features             7000
+hibench.svd.bigdata.singularvalues       6000
+hibench.svd.bigdata.maxresultsize        "4g"
 
 
 hibench.svd.examples                     ${hibench.svd.${hibench.scale.profile}.examples}
 hibench.svd.features                     ${hibench.svd.${hibench.scale.profile}.features}
+hibench.svd.singularvalues               ${hibench.svd.${hibench.scale.profile}.singularvalues}
 hibench.svd.maxresultsize                ${hibench.svd.${hibench.scale.profile}.maxresultsize}
 hibench.svd.partitions                   ${hibench.default.map.parallelism}
 
+hibench.svd.computeU                     true
+
 hibench.workload.input                   ${hibench.hdfs.data.dir}/SVD/Input
 hibench.workload.output                  ${hibench.hdfs.data.dir}/SVD/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVDExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVDExample.scala
@@ -22,32 +22,64 @@ import org.apache.spark.SparkContext
 import org.apache.spark.mllib.linalg.Matrix
 import org.apache.spark.mllib.linalg.SingularValueDecomposition
 import org.apache.spark.mllib.linalg.Vector
-import org.apache.spark.mllib.linalg.Vectors
 import org.apache.spark.mllib.linalg.distributed.RowMatrix
 import org.apache.spark.rdd.RDD
 
+import scopt.OptionParser
+
 object SVDExample {
 
+  case class Params(
+    numFeatures: Int = 0,
+    numSingularValues: Int = 0,
+    computeU: Boolean = true,
+    maxResultSize: String = "1g",
+    dataPath: String = null
+  )
+
   def main(args: Array[String]): Unit = {
-    var inputPath = ""
-    var numFeatures = 0
-    var maxResultSize = "1g"
-
-    if (args.length == 3) {
-      inputPath = args(0)
-      numFeatures = args(1).toInt
-      maxResultSize = args(2)
+    val defaultParams = Params()
+    val parser = new OptionParser[Params]("SVD") {
+      head("SVD: an example of SVD for matrix decomposition.")
+      opt[Int]("numFeatures")
+        .text(s"numFeatures, default: ${defaultParams.numFeatures}")
+        .action((x,c) => c.copy(numFeatures = x))
+      opt[Int]("numSingularValues")
+        .text(s"numSingularValues, default: ${defaultParams.numSingularValues}")
+        .action((x,c) => c.copy(numSingularValues = x))
+      opt[Boolean]("computeU")
+        .text(s"computeU, default: ${defaultParams.computeU}")
+        .action((x,c) => c.copy(computeU = x))
+      opt[String]("maxResultSize")
+        .text(s"maxResultSize, default: ${defaultParams.maxResultSize}")
+        .action((x,c) => c.copy(maxResultSize = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path of SVD")
+        .action((x,c) => c.copy(dataPath = x))
     }
+     parser.parse(args, defaultParams) match {
+       case Some(params) => run(params)
+       case _ => sys.exit(1)
+     }
+  }
+
+  def run(params: Params): Unit = {
 
     val conf = new SparkConf()
-        .setAppName("SVDExample")
-        .set("spark.driver.maxResultSize",maxResultSize)
+        .setAppName(s"SVD with $params")
+        .set("spark.driver.maxResultSize", params.maxResultSize)
     val sc = new SparkContext(conf)
 
-    val dataRDD: RDD[Vector] = sc.objectFile(inputPath) 
-    val mat: RowMatrix = new RowMatrix(dataRDD)
+    val dataPath = params.dataPath
+    val numFeatures = params.numFeatures
+    val numSingularValues = params.numSingularValues
+    val computeU = params.computeU
+
+    val data: RDD[Vector] = sc.objectFile(dataPath) 
+    val mat: RowMatrix = new RowMatrix(data)
 
-    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(numFeatures-1, computeU = true)
+    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(numSingularValues, computeU)
     val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
     val s: Vector = svd.s  // The singular values are stored in a local dense vector.
     val V: Matrix = svd.V  // The V factor is a local dense matrix.