change the form of arguments of SVM to OptionParser

jtengyp · jtengyp · commit d8ff0cd60fc5 · 2017-11-01T11:10:37.000+08:00
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -96,6 +96,9 @@
     # For SVM
     NUM_EXAMPLES_SVM="hibench.svm.examples",
     NUM_FEATURES_SVM="hibench.svm.examples",
+    NUM_ITERATIONS_SVM="hibench.svm.numIterations",
+    STEPSIZE_SVM="hibench.svm.stepSize",
+    REGPARAM_SVM="hibench.svm.regParam",
     # For ALS
     NUM_USERS_ALS="hibench.als.users",
     NUM_PRODUCTS_ALS="hibench.als.products",
diff --git a/bin/workloads/ml/svm/spark/run.sh b/bin/workloads/ml/svm/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.SVMWithSGDExample ${INPUT_HDFS}
+run_spark_job com.intel.hibench.sparkbench.ml.SVMWithSGDExample --numIterations $NUM_ITERATIONS_SVM --stepSize $STEPSIZE_SVM --regParam $REGPARAM_SVM $INPUT_HDFS
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/conf/workloads/ml/svm.conf b/conf/workloads/ml/svm.conf
@@ -1,20 +1,24 @@
-hibench.svm.tiny.examples                        1000
-hibench.svm.tiny.features                        1000
-hibench.svm.small.examples                       10000
-hibench.svm.small.features                       10000
-hibench.svm.large.examples                       50000
-hibench.svm.large.features                       100000
-hibench.svm.huge.examples                        120000
-hibench.svm.huge.features                        300000
-hibench.svm.gigantic.examples                    140000
-hibench.svm.gigantic.features                    300000
-hibench.svm.bigdata.examples                     150000
-hibench.svm.bigdata.features                     300000
+hibench.svm.tiny.examples                1000
+hibench.svm.tiny.features                1000
+hibench.svm.small.examples               10000
+hibench.svm.small.features               10000
+hibench.svm.large.examples               50000
+hibench.svm.large.features               100000
+hibench.svm.huge.examples                120000
+hibench.svm.huge.features                300000
+hibench.svm.gigantic.examples            140000
+hibench.svm.gigantic.features            300000
+hibench.svm.bigdata.examples             150000
+hibench.svm.bigdata.features             300000
 
 
 hibench.svm.examples                     ${hibench.svm.${hibench.scale.profile}.examples}
 hibench.svm.features                     ${hibench.svm.${hibench.scale.profile}.features}
 hibench.svm.partitions                   ${hibench.default.map.parallelism}
 
-hibench.workload.input                  ${hibench.hdfs.data.dir}/SVM/Input
-hibench.workload.output                 ${hibench.hdfs.data.dir}/SVM/Output
+hibench.svm.numIterations                100
+hibench.svm.stepSize                     1.0
+hibench.svm.regParam                     0.01
+
+hibench.workload.input                   ${hibench.hdfs.data.dir}/SVM/Input
+hibench.workload.output                  ${hibench.hdfs.data.dir}/SVM/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVMWithSGD.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVMWithSGD.scala
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.hibench.sparkbench.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
+import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
+import org.apache.spark.rdd.RDD
+import org.apache.spark.mllib.regression.LabeledPoint
+
+import scopt.OptionParser
+
+object SVMWithSGD {
+
+   case class Params(
+     numIterations: Int = 100,
+     stepSize: Double = 1.0,
+     regParam: Double = 0.01,
+     dataPath: String = null
+   )
+
+  def main(args: Array[String]): Unit = {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("SVM") {
+      head("SVM: an example of SVM for classification.")
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("stepSize")
+        .text(s"stepSize, default: ${defaultParams.stepSize}")
+        .action((x,c) => c.copy(stepSize = x))
+      opt[Double]("regParam")
+        .text(s"regParam, default: ${defaultParams.regParam}")
+        .action((x,c) => c.copy(regParam = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path of SVM")
+        .action((x, c) => c.copy(dataPath = x)) 
+    }
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+
+  def run(params: Params): Unit = {
+
+    val conf = new SparkConf().setAppName(s"SVM with $params")
+    val sc = new SparkContext(conf)
+
+    val dataPath = params.dataPath
+    val numIterations = params.numIterations
+    val stepSize = params.stepSize
+    val regParam = params.regParam
+
+    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)
+
+    // Split data into training (60%) and test (40%).
+    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
+    val training = splits(0).cache()
+    val test = splits(1)
+
+    // Run training algorithm to build the model
+    val model = SVMWithSGD.train(training, numIterations, stepSize, regParam)
+
+    // Clear the default threshold.
+    model.clearThreshold()
+
+    // Compute raw scores on the test set.
+    val scoreAndLabels = test.map { point =>
+      val score = model.predict(point.features)
+      (score, point.label)
+    }
+
+    // Get evaluation metrics.
+    val metrics = new BinaryClassificationMetrics(scoreAndLabels)
+    val auROC = metrics.areaUnderROC()
+
+    println("Area under ROC = " + auROC)
+
+    sc.stop()
+  }
+}
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVMWithSGDExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/SVMWithSGDExample.scala
@@ -15,40 +15,69 @@
  * limitations under the License.
  */
 
-// scalastyle:off println
 package com.intel.hibench.sparkbench.ml
 
 import org.apache.spark.{SparkConf, SparkContext}
-// $example on$
 import org.apache.spark.mllib.classification.{SVMModel, SVMWithSGD}
 import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
-import org.apache.spark.mllib.util.MLUtils
 import org.apache.spark.rdd.RDD
 import org.apache.spark.mllib.regression.LabeledPoint
-// $example off$
+
+import scopt.OptionParser
 
 object SVMWithSGDExample {
 
+   case class Params(
+     numIterations: Int = 100,
+     stepSize: Double = 1.0,
+     regParam: Double = 0.01,
+     dataPath: String = null
+   )
+
   def main(args: Array[String]): Unit = {
-    var inputPath = ""
-    if (args.length == 1) {
-       inputPath = args(0)
-     }
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("SVM") {
+      head("SVM: an example of SVM for classification.")
+      opt[Int]("numIterations")
+        .text(s"numIterations, default: ${defaultParams.numIterations}")
+        .action((x,c) => c.copy(numIterations = x))
+      opt[Double]("stepSize")
+        .text(s"stepSize, default: ${defaultParams.stepSize}")
+        .action((x,c) => c.copy(stepSize = x))
+      opt[Double]("regParam")
+        .text(s"regParam, default: ${defaultParams.regParam}")
+        .action((x,c) => c.copy(regParam = x))
+      arg[String]("<dataPath>")
+        .required()
+        .text("data path of SVM")
+        .action((x, c) => c.copy(dataPath = x)) 
+    }
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
 
-    val conf = new SparkConf().setAppName("SVMWithSGDExample")
+  def run(params: Params): Unit = {
+
+    val conf = new SparkConf().setAppName(s"SVM with $params")
     val sc = new SparkContext(conf)
 
-    // $example on$
-    val data: RDD[LabeledPoint] = sc.objectFile(inputPath)
+    val dataPath = params.dataPath
+    val numIterations = params.numIterations
+    val stepSize = params.stepSize
+    val regParam = params.regParam
+
+    val data: RDD[LabeledPoint] = sc.objectFile(dataPath)
 
     // Split data into training (60%) and test (40%).
     val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
     val training = splits(0).cache()
     val test = splits(1)
 
     // Run training algorithm to build the model
-    val numIterations = 100
-    val model = SVMWithSGD.train(training, numIterations)
+    val model = SVMWithSGD.train(training, numIterations, stepSize, regParam)
 
     // Clear the default threshold.
     model.clearThreshold()
@@ -65,8 +94,6 @@ object SVMWithSGDExample {
 
     println("Area under ROC = " + auROC)
 
-    // Save and load model
     sc.stop()
   }
 }
-// scalastyle:on println