Merge pull request #505 from jtengyp/lda

Meng, Peng · web-flow · commit a715d886bbb0 · 2017-10-24T00:30:17.000-05:00
Add optimizer and iteration parameters for LDA
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -133,6 +133,8 @@
     NUM_TOPICS_LDA="hibench.lda.num_of_topics",
     DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
     DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
+    NUM_ITERATIONS_LDA="hibench.lda.num_iterations",
+    OPTIMIZER_LDA="hibench.lda.optimizer",
     MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
     # For Pagerank
     PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",
diff --git a/bin/workloads/ml/lda/spark/run.sh b/bin/workloads/ml/lda/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
+run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}
diff --git a/conf/workloads/ml/lda.conf b/conf/workloads/ml/lda.conf
@@ -44,5 +44,8 @@ hibench.lda.doc_len_max                           ${hibench.lda.${hibench.scale.
 hibench.lda.maxresultsize                         ${hibench.lda.${hibench.scale.profile}.maxresultsize}
 hibench.lda.partitions                            ${hibench.default.map.parallelism}
 
+hibench.lda.optimizer                             "online"
+hibench.lda.num_iterations                        10
+
 hibench.workload.input                            ${hibench.hdfs.data.dir}/LDA/Input
 hibench.workload.output                           ${hibench.hdfs.data.dir}/LDA/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala
@@ -29,13 +29,17 @@ object LDAExample {
     var inputPath = ""
     var outputPath = ""
     var numTopics: Int = 10
+    var maxIterations: Int = 10
+    var optimizer = "online"
     var maxResultSize = "1g"
 
-    if (args.length == 4) {
+    if (args.length == 6) {
       inputPath = args(0)
       outputPath = args(1)
       numTopics = args(2).toInt
-      maxResultSize = args(3)
+      maxIterations = args(3).toInt
+      optimizer = args(4)
+      maxResultSize = args(5)
     } else {
        System.err.println(
          s"Usage: $LDAExample <INPUT_PATH> <OUTPUT_PATH> <NUM_TOPICS> <MAX_RESULT_SIZE>"
@@ -51,7 +55,7 @@ object LDAExample {
     val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath)
     
     // Cluster the documents into numTopics topics using LDA
-    val ldaModel = new LDA().setK(numTopics).setOptimizer("online").run(corpus)
+    val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus)
 
     // Save and load model.
     ldaModel.save(sc, outputPath)