Skip to content
This repository was archived by the owner on Dec 15, 2025. It is now read-only.

Commit a715d88

Browse files
author
Meng, Peng
authored
Merge pull request #505 from jtengyp/lda
Add optimizer and iteration parameters for LDA
2 parents a1dca4b + 5636f7b commit a715d88

4 files changed

Lines changed: 13 additions & 4 deletions

File tree

bin/functions/hibench_prop_env_mapping.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@
133133
NUM_TOPICS_LDA="hibench.lda.num_of_topics",
134134
DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
135135
DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
136+
NUM_ITERATIONS_LDA="hibench.lda.num_iterations",
137+
OPTIMIZER_LDA="hibench.lda.optimizer",
136138
MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
137139
# For Pagerank
138140
PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",

bin/workloads/ml/lda/spark/run.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
2626

2727
SIZE=`dir_size $INPUT_HDFS`
2828
START_TIME=`timestamp`
29-
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
29+
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA
3030
END_TIME=`timestamp`
3131

3232
gen_report ${START_TIME} ${END_TIME} ${SIZE}

conf/workloads/ml/lda.conf

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,5 +44,8 @@ hibench.lda.doc_len_max ${hibench.lda.${hibench.scale.
4444
hibench.lda.maxresultsize ${hibench.lda.${hibench.scale.profile}.maxresultsize}
4545
hibench.lda.partitions ${hibench.default.map.parallelism}
4646

47+
hibench.lda.optimizer "online"
48+
hibench.lda.num_iterations 10
49+
4750
hibench.workload.input ${hibench.hdfs.data.dir}/LDA/Input
4851
hibench.workload.output ${hibench.hdfs.data.dir}/LDA/Output

sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,13 +29,17 @@ object LDAExample {
2929
var inputPath = ""
3030
var outputPath = ""
3131
var numTopics: Int = 10
32+
var maxIterations: Int = 10
33+
var optimizer = "online"
3234
var maxResultSize = "1g"
3335

34-
if (args.length == 4) {
36+
if (args.length == 6) {
3537
inputPath = args(0)
3638
outputPath = args(1)
3739
numTopics = args(2).toInt
38-
maxResultSize = args(3)
40+
maxIterations = args(3).toInt
41+
optimizer = args(4)
42+
maxResultSize = args(5)
3943
} else {
4044
System.err.println(
4145
s"Usage: $LDAExample <INPUT_PATH> <OUTPUT_PATH> <NUM_TOPICS> <MAX_RESULT_SIZE>"
@@ -51,7 +55,7 @@ object LDAExample {
5155
val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath)
5256

5357
// Cluster the documents into numTopics topics using LDA
54-
val ldaModel = new LDA().setK(numTopics).setOptimizer("online").run(corpus)
58+
val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus)
5559

5660
// Save and load model.
5761
ldaModel.save(sc, outputPath)

0 commit comments

Comments
 (0)