add optimizer and iteration parameters for LDA

maniaabdi · Oct 24, 2017 · 5636f7b · 5636f7b
1 parent b661a1c
commit 5636f7b
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 4 deletions.
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -132,6 +132,8 @@
  NUM_TOPICS_LDA="hibench.lda.num_of_topics",
  DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
  DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
+ NUM_ITERATIONS_LDA="hibench.lda.num_iterations",
+ OPTIMIZER_LDA="hibench.lda.optimizer",
  MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
  # For Pagerank
  PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",

diff --git a/bin/workloads/ml/lda/spark/run.sh b/bin/workloads/ml/lda/spark/run.sh
@@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
-run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
+run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}

diff --git a/conf/workloads/ml/lda.conf b/conf/workloads/ml/lda.conf
@@ -44,5 +44,8 @@ hibench.lda.doc_len_max ${hibench.lda.${hibench.scale.
 hibench.lda.maxresultsize ${hibench.lda.${hibench.scale.profile}.maxresultsize}
 hibench.lda.partitions ${hibench.default.map.parallelism}
 
+hibench.lda.optimizer "online"
+hibench.lda.num_iterations 10
+
 hibench.workload.input ${hibench.hdfs.data.dir}/LDA/Input
 hibench.workload.output ${hibench.hdfs.data.dir}/LDA/Output
diff --git a/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala b/sparkbench/ml/src/main/scala/com/intel/sparkbench/ml/LDAExample.scala
@@ -29,13 +29,17 @@ object LDAExample {
  var inputPath = ""
  var outputPath = ""
  var numTopics: Int = 10
+ var maxIterations: Int = 10
+ var optimizer = "online"
  var maxResultSize = "1g"
 
- if (args.length == 4) {
+ if (args.length == 6) {
  inputPath = args(0)
  outputPath = args(1)
  numTopics = args(2).toInt
- maxResultSize = args(3)
+ maxIterations = args(3).toInt
+ optimizer = args(4)
+ maxResultSize = args(5)
  } else {
  System.err.println(
  s"Usage: $LDAExample <INPUT_PATH> <OUTPUT_PATH> <NUM_TOPICS> <MAX_RESULT_SIZE>"
@@ -51,7 +55,7 @@ object LDAExample {
  val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath)
 
  // Cluster the documents into numTopics topics using LDA
- val ldaModel = new LDA().setK(numTopics).setOptimizer("online").run(corpus)
+ val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus)
 
  // Save and load model.
  ldaModel.save(sc, outputPath)