Skip to content

Commit

Permalink
add optimizer and iteration parameters for LDA
Browse files Browse the repository at this point in the history
  • Loading branch information
jtengyp committed Oct 24, 2017
1 parent b661a1c commit 5636f7b
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 4 deletions.
2 changes: 2 additions & 0 deletions bin/functions/hibench_prop_env_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@
NUM_TOPICS_LDA="hibench.lda.num_of_topics",
DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
NUM_ITERATIONS_LDA="hibench.lda.num_iterations",
OPTIMIZER_LDA="hibench.lda.optimizer",
MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
# For Pagerank
PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",
Expand Down
2 changes: 1 addition & 1 deletion bin/workloads/ml/lda/spark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ rmr_hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $NUM_ITERATIONS_LDA $OPTIMIZER_LDA $MAXRESULTSIZE_LDA
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
Expand Down
3 changes: 3 additions & 0 deletions conf/workloads/ml/lda.conf
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,8 @@ hibench.lda.doc_len_max ${hibench.lda.${hibench.scale.
hibench.lda.maxresultsize ${hibench.lda.${hibench.scale.profile}.maxresultsize}
hibench.lda.partitions ${hibench.default.map.parallelism}

hibench.lda.optimizer "online"
hibench.lda.num_iterations 10

hibench.workload.input ${hibench.hdfs.data.dir}/LDA/Input
hibench.workload.output ${hibench.hdfs.data.dir}/LDA/Output
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,17 @@ object LDAExample {
var inputPath = ""
var outputPath = ""
var numTopics: Int = 10
var maxIterations: Int = 10
var optimizer = "online"
var maxResultSize = "1g"

if (args.length == 4) {
if (args.length == 6) {
inputPath = args(0)
outputPath = args(1)
numTopics = args(2).toInt
maxResultSize = args(3)
maxIterations = args(3).toInt
optimizer = args(4)
maxResultSize = args(5)
} else {
System.err.println(
s"Usage: $LDAExample <INPUT_PATH> <OUTPUT_PATH> <NUM_TOPICS> <MAX_RESULT_SIZE>"
Expand All @@ -51,7 +55,7 @@ object LDAExample {
val corpus: RDD[(Long, Vector)] = sc.objectFile(inputPath)

// Cluster the documents into numTopics topics using LDA
val ldaModel = new LDA().setK(numTopics).setOptimizer("online").run(corpus)
val ldaModel = new LDA().setK(numTopics).setMaxIterations(maxIterations).setOptimizer(optimizer).run(corpus)

// Save and load model.
ldaModel.save(sc, outputPath)
Expand Down

0 comments on commit 5636f7b

Please sign in to comment.