Merge branch 'master' into als

maniaabdi · Oct 19, 2017 · 38cd672 · 38cd672
2 parents 9127833 + f2c2a9b
commit 38cd672
Show file tree

Hide file tree

Showing 29 changed files with 1,147 additions and 6 deletions.
diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -93,6 +93,9 @@
  # For Logistic Regression
  NUM_EXAMPLES_LR="hibench.lr.examples",
  NUM_FEATURES_LR="hibench.lr.features",
+ # For SVM
+ NUM_EXAMPLES_SVM="hibench.svm.examples",
+ NUM_FEATURES_SVM="hibench.svm.examples",
  # For ALS
  NUM_USERS_ALS="hibench.als.users",
  NUM_PRODUCTS_ALS="hibench.als.products",
@@ -103,6 +106,12 @@
  NUM_ITERATIONS_ALS="hibench.als.num_iterations",
  LAMBDA_ALS="hibench.als.Lambda",
  KYRO_ALS="hibench.als.kyro",
+
+ # For PCA
+ NUM_EXAMPLES_PCA="hibench.pca.examples",
+ NUM_FEATURES_PCA="hibench.pca.features",
+ MAX_RESULT_SIZE_PCA ="hibench.pca.maxresultsize",
+
  # For Gradient Boosting Tree
  NUM_EXAMPLES_GBT="hibench.gbt.examples",
  NUM_FEATURES_GBT="hibench.gbt.features",
@@ -115,6 +124,16 @@
  NUM_EXAMPLES_SVD="hibench.svd.examples",
  NUM_FEATURES_SVD="hibench.svd.features",
  MAXRESULTSIZE_SVD="hibench.svd.maxresultsize",
+ # For Linear Regression
+ NUM_EXAMPLES_LINEAR="hibench.linear.examples",
+ NUM_FEATURES_LINEAR="hibench.linear.features",
+ # For LDA
+ NUM_DOCUMENTS_LDA="hibench.lda.num_of_documents",
+ NUM_VOCABULARY_LDA="hibench.lda.num_of_vocabulary",
+ NUM_TOPICS_LDA="hibench.lda.num_of_topics",
+ DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
+ DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
+ MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
  # For Pagerank
  PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",
  PAGERANK_INPUT="hibench.pagerank.dir.name.input",

diff --git a/bin/run_all.sh b/bin/run_all.sh
@@ -59,6 +59,12 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
  if [ $benchmark == "ml/als" ] && [ $framework == "hadoop" ]; then
  continue
  fi
+ if [ $benchmark == "ml/svm" ] && [ $framework == "hadoop" ]; then
+ continue
+ fi
+ if [ $benchmark == "ml/pca" ] && [ $framework == "hadoop" ]; then
+ continue
+ fi
  if [ $benchmark == "ml/gbt" ] && [ $framework == "hadoop" ]; then
  continue
  fi
@@ -67,8 +73,14 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
  fi 
  if [ $benchmark == "ml/svd" ] && [ $framework == "hadoop" ]; then
  continue
+ fi 
+ if [ $benchmark == "ml/linear" ] && [ $framework == "hadoop" ]; then
+ continue
+ fi
+ if [ $benchmark == "ml/lda" ] && [ $framework == "hadoop" ]; then
+ continue
  fi
- 
+
  echo -e "${UYellow}${BYellow}Run ${Yellow}${UYellow}${benchmark}/${framework}${Color_Off}"
  echo -e "${BCyan}Exec script: ${Cyan}$WORKLOAD/${framework}/run.sh${Color_Off}"
  $WORKLOAD/${framework}/run.sh

diff --git a/bin/workloads/ml/lda/prepare/prepare.sh b/bin/workloads/ml/lda/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/lda.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LDADataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.LDADataGenerator $INPUT_HDFS $NUM_DOCUMENTS_LDA $NUM_VOCABULARY_LDA $DOC_LEN_MIN_LDA $DOC_LEN_MAX_LDA
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/lda/spark/run.sh b/bin/workloads/ml/lda/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/lda.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LDA ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/bin/workloads/ml/linear/prepare/prepare.sh b/bin/workloads/ml/linear/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/linear.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LinearRegressionDataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.LinearRegressionDataGenerator $INPUT_HDFS $NUM_EXAMPLES_LINEAR $NUM_FEATURES_LINEAR 
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/linear/spark/run.sh b/bin/workloads/ml/linear/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/linear.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LinearRegression ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.LinearRegression ${INPUT_HDFS}
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/bin/workloads/ml/pca/prepare/prepare.sh b/bin/workloads/ml/pca/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/pca.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench PCADataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.PCADataGenerator $INPUT_HDFS $NUM_EXAMPLES_PCA $NUM_FEATURES_PCA
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/pca/spark/run.sh b/bin/workloads/ml/pca/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/pca.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench PCA ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.PCAExample ${INPUT_HDFS} ${MAX_RESULT_SIZE_PCA}
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/bin/workloads/ml/svm/prepare/prepare.sh b/bin/workloads/ml/svm/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/svm.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench SVMDataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.SVMDataGenerator $INPUT_HDFS $NUM_EXAMPLES_SVM $NUM_FEATURES_SVM 
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/svm/spark/run.sh b/bin/workloads/ml/svm/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/svm.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench SVM ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.SVMWithSGDExample ${INPUT_HDFS}
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/conf/benchmarks.lst b/conf/benchmarks.lst
@@ -15,8 +15,12 @@ ml.bayes
 ml.kmeans
 ml.lr
 ml.als
+ml.pca
 ml.gbt
 ml.rf
 ml.svd
+ml.linear
+ml.lda
+ml.svm
 
-graph.nweight
+graph.nweight
diff --git a/conf/hibench.conf b/conf/hibench.conf
@@ -1,12 +1,11 @@
 # Data scale profile. Available value is tiny, small, large, huge, gigantic and bigdata.
 # The definition of these profiles can be found in the workload's conf file i.e. conf/workloads/micro/wordcount.conf
-hibench.scale.profile tiny
-
+hibench.scale.profile tiny 
 # Mapper number in hadoop, partition number in Spark
 hibench.default.map.parallelism 8
 
 # Reducer nubmer in hadoop, shuffle partition number in Spark
-hibench.default.shuffle.parallelism 8
+hibench.default.shuffle.parallelism 8 
 
 
 #======================================================