Merge pull request Intel-bigdata#2 from intel-hadoop/master

Update to the master
maniaabdi · Oct 24, 2017 · 616c6e6 · 616c6e6
2 parents cd17de9 + a1dca4b
commit 616c6e6
Show file tree

Hide file tree

Showing 46 changed files with 1,332 additions and 78 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,14 +5,16 @@ jdk:
 before_install:
  - cat /etc/hosts # optionally check the content *before*
  - sudo hostname "$(hostname | cut -c1-63)"
- - sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts
+ - sed -n '/^127.0.0.1/!p' /etc/hosts | sed -e '1 i\127.0.0.1 localhost' -e "1 i\127.0.0.1 $(hostname | cut -c1-63)" | sudo tee /etc/hosts
  - cat /etc/hosts # optionally check the content *after*
+ - cat /proc/cpuinfo | grep cores | wc -l
+ - free -h
 install:
  - hibench=$(pwd)
  - cd /opt/
  - wget http://d3kbcqa49mib13.cloudfront.net/spark-1.6.0-bin-hadoop2.6.tgz
  - tar -xzf spark-1.6.0-bin-hadoop2.6.tgz
- - wget http://mirror.nexcess.net/apache/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz
+ - wget https://archive.apache.org/dist/hadoop/core/hadoop-2.6.5/hadoop-2.6.5.tar.gz
  - tar -xzf hadoop-2.6.5.tar.gz
  - cd ${hibench}
  - cp ./travis/spark-env.sh /opt/spark-1.6.0-bin-hadoop2.6/conf/
@@ -28,6 +30,7 @@ cache:
  directories:
  - $HOME/.m2
 script:
+ - mvn clean package -q -Dmaven.javadoc.skip=true -Dspark=2.2 -Dscala=2.11
  - mvn clean package -q -Dmaven.javadoc.skip=true -Dspark=2.0 -Dscala=2.11
  - mvn clean package -q -Dmaven.javadoc.skip=true -Dspark=1.6 -Dscala=2.10
  - sudo -E ./travis/configssh.sh

diff --git a/README.md b/README.md
@@ -25,7 +25,7 @@ HiBench is a big data benchmark suite that helps evaluate different big data fra
 
 There are totally 19 workloads in HiBench. The workloads are divided into 6 categories which are micro, ml(machine learning), sql, graph, websearch and streaming.
 
- **Micro Bechmarks:**
+ **Micro Benchmarks:**
 
 1. Sort (sort)
 
@@ -113,7 +113,7 @@ There are totally 19 workloads in HiBench. The workloads are divided into 6 cate
 ### Supported Hadoop/Spark/Flink/Storm/Gearpump releases: ###
 
  - Hadoop: Apache Hadoop 2.x, CDH5, HDP
- - Spark: Spark 1.6.x, Spark 2.0.x, Spark 2.1.x
+ - Spark: Spark 1.6.x, Spark 2.0.x, Spark 2.1.x, Spark 2.2.x
  - Flink: 1.0.3
  - Storm: 1.0.1
  - Gearpump: 0.8.1

diff --git a/bin/functions/hibench_prop_env_mapping.py b/bin/functions/hibench_prop_env_mapping.py
@@ -93,15 +93,25 @@
  # For Logistic Regression
  NUM_EXAMPLES_LR="hibench.lr.examples",
  NUM_FEATURES_LR="hibench.lr.features",
+ # For SVM
+ NUM_EXAMPLES_SVM="hibench.svm.examples",
+ NUM_FEATURES_SVM="hibench.svm.examples",
  # For ALS
- NUM_USERS="hibench.als.users",
- NUM_PRODUCTS="hibench.als.products",
- SPARSITY="hibench.als.sparsity",
- IMPLICITPREFS="hibench.als.implicitprefs",
- RANK="hibench.als.rank",
+ NUM_USERS_ALS="hibench.als.users",
+ NUM_PRODUCTS_ALS="hibench.als.products",
+ SPARSITY_ALS="hibench.als.sparsity",
+ IMPLICITPREFS_ALS="hibench.als.implicitprefs",
+ RANK_ALS="hibench.als.rank",
+ NUM_RECOMMENDS_ALS="hibench.als.recommends",
  NUM_ITERATIONS_ALS="hibench.als.num_iterations",
- LAMBDA="hibench.als.Lambda",
- KYRO="hibench.als.kyro",
+ LAMBDA_ALS="hibench.als.Lambda",
+ KYRO_ALS="hibench.als.kyro",
+
+ # For PCA
+ NUM_EXAMPLES_PCA="hibench.pca.examples",
+ NUM_FEATURES_PCA="hibench.pca.features",
+ MAX_RESULT_SIZE_PCA ="hibench.pca.maxresultsize",
+
  # For Gradient Boosting Tree
  NUM_EXAMPLES_GBT="hibench.gbt.examples",
  NUM_FEATURES_GBT="hibench.gbt.features",
@@ -114,6 +124,16 @@
  NUM_EXAMPLES_SVD="hibench.svd.examples",
  NUM_FEATURES_SVD="hibench.svd.features",
  MAXRESULTSIZE_SVD="hibench.svd.maxresultsize",
+ # For Linear Regression
+ NUM_EXAMPLES_LINEAR="hibench.linear.examples",
+ NUM_FEATURES_LINEAR="hibench.linear.features",
+ # For LDA
+ NUM_DOCUMENTS_LDA="hibench.lda.num_of_documents",
+ NUM_VOCABULARY_LDA="hibench.lda.num_of_vocabulary",
+ NUM_TOPICS_LDA="hibench.lda.num_of_topics",
+ DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
+ DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
+ MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
  # For Pagerank
  PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",
  PAGERANK_INPUT="hibench.pagerank.dir.name.input",

diff --git a/bin/run_all.sh b/bin/run_all.sh
@@ -32,10 +32,11 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
  echo -e "${BCyan}Exec script: ${Cyan}${WORKLOAD}/prepare/prepare.sh${Color_Off}"
  "${WORKLOAD}/prepare/prepare.sh"
 
- if [ $? -ne 0 ]
+ result=$?
+ if [ $result -ne 0 ]
  then
  echo "ERROR: ${benchmark} prepare failed!"
- continue
+ exit $result
  fi
 
  for framework in `cat $root_dir/conf/frameworks.lst`; do
@@ -58,6 +59,12 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
  if [ $benchmark == "ml/als" ] && [ $framework == "hadoop" ]; then
  continue
  fi
+ if [ $benchmark == "ml/svm" ] && [ $framework == "hadoop" ]; then
+ continue
+ fi
+ if [ $benchmark == "ml/pca" ] && [ $framework == "hadoop" ]; then
+ continue
+ fi
  if [ $benchmark == "ml/gbt" ] && [ $framework == "hadoop" ]; then
  continue
  fi
@@ -66,8 +73,14 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
  fi 
  if [ $benchmark == "ml/svd" ] && [ $framework == "hadoop" ]; then
  continue
+ fi 
+ if [ $benchmark == "ml/linear" ] && [ $framework == "hadoop" ]; then
+ continue
+ fi
+ if [ $benchmark == "ml/lda" ] && [ $framework == "hadoop" ]; then
+ continue
  fi
- 
+
  echo -e "${UYellow}${BYellow}Run ${Yellow}${UYellow}${benchmark}/${framework}${Color_Off}"
  echo -e "${BCyan}Exec script: ${Cyan}$WORKLOAD/${framework}/run.sh${Color_Off}"
  $WORKLOAD/${framework}/run.sh

diff --git a/bin/workloads/ml/als/prepare/prepare.sh b/bin/workloads/ml/als/prepare/prepare.sh
@@ -26,7 +26,7 @@ show_bannar start
 rmr_hdfs $INPUT_HDFS || true
 START_TIME=`timestamp`
 
-run_spark_job com.intel.hibench.sparkbench.ml.RatingDataGenerator $INPUT_HDFS $NUM_USERS $NUM_PRODUCTS $IMPLICITPREFS 
+run_spark_job com.intel.hibench.sparkbench.ml.RatingDataGenerator $INPUT_HDFS $NUM_USERS_ALS $NUM_PRODUCTS_ALS $SPARSITY_ALS $IMPLICITPREFS_ALS
 
 END_TIME=`timestamp`
 

diff --git a/bin/workloads/ml/als/spark/run.sh b/bin/workloads/ml/als/spark/run.sh
@@ -28,7 +28,7 @@ rmr_hdfs $OUTPUT_HDFS || true
 SIZE=`dir_size $INPUT_HDFS`
 START_TIME=`timestamp`
 
-run_spark_job com.intel.hibench.sparkbench.ml.ALSExample --numUsers $NUM_USERS --numProducts $NUM_PRODUCTS --sparsity $SPARSITY --rank $RANK --numIterations $NUM_ITERATIONS_ALS --lambda $LAMBDA --kryo $KYRO --implicitPrefs $IMPLICITPREFS $INPUT_HDFS
+run_spark_job com.intel.hibench.sparkbench.ml.ALSExample --numUsers $NUM_USERS_ALS --numProducts $NUM_PRODUCTS_ALS --rank $RANK_ALS --numRecommends $NUM_RECOMMENDS_ALS --numIterations $NUM_ITERATIONS_ALS --lambda $LAMBDA_ALS --kryo $KYRO_ALS --implicitPrefs $IMPLICITPREFS_ALS $INPUT_HDFS
 END_TIME=`timestamp`
 
 gen_report ${START_TIME} ${END_TIME} ${SIZE}

diff --git a/bin/workloads/ml/lda/prepare/prepare.sh b/bin/workloads/ml/lda/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/lda.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LDADataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.LDADataGenerator $INPUT_HDFS $NUM_DOCUMENTS_LDA $NUM_VOCABULARY_LDA $DOC_LEN_MIN_LDA $DOC_LEN_MAX_LDA
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/lda/spark/run.sh b/bin/workloads/ml/lda/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/lda.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LDA ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/bin/workloads/ml/linear/prepare/prepare.sh b/bin/workloads/ml/linear/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/linear.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LinearRegressionDataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.LinearRegressionDataGenerator $INPUT_HDFS $NUM_EXAMPLES_LINEAR $NUM_FEATURES_LINEAR 
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/linear/spark/run.sh b/bin/workloads/ml/linear/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/linear.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench LinearRegression ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.LinearRegression ${INPUT_HDFS}
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench
diff --git a/bin/workloads/ml/pca/prepare/prepare.sh b/bin/workloads/ml/pca/prepare/prepare.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/pca.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench PCADataPrepare ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $INPUT_HDFS || true
+START_TIME=`timestamp`
+
+run_spark_job com.intel.hibench.sparkbench.ml.PCADataGenerator $INPUT_HDFS $NUM_EXAMPLES_PCA $NUM_FEATURES_PCA
+
+END_TIME=`timestamp`
+
+show_bannar finish
+leave_bench
+
diff --git a/bin/workloads/ml/pca/spark/run.sh b/bin/workloads/ml/pca/spark/run.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+current_dir=`dirname "$0"`
+current_dir=`cd "$current_dir"; pwd`
+root_dir=${current_dir}/../../../../../
+workload_config=${root_dir}/conf/workloads/ml/pca.conf
+. "${root_dir}/bin/functions/load_bench_config.sh"
+
+enter_bench PCA ${workload_config} ${current_dir}
+show_bannar start
+
+rmr_hdfs $OUTPUT_HDFS || true
+
+SIZE=`dir_size $INPUT_HDFS`
+START_TIME=`timestamp`
+run_spark_job com.intel.hibench.sparkbench.ml.PCAExample ${INPUT_HDFS} ${MAX_RESULT_SIZE_PCA}
+END_TIME=`timestamp`
+
+gen_report ${START_TIME} ${END_TIME} ${SIZE}
+show_bannar finish
+leave_bench