Skip to content

Commit

Permalink
Merge pull request Intel-bigdata#2 from intel-hadoop/master
Browse files Browse the repository at this point in the history
Update to the master
  • Loading branch information
jtengyp authored Oct 24, 2017
2 parents cd17de9 + a1dca4b commit 616c6e6
Show file tree
Hide file tree
Showing 46 changed files with 1,332 additions and 78 deletions.
7 changes: 5 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ jdk:
before_install:
- cat /etc/hosts # optionally check the content *before*
- sudo hostname "$(hostname | cut -c1-63)"
- sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts
- sed -n '/^127.0.0.1/!p' /etc/hosts | sed -e '1 i\127.0.0.1 localhost' -e "1 i\127.0.0.1 $(hostname | cut -c1-63)" | sudo tee /etc/hosts
- cat /etc/hosts # optionally check the content *after*
- cat /proc/cpuinfo | grep cores | wc -l
- free -h
install:
- hibench=$(pwd)
- cd /opt/
- wget http://d3kbcqa49mib13.cloudfront.net/spark-1.6.0-bin-hadoop2.6.tgz
- tar -xzf spark-1.6.0-bin-hadoop2.6.tgz
- wget http://mirror.nexcess.net/apache/hadoop/common/hadoop-2.6.5/hadoop-2.6.5.tar.gz
- wget https://archive.apache.org/dist/hadoop/core/hadoop-2.6.5/hadoop-2.6.5.tar.gz
- tar -xzf hadoop-2.6.5.tar.gz
- cd ${hibench}
- cp ./travis/spark-env.sh /opt/spark-1.6.0-bin-hadoop2.6/conf/
Expand All @@ -28,6 +30,7 @@ cache:
directories:
- $HOME/.m2
script:
- mvn clean package -q -Dmaven.javadoc.skip=true -Dspark=2.2 -Dscala=2.11
- mvn clean package -q -Dmaven.javadoc.skip=true -Dspark=2.0 -Dscala=2.11
- mvn clean package -q -Dmaven.javadoc.skip=true -Dspark=1.6 -Dscala=2.10
- sudo -E ./travis/configssh.sh
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ HiBench is a big data benchmark suite that helps evaluate different big data fra

There are totally 19 workloads in HiBench. The workloads are divided into 6 categories which are micro, ml(machine learning), sql, graph, websearch and streaming.

**Micro Bechmarks:**
**Micro Benchmarks:**

1. Sort (sort)

Expand Down Expand Up @@ -113,7 +113,7 @@ There are totally 19 workloads in HiBench. The workloads are divided into 6 cate
### Supported Hadoop/Spark/Flink/Storm/Gearpump releases: ###

- Hadoop: Apache Hadoop 2.x, CDH5, HDP
- Spark: Spark 1.6.x, Spark 2.0.x, Spark 2.1.x
- Spark: Spark 1.6.x, Spark 2.0.x, Spark 2.1.x, Spark 2.2.x
- Flink: 1.0.3
- Storm: 1.0.1
- Gearpump: 0.8.1
Expand Down
34 changes: 27 additions & 7 deletions bin/functions/hibench_prop_env_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,25 @@
# For Logistic Regression
NUM_EXAMPLES_LR="hibench.lr.examples",
NUM_FEATURES_LR="hibench.lr.features",
# For SVM
NUM_EXAMPLES_SVM="hibench.svm.examples",
NUM_FEATURES_SVM="hibench.svm.examples",
# For ALS
NUM_USERS="hibench.als.users",
NUM_PRODUCTS="hibench.als.products",
SPARSITY="hibench.als.sparsity",
IMPLICITPREFS="hibench.als.implicitprefs",
RANK="hibench.als.rank",
NUM_USERS_ALS="hibench.als.users",
NUM_PRODUCTS_ALS="hibench.als.products",
SPARSITY_ALS="hibench.als.sparsity",
IMPLICITPREFS_ALS="hibench.als.implicitprefs",
RANK_ALS="hibench.als.rank",
NUM_RECOMMENDS_ALS="hibench.als.recommends",
NUM_ITERATIONS_ALS="hibench.als.num_iterations",
LAMBDA="hibench.als.Lambda",
KYRO="hibench.als.kyro",
LAMBDA_ALS="hibench.als.Lambda",
KYRO_ALS="hibench.als.kyro",

# For PCA
NUM_EXAMPLES_PCA="hibench.pca.examples",
NUM_FEATURES_PCA="hibench.pca.features",
MAX_RESULT_SIZE_PCA ="hibench.pca.maxresultsize",

# For Gradient Boosting Tree
NUM_EXAMPLES_GBT="hibench.gbt.examples",
NUM_FEATURES_GBT="hibench.gbt.features",
Expand All @@ -114,6 +124,16 @@
NUM_EXAMPLES_SVD="hibench.svd.examples",
NUM_FEATURES_SVD="hibench.svd.features",
MAXRESULTSIZE_SVD="hibench.svd.maxresultsize",
# For Linear Regression
NUM_EXAMPLES_LINEAR="hibench.linear.examples",
NUM_FEATURES_LINEAR="hibench.linear.features",
# For LDA
NUM_DOCUMENTS_LDA="hibench.lda.num_of_documents",
NUM_VOCABULARY_LDA="hibench.lda.num_of_vocabulary",
NUM_TOPICS_LDA="hibench.lda.num_of_topics",
DOC_LEN_MIN_LDA="hibench.lda.doc_len_min",
DOC_LEN_MAX_LDA="hibench.lda.doc_len_max",
MAXRESULTSIZE_LDA="hibench.lda.maxresultsize",
# For Pagerank
PAGERANK_BASE_HDFS="hibench.pagerank.base.hdfs",
PAGERANK_INPUT="hibench.pagerank.dir.name.input",
Expand Down
19 changes: 16 additions & 3 deletions bin/run_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,11 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
echo -e "${BCyan}Exec script: ${Cyan}${WORKLOAD}/prepare/prepare.sh${Color_Off}"
"${WORKLOAD}/prepare/prepare.sh"

if [ $? -ne 0 ]
result=$?
if [ $result -ne 0 ]
then
echo "ERROR: ${benchmark} prepare failed!"
continue
exit $result
fi

for framework in `cat $root_dir/conf/frameworks.lst`; do
Expand All @@ -58,6 +59,12 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
if [ $benchmark == "ml/als" ] && [ $framework == "hadoop" ]; then
continue
fi
if [ $benchmark == "ml/svm" ] && [ $framework == "hadoop" ]; then
continue
fi
if [ $benchmark == "ml/pca" ] && [ $framework == "hadoop" ]; then
continue
fi
if [ $benchmark == "ml/gbt" ] && [ $framework == "hadoop" ]; then
continue
fi
Expand All @@ -66,8 +73,14 @@ for benchmark in `cat $root_dir/conf/benchmarks.lst`; do
fi
if [ $benchmark == "ml/svd" ] && [ $framework == "hadoop" ]; then
continue
fi
if [ $benchmark == "ml/linear" ] && [ $framework == "hadoop" ]; then
continue
fi
if [ $benchmark == "ml/lda" ] && [ $framework == "hadoop" ]; then
continue
fi

echo -e "${UYellow}${BYellow}Run ${Yellow}${UYellow}${benchmark}/${framework}${Color_Off}"
echo -e "${BCyan}Exec script: ${Cyan}$WORKLOAD/${framework}/run.sh${Color_Off}"
$WORKLOAD/${framework}/run.sh
Expand Down
2 changes: 1 addition & 1 deletion bin/workloads/ml/als/prepare/prepare.sh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ show_bannar start
rmr_hdfs $INPUT_HDFS || true
START_TIME=`timestamp`

run_spark_job com.intel.hibench.sparkbench.ml.RatingDataGenerator $INPUT_HDFS $NUM_USERS $NUM_PRODUCTS $IMPLICITPREFS
run_spark_job com.intel.hibench.sparkbench.ml.RatingDataGenerator $INPUT_HDFS $NUM_USERS_ALS $NUM_PRODUCTS_ALS $SPARSITY_ALS $IMPLICITPREFS_ALS

END_TIME=`timestamp`

Expand Down
2 changes: 1 addition & 1 deletion bin/workloads/ml/als/spark/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ rmr_hdfs $OUTPUT_HDFS || true
SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`

run_spark_job com.intel.hibench.sparkbench.ml.ALSExample --numUsers $NUM_USERS --numProducts $NUM_PRODUCTS --sparsity $SPARSITY --rank $RANK --numIterations $NUM_ITERATIONS_ALS --lambda $LAMBDA --kryo $KYRO --implicitPrefs $IMPLICITPREFS $INPUT_HDFS
run_spark_job com.intel.hibench.sparkbench.ml.ALSExample --numUsers $NUM_USERS_ALS --numProducts $NUM_PRODUCTS_ALS --rank $RANK_ALS --numRecommends $NUM_RECOMMENDS_ALS --numIterations $NUM_ITERATIONS_ALS --lambda $LAMBDA_ALS --kryo $KYRO_ALS --implicitPrefs $IMPLICITPREFS_ALS $INPUT_HDFS
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
Expand Down
35 changes: 35 additions & 0 deletions bin/workloads/ml/lda/prepare/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/lda.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench LDADataPrepare ${workload_config} ${current_dir}
show_bannar start

rmr_hdfs $INPUT_HDFS || true
START_TIME=`timestamp`

run_spark_job com.intel.hibench.sparkbench.ml.LDADataGenerator $INPUT_HDFS $NUM_DOCUMENTS_LDA $NUM_VOCABULARY_LDA $DOC_LEN_MIN_LDA $DOC_LEN_MAX_LDA

END_TIME=`timestamp`

show_bannar finish
leave_bench

34 changes: 34 additions & 0 deletions bin/workloads/ml/lda/spark/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/lda.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench LDA ${workload_config} ${current_dir}
show_bannar start

rmr_hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`
run_spark_job com.intel.hibench.sparkbench.ml.LDAExample $INPUT_HDFS $OUTPUT_HDFS $NUM_TOPICS_LDA $MAXRESULTSIZE_LDA
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
show_bannar finish
leave_bench
35 changes: 35 additions & 0 deletions bin/workloads/ml/linear/prepare/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/linear.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench LinearRegressionDataPrepare ${workload_config} ${current_dir}
show_bannar start

rmr_hdfs $INPUT_HDFS || true
START_TIME=`timestamp`

run_spark_job com.intel.hibench.sparkbench.ml.LinearRegressionDataGenerator $INPUT_HDFS $NUM_EXAMPLES_LINEAR $NUM_FEATURES_LINEAR

END_TIME=`timestamp`

show_bannar finish
leave_bench

34 changes: 34 additions & 0 deletions bin/workloads/ml/linear/spark/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/linear.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench LinearRegression ${workload_config} ${current_dir}
show_bannar start

rmr_hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`
run_spark_job com.intel.hibench.sparkbench.ml.LinearRegression ${INPUT_HDFS}
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
show_bannar finish
leave_bench
35 changes: 35 additions & 0 deletions bin/workloads/ml/pca/prepare/prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/pca.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench PCADataPrepare ${workload_config} ${current_dir}
show_bannar start

rmr_hdfs $INPUT_HDFS || true
START_TIME=`timestamp`

run_spark_job com.intel.hibench.sparkbench.ml.PCADataGenerator $INPUT_HDFS $NUM_EXAMPLES_PCA $NUM_FEATURES_PCA

END_TIME=`timestamp`

show_bannar finish
leave_bench

34 changes: 34 additions & 0 deletions bin/workloads/ml/pca/spark/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
current_dir=`dirname "$0"`
current_dir=`cd "$current_dir"; pwd`
root_dir=${current_dir}/../../../../../
workload_config=${root_dir}/conf/workloads/ml/pca.conf
. "${root_dir}/bin/functions/load_bench_config.sh"

enter_bench PCA ${workload_config} ${current_dir}
show_bannar start

rmr_hdfs $OUTPUT_HDFS || true

SIZE=`dir_size $INPUT_HDFS`
START_TIME=`timestamp`
run_spark_job com.intel.hibench.sparkbench.ml.PCAExample ${INPUT_HDFS} ${MAX_RESULT_SIZE_PCA}
END_TIME=`timestamp`

gen_report ${START_TIME} ${END_TIME} ${SIZE}
show_bannar finish
leave_bench
Loading

0 comments on commit 616c6e6

Please sign in to comment.