diff --git a/.travis.yml b/.travis.yml
index 874ef90461..40a66d5ee4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -15,7 +15,7 @@ matrix:
before_install:
- cd prometheus
install:
- - pip install paramiko pyyaml jinja2 python-etcd requests
+ - pip install paramiko pyyaml jinja2 python-etcd requests prometheus_client
script:
- python -m unittest discover test/
- language: python
diff --git a/Jenkinsfile b/Jenkinsfile
index 11bb5742f3..e39387a342 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -23,6 +23,7 @@ echo ${labels[0]} > ${WORKSPACE}/BED.txt
sh '''#!/bin/bash
set -ex
+
echo ${GIT_BRANCH//\\//-}-$(git rev-parse --short HEAD)-${BUILD_ID} > ${WORKSPACE}/IMAGE_TAG.txt
'''
env.IMAGE_TAG = readFile("${WORKSPACE}/IMAGE_TAG.txt").trim()
@@ -161,8 +162,10 @@ sed -i "42s/.*/ zkid: "1"/" /cluster-configuration/cluster-configuration.yaml
# Step 2. Boot up Kubernetes
# install k8s
./paictl.py cluster k8s-bootup -p /cluster-configuration
+
# ! TODO wait for cluster ready
sleep 6s
+
# Step 3. Start all PAI services
# start pai services
./paictl.py service start -p /cluster-configuration
@@ -251,8 +254,10 @@ sed -i "42s/.*/ zkid: "2"/" /cluster-configuration/cluster-configuration.yaml
# Step 2. Boot up Kubernetes
# install k8s
./paictl.py cluster k8s-bootup -p /cluster-configuration
+
# ! TODO wait for cluster ready
sleep 6s
+
# Step 3. Start all PAI services
# start pai services
./paictl.py service start -p /cluster-configuration
@@ -533,9 +538,11 @@ else
fi
# delete service for next install
./paictl.py service start -p /cluster-configuration -n cluster-configuration
+
./paictl.py service delete -p /cluster-configuration << EOF
Y
EOF
+
# clean k8s
./paictl.py cluster k8s-clean -p /cluster-configuration -f << EOF
Y
@@ -572,9 +579,11 @@ else
fi
# delete service for next install
./paictl.py service start -p /cluster-configuration -n cluster-configuration
+
./paictl.py service delete -p /cluster-configuration << EOF
Y
EOF
+
# clean k8s
./paictl.py cluster k8s-clean -p /cluster-configuration -f << EOF
Y
diff --git a/README.md b/README.md
index 5abf9f0a26..450bdffb56 100644
--- a/README.md
+++ b/README.md
@@ -56,12 +56,13 @@ Before start, you need to meet the following requirements:
## How to use
### How to train jobs
- How to write PAI jobs
- - [Learn from Example Jobs](./examples/README.md)
+ - [Quick start: how to write and submit a CIFAR-10 job](./examples/README.md#quickstart)
- [Write job from scratch in deepth](./docs/job_tutorial.md)
+ - [Learn more example jobs](./examples/#offtheshelf)
- How to submit PAI jobs
+ - [Submit a job in Web Portal](./docs/submit_from_webportal.md)
- [Submit a job in Visual Studio](https://github.com/Microsoft/vs-tools-for-ai/blob/master/docs/pai.md)
- [Submit a job in Visual Studio Code](https://github.com/Microsoft/vscode-tools-for-ai/blob/master/docs/quickstart-05-pai.md)
- - [Submit a job in web portal](https://github.com/Microsoft/pai/blob/master/job-tutorial/README.md#job-submission)
- How to request on-demand resource for in place training
- [Launch a jupyter notebook and work in it](./examples/jupyter/README.md)
@@ -71,7 +72,9 @@ Before start, you need to meet the following requirements:
- [Monitoring](./webportal/README.md)
## Resources
-The OpenPAI user [documentations](./docs/documentation.md) provides in-depth instructions for using OpenPAI
+
+- The OpenPAI user [documentations](./docs/documentation.md) provides in-depth instructions for using OpenPAI
+- Visit the [release notes](https://github.com/Microsoft/pai/releases) to read about the new features, or download the release today.
## Get Involved
- [StackOverflow:](./docs/stackoverflow.md) If you have questions about OpenPAI, please submit question at Stackoverflow under tag: openpai
diff --git a/cluster-configuration/cluster-configuration.yaml b/cluster-configuration/cluster-configuration.yaml
index e31fe2fa7a..309ebeae4b 100644
--- a/cluster-configuration/cluster-configuration.yaml
+++ b/cluster-configuration/cluster-configuration.yaml
@@ -61,9 +61,7 @@ machine-list:
dashboard: "true"
zkid: "1"
pai-master: "true"
- watchdog: "true"
- alert-manager: "true"
-
+
- hostname: hostname
hostip: IP
@@ -73,8 +71,7 @@ machine-list:
#username: username (Optional)
#password: password (Optional)
k8s-role: master
- node-exporter: "true"
-
+
- hostname: hostname
hostip: IP
@@ -84,7 +81,6 @@ machine-list:
#username: username (Optional)
#password: password (Optional)
k8s-role: master
- node-exporter: "true"
- hostname: hostname
diff --git a/cluster-configuration/kubernetes-configuration.yaml b/cluster-configuration/kubernetes-configuration.yaml
index 6cfda30523..3290981c55 100644
--- a/cluster-configuration/kubernetes-configuration.yaml
+++ b/cluster-configuration/kubernetes-configuration.yaml
@@ -42,6 +42,8 @@ kubernetes:
kube-controller-manager-version: v1.9.9
# http://gcr.io/google_containers/kubernetes-dashboard-amd64
dashboard-version: v1.8.3
+ # The path to storage etcd data.
+ etcd-data-path: "/var/etcd"
diff --git a/cluster-configuration/services-configuration.yaml b/cluster-configuration/services-configuration.yaml
index cd0a25ba3d..e2f1c83d6d 100644
--- a/cluster-configuration/services-configuration.yaml
+++ b/cluster-configuration/services-configuration.yaml
@@ -34,7 +34,7 @@ cluster:
# If the docker registry doesn't require authentication, please comment out docker_username and docker_password
#docker-username: your_registry_username
#docker-password: your_registry_password
-
+
docker-tag: latest
# The name of the secret in kubernetes will be created in your cluster
@@ -47,7 +47,6 @@ hadoop:
# More about hadoop-ai please follow the link: https://github.com/Microsoft/pai/tree/master/hadoop-ai.
# Notice: the name should be hadoop-{hadoop-version}.tar.gz
custom-hadoop-binary-path: /pathHadoop/hadoop-2.9.0.tar.gz
- hadoop-version: 2.9.0
# Step 1 of 4 to set up Hadoop queues.
# Define all virtual clusters, equivalent concept of Hadoop queues:
# - Each VC will be assigned with (capacity / total_capacity * 100%) of the resources in the system.
@@ -85,7 +84,7 @@ restserver:
# database admin password
default-pai-admin-password: your_default_pai_admin_password
-
+
webportal:
# port for webportal
server-port: 9286
@@ -101,8 +100,11 @@ prometheus:
prometheus-port: 9091
# port for node exporter
node-exporter-port: 9100
+ # port for yarn exporter
+ yarn_exporter_port: 9459
# How frequently to scrape targets
scrape_interval: 30
+
# if you want to enable alert manager to send alert email, uncomment following lines and fill
# right values.
# alerting:
diff --git a/deployment/k8sPaiLibrary/maintainconf/clean.yaml b/deployment/k8sPaiLibrary/maintainconf/clean.yaml
index 4665e09a6d..c9e0c4e30c 100644
--- a/deployment/k8sPaiLibrary/maintainconf/clean.yaml
+++ b/deployment/k8sPaiLibrary/maintainconf/clean.yaml
@@ -18,8 +18,8 @@
clean:
- file-list:
+ template-list:
- name: kubernetes-cleanup.sh
- src: deployment/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh
+ src: k8sPaiLibrary/template/kubernetes-cleanup.sh.template
dst: clean
diff --git a/deployment/k8sPaiLibrary/maintainlib/clean.py b/deployment/k8sPaiLibrary/maintainlib/clean.py
index d042df6f61..210c6b19b0 100644
--- a/deployment/k8sPaiLibrary/maintainlib/clean.py
+++ b/deployment/k8sPaiLibrary/maintainlib/clean.py
@@ -76,7 +76,7 @@ def job_executer(self, node_config):
self.logger.error("Failed to uncompress {0}.tar".format(self.jobname))
sys.exit(1)
- commandline = "sudo ./{0}/kubernetes-cleanup.sh".format(self.jobname)
+ commandline = "sudo /bin/bash {0}/kubernetes-cleanup.sh".format(self.jobname)
if self.force_flag:
commandline += " -f"
if common.ssh_shell_with_password_input_paramiko(node_config, commandline) == False:
@@ -126,5 +126,3 @@ def run(self):
self.logger.info("The kubernetes has been destroyed, and metadata has been removed")
-
-
diff --git a/deployment/k8sPaiLibrary/maintainlib/common.py b/deployment/k8sPaiLibrary/maintainlib/common.py
index eea4e676b3..b187825407 100644
--- a/deployment/k8sPaiLibrary/maintainlib/common.py
+++ b/deployment/k8sPaiLibrary/maintainlib/common.py
@@ -56,6 +56,22 @@ def execute_shell(shell_cmd, error_msg):
+def execute_shell_retry(shell_cmd, error_msg, retry_count):
+
+ count = 0
+ while count < retry_count:
+ try:
+ subprocess.check_call( shell_cmd, shell=True )
+ break
+ except subprocess.CalledProcessError:
+ count += 1
+ logger.error(error_msg)
+ logger.info("run command \" %s \" exception, retrying %d", shell_cmd, count)
+ if count == retry_count:
+ sys.exit(1)
+ time.sleep(5)
+
+
def execute_shell_return(shell_cmd, error_msg):
diff --git a/deployment/k8sPaiLibrary/maintainlib/deploy.py b/deployment/k8sPaiLibrary/maintainlib/deploy.py
index 51021c3b50..3eb2738e76 100644
--- a/deployment/k8sPaiLibrary/maintainlib/deploy.py
+++ b/deployment/k8sPaiLibrary/maintainlib/deploy.py
@@ -118,9 +118,12 @@ def create_kube_proxy(self):
generated_data = common.generate_from_template_dict(template_data, dict_map)
common.write_generated_file(generated_data, "kube-proxy.yaml")
- common.execute_shell(
+
+ retry_count = 5
+ common.execute_shell_retry(
"kubectl apply --overwrite=true -f kube-proxy.yaml",
- "Failed to create kube-proxy"
+ "Failed to create kube-proxy",
+ retry_count
)
os.remove("kube-proxy.yaml")
@@ -141,9 +144,12 @@ def create_k8s_dashboard(self):
generated_data = common.generate_from_template_dict(template_data, dict_map)
common.write_generated_file(generated_data, "dashboard-service.yaml")
- common.execute_shell(
+
+ retry_count = 5
+ common.execute_shell_retry(
"kubectl apply --overwrite=true -f dashboard-service.yaml",
- "Failed to create dashboard-service"
+ "Failed to create dashboard-service",
+ retry_count
)
os.remove("dashboard-service.yaml")
@@ -197,6 +203,12 @@ def run(self):
kubectl_install_instance = kubectl_install.kubectl_install(self.cluster_config)
kubectl_install_instance.run()
+ # check the registerd api resources
+ common.execute_shell_retry("kubectl api-resources", "kubectl command failed!", 5)
+
+ # create kube-proxy until daemonset resource is registerd
+ common.execute_shell_retry("kubectl api-resources | grep -q daemonsets", "Controller manager hasn't create daemonset object!", 5)
+
self.create_kube_proxy()
self.create_k8s_dashboard()
diff --git a/deployment/k8sPaiLibrary/maintaintool/kubectl-install.sh b/deployment/k8sPaiLibrary/maintaintool/kubectl-install.sh
index 05a7d59fa3..98c77ec12a 100755
--- a/deployment/k8sPaiLibrary/maintaintool/kubectl-install.sh
+++ b/deployment/k8sPaiLibrary/maintaintool/kubectl-install.sh
@@ -19,12 +19,11 @@
mkdir -p ~/.kube
-[[ -f "/usr/local/bin/kubectl" ]] &&
-{
+if which kubectl > /dev/null; then
echo "kubectl has been installed."
echo "Skip this precess"
exit 0
-}
+fi
set -e
@@ -38,4 +37,4 @@ pathofusllocalbin="/usr/local/bin"
mkdir -p /usr/local/bin
}
-mv ./kubectl /usr/local/bin/kubectl
\ No newline at end of file
+mv ./kubectl /usr/local/bin/kubectl
diff --git a/deployment/k8sPaiLibrary/template/config.template b/deployment/k8sPaiLibrary/template/config.template
index 728e8911b6..25818adcbb 100644
--- a/deployment/k8sPaiLibrary/template/config.template
+++ b/deployment/k8sPaiLibrary/template/config.template
@@ -16,17 +16,21 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
apiVersion: v1
+kind: Config
+preferences: {}
+
clusters:
- cluster:
insecure-skip-tls-verify: true
server: http://{{ clusterconfig['api-servers-ip'] }}:8080
name: kubernetes
+
contexts:
- context:
cluster: kubernetes
user: admin
name: kubernetes
+
current-context: kubernetes
-kind: Config
-preferences: {}
+
users: []
diff --git a/deployment/k8sPaiLibrary/template/etcd.yaml.template b/deployment/k8sPaiLibrary/template/etcd.yaml.template
index 1803399a32..b17fe944c0 100644
--- a/deployment/k8sPaiLibrary/template/etcd.yaml.template
+++ b/deployment/k8sPaiLibrary/template/etcd.yaml.template
@@ -54,5 +54,5 @@ spec:
name: varetcd
volumes:
- hostPath:
- path: /var/etcd/data
+ path: {{ clusterconfig['etcd-data-path'] }}
name: varetcd
\ No newline at end of file
diff --git a/deployment/k8sPaiLibrary/template/kubelet.sh.template b/deployment/k8sPaiLibrary/template/kubelet.sh.template
index b51ba4e706..97967889d6 100644
--- a/deployment/k8sPaiLibrary/template/kubelet.sh.template
+++ b/deployment/k8sPaiLibrary/template/kubelet.sh.template
@@ -26,7 +26,7 @@ docker run \
--volume=/sys:/sys:rw \
--volume=/dev:/dev:rw \
--volume=/var/lib/docker/:/var/lib/docker:rw \
- --volume=/var/lib/kubelet/:/var/lib/kubelet:rw \
+ --volume=/var/lib/kubelet/:/var/lib/kubelet:rw,shared \
--volume=/etc/resolv.conf:/etc/resolv.conf:rw \
--volume=/var/run:/var/run:rw \
--volume=/var/log:/var/log:rw \
@@ -46,6 +46,13 @@ docker run \
--pod-manifest-path=/etc/kubernetes/manifests \
--allow-privileged=true \
--logtostderr=true \
+ {% if 'pai-master' in hostcofig -%}
+ --node-labels pai-master=true \
+ {% endif -%}
+ {% if 'pai-worker' in hostcofig -%}
+ --node-labels pai-worker=true \
+ {% endif -%}
--pod-infra-container-image {{ clusterconfig['dockerregistry'] }}/pause-amd64:3.0 \
--eviction-hard="memory.available<5%,nodefs.available<5%,imagefs.available<5%,nodefs.inodesFree<5%,imagefs.inodesFree<5%" \
+ --image-pull-progress-deadline=10m \
--v=2
diff --git a/deployment/k8sPaiLibrary/template/kubernetes-cleanup.template b/deployment/k8sPaiLibrary/template/kubernetes-cleanup.template
new file mode 100644
index 0000000000..848d0a582d
--- /dev/null
+++ b/deployment/k8sPaiLibrary/template/kubernetes-cleanup.template
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+option=$1
+
+apt-get install -y gawk
+
+docker stop kubelet
+docker rm kubelet
+
+for ID in `docker ps -a | awk "/k8s_/ {print\\$1}"`; do docker kill $ID; docker rm $ID ; done
+
+if [ -d "/etc/kubernetes" ]; then
+
+ rm -rf /etc/kubernetes
+
+fi
+
+if [ -d "{{ clusterconfig['etcd-data-path'] }}" -a "$option" == "-f" ]; then
+
+ rm -rf {{ clusterconfig['etcd-data-path'] }}
+
+fi
+
+if [ -d "/var/log/pods" ]; then
+
+ rm -rf /var/log/pods
+
+fi
+
+if [ -d "/var/lib/kubelet/pods" ]; then
+
+ rm -rf /var/lib/kubelet/pods
+
+fi
+
+if [ -d "src" ]; then
+
+ rm -rf src
+
+fi
+
+if [ -f "kubernetes.tar" ]; then
+
+ rm -rf kubernetes.tar
+
+fi
\ No newline at end of file
diff --git a/deployment/paiLibrary/clusterObjectModel/paiObjectModel.py b/deployment/paiLibrary/clusterObjectModel/paiObjectModel.py
index 58c87f3aed..d6546d5997 100644
--- a/deployment/paiLibrary/clusterObjectModel/paiObjectModel.py
+++ b/deployment/paiLibrary/clusterObjectModel/paiObjectModel.py
@@ -56,6 +56,8 @@ def k8sParse(self):
k8sDict["clusterinfo"]["kubeschedulerversion"] = k8sDict["clusterinfo"]["kube-scheduler-version"]
k8sDict["clusterinfo"]["kubecontrollermanagerversion"] = k8sDict["clusterinfo"]["kube-controller-manager-version"]
k8sDict["clusterinfo"]["dashboard_version"] = k8sDict["clusterinfo"]["dashboard-version"]
+ if "etcd-data-path" not in k8sDict["clusterinfo"]:
+ k8sDict["clusterinfo"]["etcd-data-path"] = "/var/etcd"
# section : component_list
@@ -142,31 +144,6 @@ def k8sParse(self):
return k8sDict
-
- def labelExpend(self, host):
-
- if "pai-master" in host and host["pai-master"] == "true":
-
- host["hadoop-name-node"] = "true"
- host["hadoop-resource-manager"] = "true"
- host["zookeeper"] = "true"
- host["jobhistory"] = "true"
- host["launcher"] = "true"
- host["restserver"] = "true"
- host["webportal"] = "true"
- host["prometheus"] = "true"
- host["grafana"] = "true"
- host["pylon"] = "true"
- host["node-exporter"] = "true"
-
- if "pai-worker" in host and host["pai-worker"] == "true":
-
- host["hadoop-data-node"] = "true"
- host["hadoop-node-manager"] = "true"
- host["node-exporter"] = "true"
-
-
-
def serviceParse(self):
serviceDict = dict()
@@ -205,11 +182,8 @@ def serviceParse(self):
self.rawData["serviceConfiguration"]["hadoop"]
serviceDict["clusterinfo"]["hadoopinfo"]["custom_hadoop_binary_path"] = \
serviceDict["clusterinfo"]["hadoopinfo"]["custom-hadoop-binary-path"]
- serviceDict["clusterinfo"]["hadoopinfo"]["hadoopversion"] = \
- serviceDict["clusterinfo"]["hadoopinfo"]["hadoop-version"]
serviceDict["clusterinfo"]["hadoopinfo"]["configmapname"] = "hadoop-configuration"
- serviceDict["clusterinfo"]["hadoopinfo"]["hadoop_vip"] = \
- serviceDict["clusterinfo"]["hadoopinfo"]["hadoop-version"] = self.getMasterIP()
+ serviceDict["clusterinfo"]["hadoopinfo"]["hadoop_vip"] = self.getMasterIP()
# section : virtualClusters
@@ -303,6 +277,7 @@ def serviceParse(self):
serviceDict["clusterinfo"]["pyloninfo"]["grafana_uri"] = self.getGrafanaUri()
serviceDict["clusterinfo"]["pyloninfo"]["pai_web_portal_uri"] = self.getPaiWebPortalUri()
+
# section: machineinfo
serviceDict["machineinfo"] = self.rawData["clusterConfiguration"]["machine-sku"]
@@ -314,7 +289,6 @@ def serviceParse(self):
for host in self.rawData["clusterConfiguration"]["machine-list"]:
hostname = host["hostname"]
- self.labelExpend(host)
host["nodename"] = host["hostip"]
host["machinetype"] = host["machine-type"]
host["ip"] = host["hostip"]
diff --git a/deployment/paiLibrary/common/linux_shell.py b/deployment/paiLibrary/common/linux_shell.py
index 7445e32cf2..140ebfa7b5 100644
--- a/deployment/paiLibrary/common/linux_shell.py
+++ b/deployment/paiLibrary/common/linux_shell.py
@@ -25,6 +25,18 @@
+def execute_shell_raise(shell_cmd, error_msg):
+
+ try:
+ subprocess.check_call( shell_cmd, shell=True )
+
+ except subprocess.CalledProcessError:
+ logger.error(error_msg)
+ raise
+
+
+
+
def execute_shell(shell_cmd, error_msg):
try:
diff --git a/deployment/paiLibrary/paiService/service_management_start.py b/deployment/paiLibrary/paiService/service_management_start.py
index dda63b88c9..ab406c972d 100644
--- a/deployment/paiLibrary/paiService/service_management_start.py
+++ b/deployment/paiLibrary/paiService/service_management_start.py
@@ -15,8 +15,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
+import sys
+import subprocess
import logging
+import time
import logging.config
#
from . import service_start
@@ -41,6 +43,10 @@ def __init__(self, cluster_object_model, service_list = None, **kwargs):
else:
self.service_list = service_list
+ self.retry_times = 5
+ if "retry_times" in kwargs:
+ self.retry_times = kwargs["retry_times"]
+
def get_service_list(self):
@@ -78,21 +84,40 @@ def start(self, serv):
continue
self.start(fat_serv)
+ try_counts = 0
+ while True:
+
+ try:
+ self.logger.info("-----------------------------------------------------------")
+ self.logger.info("Begin to generate service {0}'s template file".format(serv))
+ service_template_generater = service_template_generate.service_template_generate(self.cluster_object_model, serv, service_conf)
+ service_template_generater.run()
+
+ self.logger.info("Begin to start service: [ {0} ]".format(serv))
+ service_starter.run()
+
+ self.logger.info("Begin to clean all service's generated template file".format(serv))
+ service_template_cleaner = service_template_clean.service_template_clean(serv, service_conf)
+ service_template_cleaner.run()
+
+ self.logger.info("Successfully start {0}".format(serv))
+ self.logger.info("-----------------------------------------------------------")
+ break
- self.logger.info("-----------------------------------------------------------")
- self.logger.info("Begin to generate service {0}'s template file".format(serv))
- service_template_generater = service_template_generate.service_template_generate(self.cluster_object_model, serv, service_conf)
- service_template_generater.run()
+ except subprocess.CalledProcessError:
+ self.logger.error("Failed to start service {0}".format(serv))
+ self.logger.info("-----------------------------------------------------------")
- self.logger.info("Begin to start service: [ {0} ]".format(serv))
- service_starter.run()
+ try_counts = try_counts + 1
+ if try_counts >= self.retry_times:
+ self.logger.error("Have retried {0} times, but service {1} doesn't start. Please check it.".format(self.retry_times, serv ))
+ sys.exit(1)
- self.logger.info("Begin to clean all service's generated template file".format(serv))
- service_template_cleaner = service_template_clean.service_template_clean(serv, service_conf)
- service_template_cleaner.run()
+ time.sleep(10)
- self.logger.info("Successfully start {0}".format(serv))
- self.logger.info("-----------------------------------------------------------")
+ except Exception as error:
+ self.logger.error("Some error occurs when starting service {0}".format(serv))
+ sys.exit(1)
self.done_dict[serv] = True
diff --git a/deployment/paiLibrary/paiService/service_start.py b/deployment/paiLibrary/paiService/service_start.py
index 7bda9abc8f..5757c55c38 100644
--- a/deployment/paiLibrary/paiService/service_start.py
+++ b/deployment/paiLibrary/paiService/service_start.py
@@ -42,7 +42,7 @@ def start(self):
cmd = "/bin/bash {0}".format(start_script)
err_msg = "Failed to execute the start script of service {0}".format(self.service_name)
self.logger.info("Begin to execute service {0}'s start script.".format(self.service_name))
- linux_shell.execute_shell(cmd, err_msg)
+ linux_shell.execute_shell_raise(cmd, err_msg)
diff --git a/deployment/paiLibrary/paiService/service_template_generate.py b/deployment/paiLibrary/paiService/service_template_generate.py
index cb992b6f05..3615d973c6 100644
--- a/deployment/paiLibrary/paiService/service_template_generate.py
+++ b/deployment/paiLibrary/paiService/service_template_generate.py
@@ -18,7 +18,7 @@
import logging
import logging.config
-
+import yaml
from ..common import template_handler
from ..common import file_handler
@@ -58,6 +58,43 @@ def template_mapper(self):
+ # Add "NodeAffinity" to service deployment yaml file
+ # according to the "deploy-rules" in service.yaml config file
+ # Currently support "In" and "NotIn" rules or the combination of them.
+ def add_deploy_rule_to_yaml(self, str_src_yaml):
+
+ service_deploy_kind_list = ['DaemonSet', 'Deployment', 'StatefulSets', 'Pod']
+
+ config = yaml.load(str_src_yaml)
+
+ # judge whether it's a service deploy file, eg. exclude configmap
+ if 'kind' in config and config['kind'] in service_deploy_kind_list:
+ match_expressions_arr = []
+
+ deploy_rules = self.service_conf['deploy-rules']
+ for operator, label in deploy_rules.items():
+ match_expression = dict()
+ if operator.lower() == 'in':
+ match_expression['operator'] = 'In'
+ if operator.lower() == 'notin':
+ match_expression['operator'] = 'NotIn'
+
+ match_expression['key'] = label
+ match_expression['values'] = ['true']
+ match_expressions_arr.append(match_expression)
+
+ config['spec']['template']['spec']['affinity'] = {'nodeAffinity': \
+ {'requiredDuringSchedulingIgnoredDuringExecution': {'nodeSelectorTerms': \
+ [{'matchExpressions': match_expressions_arr}]}}}
+
+ else:
+ logging.info("It is not a service deploy file! Only support " + str(service_deploy_kind_list))
+ return str_src_yaml
+
+ return yaml.dump(config, default_flow_style=False)
+
+
+
def generate_template(self):
self.logger.info("Begin to generate the template file in service {0}'s configuration.".format(self.service_name))
@@ -84,8 +121,14 @@ def generate_template(self):
self.logger.exception("failed to generate template file from %s with dict %s", template_path, service_conf_dict)
raise e
+ # judge whether it's a service deploy file
+ if "deploy-rules" in self.service_conf and template_file.find("yaml") >= 0 and template_file.find("delete") == -1:
+ generated_template = self.add_deploy_rule_to_yaml(generated_template)
+
file_handler.write_generated_file(target_path, generated_template)
+
+
self.logger.info("The template file of service {0} is generated.".format(self.service_name))
diff --git a/deployment/quick-start/cluster-configuration.yaml.template b/deployment/quick-start/cluster-configuration.yaml.template
index 5a2c111439..4d048e7bda 100644
--- a/deployment/quick-start/cluster-configuration.yaml.template
+++ b/deployment/quick-start/cluster-configuration.yaml.template
@@ -51,5 +51,4 @@ machine-list:
{%- if loop.length == 1 %}
pai-worker: "true"
{%- endif %}
- node-exporter: "true"
{%- endfor %}
diff --git a/deployment/quick-start/kubernetes-configuration.yaml.template b/deployment/quick-start/kubernetes-configuration.yaml.template
index 9aaf623f1e..1a86bd704c 100644
--- a/deployment/quick-start/kubernetes-configuration.yaml.template
+++ b/deployment/quick-start/kubernetes-configuration.yaml.template
@@ -30,15 +30,17 @@ kubernetes:
# The docker registry used in the k8s deployment. If you can access to gcr, we suggest to use gcr.
docker-registry: gcr.io/google_containers
# http://gcr.io/google_containers/hyperkube. Or the tag in your registry.
- hyperkube-version: v1.9.4
+ hyperkube-version: v1.9.9
# http://gcr.io/google_containers/etcd. Or the tag in your registry.
# If you are not familiar with etcd, please don't change it.
etcd-version: 3.2.17
# http://gcr.io/google_containers/kube-apiserver. Or the tag in your registry.
- apiserver-version: v1.9.4
+ apiserver-version: v1.9.9
# http://gcr.io/google_containers/kube-scheduler. Or the tag in your registry.
- kube-scheduler-version: v1.9.4
+ kube-scheduler-version: v1.9.9
# http://gcr.io/google_containers/kube-controller-manager
- kube-controller-manager-version: v1.9.4
+ kube-controller-manager-version: v1.9.9
# http://gcr.io/google_containers/kubernetes-dashboard-amd64
dashboard-version: v1.8.3
+ # The path to storage etcd data.
+ etcd-data-path: "/var/etcd"
diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template
index 1a6a578694..35b1b9fffb 100644
--- a/deployment/quick-start/services-configuration.yaml.template
+++ b/deployment/quick-start/services-configuration.yaml.template
@@ -47,7 +47,6 @@ hadoop:
# More about hadoop-ai please follow the link: https://github.com/Microsoft/pai/tree/master/hadoop-ai.
# Notice: the name should be hadoop-{hadoop-version}.tar.gz
custom-hadoop-binary-path: /pathHadoop/hadoop-2.9.0.tar.gz
- hadoop-version: 2.9.0
# Step 1 of 4 to set up Hadoop queues.
# Define all virtual clusters, equivalent concept of Hadoop queues:
# - Each VC will be assigned with (capacity / total_capacity * 100%) of the resources in the system.
@@ -74,6 +73,7 @@ frameworklauncher:
frameworklauncher-port: 9086
+
restserver:
# port for rest api server
server-port: 9186
@@ -104,7 +104,20 @@ prometheus:
# How frequently to scrape targets
scrape_interval: 30
+ # port for yarn exporter
+ yarn_exporter_port: 9459
+
+ # if you want to enable alert manager to send alert email, uncomment following lines and fill
+ # right values.
+ # alerting:
+ # alert_manager_port: 9093
+ # alert_receiver: alert@example.com
+ # smtp_url: smtp.gmail.com:587
+ # smtp_from: foo_bar@gmail.com
+ # smtp_auth_username: user@gmail.com
+ # smtp_auth_password: gmail_password
+
pylon:
# port of pylon
- port: 80
+ port: 80
\ No newline at end of file
diff --git a/docs/documentation.md b/docs/documentation.md
index e69322536e..25a1c78cc9 100644
--- a/docs/documentation.md
+++ b/docs/documentation.md
@@ -1,10 +1,13 @@
## Documentation
+
### Achitecture and OpenPAI core
- [System architecture](./system_architecture.md)
- [Job Scheduling: scheduling resources across OpenPAI jobs](../hadoop-ai/README.md)
- [FrameworkLauncher: launching customize Framework by Launcher Service](../frameworklauncher/README.md)
+
### Configuration and API
- [Configuration: customize OpenPAI via its configuration](./pai-management/doc/how-to-write-pai-configuration.md)
- [OpenPAI Programming Guides](../examples/README.md)
- [Restful API Docs](../rest-server/README.md)
+### [FAQs](./faq.md)
\ No newline at end of file
diff --git a/docs/faq.md b/docs/faq.md
new file mode 100644
index 0000000000..5a067d6ab7
--- /dev/null
+++ b/docs/faq.md
@@ -0,0 +1,113 @@
+# OpenPAI FAQs
+
+### Q: Why not recommend deploying the master node to the GPU server and running the job?
+
+A: It is not recommended to run the job on the master node in order to avoid overload on the master node and affect the stability of the cluster.
+
+### Q: When OpenPAI has multiple master nodes, can the master node be deployed on multiple subnets, and they can still access normally?
+
+A: We recommend deploying them on the same subnet. In theory, as long as the network is interoperable, it can be deployed. Considering the high communication requirements of the cluster, the network delay of different subnets is usually high, and the network is often inaccessible.
+
+### Q: If user find a job to retry multiple times, how to diagnose the cause?
+
+A: Users can find historical job logs through yarn. Please check [issue-1072](https://github.com/Microsoft/pai/issues/1072)'s answer and [job_log.md](./job_log.md)'s introduction.
+
+### Q: How to diagnose job problems through logs?
+
+A: Please check [job_log.md](./job_log.md)'s introduction.
+
+### Q: To improve the cluster usage, user would like to see a VC can use up all cluster resource if others don’t use it.
+
+A: By default, a VC can use up all cluster resource if others don’t use it. OpenPAI use [capacity scheduler](https://hadoop.apache.org/docs/r1.2.1/capacity_scheduler.html) of YARN for resource allocation. maximum-capacity defines a limit beyond which a queue cannot use the capacity of the cluster. This provides a means to limit how much excess capacity a queue can use. Default value of -1 implies a queue can use complete capacity of the cluster. [OpenPAI capacity scheduler](../pai-management/bootstrap/hadoop-resource-manager/hadoop-resource-manager-configuration/capacity-scheduler.xml.template) not set this item and there is no limit.
+
+### Q: To ensure one user cannot occupy excessive resource, operator would like to set a quota constraint for individual users.
+
+A: OpenPAI use capacity scheduler of YARN for resource allocation. User can configure the items "[yarn.scheduler.capacity.root.{{ queueName }}.user-limit-factor, yarn.scheduler.capacity.root.{{ queueName }}.minimum-user-limit-percent](https://hadoop.apache.org/docs/r1.2.1/capacity_scheduler.html)" to control the user's resource quota. These configuration items are in this file [capacity-scheduler.xml.template](../pai-management/bootstrap/hadoop-resource-manager/hadoop-resource-manager-configuration/capacity-scheduler.xml.template) of OpenPAI.
+
+```xml
+
+ yarn.scheduler.capacity.root.{{ queueName }}.user-limit-factor
+ 100
+
+
+
+ yarn.scheduler.capacity.root.{{ queueName }}.minimum-user-limit-percent
+ 100
+
+```
+
+For yarn.scheduler.capacity.root.{{ queueName }}.user-limit-factor:
+- OpenPAI default value of 100 implies no user limits are imposed.
+- Official explanation:
+
+```
+The multiple of the queue capacity which can be configured to allow a single user to acquire more slots. By default this is set to 1 which ensure that a single user can never take more than the queue's configured capacity irrespective of how idle th cluster is.
+```
+
+- Note: This configuration control user's resource usage which exceeds current vc. VC a can preempt the resources occupied by VC b, before job is completed.
+
+For yarn.scheduler.capacity.root.{{ queueName }}.minimum-user-limit-percent:
+- OpenPAI's default value of 100 implies no user limits are imposed.
+- Official explanation:
+
+```
+Each queue enforces a limit on the percentage of resources allocated to a user at any given time, if there is competition for them. This user limit can vary between a minimum and maximum value. The former depends on the number of users who have submitted jobs, and the latter is set to this property value. For example, suppose the value of this property is 25. If two users have submitted jobs to a queue, no single user can use more than 50% of the queue resources. If a third user submits a job, no single user can use more than 33% of the queue resources. With 4 or more users, no user can use more than 25% of the queue's resources. A value of 100 implies no user limits are imposed.
+```
+
+- Note: This configuration control users' resource usage in current vc. User a can not preempt the resources occupied by user b before job is completed.
+
+### Q: How to configure virtual cluster capacity?
+
+A: Please refer [configure virtual cluster capacity](../pai-management/doc/how-to-write-pai-configuration.md#configure_vc_capacity)
+
+### Q: How to use private docker registry job image when submitting an OpenPAI job?
+
+A: Please refer [job_tutorial.md](./job_tutorial.md) to config the auth file at job submit json file:
+
+If you're using a private Docker registry which needs authentication for image pull and is different from the registry used during deployment,
+please create an authentication file in the following format, upload it to HDFS and specify the path in `authFile` parameter in config file.
+
+- (1) Create an authFile
+
+authfile content:
+
+```
+userprivateimage.azurecr.io
+username
+password
+```
+
+Note: userprivateimage.azurecr.io is docker_registry_server
+
+- (2) [Upload it to HDFS](../pai-management/doc/hdfs.md#WebHDFS).
+
+File path at hdfs example: hdfs://master_ip:9000/user/paidemo/authfile
+
+- (3) Specify the path in `authFile` paramete
+
+OpenPAI job json file example:
+
+```
+{
+ "jobName": "paidemo",
+ "image": "userprivateimage.azurecr.io/demo4pai:test",
+ "dataDir": "hdfs://master_ip:9000/user/paidemo/data",
+ "outputDir": "hdfs://master_ip:9000/user/paidemo/output",
+ "codeDir": "hdfs://master_ip:9000/user/paidemo/code",
+ "authFile":"hdfs://master_ip:9000/user/paidemo/authfile",
+ "taskRoles": [
+ {
+ "name": "demo4pai",
+ "taskNumber": 1,
+ "cpuNumber": 2,
+ "memoryMB": 8192,
+ "gpuNumber": 1,
+ "command": " cd /home/test && bash train.sh"
+ }
+ ]
+}
+```
+
+*NOTE*:
+- If you're using a private registry at Docker Hub, you should use `docker.io` for `docker_registry_server` field in the authentication file.
+- Related issue: [1125](https://github.com/Microsoft/pai/issues/1215)
diff --git a/docs/images/PAI_job_am.png b/docs/images/PAI_job_am.png
new file mode 100644
index 0000000000..ef1d8810fe
Binary files /dev/null and b/docs/images/PAI_job_am.png differ
diff --git a/docs/images/PAI_job_retry.png b/docs/images/PAI_job_retry.png
new file mode 100644
index 0000000000..1fe1fac004
Binary files /dev/null and b/docs/images/PAI_job_retry.png differ
diff --git a/docs/images/PAI_job_task_container.png b/docs/images/PAI_job_task_container.png
new file mode 100644
index 0000000000..971fd57194
Binary files /dev/null and b/docs/images/PAI_job_task_container.png differ
diff --git a/docs/images/PAI_submit_online_1.png b/docs/images/PAI_submit_online_1.png
new file mode 100644
index 0000000000..eb745774ef
Binary files /dev/null and b/docs/images/PAI_submit_online_1.png differ
diff --git a/docs/images/PAI_submit_online_2.png b/docs/images/PAI_submit_online_2.png
new file mode 100644
index 0000000000..ea604b1152
Binary files /dev/null and b/docs/images/PAI_submit_online_2.png differ
diff --git a/docs/images/PAI_submit_online_3.png b/docs/images/PAI_submit_online_3.png
new file mode 100644
index 0000000000..de74ad8316
Binary files /dev/null and b/docs/images/PAI_submit_online_3.png differ
diff --git a/docs/images/PAI_submit_online_4.png b/docs/images/PAI_submit_online_4.png
new file mode 100644
index 0000000000..4205711edf
Binary files /dev/null and b/docs/images/PAI_submit_online_4.png differ
diff --git a/docs/job_docker_env.md b/docs/job_docker_env.md
new file mode 100644
index 0000000000..e6675f80f9
--- /dev/null
+++ b/docs/job_docker_env.md
@@ -0,0 +1,26 @@
+# Use docker to package the job environment dependencies
+
+The system launches a deep learning job in one or more Docker containers. A Docker images is required in advance.
+The system provides a base Docker images with HDFS, CUDA and cuDNN support, based on which users can build their own custom Docker images.
+
+To build a base Docker image, for example [Dockerfile.build.base](../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base), run:
+```sh
+docker build -f Dockerfiles/Dockerfile.build.base -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 Dockerfiles/
+```
+
+Then a custom docker image can be built based on it by adding `FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04` in the Dockerfile.
+
+As an example, we customize a TensorFlow Docker image using [Dockerfile.run.tensorflow](../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.run.tensorflow):
+
+```sh
+docker build -f Dockerfiles/Dockerfile.run.tensorflow -t pai.run.tensorflow Dockerfiles/
+```
+
+Next, the built image is pushed to a docker registry for every node in the system to access that image:
+```sh
+docker tag pai.run.tensorflow your_docker_registry/pai.run.tensorflow
+docker push your_docker_registry/pai.run.tensorflow
+```
+
+And the image is ready to serve. Note that above script assume the docker registry is deployed locally.
+Actual script can vary depending on the configuration of Docker registry.
\ No newline at end of file
diff --git a/docs/job_log.md b/docs/job_log.md
new file mode 100644
index 0000000000..f314f1a246
--- /dev/null
+++ b/docs/job_log.md
@@ -0,0 +1,157 @@
+
+# How to diagnose job problems through logs
+
+## Table of Contents
+- [1 Diagnostic job failure reason](#job)
+ - [1.1 View job's launcher AM log](#amlog)
+ - [1.2 View job's each task container log](#tasklog)
+ - [1.3 Job exitStatus Convention](#exit)
+- [2 Diagnostic job retried many times reason](#retry)
+
+## 1 Diagnose job failure reason
+
+OpenPAI job is launched by [famework launcher](../frameworklauncher/doc/USERMANUAL.md), and each task container is managed by launcher application master.
+
+LauncherAM will manage each job's tasks by customized feature requirement. You can refer to this document [frameworklauncher architecture](../frameworklauncher/doc/USERMANUAL.md#Architecture) to understand the relationship between them.
+
+ When we diagnose job problems through logs, we shoud pay attention to job launcher AM log (get the main reason) or zoom in job task container log.
+
+### 1.1 View job launcher AM log
+
+Check the summary, and pay attention to the highlights.
+
+![PAI_job_am](./images/PAI_job_am.png)
+
+Log example:
+
+```
+ Exit Diagnostics:
+ [ExitStatus]: LAUNCHER_EXIT_STATUS_UNDEFINED
+ [ExitCode]: 177
+ [ExitDiagnostics]:
+ ExitStatus undefined in Launcher, maybe UserApplication itself failed.
+ [ExitType]: UNKNOWN
+ ________________________________________________________________________________________________________________________________________________________________________________________________________
+ [ExitCustomizedDiagnostics]:
+ [ExitCode]: 134
+ [ExitDiagnostics]:
+ Exception from container-launch.
+ Container id: container_e9878_1532412068340_0018_01_000002
+ Exit code: 134
+ Exception message: Error: No such object: cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002
+
+ Stack trace: ExitCodeException exitCode=134: Error: No such object: cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002
+
+ at org.apache.hadoop.util.Shell.runCommand(Shell.java:545)
+ at org.apache.hadoop.util.Shell.run(Shell.java:456)
+ at org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:722)
+ at org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)
+ at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:302)
+ at org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)
+ at java.util.concurrent.FutureTask.run(FutureTask.java:266)
+ at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
+ at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
+ at java.lang.Thread.run(Thread.java:748)
+
+ Shell output: [DEBUG] EXIT signal received in yarn container, exiting ...
+ [DEBUG] cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002 does not exist.
+
+
+ Container exited with a non-zero exit code 134
+
+ ________________________________________________________________________________________________________________________________________________________________________________________________________
+ [ExitCustomizedDiagnostics]:
+
+ [g2p_train]: [LastCompletedTask]: [TaskStatus]:
+ {
+ "taskIndex" : 0,
+ "taskRoleName" : "g2p_train",
+ "taskState" : "TASK_COMPLETED",
+ "taskRetryPolicyState" : {
+ "retriedCount" : 0,
+ "succeededRetriedCount" : 0,
+ "transientNormalRetriedCount" : 0,
+ "transientConflictRetriedCount" : 0,
+ "nonTransientRetriedCount" : 0,
+ "unKnownRetriedCount" : 0
+ },
+ "taskCreatedTimestamp" : 1532419805920,
+ "taskCompletedTimestamp" : 1532494535939,
+ "taskServiceStatus" : {
+ "serviceVersion" : 0
+ },
+ "containerId" : "container_e9878_1532412068340_0018_01_000002",
+ "containerHost" : "10.151.40.165",
+ "containerIp" : "10.151.40.165",
+ "containerPorts" : "web:22787;grpc:22788;http:22789;ssh:22790;",
+ "containerGpus" : 1,
+ "containerLogHttpAddress" : "http://10.151.40.165:8042/node/containerlogs/container_e9878_1532412068340_0018_01_000002/core/",
+ "containerConnectionLostCount" : 0,
+ "containerIsDecommissioning" : null,
+ "containerLaunchedTimestamp" : 1532419818661,
+ "containerCompletedTimestamp" : 1532494535935,
+ "containerExitCode" : 134,
+ "containerExitDiagnostics" : "Exception from container-launch.\nContainer id: container_e9878_1532412068340_0018_01_000002\nExit code: 134\nException message: Error: No such object: cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002\n\nStack trace: ExitCodeException exitCode=134: Error: No such object: cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002\n\n\tat org.apache.hadoop.util.Shell.runCommand(Shell.java:545)\n\tat org.apache.hadoop.util.Shell.run(Shell.java:456)\n\tat org.apache.hadoop.util.Shell$ShellCommandExecutor.execute(Shell.java:722)\n\tat org.apache.hadoop.yarn.server.nodemanager.DefaultContainerExecutor.launchContainer(DefaultContainerExecutor.java:212)\n\tat org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:302)\n\tat org.apache.hadoop.yarn.server.nodemanager.containermanager.launcher.ContainerLaunch.call(ContainerLaunch.java:82)\n\tat java.util.concurrent.FutureTask.run(FutureTask.java:266)\n\tat java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)\n\tat java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)\n\tat java.lang.Thread.run(Thread.java:748)\n\nShell output: [DEBUG] EXIT signal received in yarn container, exiting ...\n[DEBUG] cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002 does not exist.\n\n\nContainer exited with a non-zero exit code 134\n",
+ "containerExitType" : "UNKNOWN"
+ }
+ [g2p_train]: [LastCompletedTask]: [ContainerLocations]:
+ ContainerLogHttpAddress: http://10.151.40.165:8042/node/containerlogs/container_e9878_1532412068340_0018_01_000002/core/
+ AppCacheNetworkPath: 10.151.40.165:/var/lib/hadoopdata/nm-local-dir/usercache/core/appcache/application_1532412068340_0018
+ ContainerLogNetworkPath: 10.151.40.165:/var/lib/yarn/userlogs/application_1532412068340_0018/container_e9878_1532412068340_0018_01_000002
+ ________________________________________________________________________________________________________________________________________________________________________________________________________
+ [ApplicationCompletionReason]: [g2p_train]: FailedTaskCount 1 has reached MinFailedTaskCount 1.
+
+```
+
+Please pay attention to these lines to diagnostic job failure reason
+
+| line head | above example log info |
+| --- | --- |
+| [ExitDiagnostics] | ExitStatus undefined in Launcher, maybe UserApplication itself failed.|
+| [ExitCode] | 134|
+| Exception message | No such object: cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002. |
+| Shell output | [DEBUG] EXIT signal received in yarn container, exiting ...[DEBUG] cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002 does not exist.|
+|ContainerLogHttpAddress| ```http://10.151.40.165:8042/node/containerlogs/container_e9878_1532412068340_0018_01_000002/core/ ```|
+|AppCacheNetworkPath|10.151.40.165:/var/lib/hadoopdata/nm-local-dir/usercache/core/appcache/application_1532412068340_0018|
+|ContainerLogNetworkPath|10.151.40.165:/var/lib/yarn/userlogs/application_1532412068340_0018/container_e9878_1532412068340_0018_01_000002|
+|[ApplicationCompletionReason]| [g2p_train]: FailedTaskCount 1 has reached MinFailedTaskCount 1.|
+
+we could get information:
+1. UserApplication itself failed.
+2. cntk-test-4621-17223-container_e9878_1532412068340_0018_01_000002 does not exist is the reason.
+3. Then we could visit ```http://10.151.40.165:8042/node/containerlogs/container_e9878_1532412068340_0018_01_000002/core/``` at step 1.2 to detect task failure reason.
+
+
+### 1.2 View job each task container log
+
+- Check the failed task log who triggered the whole attempt failed, i.e.
+
+ContainerLogHttpAddress:
+
+```http://10.151.40.165:8042/node/containerlogs/container_e9878_1532412068340_0018_01_000002/core/```
+
+- Or check other tasks logs:
+
+![PAI_job_retry](./images/PAI_job_retry.png)
+
+### 1.3 Job exitStatus Convention
+
+You can check all the defined ExitStatus by: ExitType, ExitDiagnostics from framework launcher [USERMANUAL.md](../frameworklauncher/doc/USERMANUAL.md#ExitStatus_Convention)
+
+## 2 Diagnostic job retried many times reason
+
+If the Framework retried many times, check other attempts by searching the FrameworkName in the YARN Web:
+
+- Visit YARN URL: ```http://master_ip/yarn/``` or ```http://master_ip:8088```
+- Seach by the job name key words
+
+![PAI_job_task_container](./images/PAI_job_task_container.png)
+
+- Click the ID or History key word to redirect to the job am log pages. Then user could refer section 1 to diagnostic job history retry failure reason.
+
+#### Note:
+- History job number limit: Currently, OpenPAI's yarn only store 1000 jobs' logs. User maybe can not find some old job's logs. For example, frequently retried job logs.
+- Job container log rotation: In order to prevent the historical log from being too large, OpenPAI configure the [docker log rotation](https://docs.docker.com/config/containers/logging/json-file/) by file [docker-daemon.json](../pai-management/k8sPaiLibrary/maintaintool/docker-daemon.json).
+ The default configuration:
+ - "max-size": "100m": The maximum size of the log before it is rolled. A positive integer plus a modifier representing the unit of measure (k, m, or g). Defaults to -1 (unlimited).,
+ - "max-file": "10": The maximum number of log files that can be present. If rolling the logs creates excess files, the oldest file is removed. Only effective when max-size is also set. A positive integer. Defaults to 1.
diff --git a/docs/job_tutorial.md b/docs/job_tutorial.md
index 8afbaaef48..915246cd3b 100644
--- a/docs/job_tutorial.md
+++ b/docs/job_tutorial.md
@@ -20,47 +20,39 @@
# How to Run a Deep Learning Job
-## Introduction
-
-The system supports major deep learning frameworks, including CNTK and TensorFlow, etc.
-It also supports other type of workload through a customized docker image.
+OpenPAI supports major deep learning frameworks, including CNTK and TensorFlow, etc.
+It also supports other type of job through a customized docker image.
Users need to prepare a config file and submit it for a job submission.
This guide introduces the details of job submission.
+## Table of Contents:
-## Prerequisites
-
-This guide assumes the system has already been deployed properly and a docker registry is available to store docker images.
-
+- [Quick start: how to write and submit a CIFAR-10 job](#quickstart)
+- [Write a customized job](#writejob)
+ - [Prerequisites](#prerequisites)
+ - [Use docker to package the job environment dependencies](#docker)
+ - [Write a job json configuration file](#jobjson)
+ - [Job runtime environmental variable](#envvar)
+ - [A deep learning job example](#example)
+ - [Job submission steps](#submission)
+- [How to debug Job](#debug)
+- [Learn more job examples](#moreexample)
-## Docker image
+## Quick start: how to write and submit a CIFAR-10 job
-The system launches a deep learning job in one or more Docker containers. A Docker images is required in advance.
-The system provides a base Docker images with HDFS, CUDA and cuDNN support, based on which users can build their own custom Docker images.
+Please refer to this [document](../examples/README.md#quickstart) for how to write and submit a CIFAR-10 job.
-To build a base Docker image, for example [Dockerfile.build.base](../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base), run:
-```sh
-docker build -f Dockerfiles/Dockerfile.build.base -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 Dockerfiles/
-```
+## Write a customized job
-Then a custom docker image can be built based on it by adding `FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04` in the Dockerfile.
+### Prerequisites
-As an example, we customize a TensorFlow Docker image using [Dockerfile.run.tensorflow](../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.run.tensorflow):
-```sh
-docker build -f Dockerfiles/Dockerfile.run.tensorflow -t pai.run.tensorflow Dockerfiles/
-```
-
-Next, the built image is pushed to a docker registry for every node in the system to access that image:
-```sh
-docker tag pai.run.tensorflow your_docker_registry/pai.run.tensorflow
-docker push your_docker_registry/pai.run.tensorflow
-```
+This guide assumes the system has already been deployed properly and a docker registry is available to store docker images.
-And the image is ready to serve. Note that above script assume the docker registry is deployed locally.
-Actual script can vary depending on the configuration of Docker registry.
+### Use docker to package the job environment dependencies
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [job_docker_env.md](./job_docker_env.md) to customize example's docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image in following example `"image": "your_docker_registry/pai.run.tensorflow"` with your own. OpenPAI has many pre-built images for different frameworks. In [Learn more job examples](#moreexample) section, each example folder will contain a pre-build docker env.
-## Json config file for job submission
+### Write a job json configuration file
A json file describe detailed configuration required for a job submission. The detailed format is shown as below:
@@ -139,7 +131,7 @@ password
*NOTE*: If you're using a private registry at Docker Hub, you should use `docker.io` for `docker_registry_server` field in the authentication file.
-## Runtime environment
+### Job runtime environmental variable
Each task in a job runs in one Docker container.
For a multi-task job, one task might communicate with others.
@@ -178,7 +170,7 @@ Below we show a complete list of environment variables accessible in a Docker co
| PAI_TASK_ROLE\_`$name`\_HOST_LIST | Host list for `PAI_TASK_ROLE_NAME == $name`, comma separated `ip:port` string, sorted by current task index in task role. Each task role has a host list environment variable with the corresponding task role name |
-## An example deep learning job
+### A deep learning job example
A distributed TensorFlow job is listed below as an example:
@@ -247,17 +239,12 @@ A distributed TensorFlow job is listed below as an example:
}
```
+### Job submission steps
-## More examples
-
-For more examples, please refer to [job examples directory](../examples).
-
-
-## Job submission
-
-1. Put the code and data on HDFS
+1. Put the code and data on [HDFS](https://github.com/Microsoft/pai/blob/master/pai-management/doc/hdfs.md)
- Use HDFS tools to upload your code and data to HDFS on the system. We upload a [Docker image](https://hub.docker.com/r/paiexample/pai.example.hdfs/) to DockerHub with built-in HDFS support.
+- Option-1: Use [WebHDFS](https://github.com/Microsoft/pai/blob/master/pai-management/doc/hdfs.md#WebHDFS) to upload your code and data to HDFS on the system.
+- Option-2: Use HDFS tools to upload your code and data to HDFS on the system. We upload a [Docker image](https://hub.docker.com/r/paiexample/pai.example.hdfs/) to DockerHub with built-in HDFS support.
Please refer to the [HDFS commands guide](https://hadoop.apache.org/docs/r2.7.2/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html) for details.
2. Prepare a job config file
@@ -268,8 +255,10 @@ For more examples, please refer to [job examples directory](../examples).
Open web portal in a browser, click "Submit Job" and upload your config file.
-## SSH Connection
+## How to debug the job
+
You can ssh connect to a specified container either from outside or inside container.
+
### SSH connect from outside
1. Get job ssh connect info by invoking `/api/v1/jobs/:jobName/ssh` api or clicking the job detail page on webportal.
@@ -295,3 +284,7 @@ You can use `ssh $PAI_CURRENT_TASK_ROLE_NAME-$PAI_CURRENT_TASK_ROLE_CURRENT_TASK
```sh
ssh worker-0
```
+
+## Learn more job examples
+
+For more examples, please refer to [job examples directory](../examples).
diff --git a/docs/pai-management/doc/add-service.md b/docs/pai-management/doc/add-service.md
index 8fbb623138..f443bb97ed 100644
--- a/docs/pai-management/doc/add-service.md
+++ b/docs/pai-management/doc/add-service.md
@@ -123,7 +123,6 @@ prerequisite:
# paictl will generate the template file with the name "filename".template with jinja2.
template-list:
- - node-label.sh
- master-hbase.yaml
# The script about how to starting a service
@@ -177,9 +176,16 @@ This configuration consists of 7 parts.
- With node label and node selector, it is possible to assign a service pod to a specific node. For example, hadoop-name-node should be assigned to the node with the label master. And hadoop-data-node should be assigned to the node with the label worker.
- With node label, we are able to management a service on a specific node, but do not affect the same service on other nodes.
- Example
+<<<<<<< HEAD:docs/pai-management/doc/add-service.md
- [Hadoop-name-node's node-label.sh](../../../src/hadoop-name-node/deploy/node-label.sh.template)
- [Hadoop name node](../../../src/hadoop-name-node/deploy/hadoop-name-node.yaml.template)
- [Hadoop data node](../../../src/hadoop-data-node/deploy/hadoop-data-node.yaml.template)
+=======
+ - Specify which label of roles you want to deploy your service in [services-configuration.yaml](https://github.com/Microsoft/pai/blob/master/cluster-configuration/services-configuration.yaml)
+ - Use NodeAffinity to schedule pods onto the right machines, such as:
+ - [Hadoop name node](../bootstrap/hadoop-name-node/hadoop-name-node.yaml.template)
+ - [Hadoop data node](../bootstrap/hadoop-data-node/hadoop-data-node.yaml.template)
+>>>>>>> origin/master:pai-management/doc/add-service.md
[DaemonSet](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/)
diff --git a/docs/pai-management/doc/cluster-bootup.md b/docs/pai-management/doc/cluster-bootup.md
index bdf1c68a12..5dda81f747 100644
--- a/docs/pai-management/doc/cluster-bootup.md
+++ b/docs/pai-management/doc/cluster-bootup.md
@@ -30,6 +30,7 @@ With the cluster being set up, the steps to bring PAI up on it are as follows:
## Customized deploy
### Steps:
+
- [Step 0. Prepare the dev-box](#c-step-0)
- [Step 1. Prepare the quick-start.yaml file](#c-step-1)
- [Step 2. Generate OpenPAI configuration files](#c-step-2)
@@ -49,6 +50,8 @@ Please refer to this [section](./how-to-setup-dev-box.md) for the customize sett
##### (1) Run your dev-box
+Notice that `dev-box` should run on a machine outside of PAI cluster, it shouldn't run on any PAI cluster node.
+
```bash
# Pull the dev-box image from Docker Hub
@@ -60,7 +63,6 @@ sudo docker pull docker.io/openpai/dev-box
# By now, you can leave it as it is, we only mount those two directories into docker container for later usage.
sudo docker run -itd \
-e COLUMNS=$COLUMNS -e LINES=$LINES -e TERM=$TERM \
- -v /var/lib/docker:/var/lib/docker \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /pathHadoop:/pathHadoop \
-v /pathConfiguration:/cluster-configuration \
@@ -76,7 +78,28 @@ sudo docker run -itd \
sudo docker exec -it dev-box /bin/bash
```
-##### (3) Go to pai-management working dir
+##### (3) Check out a latest release branch of OpenPAI
+
+```bash
+cd /pai
+
+# fetch tags
+git fetch --tags
+
+# please go to https://github.com/Microsoft/pai/releases to checkout a latest release.
+# checkout a release branch. For example: v0.x.y
+git checkout v0.x.y
+
+# check current branch
+git status
+```
+- sucessful result:
+```bash
+HEAD detached at v0.6.1
+nothing to commit, working directory clean
+```
+
+##### (4) Go to pai-management working dir
```bash
cd /pai/pai-management
@@ -99,11 +122,11 @@ sudo docker ps
### Step 1. Prepare the quick-start.yaml file
-Prepare the file under dev-box folder: /pai/pai-management/quick-start
+Prepare the file under dev-box folder: /pai/pai-management/quick-start
-There is a example file under path: /pai/pai-management/quick-start/quick-start-example.yaml
+There is a example file under path: /pai/pai-management/quick-start/quick-start-example.yaml
-An example yaml file is shown below. Note that you should change the IP address of the machine and ssh information accordingly.
+An example yaml file is shown below. Note that you should change the IP address of the machine and ssh information accordingly.
```yaml
# quick-start.yaml
@@ -139,12 +162,27 @@ Check all configruation items of the quick-start.yaml are correct.
After the quick-start.yaml is ready, use it to generate four configuration yaml files as follows.
+##### (1) generate configuration files
+
```bash
cd /pai/pai-management
# cmd should be executed under /pai/pai-management directory in the dev-box.
python paictl.py cluster generate-configuration -i /pai/pai-management/quick-start/quick-start.yaml -o ~/pai-config -f
+
+```
+
+##### (2) update docker tag to release version
+
+```bash
+vi ~/pai-config/services-configuration.yaml
+```
+
+For example: v0.x.y branch, user should change docker-tag to v0.x.y.
+
+```bash
+docker-tag: v0.x.y
```
[Appendix: Default values in auto-generated configuration files](./how-to-write-pai-configuration.md#appendix)
@@ -163,7 +201,7 @@ Please refer to this [section](./how-to-write-pai-configuration.md) for the deta
### Step 3(Optional). Customize configure OpenPAI
-This method is for advanced users.
+This method is for advanced users.
The description of each field in these configuration files can be found in [A Guide For Cluster Configuration](how-to-write-pai-configuration.md).
@@ -178,9 +216,9 @@ If user want to customize configuration, please see the table below
- [configure customize docker repository](./how-to-write-pai-configuration.md#docker_repo)
- [configure OpenPAI admin user account](./how-to-write-pai-configuration.md#configure_user_acc)
- port / data folder etc.
- - [configure service entry](./how-to-write-pai-configuration.md#configure_service_entry)
+ - [configure service entry](./how-to-write-pai-configuration.md#configure_service_entry)
- [configure HDFS data / OpenPAI temp data folder](./how-to-write-pai-configuration.md#data_folder)
- - component version
+ - component version
- [configure K8s component version](./how-to-write-pai-configuration.md#k8s_component)
- [configure docker version](./how-to-write-pai-configuration.md#docker_repo)
- [configure nvidia gpu driver version](./how-to-write-pai-configuration.md#driver_version)
@@ -205,7 +243,7 @@ If user want to customize configuration, please see the table below
- [YARN / HDFS](./how-to-write-pai-service-configuration.md#hadoop)
- [Zookeeper](./how-to-write-pai-service-configuration.md#zookeeper)
- Monitor
- - [Prometheus / Exporter](./how-to-write-pai-service-configuration.md#prometheus)
+ - [Prometheus / Exporter](./how-to-write-pai-service-configuration.md#prometheus)
- [Grafana](./how-to-write-pai-service-configuration.md#grafana)
- [Appendix: Default values in auto-generated configuration files](./how-to-write-pai-configuration.md#appendix)
@@ -276,7 +314,7 @@ http://:9090/#!/pod?namespace=default
Where `` is the same as in the previous [section](#step-2).
-## Singlebox deploy
+## Singlebox deploy
### Steps:
@@ -284,9 +322,9 @@ Where `` is the same as in the previous [section](#step-2).
- Step 1. Prepare the quick-start.yaml file
-Prepare the file under dev-box folder: /pai/pai-management/quick-start
+Prepare the file under dev-box folder: /pai/pai-management/quick-start
-There is a example file under path: /pai/pai-management/quick-start/quick-start-example.yaml
+There is a example file under path: /pai/pai-management/quick-start/quick-start-example.yaml
An example yaml file is shown below. Note that you should change the IP address of the machine and ssh information accordingly.
@@ -335,7 +373,7 @@ ssh-password: pai-password
- [3 Getting help](#troubleshooting_3)
### 1 Troubleshooting OpenPAI services
-
+
#### 1.1 Diagnosing the problem
- Monitor
@@ -380,7 +418,7 @@ As OpenPAI services are deployed on kubernetes, please refer [debug kubernetes p
#### 1.2 Fix problem
- Update OpenPAI Configuration
-
+
Check and refine 4 yaml files:
```
@@ -390,7 +428,7 @@ Check and refine 4 yaml files:
- serivices-configuration.yaml
```
-- Customize config for specific service
+- Customize config for specific service
If user want to customize single service, you could find service config file at [pai-management/bootstrap](../../../src) and find image dockerfile at [pai-management/src](../../../src).
@@ -398,7 +436,7 @@ If user want to customize single service, you could find service config file at
- Customize image dockerfile or code
-User could find service's image dockerfile at [pai-management/src](#pai-management/src) and customize them.
+User could find service's image dockerfile at [pai-management/src](#pai-management/src) and customize them.
- Rebuild image
@@ -428,7 +466,7 @@ python paictl.py service stop \
[ -n service-name ]
```
-If the -n parameter is specified, only the given service, e.g. rest-server, webportal, watchdog, etc., will be stopped. If not, all PAI services will be stopped.
+If the -n parameter is specified, only the given service, e.g. rest-server, webportal, watchdog, etc., will be stopped. If not, all PAI services will be stopped.
2. ```Boot up single all OpenPAI services.```
@@ -440,8 +478,13 @@ Please refer [Kubernetes Troubleshoot Clusters](https://kubernetes.io/docs/tasks
### 3 Getting help
+<<<<<<< HEAD:docs/pai-management/doc/cluster-bootup.md
- [StackOverflow:](../../../docs/stackoverflow.md) If you have questions about OpenPAI, please submit question at Stackoverflow under tag: openpai
- [Report an issue:](https://github.com/Microsoft/pai/wiki/Issue-tracking) If you have issue/ bug/ new feature, please submit it at Github
+=======
+- [StackOverflow:](../../docs/stackoverflow.md) If you have questions about OpenPAI, please submit question at Stackoverflow under tag: openpai
+- [Report an issue:](https://github.com/Microsoft/pai/wiki/Issue-tracking) If you have issue/ bug/ new feature, please submit it at Github
+>>>>>>> origin/master:pai-management/doc/cluster-bootup.md
## Maintenance
#### [Service Upgrading](./machine-maintenance.md#service-maintain.md)
diff --git a/docs/pai-management/doc/example/add-service/bootstrap/hbase/refresh.sh.template b/docs/pai-management/doc/example/add-service/bootstrap/hbase/refresh.sh.template
index 4e2991b702..8504e271b5 100644
--- a/docs/pai-management/doc/example/add-service/bootstrap/hbase/refresh.sh.template
+++ b/docs/pai-management/doc/example/add-service/bootstrap/hbase/refresh.sh.template
@@ -22,8 +22,6 @@ pushd $(dirname "$0") > /dev/null
echo "refresh hbase-configuration"
kubectl create configmap hbase-configuration --from-file=hbase-configuration/ --dry-run -o yaml | kubectl apply -f -
-echo "relabel the node label"
-sh node-label.sh
{% for host in machinelist %}
diff --git a/docs/pai-management/doc/example/add-service/bootstrap/hbase/service.yaml b/docs/pai-management/doc/example/add-service/bootstrap/hbase/service.yaml
index 153835b307..7acd25ad5c 100644
--- a/docs/pai-management/doc/example/add-service/bootstrap/hbase/service.yaml
+++ b/docs/pai-management/doc/example/add-service/bootstrap/hbase/service.yaml
@@ -24,7 +24,6 @@ prerequisite:
# paictl will generate the template file with the name "filename".template with jinja2.
template-list:
- - node-label.sh
- hbase-master.yaml
- hbase-regionserver.yaml
- delete.yaml
diff --git a/docs/pai-management/doc/example/add-service/bootstrap/hbase/start.sh b/docs/pai-management/doc/example/add-service/bootstrap/hbase/start.sh
index 34c925136e..f64f4c7c56 100644
--- a/docs/pai-management/doc/example/add-service/bootstrap/hbase/start.sh
+++ b/docs/pai-management/doc/example/add-service/bootstrap/hbase/start.sh
@@ -25,10 +25,9 @@
pushd $(dirname "$0") > /dev/null
-# Step1: Lable the node.
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh
+# Step1: Choose the pai roles you want to deploy in your service.yaml file, such as:
+# deploy-rules:
+# in: pai-master
# Step2: Create the configmap.
#chmod u+x configmap-create.sh
diff --git a/docs/pai-management/doc/how-to-setup-dev-box.md b/docs/pai-management/doc/how-to-setup-dev-box.md
index e151cd5878..d46df2b490 100644
--- a/docs/pai-management/doc/how-to-setup-dev-box.md
+++ b/docs/pai-management/doc/how-to-setup-dev-box.md
@@ -17,7 +17,6 @@ sudo docker pull docker.io/openpai/dev-box
# By now, you can leave it as it is, we only mount those two directories into docker container for later usage.
sudo docker run -itd \
-e COLUMNS=$COLUMNS -e LINES=$LINES -e TERM=$TERM \
- -v /var/lib/docker:/var/lib/docker \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /pathHadoop:/pathHadoop \
-v /pathConfiguration:/cluster-configuration \
@@ -62,7 +61,6 @@ sudo docker build -t dev-box .
# Run your dev-box
sudo docker run -itd \
-e COLUMNS=$COLUMNS -e LINES=$LINES -e TERM=$TERM \
- -v /var/lib/docker:/var/lib/docker \
-v /var/run/docker.sock:/var/run/docker.sock \
-v /pathHadoop:/pathHadoop \
-v /pathConfiguration:/cluster-configuration \
diff --git a/docs/pai-management/doc/how-to-write-pai-configuration.md b/docs/pai-management/doc/how-to-write-pai-configuration.md
index c1f2df1820..de8e494c7b 100644
--- a/docs/pai-management/doc/how-to-write-pai-configuration.md
+++ b/docs/pai-management/doc/how-to-write-pai-configuration.md
@@ -399,7 +399,6 @@ hadoop:
# custom_hadoop_binary_path specifies the path PAI stores the custom built hadoop-ai
# Notice: the name should be hadoop-{hadoop-version}.tar.gz
custom-hadoop-binary-path: /pathHadoop/hadoop-2.9.0.tar.gz
- hadoop-version: 2.9.0
virtualClusters:
default:
description: default queue for all users.
@@ -417,8 +416,12 @@ hadoop:
| Configuration Property | File | Meaning |
| --- | --- | --- |
+<<<<<<< HEAD:docs/pai-management/doc/how-to-write-pai-configuration.md
| ```custom-hadoop-binary-path```|services-configuration.yaml| Please set a path here for paictl to build [hadoop-ai](../../../src/hadoop-ai).|
| ```hadoop-version```|services-configuration.yaml| Please set this to ```2.9.0```.|
+=======
+| ```custom-hadoop-binary-path```|services-configuration.yaml| Please set a path here for paictl to build [hadoop-ai](../../hadoop-ai).|
+>>>>>>> origin/master:pai-management/doc/how-to-write-pai-configuration.md
| ```virtualClusters```|services-configuration.yaml| Hadoop queue setting. Each VC will be assigned with (capacity / total_capacity * 100%) of resources. paictl will create the 'default' VC with 0 capacity, if it is not been specified. paictl will split resources to each VC evenly if the total capacity is 0. The capacity of each VC will be set to 0 if it is a negative number.|
### how to check
diff --git a/docs/pai-management/doc/how-to-write-pai-service-configuration.md b/docs/pai-management/doc/how-to-write-pai-service-configuration.md
index 7c19d08e26..7328264a56 100644
--- a/docs/pai-management/doc/how-to-write-pai-service-configuration.md
+++ b/docs/pai-management/doc/how-to-write-pai-service-configuration.md
@@ -16,7 +16,8 @@ OpenPAI consists of multiple services, user could customize each service.
- [YARN / HDFS](#hadoop)
- [Zookeeper](#zookeeper)
- Monitor
- - [Prometheus / Exporter](#prometheus)
+ - [Prometheus / Exporter](#prometheus)
+ - [Alert Manager](#alertmanager)
- [Grafana](#grafana)
## Configure Kubernetes
@@ -62,11 +63,17 @@ User could customize [Zookeeper](https://zookeeper.apache.org/) startup configur
## Configure Prometheus / Exporter
-User could customize [Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) at OpenPAI's [folder / file](../bootstrap/prometheus/prometheus-configmap.yaml.template)
+User could customize [Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) at OpenPAI's [folder / file](../bootstrap/prometheus/prometheus-configmap.yaml.template)
User could customize [Prometheus](https://prometheus.io/docs/prometheus/latest/configuration/configuration/) startup configuration at OpenPAI's [folder / file](../bootstrap/prometheus/prometheus-deployment.yaml.template)
-User could customize [Node-exporter](https://github.com/prometheus/node_exporter) startup configuration at OpenPAI's [folder / file](../bootstrap/prometheus/node-exporter-ds.yaml.template)
+User could customize [Node-exporter](https://github.com/prometheus/node_exporter) startup configuration at OpenPAI's [folder / file](../bootstrap/node-exporter/node-exporter.yaml.template)
+
+## Configure Alert Manager
+
+User could customize [Alert Manager](https://prometheus.io/docs/alerting/alertmanager/) at OpenPAI's [folder / file](../bootstrap/alert-manager/alert-configmap.yaml.template). Please refer to [doc](../../prometheus/doc/alert-manager.md#configuration) for more info.
+
+User could customize [Alerting rules](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) at OpenPAI's [folder / file](../../prometheus/prometheus-alert)
## Configure Grafana
diff --git a/docs/submit_from_webportal.md b/docs/submit_from_webportal.md
new file mode 100644
index 0000000000..e70c1337a7
--- /dev/null
+++ b/docs/submit_from_webportal.md
@@ -0,0 +1,20 @@
+# Submit a job in web portal
+
+1. At PAI home page, click the top right corner to login:
+
+![](./images/PAI_submit_online_1.png)
+
+
+2. At the left sidebar, click "Submit Job":
+
+![](./images/PAI_submit_online_2.png)
+
+
+3. You can import your job configuration json file, or you can fill the job configurations in the webportal's table online. How to write the configuration please refer to [write job from scratch](./job_tutorial.md#json-config-file-for-job-submission).
+
+![image.png](./images/PAI_submit_online_3.png)
+
+
+4. Then click "Submit" button at the bottom, if your job configuration is right you will see a success message:
+
+![image.png](./images/PAI_submit_online_4.png)
diff --git a/examples/README.md b/examples/README.md
index eaa32ffadf..7ede41bbe6 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,35 +1,120 @@
-# PAI Examples
+# OpenPAI Job Examples
- - [Contributing](#contributing)
- - [List of workload](#list-of-workload)
- - [Off-the-shelf examples](#off-the-shelf-examples)
+## Table of Contents
+- [Quick start: how to write and submit a CIFAR-10 job](#quickstart)
+- [List of off-the-shelf examples](#offtheshelf)
+- [List of customized job template](#customize)
+- [Contributing](#contributing)
+
+## Quick start: how to write and submit a CIFAR-10 job
+#### (1) Prepare a job json file
-## Contributing
+In this section, we will use CIFAR-10 training job as an example to explain how to write and submit a job in OpenPAI.
-If you want to contribute to run new workload on PAI or add more PAI examples, please open a new pull request.
+[CIFAR-10](http://www.cs.toronto.edu/~kriz/cifar.html) is an established computer-vision dataset used for image classification.
+- Full example for tensorflow cifar10 image classification training on OpenPAI:
-## List of workload
+```js
+{
+ // Name for the job, need to be unique
+ "jobName": "tensorflow-cifar10",
+ // URL pointing to the Docker image for all tasks in the job
+ "image": "openpai/pai.example.tensorflow",
+ // Data directory existing on HDFS
+ "dataDir": "/tmp/data",
+ // Output directory on HDFS,
+ "outputDir": "/tmp/output",
+ // List of taskRole, one task role at least
+ "taskRoles": [
+ {
+ // Name for the task role
+ "name": "cifar_train",
+ // Number of tasks for the task role, no less than 1
+ "taskNumber": 1,
+ // CPU number for one task in the task role, no less than 1
+ "cpuNumber": 8,
+ // Memory for one task in the task role, no less than 100
+ "memoryMB": 32768,
+ // GPU number for one task in the task role, no less than 0
+ "gpuNumber": 1,
+ // Executable command for tasks in the task role, can not be empty
+ "command": "git clone https://github.com/tensorflow/models && cd models/research/slim && python download_and_convert_data.py --dataset_name=cifar10 --dataset_dir=$PAI_DATA_DIR && python train_image_classifier.py --batch_size=64 --model_name=inception_v3 --dataset_name=cifar10 --dataset_split_name=train --dataset_dir=$PAI_DATA_DIR --train_dir=$PAI_OUTPUT_DIR"
+ }
+ ]
+}
+```
-* [Open MPI](./mpi/README.md)
-* [Model Serving](./serving/README.md)
-* [CNTK](./cntk/README.md)
-* [TensorFlow](./tensorflow/README.md)
-* [PyTorch](./pytorch/README.md)
-* [MXNet](./mxnet/README.md)
-* [Keras](./keras/README.md)
-* [scikit-learn](./scikit-learn/README.md)
-* [Jupyter](./jupyter/README.md)
+- Save content to a file. Name this file as cifar10.json
+- [Job configuration items introduction](../docs/job_tutorial.md#json-config-file-for-job-submission)
-## Off-the-shelf examples
+#### (2) Submit job json file from OpenPAI webportal
+
+Users can refer to this tutorial [submit a job in web portal](https://github.com/Microsoft/pai/blob/master/docs/submit_from_webportal.md) for job submission from OpenPAI webportal.
+
+## List of off-the-shelf examples
Examples which can be run by submitting the json straightly without any modification.
-* [serving.tensorflow.json](./serving/serving.tensorflow.json): TensorFlow model serving.
* [tensorflow.cifar10.json](./tensorflow/tensorflow.cifar10.json): Single GPU trainning on CIFAR-10 using TensorFlow.
* [pytorch.mnist.json](./pytorch/pytorch.mnist.json): Single GPU trainning on MNIST using PyTorch.
* [pytorch.regression.json](./pytorch/pytorch.regression.json): Regression using PyTorch.
* [mxnet.autoencoder.json](./mxnet/mxnet.autoencoder.json): Autoencoder using MXNet.
-* [mxnet.image-classification.json](./mxnet/mxnet.image-classification.json): Image classification on MNIST using MXNet.
+* [mxnet.image-classification.json](./mxnet/mxnet.image-classification.json): Image
+* [serving.tensorflow.json](./serving/serving.tensorflow.json): TensorFlow model serving.
+classification on MNIST using MXNet.
+
+## List of customized job template
+
+These user could customize and run these jobs over OpenPAI.
+
+* [TensorFlow](./tensorflow):
+
+ 1. [TensorFlow CIFAR-10 image classification](./tensorflow#tensorflow-cifar-10-image-classification)
+ 2. [TensorFlow ImageNet image classification](./tensorflow#tensorflow-imagenet-image-classification)
+ 3. [Distributed TensorFlow CIFAR-10 image classification](./tensorflow#distributed-tensorflow-cifar-10-image-classification )
+ 4. [TensorFlow Tensorboard](./tensorflow#tensorflow-tensorboard)
+
+* [Keras](./keras):
+ 1. [MNIST training job over keras.](./keras/README.md)
+* [Jupyter](./jupyter):
+ 1. [MNIST over Jupyter Notebook. User can also treat this job as an example how to use Jupyter over OpenPAI](./jupyter/README.md)
+* [Model Serving](./serving):
+ 1. [MNIST model serving over Tensorflow](./serving/README.md)
+* [Scikit-Learn](./scikit-learn):
+ 1. [Scikit-Learn MNIST digit recognition](./scikit-learn/#scikit-learn-mnist-digit-recognition-example)
+ 2. [Scikit-Learn text-vectorizers](./scikit-learn/#scikit-learn-text-vectorizers-example)
+* [CNTK](./cntk):
+ 1. [CNTK grapheme-to-phoneme](./cntk/README.md)
+* [PyTorch](./pytorch):
+ 1. [PyTorch MNIST digit recognition](./pytorch/#pytorch-mnist-digit-recognition-examples)
+ 2. [PyTorch regression](./pytorch/#pytorch-regression-examples)
+* [MXNet](./mxnet):
+ 1. [MXNet autoencoder](./mxnet#mxnet-autoencoder-examples)
+ 2. [MXNet image classification](./mxnet#mxnet-image-classification-examples)
+* [Open MPI](./mpi):
+ 1. [Open MPI TensorFlow CIFAR-10](./mpi#open-mpi-tensorflow-cifar-10-example)
+ 2. [Open MPI CNTK grapheme-to-phoneme conversion](./mpi#open-mpi-cntk-grapheme-to-phoneme-conversion-example)
+
+## Contributing
+
+If you want to contribute a job example that can be run on PAI, please open a new pull request.
+
+- Prepare a folder under pai/examples folder, for example create pai/examples/caffe2/
+
+- Prepare example files:
+
+ Under [Caffe2 example](./caffe2) dir, user should prepare these files for an example's contribution PR:
+
+![PAI_caffe2_dir](./images/PAI_caffe2_dir.png)
+
+1. README.md: Example's introductions
+2. Dockerfile: Example's dependencies
+3. Pai job json file: Example's OpenPAI job json template
+4. [Optional] Code file: Example's code file
+
+
+
+
diff --git a/examples/XGBoost/DOCKER.md b/examples/XGBoost/DOCKER.md
new file mode 100644
index 0000000000..295d2c6845
--- /dev/null
+++ b/examples/XGBoost/DOCKER.md
@@ -0,0 +1,56 @@
+# XGBoost on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [XGBoost example](#xgboost-example) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.xgboost/) on Docker Hub.
+
+We need to build a XGBoost image with GPU support to run XGBoost workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare XGBoost envoriment in a [Dockerfile](./Dockerfile.example.xgboost) using the base image.
+
+ Write a XGBoost Dockerfile and save it to `Dockerfile.example.xgboost`:
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.xgboost USER/pai.example.xgboost
+ $ sudo docker push USER/pai.example.xgboost
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime XGBoost environment in [Dockerfile.example.xgboost](./Dockerfile.example.xgboost), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install PACKAGE
+
+# install other packages using pip
+RUN pip install PACKAGE
+```
+
+
diff --git a/examples/XGBoost/README.md b/examples/XGBoost/README.md
index 3fbea096c7..e0e66c2038 100644
--- a/examples/XGBoost/README.md
+++ b/examples/XGBoost/README.md
@@ -23,72 +23,16 @@
This guide introduces how to run [XGBoost](https://xgboost.readthedocs.io/en/latest/) workload on PAI.
The following contents show some basic XGBoost examples, other customized XGBoost code can be run similarly.
-
-## Contents
-
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [XGBoost example](#xgboost-example)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [XGBoost example](#xgboost-example) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.xgboost/) on Docker Hub.
-
-We need to build a XGBoost image with GPU support to run XGBoost workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare XGBoost envoriment in a [Dockerfile](./Dockerfile.example.xgboost) using the base image.
-
- Write a XGBoost Dockerfile and save it to `Dockerfile.example.xgboost`:
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.xgboost USER/pai.example.xgboost
- $ sudo docker push USER/pai.example.xgboost
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime XGBoost environment in [Dockerfile.example.xgboost](./Dockerfile.example.xgboost), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install PACKAGE
-
-# install other packages using pip
-RUN pip install PACKAGE
-```
-
-
-# XGBoost example
+## XGBoost covertype dataset classification example
To run XGBoost examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.xgboost` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.xgboost` with your own.
Here's one configuration file example to train a model on the [forest cover type](https://archive.ics.uci.edu/ml/datasets/covertype) dataset using GPU acceleration:
### [gpu_hist](https://github.com/dmlc/xgboost/blob/master/demo/gpu_acceleration/cover_type.py)
+
```json
{
"jobName": "xgboost_gpu_hist",
@@ -105,4 +49,5 @@ Here's one configuration file example to train a model on the [forest cover type
]
}
```
+
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
diff --git a/examples/caffe/DOCKER.md b/examples/caffe/DOCKER.md
new file mode 100644
index 0000000000..35dc9a7590
--- /dev/null
+++ b/examples/caffe/DOCKER.md
@@ -0,0 +1,54 @@
+# Caffe on PAI docker env
+
+## Table of Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [Caffe example](#caffe-example) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.caffe/) on Docker Hub.
+
+We need to build a Caffe image with GPU support to run Caffe workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare Caffe envoriment in a [Dockerfile](./Dockerfile.example.caffe) using the base image.
+
+ Write a Caffe Dockerfile and save it to `Dockerfile.example.caffe`:
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.caffe USER/pai.example.caffe
+ $ sudo docker push USER/pai.example.caffe
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime PyTorch environment in [Dockerfile.example.caffe](./Dockerfile.example.caffe), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install torch torchvision PACKAGE
+```
diff --git a/examples/caffe/README.md b/examples/caffe/README.md
index 776ba72b26..7a21355209 100644
--- a/examples/caffe/README.md
+++ b/examples/caffe/README.md
@@ -18,77 +18,21 @@
-->
-# Caffe on PAI
+# Caffe on OpenPAI
-This guide introduces how to run [Caffe](http://caffe.berkeleyvision.org/) workload on PAI.
+This guide introduces how to run [Caffe](http://caffe.berkeleyvision.org/) job on OpenPAI.
The following contents show some basic Caffe examples, other customized Caffe code can be run similarly.
+# Caffe lenet MNIST digit recognition example
-## Contents
+To run Caffe examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [Caffe example](#caffe-example)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [Caffe example](#caffe-example) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.caffe/) on Docker Hub.
-
-We need to build a Caffe image with GPU support to run Caffe workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare Caffe envoriment in a [Dockerfile](./Dockerfile.example.caffe) using the base image.
-
- Write a Caffe Dockerfile and save it to `Dockerfile.example.caffe`:
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.caffe USER/pai.example.caffe
- $ sudo docker push USER/pai.example.caffe
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime PyTorch environment in [Dockerfile.example.caffe](./Dockerfile.example.caffe), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install torch torchvision PACKAGE
-```
-
-
-# Caffe example
-
-To run Caffe examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.caffe` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.caffe` with your own.
Here's one configuration file example:
### [mnist](http://caffe.berkeleyvision.org/gathered/examples/mnist.html)
+
```json
{
"jobName": "caffe-mnist",
@@ -106,3 +50,4 @@ Here's one configuration file example:
}
```
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
+
diff --git a/examples/caffe2/DOCKER.md b/examples/caffe2/DOCKER.md
new file mode 100644
index 0000000000..eb1d2c8894
--- /dev/null
+++ b/examples/caffe2/DOCKER.md
@@ -0,0 +1,60 @@
+# Apache Caffe2 on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [Caffe2 example](#caffe2-example) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.caffe2/) on Docker Hub.
+
+We need to build a Caffe2 image with GPU support to run Caffe2 workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+1. Prepare Caffe2 envoriment in a [Dockerfile](./Dockerfile.example.caffe2) using the base image.
+
+ Write a Caffe2 Dockerfile and save it to `Dockerfile.example.caffe2`:
+
+ Build the Docker image from `Dockerfile.example.caffe2`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.caffe2 -t pai.example.caffe2 .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.caffe2 USER/pai.example.caffe2
+ $ sudo docker push USER/pai.example.caffe2
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime Caffe2 environment in [Dockerfile.example.caffe2](./Dockerfile.example.caffe2), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install PACKAGE
+
+# install other packages using pip
+RUN pip install PACKAGE
+```
\ No newline at end of file
diff --git a/examples/caffe2/README.md b/examples/caffe2/README.md
index 0ea8d00171..38587add32 100644
--- a/examples/caffe2/README.md
+++ b/examples/caffe2/README.md
@@ -18,79 +18,16 @@
-->
-# Apache Caffe2 on PAI
+# Apache Caffe2 on OpenPAI
-This guide introduces how to run [Caffe2](https://caffe2.ai/) workload on PAI.
+This guide introduces how to run [Caffe2](https://caffe2.ai/) job on OpenPAI.
The following contents show some basic Caffe2 examples, other customized Caffe2 code can be run similarly.
+# Caffe2 resnet50 ImageNet example
-## Contents
+To run Caffe2 examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [Caffe2 example](#caffe2-example)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [Caffe2 example](#caffe2-example) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.caffe2/) on Docker Hub.
-
-We need to build a Caffe2 image with GPU support to run Caffe2 workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-1. Prepare Caffe2 envoriment in a [Dockerfile](./Dockerfile.example.caffe2) using the base image.
-
- Write a Caffe2 Dockerfile and save it to `Dockerfile.example.caffe2`:
-
- Build the Docker image from `Dockerfile.example.caffe2`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.caffe2 -t pai.example.caffe2 .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.caffe2 USER/pai.example.caffe2
- $ sudo docker push USER/pai.example.caffe2
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime Caffe2 environment in [Dockerfile.example.caffe2](./Dockerfile.example.caffe2), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install PACKAGE
-
-# install other packages using pip
-RUN pip install PACKAGE
-```
-
-
-# Caffe2 example
-
-To run Caffe2 examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.caffe2` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.caffe2` with your own.
Here's one configuration file example:
diff --git a/examples/chainer/DOCKER.md b/examples/chainer/DOCKER.md
new file mode 100644
index 0000000000..15e084cf10
--- /dev/null
+++ b/examples/chainer/DOCKER.md
@@ -0,0 +1,55 @@
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+# Chainer on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment
+
+We need to build a Chainer image with GPU support to run Chainer workload on PAI, this can be done with two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare Chainer envoriment in a [Dockerfile](./Dockerfile.example.chainer) using the base image.
+
+ Write a Chainer Dockerfile and save it to `Dockerfile.example.chainer`:
+
+ ```dockerfile
+ FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ #set LC_ALL
+ ENV LC_ALL C
+
+ # install git
+ RUN apt-get -y update && apt-get -y install git
+
+ # install Chainer and cupy using pip
+ RUN pip install chainer && pip install cupy-cuda80
+
+ # clone Chainer official code
+ RUN git clone https://github.com/chainer/chainer.git
+
+ ```
+
+ Build the Docker image from `Dockerfile.example.chainer`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.chainer -t USER/pai.example.chainer .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker push USER/pai.example.chainer
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
diff --git a/examples/chainer/README.md b/examples/chainer/README.md
index 368dc00796..69fa252a0c 100644
--- a/examples/chainer/README.md
+++ b/examples/chainer/README.md
@@ -18,78 +18,26 @@
-->
-# Chainer on PAI
+# Chainer on OpenPAI
-This guide introduces how to run [Chainer](https://chainer.org/) workload on PAI.
+This guide introduces how to run [Chainer](https://chainer.org/) job on OpenPAI.
The following contents show a basic Chainer example, other customized Chainer code can be run similarly.
-
## Contents
-1. [Basic environment](#basic-environment)
-2. [Chainer examples](#chainer-example)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-We need to build a Chainer image with GPU support to run Chainer workload on PAI, this can be done with two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare Chainer envoriment in a [Dockerfile](./Dockerfile.example.chainer) using the base image.
-
- Write a Chainer Dockerfile and save it to `Dockerfile.example.chainer`:
-
- ```dockerfile
- FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+1. [Chainer examples](#chainer-example)
+2. [Customize Docker Env](#customize-docker-env)
- #set LC_ALL
- ENV LC_ALL C
+## Chainer CIFAR-100 image classification example
- # install git
- RUN apt-get -y update && apt-get -y install git
+To run Chainer examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
- # install Chainer and cupy using pip
- RUN pip install chainer && pip install cupy-cuda80
-
- # clone Chainer official code
- RUN git clone https://github.com/chainer/chainer.git
-
- ```
-
- Build the Docker image from `Dockerfile.example.chainer`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.chainer -t USER/pai.example.chainer .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker push USER/pai.example.chainer
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-## Chainer examples
-
-To run Chainer examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.chainer` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.chainer` with your own.
Here're some configuration file examples:
### [cifar](https://github.com/chainer/chainer/tree/master/examples/cifar)
+
```json
"jobName": "chainer-cifar",
"image": "openpai/pai.example.chainer",
diff --git a/examples/cntk/DOCKER.md b/examples/cntk/DOCKER.md
new file mode 100644
index 0000000000..dffb9cc347
--- /dev/null
+++ b/examples/cntk/DOCKER.md
@@ -0,0 +1,120 @@
+
+# CNTK on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [CNTK examples](#cntk-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.cntk/) on Docker Hub.
+
+We need to build a CNTK image with GPU support to run CNTK workload on PAI, this can be done in three steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Build a openmpi Docker image. We prepared a [mpi Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.mpi) which can be built based on the base image.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.mpi \
+ > -t pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+3. Prepare CNTK envoriment in a [Dockerfile](./Dockerfile.example.cntk) using the base image.
+
+ Write a CNTK Dockerfile and save it to `Dockerfile.example.cntk`:
+
+ ```dockerfile
+ FROM pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ ENV CNTK_VERSION=2.0.beta11.0
+
+ RUN apt-get -y update && \
+ apt-get -y install git \
+ fuse \
+ golang \
+ libjasper1 \
+ libjpeg8 \
+ libpng12-0 \
+ libgfortran3 && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/*
+
+ WORKDIR /
+
+ # Install hdfs-mount
+ RUN git clone --recursive https://github.com/Microsoft/hdfs-mount.git && \
+ cd hdfs-mount && \
+ make -j $(nproc) && \
+ make test && \
+ cp hdfs-mount /bin && \
+ cd .. && \
+ rm -rf hdfs-mount
+
+ # Install Anaconda
+ RUN ANACONDA_PREFIX="/root/anaconda3" && \
+ ANACONDA_VERSION="3-4.1.1" && \
+ ANACONDA_SHA256="4f5c95feb0e7efeadd3d348dcef117d7787c799f24b0429e45017008f3534e55" && \
+ wget -q https://repo.continuum.io/archive/Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh && \
+ echo "$ANACONDA_SHA256 Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh" | sha256sum --check --strict - && \
+ chmod a+x Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh && \
+ ./Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh -b -p ${ANACONDA_PREFIX} && \
+ rm -rf Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh && \
+ $ANACONDA_PREFIX/bin/conda clean --all --yes
+
+ ENV PATH=/root/anaconda3/bin:/usr/local/mpi/bin:$PATH \
+ LD_LIBRARY_PATH=/root/anaconda3/lib:/usr/local/mpi/lib:$LD_LIBRARY_PATH
+
+ # Get CNTK Binary Distribution
+ RUN CNTK_VERSION_DASHED=$(echo $CNTK_VERSION | tr . -) && \
+ CNTK_SHA256="2e60909020a0f553431dc7f7818401cc1bb2c99eef307d65bb552c497993593a" && \
+ wget -q https://cntk.ai/BinaryDrop/CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
+ echo "$CNTK_SHA256 CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz" | sha256sum --check --strict - && \
+ tar -xzf CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
+ rm -f CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
+ wget -q https://raw.githubusercontent.com/Microsoft/CNTK-docker/master/ubuntu-14.04/version_2/${CNTK_VERSION}/gpu/runtime/install-cntk-docker.sh \
+ -O /cntk/Scripts/install/linux/install-cntk-docker.sh && \
+ /bin/bash /cntk/Scripts/install/linux/install-cntk-docker.sh && \
+ /root/anaconda3/bin/conda clean --all --yes && \
+ rm -rf /cntk/cntk/python
+
+ ENV PATH=/cntk/cntk/bin:$PATH \
+ LD_LIBRARY_PATH=/cntk/cntk/lib:/cntk/cntk/dependencies/lib:$LD_LIBRARY_PATH
+
+ WORKDIR /root
+ ```
+
+ Build the Docker image from `Dockerfile.example.cntk`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.cntk -t pai.example.cntk .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.cntk USER/pai.example.cntk
+ $ sudo docker push USER/pai.example.cntk
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime CNTK environment in [Dockerfile.example.cntk](./Dockerfile.example.cntk), for example, adding other dependeces in Dockerfile.
diff --git a/examples/cntk/README.md b/examples/cntk/README.md
index cef9c2854f..fab3772f58 100644
--- a/examples/cntk/README.md
+++ b/examples/cntk/README.md
@@ -18,138 +18,16 @@
-->
-# CNTK on PAI
+# CNTK on OpenPAI
-This guide introduces how to run [CNTK](https://docs.microsoft.com/en-us/cognitive-toolkit/) workload on PAI.
+This guide introduces how to run [CNTK](https://docs.microsoft.com/en-us/cognitive-toolkit/) job on OpenPAI.
The following contents show some basic CNTK examples, other customized CNTK code can be run similarly.
+# CNTK grapheme-to-phoneme examples
-## Contents
+To run CNTK examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [CNTK examples](#cntk-examples)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [CNTK examples](#cntk-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.cntk/) on Docker Hub.
-
-We need to build a CNTK image with GPU support to run CNTK workload on PAI, this can be done in three steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Build a openmpi Docker image. We prepared a [mpi Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.mpi) which can be built based on the base image.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.mpi \
- > -t pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-3. Prepare CNTK envoriment in a [Dockerfile](./Dockerfile.example.cntk) using the base image.
-
- Write a CNTK Dockerfile and save it to `Dockerfile.example.cntk`:
-
- ```dockerfile
- FROM pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
- ENV CNTK_VERSION=2.0.beta11.0
-
- RUN apt-get -y update && \
- apt-get -y install git \
- fuse \
- golang \
- libjasper1 \
- libjpeg8 \
- libpng12-0 \
- libgfortran3 && \
- apt-get clean && \
- rm -rf /var/lib/apt/lists/*
-
- WORKDIR /
-
- # Install hdfs-mount
- RUN git clone --recursive https://github.com/Microsoft/hdfs-mount.git && \
- cd hdfs-mount && \
- make -j $(nproc) && \
- make test && \
- cp hdfs-mount /bin && \
- cd .. && \
- rm -rf hdfs-mount
-
- # Install Anaconda
- RUN ANACONDA_PREFIX="/root/anaconda3" && \
- ANACONDA_VERSION="3-4.1.1" && \
- ANACONDA_SHA256="4f5c95feb0e7efeadd3d348dcef117d7787c799f24b0429e45017008f3534e55" && \
- wget -q https://repo.continuum.io/archive/Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh && \
- echo "$ANACONDA_SHA256 Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh" | sha256sum --check --strict - && \
- chmod a+x Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh && \
- ./Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh -b -p ${ANACONDA_PREFIX} && \
- rm -rf Anaconda${ANACONDA_VERSION}-Linux-x86_64.sh && \
- $ANACONDA_PREFIX/bin/conda clean --all --yes
-
- ENV PATH=/root/anaconda3/bin:/usr/local/mpi/bin:$PATH \
- LD_LIBRARY_PATH=/root/anaconda3/lib:/usr/local/mpi/lib:$LD_LIBRARY_PATH
-
- # Get CNTK Binary Distribution
- RUN CNTK_VERSION_DASHED=$(echo $CNTK_VERSION | tr . -) && \
- CNTK_SHA256="2e60909020a0f553431dc7f7818401cc1bb2c99eef307d65bb552c497993593a" && \
- wget -q https://cntk.ai/BinaryDrop/CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
- echo "$CNTK_SHA256 CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz" | sha256sum --check --strict - && \
- tar -xzf CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
- rm -f CNTK-${CNTK_VERSION_DASHED}-Linux-64bit-GPU.tar.gz && \
- wget -q https://raw.githubusercontent.com/Microsoft/CNTK-docker/master/ubuntu-14.04/version_2/${CNTK_VERSION}/gpu/runtime/install-cntk-docker.sh \
- -O /cntk/Scripts/install/linux/install-cntk-docker.sh && \
- /bin/bash /cntk/Scripts/install/linux/install-cntk-docker.sh && \
- /root/anaconda3/bin/conda clean --all --yes && \
- rm -rf /cntk/cntk/python
-
- ENV PATH=/cntk/cntk/bin:$PATH \
- LD_LIBRARY_PATH=/cntk/cntk/lib:/cntk/cntk/dependencies/lib:$LD_LIBRARY_PATH
-
- WORKDIR /root
- ```
-
- Build the Docker image from `Dockerfile.example.cntk`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.cntk -t pai.example.cntk .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.cntk USER/pai.example.cntk
- $ sudo docker push USER/pai.example.cntk
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime CNTK environment in [Dockerfile.example.cntk](./Dockerfile.example.cntk), for example, adding other dependeces in Dockerfile.
-
-
-# CNTK examples
-
-To run CNTK examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.cntk` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.caffe` with your own.
Here're some configuration file examples:
diff --git a/examples/images/PAI_caffe2_dir.png b/examples/images/PAI_caffe2_dir.png
new file mode 100644
index 0000000000..54081966d9
Binary files /dev/null and b/examples/images/PAI_caffe2_dir.png differ
diff --git a/examples/jupyter/DOCKER.md b/examples/jupyter/DOCKER.md
new file mode 100644
index 0000000000..bb7544a925
--- /dev/null
+++ b/examples/jupyter/DOCKER.md
@@ -0,0 +1,45 @@
+# Jupyter on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+We need a docker image to run it on PAI, this can be done under following instructions:
+
+1. Build a base Docker image to run jobs on PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.example.jupyter .
+ $ cd -
+ ```
+2. Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.jupyter USER/pai.example.jupyter
+ $ sudo docker push USER/pai.example.jupyter
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime Jupyter Notebook environment in your own docker image based on our base image, for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM openpai/pai.example.jupyter
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install PACKAGE
+```
\ No newline at end of file
diff --git a/examples/jupyter/README.md b/examples/jupyter/README.md
index d803b724dd..83f5ee573d 100644
--- a/examples/jupyter/README.md
+++ b/examples/jupyter/README.md
@@ -18,65 +18,20 @@
-->
-# Jupyter on PAI
+# Jupyter on OpenPAI
-This guide introduces how to run [Jupyter Notebook](http://jupyter.org/) on PAI.
+This guide introduces how to run [Jupyter Notebook](http://jupyter.org/) on OpenPAI.
The following contents show some basic examples, other customized examples can be run similarly.
-## Contents
-
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [Jupyter Notebook example](#jupyter-notebook-example)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-We need a docker image to run it on PAI, this can be done under following instructions:
-
-1. Build a base Docker image to run jobs on PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.example.jupyter .
- $ cd -
- ```
-2. Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.jupyter USER/pai.example.jupyter
- $ sudo docker push USER/pai.example.jupyter
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime Jupyter Notebook environment in your own docker image based on our base image, for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM openpai/pai.example.jupyter
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install PACKAGE
-```
## Jupyter Notebook example
-To run Jupyter Notebook in PAI, you need to prepare a job configuration file and submit it through webportal.
+To run Jupyter Notebook in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-Here's one configuration file example to use Jupyter Notebook as a tutorial to run a tensorflow mnist example:
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.caffe` with your own.
-Please built your image and pushed it to your Docker registry, replace image `openpai/pai.example.tensorflow` with your own and modify the command if needed.
+Here's one configuration file example to use Jupyter Notebook as a tutorial to run a tensorflow mnist example:
### Job configuration file
diff --git a/examples/kafka/DOCKER.md b/examples/kafka/DOCKER.md
new file mode 100644
index 0000000000..194ff1220b
--- /dev/null
+++ b/examples/kafka/DOCKER.md
@@ -0,0 +1,37 @@
+
+# Build customize Kafka docker
+
+Please follow the next processes step by step. Do not skip any steps if you don't know whether you can skip them.
+
+### Zookeeper
+If you already have a kafka cluster, you could skip this step. And if you build the image by "Dockerfile.example.kafka", you should do the work of step 2 and ignore the others.
+
+1. [Download zookeeper source file](https://zookeeper.apache.org/doc/r3.1.2/zookeeperStarted.html) and unpack it.
+
+2. Change the name of file "/conf/zoo_sample.cfg" to "/conf/zoo.cfg" and open it.
+
+3. Set the "dataDir" and "clientPort" like "dataDir=/var/zookeeper clientPort=XXXX". Remember the port you set.
+
+4. Start it `bin/zkServer.sh start`.
+
+### Kafka
+If you already have a kafka cluster or build the image by "Dockerfile.example.kafka", you could skip this step.
+
+Just take reference in the [official document](https://www.tutorialspoint.com/apache_kafka/apache_kafka_installation_steps.html).
+
+Note:
+
+1. You should set the listeners in config/server.properties:"listeners=PLAINTEXT://localhost:XXXX". The port "XXXX" is the kafka client port you should use in your client code. But you'd better use dynamic port rather than static port due to possible conflict. The way of using dynamic port is described at the end of this document.
+
+2. The "zookeeper.connect" port is the port you have already set in "/conf/zoo.cfg".
+
+3. If you can't start the kafka server, it might because you haven't started the zookeeper server or you haven't set the correct port.
+
+### Make your logical code
+1. If you want to use python to check if the kafka can be used correctly, you'd better see the [official document](https://kafka-python.readthedocs.io/en/master/). Don't forget to install kafka-python.(If you build the image by "Dockerfile.example.kafka", ignore it!)
+
+2. When you use the producer of kafka, you should keep in mind that the massage might be lost if you haven't set "acks" and haven't use "producer.flush()".
+
+3. When you use the consumer of kafka, you should notice that you might be not able to consumer the message if you don't set "auto_offset_reset" to "earliest" even though you produce the message after you starting the consumer.
+
+4. Make a shell file to start all the servers and programs.
diff --git a/examples/kafka/README.md b/examples/kafka/README.md
index 0d7e08f3e6..91514f8366 100644
--- a/examples/kafka/README.md
+++ b/examples/kafka/README.md
@@ -1,44 +1,9 @@
## How to run kafka on PAI
-If you just want to make a very simple example to see whether you can use kafka on PAI, you could submit the job with the kafka.json file in this folder. Then, you can see the stdout in the "Go to Tracking Page" page. You could see that the program has already produced and consumed message with kafka.
-
-And we have prepared a docker image with zookeeper, kafka and kafka-python for you, you can use it by building image with the "Dockerfile.example.kafka" file:`sudo docker build -f Dockerfile.example.kafka -t kafka.test .`. And then "run" and "exec" it!
-
-But, if you want to use kafka by yourself, you should follow the next steps.
-
-Please follow the next processes step by step. Do not skip any steps if you don't know whether you can skip them.
-
-### Zookeeper
-If you already have a kafka cluster, you could skip this step. And if you build the image by "Dockerfile.example.kafka", you should do the work of step 2 and ignore the others.
-
-1. [Download zookeeper source file](https://zookeeper.apache.org/doc/r3.1.2/zookeeperStarted.html) and unpack it.
-
-2. Change the name of file "/conf/zoo_sample.cfg" to "/conf/zoo.cfg" and open it.
-
-3. Set the "dataDir" and "clientPort" like "dataDir=/var/zookeeper clientPort=XXXX". Remember the port you set.
-
-4. Start it `bin/zkServer.sh start`.
-### Kafka
-If you already have a kafka cluster or build the image by "Dockerfile.example.kafka", you could skip this step.
-
-Just take reference in the [official document](https://www.tutorialspoint.com/apache_kafka/apache_kafka_installation_steps.html).
-
-Note:
-
-1. You should set the listeners in config/server.properties:"listeners=PLAINTEXT://localhost:XXXX". The port "XXXX" is the kafka client port you should use in your client code. But you'd better use dynamic port rather than static port due to possible conflict. The way of using dynamic port is described at the end of this document.
-
-2. The "zookeeper.connect" port is the port you have already set in "/conf/zoo.cfg".
-
-3. If you can't start the kafka server, it might because you haven't started the zookeeper server or you haven't set the correct port.
-
-### Make your logical code
-1. If you want to use python to check if the kafka can be used correctly, you'd better see the [official document](https://kafka-python.readthedocs.io/en/master/). Don't forget to install kafka-python.(If you build the image by "Dockerfile.example.kafka", ignore it!)
-
-2. When you use the producer of kafka, you should keep in mind that the massage might be lost if you haven't set "acks" and haven't use "producer.flush()".
+If you just want to make a very simple example to see whether you can use kafka on PAI, you could submit the job with the kafka.json file in this folder. Then, you can see the stdout in the "Go to Tracking Page" page. You could see that the program has already produced and consumed message with kafka.
-3. When you use the consumer of kafka, you should notice that you might be not able to consumer the message if you don't set "auto_offset_reset" to "earliest" even though you produce the message after you starting the consumer.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `Dockerfile.example.kafka` with your own.
-4. Make a shell file to start all the servers and programs.
### Submit Job
If you could use kafka correctly in your own environment, of course computer or server, you could:
diff --git a/examples/keras/DOCKER.md b/examples/keras/DOCKER.md
new file mode 100644
index 0000000000..7b9cdf22ca
--- /dev/null
+++ b/examples/keras/DOCKER.md
@@ -0,0 +1,80 @@
+# Keras on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+We need to build a Keras image to run Keras workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI Keras. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+You can also directly use [cntk](../cntk/Dockerfile.example.cntk)/[tensorflow](../tensorflow/Dockerfile.example.tensorflow) base image as Keras backend.
+
+2. Prepare Keras envoriment in a [Dockerfile](./Dockerfile.example.keras.tensorflow_backend) using tensorflow base image as Keras backend.
+
+ Write a Keras Dockerfile and save it to `Dockerfile.example.Keras`:
+
+ ```dockerfile
+ # use tensorflow as Keras backend
+ FROM openpai/pai.example.tensorflow
+
+ # install git
+ RUN apt-get -y update && apt-get -y install git
+
+ # install Keras python package using pip
+ RUN pip install keras
+
+ WORKDIR /root
+
+ # clone Keras examples
+ RUN git clone https://github.com/keras-team/keras.git
+
+ # set tensorflow as keras backend
+ ENV KERAS_BACKEND tensorflow
+
+ # set work directory to keras examples
+ WORKDIR /root/keras/examples
+ ```
+
+ Build the Docker image from `Dockerfile.example.Keras`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.keras -t pai.example.keras .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.keras USER/pai.example.keras
+ $ sudo docker push USER/pai.example.keras
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime Keras environment in Dockerfile, for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install PACKAGE
+```
\ No newline at end of file
diff --git a/examples/keras/Dockerfile.example.keras.tensorflow_backend b/examples/keras/Dockerfile.example.keras.tensorflow_backend
index 643382baca..79627855c9 100644
--- a/examples/keras/Dockerfile.example.keras.tensorflow_backend
+++ b/examples/keras/Dockerfile.example.keras.tensorflow_backend
@@ -1,4 +1,4 @@
-FROM openpai/pai.example.tensorflow
+FROM openpai/pai.example.tensorflow:v1.10
# install git
RUN apt-get -y update && apt-get -y install git
@@ -13,4 +13,4 @@ RUN git clone https://github.com/keras-team/keras.git
ENV KERAS_BACKEND tensorflow
-WORKDIR /root/keras/examples
\ No newline at end of file
+WORKDIR /root/keras/examples
diff --git a/examples/keras/README.md b/examples/keras/README.md
index d15c1e52f8..9095f21aa1 100644
--- a/examples/keras/README.md
+++ b/examples/keras/README.md
@@ -18,100 +18,16 @@
-->
-# Keras on PAI
+# Keras on OpenPAI
-This guide introduces how to run [Keras](http://keras.io/) workload on PAI.
+This guide introduces how to run [Keras](http://keras.io/) job on OpenPAI.
The following contents show some basic Keras examples, other customized Keras code can be run similarly.
+## Keras tensorflow backend MNIST digit recognition examples
-## Contents
+To run Keras examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [Keras examples](#keras-examples)
-4. [Frequently asked questions](#faq)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-We need to build a Keras image to run Keras workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI Keras. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-You can also directly use [cntk](../cntk/Dockerfile.example.cntk)/[tensorflow](../tensorflow/Dockerfile.example.tensorflow) base image as Keras backend.
-
-2. Prepare Keras envoriment in a [Dockerfile](./Dockerfile.example.keras.tensorflow_backend) using tensorflow base image as Keras backend.
-
- Write a Keras Dockerfile and save it to `Dockerfile.example.Keras`:
-
- ```dockerfile
- # use tensorflow as Keras backend
- FROM openpai/pai.example.tensorflow
-
- # install git
- RUN apt-get -y update && apt-get -y install git
-
- # install Keras python package using pip
- RUN pip install keras
-
- WORKDIR /root
-
- # clone Keras examples
- RUN git clone https://github.com/keras-team/keras.git
-
- # set tensorflow as keras backend
- ENV KERAS_BACKEND tensorflow
-
- # set work directory to keras examples
- WORKDIR /root/keras/examples
- ```
-
- Build the Docker image from `Dockerfile.example.Keras`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.keras -t pai.example.keras .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.keras USER/pai.example.keras
- $ sudo docker push USER/pai.example.keras
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime Keras environment in Dockerfile, for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install PACKAGE
-```
-
-
-# Keras examples
-
-To run Keras examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-Please built your image and pushed it to your Docker registry, replace image `openpai/pai.example.keras.[cntk|tensorflow]` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `pai.example.keras.tensorflow` with your own.
Here're some configuration file examples:
@@ -133,6 +49,9 @@ Here're some configuration file examples:
}
```
+## Keras cntk backend MNIST digit recognition examples
+
+
### [mnist_cntk_backend](https://github.com/keras-team/keras/blob/master/examples/mnist_cnn.py)
```json
{
@@ -153,9 +72,7 @@ Here're some configuration file examples:
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
-
-## FAQ
-
-### Speed
+### Note:
Since PAI runs Keras jobs in Docker, the trainning speed on PAI should be similar to speed on host.
+
diff --git a/examples/keras/keras.tensorflow_backend.mnist.json b/examples/keras/keras.tensorflow_backend.mnist.json
index 5adccc987f..749aa243dd 100644
--- a/examples/keras/keras.tensorflow_backend.mnist.json
+++ b/examples/keras/keras.tensorflow_backend.mnist.json
@@ -1,6 +1,6 @@
{
"jobName": "keras_tensorflow_backend_mnist",
- "image": "openpai/pai.example.keras.tensorflow",
+ "image": "openpai/pai.example.keras.tensorflow:v1.10",
"taskRoles": [
{
"name": "mnist",
@@ -11,4 +11,4 @@
"command": "python mnist_cnn.py"
}
]
-}
\ No newline at end of file
+}
diff --git a/examples/mpi/DOCKER.md b/examples/mpi/DOCKER.md
new file mode 100644
index 0000000000..f36723e09f
--- /dev/null
+++ b/examples/mpi/DOCKER.md
@@ -0,0 +1,46 @@
+# MPI on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+We need to build a Open MPI base image with GPU support to run Open MPI workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Build an Open MPI Docker image for PAI. We prepared a [mpi Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.mpi) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.mpi \
+ > -t pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+
+## Advanced environment
+
+You can build runtime TensorFlow or CNTK Docker images based on the MPI base image,
+for example, we prepared [TensorFlow mpi Dockerfile](./Dockerfile.example.tensorflow-mpi) and [CNTK mpi Dockerfile](./Dockerfile.example.cntk-mpi) which can be refered to.
+
+Push the Docker image to a Docker registry, we use TensorFlow mpi Docker image as an example:
+
+```bash
+$ sudo docker tag pai.example.tensorflow-mpi USER/pai.example.tensorflow-mpi
+$ sudo docker push USER/pai.example.tensorflow-mpi
+```
+*Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
diff --git a/examples/mpi/README.md b/examples/mpi/README.md
index fac4c7816a..f0cfc6163f 100644
--- a/examples/mpi/README.md
+++ b/examples/mpi/README.md
@@ -18,69 +18,29 @@
-->
-# MPI on PAI
+# MPI on OpenPAI
-This guide introduces how to run [Open MPI](https://www.open-mpi.org/) workload on PAI.
+This guide introduces how to run [Open MPI](https://www.open-mpi.org/) workload on OpenPAI.
The following contents show some basic Open MPI examples, other customized MPI code can be run similarly.
-
## Contents
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [Open MPI examples](#open-mpi-examples)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-We need to build a Open MPI base image with GPU support to run Open MPI workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Build an Open MPI Docker image for PAI. We prepared a [mpi Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.mpi) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.mpi \
- > -t pai.build.mpi:openmpi1.10.4-hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
+1. [Open MPI TensorFlow CIFAR-10 example](#open-mpi-tensorflow-cifar-10-example)
+2. [Open MPI CNTK grapheme-to-phoneme conversion example](#open-mpi-cntk-grapheme-to-phoneme-conversion-example)
-## Advanced environment
+# Open MPI TensorFlow / CNTK CIFAR-10 example
-You can build runtime TensorFlow or CNTK Docker images based on the MPI base image,
-for example, we prepared [TensorFlow mpi Dockerfile](./Dockerfile.example.tensorflow-mpi) and [CNTK mpi Dockerfile](./Dockerfile.example.cntk-mpi) which can be refered to.
+To run Open MPI examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-Push the Docker image to a Docker registry, we use TensorFlow mpi Docker image as an example:
-
-```bash
-$ sudo docker tag pai.example.tensorflow-mpi USER/pai.example.tensorflow-mpi
-$ sudo docker push USER/pai.example.tensorflow-mpi
-```
-*Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-# Open MPI examples
-
-To run Open MPI examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow-mpi` or `openpai/pai.example.cntk-mpi` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow-mpi`, `openpai/pai.example.cntk-mp` with your own.
Here're some configuration file examples:
+## Open MPI TensorFlow CIFAR-10 example
+
### [TensorFlow cifar10 benchmark](https://git.io/vF4wT)
+
```js
{
"jobName": "tensorflow-mpi",
@@ -116,7 +76,10 @@ Here're some configuration file examples:
}
```
+## Open MPI CNTK grapheme-to-phoneme conversion example
+
### [CNTK G2P example](https://github.com/Microsoft/CNTK/tree/master/Examples/SequenceToSequence/CMUDict/BrainScript)
+
```js
{
"jobName": "cntk-mpi",
diff --git a/examples/mxnet/DOCKER.md b/examples/mxnet/DOCKER.md
new file mode 100644
index 0000000000..42b52656ac
--- /dev/null
+++ b/examples/mxnet/DOCKER.md
@@ -0,0 +1,76 @@
+# Apache MXNet on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [MXNet examples](#mxnet-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.mxnet/) on Docker Hub.
+
+We need to build a MXNet image with GPU support to run MXNet workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare MXNet envoriment in a [Dockerfile](./Dockerfile.example.mxnet) using the base image.
+
+ Write a MXNet Dockerfile and save it to `Dockerfile.example.mxnet`:
+
+ ```dockerfile
+ FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ # install git
+ RUN apt-get -y update && apt-get -y install git
+
+ # install MXNet dependeces using pip
+ RUN pip install mxnet-cu80
+
+ # clone MXNet examples
+ RUN git clone https://github.com/apache/incubator-mxnet.git
+ ```
+
+ Build the Docker image from `Dockerfile.example.mxnet`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.mxnet -t pai.example.mxnet .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.mxnet USER/pai.example.mxnet
+ $ sudo docker push USER/pai.example.mxnet
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime MXNet environment in [Dockerfile.example.mxnet](./Dockerfile.example.mxnet), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install mxnet-cu80 PACKAGE
+
+# clone MXNet examples
+RUN git clone https://github.com/apache/incubator-mxnet.git
+```
diff --git a/examples/mxnet/README.md b/examples/mxnet/README.md
index 1a406d5463..95704857d8 100644
--- a/examples/mxnet/README.md
+++ b/examples/mxnet/README.md
@@ -23,91 +23,17 @@
This guide introduces how to run [Apache MXNet](https://mxnet.apache.org/) workload on PAI.
The following contents show some basic MXNet examples, other customized MXNet code can be run similarly.
-
## Contents
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [MXNet examples](#mxnet-examples)
-4. [Frequently asked questions](#faq)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [MXNet examples](#mxnet-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.mxnet/) on Docker Hub.
-
-We need to build a MXNet image with GPU support to run MXNet workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare MXNet envoriment in a [Dockerfile](./Dockerfile.example.mxnet) using the base image.
-
- Write a MXNet Dockerfile and save it to `Dockerfile.example.mxnet`:
-
- ```dockerfile
- FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
- # install git
- RUN apt-get -y update && apt-get -y install git
-
- # install MXNet dependeces using pip
- RUN pip install mxnet-cu80
-
- # clone MXNet examples
- RUN git clone https://github.com/apache/incubator-mxnet.git
- ```
-
- Build the Docker image from `Dockerfile.example.mxnet`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.mxnet -t pai.example.mxnet .
- ```
+1. [MXNet autoencoder examples](#mxnet-autoencoder-examples)
+2. [MXNet image classification examples](#mxnet-image-classification-examples)
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.mxnet USER/pai.example.mxnet
- $ sudo docker push USER/pai.example.mxnet
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime MXNet environment in [Dockerfile.example.mxnet](./Dockerfile.example.mxnet), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install mxnet-cu80 PACKAGE
-
-# clone MXNet examples
-RUN git clone https://github.com/apache/incubator-mxnet.git
-```
-
-
-# MXNet examples
+## MXNet autoencoder examples
To run MXNet examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.mxnet` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.mxnet` with your own.
+
Here're some configuration file examples:
@@ -129,6 +55,8 @@ Here're some configuration file examples:
}
```
+## MXNet image classification examples
+
### [image classification](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification)
```json
{
@@ -149,9 +77,6 @@ Here're some configuration file examples:
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
-
-## FAQ
-
-### Speed
+### Note:
Since PAI runs MXNet jobs in Docker, the trainning speed on PAI should be similar to speed on host.
diff --git a/examples/pytorch/DOCKER.md b/examples/pytorch/DOCKER.md
new file mode 100644
index 0000000000..bf1d4bb6c1
--- /dev/null
+++ b/examples/pytorch/DOCKER.md
@@ -0,0 +1,76 @@
+# PyTorch on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [PyTorch examples](#pytorch-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.pytorch/) on Docker Hub.
+
+We need to build a PyTorch image with GPU support to run PyTorch workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare PyTorch envoriment in a [Dockerfile](./Dockerfile.example.pytorch) using the base image.
+
+ Write a PyTorch Dockerfile and save it to `Dockerfile.example.pytorch`:
+
+ ```dockerfile
+ FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ # install git
+ RUN apt-get -y update && apt-get -y install git
+
+ # install PyTorch dependeces using pip
+ RUN pip install torch torchvision
+
+ # clone PyTorch examples
+ RUN git clone https://github.com/pytorch/examples.git
+ ```
+
+ Build the Docker image from `Dockerfile.example.pytorch`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.pytorch -t pai.example.pytorch .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.pytorch USER/pai.example.pytorch
+ $ sudo docker push USER/pai.example.pytorch
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime PyTorch environment in [Dockerfile.example.pytorch](./Dockerfile.example.pytorch), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install torch torchvision PACKAGE
+
+# clone PyTorch examples
+RUN git clone https://github.com/pytorch/examples.git
+```
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index ad9db48ee0..f245c28108 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -18,96 +18,22 @@
-->
-# PyTorch on PAI
+# PyTorch on OpenPAI
-This guide introduces how to run [PyTorch](http://pytorch.org/) workload on PAI.
+This guide introduces how to run [PyTorch](http://pytorch.org/) job on OpenPAI.
The following contents show some basic PyTorch examples, other customized PyTorch code can be run similarly.
## Contents
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [PyTorch examples](#pytorch-examples)
-4. [Frequently asked questions](#faq)
+1. [PyTorch MNIST digit recognition](#pytorch-mnist-digit-recognition-examples)
+2. [PyTorch regression examples](#pytorch-regression-examples)
+## PyTorch MNIST digit recognition examples
-## Basic environment
+To run PyTorch examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [PyTorch examples](#pytorch-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.pytorch/) on Docker Hub.
-
-We need to build a PyTorch image with GPU support to run PyTorch workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare PyTorch envoriment in a [Dockerfile](./Dockerfile.example.pytorch) using the base image.
-
- Write a PyTorch Dockerfile and save it to `Dockerfile.example.pytorch`:
-
- ```dockerfile
- FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
- # install git
- RUN apt-get -y update && apt-get -y install git
-
- # install PyTorch dependeces using pip
- RUN pip install torch torchvision
-
- # clone PyTorch examples
- RUN git clone https://github.com/pytorch/examples.git
- ```
-
- Build the Docker image from `Dockerfile.example.pytorch`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.pytorch -t pai.example.pytorch .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.pytorch USER/pai.example.pytorch
- $ sudo docker push USER/pai.example.pytorch
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime PyTorch environment in [Dockerfile.example.pytorch](./Dockerfile.example.pytorch), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install torch torchvision PACKAGE
-
-# clone PyTorch examples
-RUN git clone https://github.com/pytorch/examples.git
-```
-
-
-# PyTorch examples
-
-To run PyTorch examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.pytorch` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.pytorch` with your own.
Here're some configuration file examples:
@@ -129,6 +55,8 @@ Here're some configuration file examples:
}
```
+## PyTorch regression examples
+
### [regression](https://github.com/pytorch/examples/tree/master/regression)
```json
{
@@ -149,9 +77,6 @@ Here're some configuration file examples:
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
-
-## FAQ
-
-### Speed
+## Note:
Since PAI runs PyTorch jobs in Docker, the trainning speed on PAI should be similar to speed on host.
diff --git a/examples/scikit-learn/DOCKER.md b/examples/scikit-learn/DOCKER.md
new file mode 100644
index 0000000000..64b0370586
--- /dev/null
+++ b/examples/scikit-learn/DOCKER.md
@@ -0,0 +1,77 @@
+# scikit-learn on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [scikit-learn examples](#scikit-learn-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.sklearn/) on Docker Hub.
+
+We need to build a scikit-learn image to run scikit-learn workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare scikit-learn envoriment in a [Dockerfile](./Dockerfile.example.sklearn) using the base image.
+
+ Write a scikit-learn Dockerfile and save it to `Dockerfile.example.sklearn`:
+
+ ```dockerfile
+ FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ # install git
+ RUN apt-get -y update && apt-get -y install git
+
+ # install scikit-learn using pip
+ RUN pip install numpy pandas scipy scikit-learn
+
+ # clone scikit-learn examples
+ RUN git clone https://github.com/scikit-learn/scikit-learn.git
+ ```
+
+ Build the Docker image from `Dockerfile.example.sklearn`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.sklearn -t pai.example.sklearn .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.sklearn USER/pai.example.sklearn
+ $ sudo docker push USER/pai.example.sklearn
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime scikit-learn environment in [Dockerfile.example.sklearn](./Dockerfile.example.sklearn), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install numpy pandas scipy scikit-learn PACKAGE
+
+# clone scikit-learn examples
+RUN git clone https://github.com/scikit-learn/scikit-learn.git
+```
+
diff --git a/examples/scikit-learn/README.md b/examples/scikit-learn/README.md
index 6ac27dddb6..77658025de 100644
--- a/examples/scikit-learn/README.md
+++ b/examples/scikit-learn/README.md
@@ -18,96 +18,22 @@
-->
-# scikit-learn on PAI
+# scikit-learn on OpenPAI
-This guide introduces how to run [scikit-learn](http://scikit-learn.org/stable/) workload on PAI.
+This guide introduces how to run [scikit-learn](http://scikit-learn.org/stable/) job on OpenPAI.
The following contents show some basic scikit-learn examples, other customized scikit-learn code can be run similarly.
## Contents
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [scikit-learn examples](#scikit-learn-examples)
-4. [Frequently asked questions](#faq)
+1. [scikit-learn MNIST digit recognition example](#scikit-learn-mnist-digit-recognition-example)
+2. [scikit-learn text-vectorizers example](#scikit-learn-text-vectorizers-example)
+## scikit-learn MNIST digit recognition example
-## Basic environment
+To run scikit-learn examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [scikit-learn examples](#scikit-learn-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.sklearn/) on Docker Hub.
-
-We need to build a scikit-learn image to run scikit-learn workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare scikit-learn envoriment in a [Dockerfile](./Dockerfile.example.sklearn) using the base image.
-
- Write a scikit-learn Dockerfile and save it to `Dockerfile.example.sklearn`:
-
- ```dockerfile
- FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
- # install git
- RUN apt-get -y update && apt-get -y install git
-
- # install scikit-learn using pip
- RUN pip install numpy pandas scipy scikit-learn
-
- # clone scikit-learn examples
- RUN git clone https://github.com/scikit-learn/scikit-learn.git
- ```
-
- Build the Docker image from `Dockerfile.example.sklearn`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.sklearn -t pai.example.sklearn .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.sklearn USER/pai.example.sklearn
- $ sudo docker push USER/pai.example.sklearn
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime scikit-learn environment in [Dockerfile.example.sklearn](./Dockerfile.example.sklearn), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install numpy pandas scipy scikit-learn PACKAGE
-
-# clone scikit-learn examples
-RUN git clone https://github.com/scikit-learn/scikit-learn.git
-```
-
-
-# scikit-learn examples
-
-To run scikit-learn examples in PAI, you need to prepare a job configuration file and submit it through webportal.
-
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.sklearn` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.sklearn` with your own.
Here're some configuration file examples:
@@ -129,6 +55,8 @@ Here're some configuration file examples:
}
```
+## scikit-learn text-vectorizers example
+
### [text-vectorizers](https://github.com/scikit-learn/scikit-learn/blob/master/benchmarks/bench_text_vectorizers.py)
```json
{
@@ -149,9 +77,6 @@ Here're some configuration file examples:
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
-
-## FAQ
-
-### Speed
+### Note:
Since PAI runs PyTorch jobs in Docker, the trainning speed on PAI should be similar to speed on host.
diff --git a/examples/serving/DOCKER.md b/examples/serving/DOCKER.md
new file mode 100644
index 0000000000..f3c179aa9b
--- /dev/null
+++ b/examples/serving/DOCKER.md
@@ -0,0 +1,41 @@
+# Model Serving on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+We use TensorFlow model serving as an example, for how to serve a TensorFlow model, please refer to its [serving tutorial](https://www.tensorflow.org/serving/serving_basic).
+
+You can also jump to [Serving a TensorFlow model](#serving-a-tensorflow-model) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.tensorflow-serving/) on Docker Hub.
+
+We need to build a TensorFlow serving image with GPU support to serve a TensorFlow model on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda9.0-cudnn7/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda9.0-cudnn7
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda9.0-cudnn7-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Build the TensorFlow serving Docker image for PAI. We use the [TensorFlow serving Dockerfile](./Dockerfile.example.tensorflow-serving) provided in its tutorial.
+
+ ```
+ $ sudo docker build -f Dockerfile.example.tensorflow-serving .
+ ```
+
+ Then push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.tensorflow-serving USER/pai.example.tensorflow-serving
+ $ sudo docker push USER/pai.example.tensorflow-serving
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
diff --git a/examples/serving/Dockerfile.example.tensorflow-serving b/examples/serving/Dockerfile.example.tensorflow-serving
new file mode 100644
index 0000000000..3a17a03bbf
--- /dev/null
+++ b/examples/serving/Dockerfile.example.tensorflow-serving
@@ -0,0 +1,143 @@
+# Copyright 2018 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+FROM pai.build.base:hadoop2.7.2-cuda9.0-cudnn7-devel-ubuntu16.04 as base_build
+
+ARG TF_SERVING_VERSION_GIT_BRANCH=master
+ARG TF_SERVING_VERSION_GIT_COMMIT=head
+
+LABEL maintainer=gvasudevan@google.com
+LABEL tensorflow_serving_github_branchtag=${TF_SERVING_VERSION_GIT_BRANCH}
+LABEL tensorflow_serving_github_commit=${TF_SERVING_VERSION_GIT_COMMIT}
+
+ENV NCCL_VERSION=2.2.13
+ENV CUDNN_VERSION=7.1.4.18
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ automake \
+ build-essential \
+ cuda-command-line-tools-9-0 \
+ cuda-cublas-dev-9-0 \
+ cuda-cudart-dev-9-0 \
+ cuda-cufft-dev-9-0 \
+ cuda-curand-dev-9-0 \
+ cuda-cusolver-dev-9-0 \
+ cuda-cusparse-dev-9-0 \
+ curl \
+ git \
+ libfreetype6-dev \
+ libpng12-dev \
+ libtool \
+ libcudnn7=${CUDNN_VERSION}-1+cuda9.0 \
+ libcudnn7-dev=${CUDNN_VERSION}-1+cuda9.0 \
+ libcurl3-dev \
+ libnccl2=${NCCL_VERSION}-1+cuda9.0 \
+ libnccl-dev=${NCCL_VERSION}-1+cuda9.0 \
+ libzmq3-dev \
+ mlocate \
+ openjdk-8-jdk\
+ openjdk-8-jre-headless \
+ pkg-config \
+ python-dev \
+ software-properties-common \
+ swig \
+ unzip \
+ wget \
+ zip \
+ zlib1g-dev \
+ && \
+ apt-get clean && \
+ rm -rf /var/lib/apt/lists/* && \
+ find /usr/local/cuda-9.0/lib64/ -type f -name 'lib*_static.a' -not -name 'libcudart_static.a' -delete && \
+ rm /usr/lib/x86_64-linux-gnu/libcudnn_static_v7.a
+
+RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
+ python get-pip.py && \
+ rm get-pip.py
+
+RUN pip --no-cache-dir install \
+ grpcio \
+ h5py \
+ keras_applications==1.0.4 \
+ keras_preprocessing==1.0.2 \
+ mock \
+ numpy==1.14.5
+
+# Set up Bazel
+# Need >= 0.15.0 so bazel compiles work with docker bind mounts.
+ENV BAZEL_VERSION 0.15.0
+WORKDIR /
+RUN mkdir /bazel && \
+ cd /bazel && \
+ curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -O https://github.com/bazelbuild/bazel/releases/download/$BAZEL_VERSION/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+ curl -H "User-Agent: Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36" -fSsL -o /bazel/LICENSE.txt https://raw.githubusercontent.com/bazelbuild/bazel/master/LICENSE && \
+ chmod +x bazel-*.sh && \
+ ./bazel-$BAZEL_VERSION-installer-linux-x86_64.sh && \
+ cd / && \
+ rm -f /bazel/bazel-$BAZEL_VERSION-installer-linux-x86_64.sh
+
+# Build TensorFlow with the CUDA configuration
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.0,3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# Fix paths so that CUDNN can be found: https://github.com/tensorflow/tensorflow/issues/8264
+WORKDIR /
+RUN mkdir /usr/lib/x86_64-linux-gnu/include/ && \
+ ln -s /usr/lib/x86_64-linux-gnu/include/cudnn.h /usr/lib/x86_64-linux-gnu/include/cudnn.h && \
+ ln -s /usr/include/cudnn.h /usr/local/cuda/include/cudnn.h && \
+ ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/local/cuda/lib64/libcudnn.so && \
+ ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.${TF_CUDNN_VERSION} /usr/local/cuda/lib64/libcudnn.so.${TF_CUDNN_VERSION}
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+ENV NCCL_INSTALL_PATH=/usr/lib/nccl/
+
+# Fix paths so that NCCL can be found
+WORKDIR /
+RUN mkdir -p ${NCCL_INSTALL_PATH} && \
+ mkdir ${NCCL_INSTALL_PATH}include/ && \
+ mkdir ${NCCL_INSTALL_PATH}lib/ && \
+ ln -s /usr/include/nccl.h ${NCCL_INSTALL_PATH}include/nccl.h && \
+ ln -s /usr/lib/x86_64-linux-gnu/libnccl.so ${NCCL_INSTALL_PATH}lib/libnccl.so && \
+ ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.${TF_NCCL_VERSION} ${NCCL_INSTALL_PATH}lib/libnccl.so.${TF_NCCL_VERSION}
+
+# Set TMP for nvidia build environment
+ENV TMP="/tmp"
+
+# Download TF Serving sources (optionally at specific commit).
+WORKDIR /tensorflow-serving
+RUN git clone --branch=${TF_SERVING_VERSION_GIT_BRANCH} https://github.com/tensorflow/serving . && \
+ git remote add upstream https://github.com/tensorflow/serving.git && \
+ if [ "${TF_SERVING_VERSION_GIT_COMMIT}" != "head" ]; then git checkout ${TF_SERVING_VERSION_GIT_COMMIT} ; fi
+
+FROM base_build as binary_build
+# Build, and install TensorFlow Serving
+ARG TF_SERVING_BUILD_OPTIONS="--copt=-mavx --cxxopt=-D_GLIBCXX_USE_CXX11_ABI=0"
+ARG TF_SERVING_BAZEL_OPTIONS=""
+RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+ LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
+ bazel build -c opt --color=yes --curses=yes --config=cuda \
+ ${TF_SERVING_BAZEL_OPTIONS} \
+ --verbose_failures \
+ --output_filter=DONT_MATCH_ANYTHING \
+ ${TF_SERVING_BUILD_OPTIONS} \
+ tensorflow_serving/model_servers:tensorflow_model_server && \
+ cp bazel-bin/tensorflow_serving/model_servers/tensorflow_model_server /usr/local/bin/ && \
+ rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+ bazel build -c opt tensorflow_serving/example:mnist_saved_model
+
+CMD ["/bin/bash"]
\ No newline at end of file
diff --git a/examples/serving/README.md b/examples/serving/README.md
index 42fdf07e5f..9b9de3a177 100644
--- a/examples/serving/README.md
+++ b/examples/serving/README.md
@@ -18,64 +18,17 @@
-->
-# Model Serving on PAI
+# Model Serving on OpenPAI
-This guide introduces how to run model serving workload on PAI.
+This guide introduces how to run model serving job on OpenPAI.
Serving system for machine learning models is designed for production environments, which makes it easy to deploy new algorithms and experiments to users.
The following contents show some basic model serving examples, other customized serving code can be run similarly.
-
-## Contents
-
-1. [Basic environment](#basic-environment)
-2. [Serving a TensorFlow model](#serving-a-tensorflow-model)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-We use TensorFlow model serving as an example, for how to serve a TensorFlow model, please refer to its [serving tutorial](https://www.tensorflow.org/serving/serving_basic).
-
-You can also jump to [Serving a TensorFlow model](#serving-a-tensorflow-model) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.tensorflow-serving/) on Docker Hub.
-
-We need to build a TensorFlow serving image with GPU support to serve a TensorFlow model on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Build the TensorFlow serving Docker image for PAI. We use the [TensorFlow serving Dockerfile](https://github.com/tensorflow/serving/blob/master/tensorflow_serving/tools/docker/Dockerfile.devel-gpu) provided in its tutorial.
-
- Download the Dockerfile and built it from the base Docker image.
-
- ```bash
- $ wget --no-check-certificate -O Dockerfile.example.tensorflow-serving https://raw.githubusercontent.com/tensorflow/serving/master/tensorflow_serving/tools/docker/Dockerfile.devel-gpu
- $ sed -i "/FROM/c\FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04" Dockerfile.example.tensorflow-serving
- $ sudo docker build -f Dockerfile.example.tensorflow-serving .
- ```
-
- Then push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.tensorflow-serving USER/pai.example.tensorflow-serving
- $ sudo docker push USER/pai.example.tensorflow-serving
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-# Serving a TensorFlow model
+# Serving a TensorFlow MNIST digit recognition model
To run TensorFlow model serving, you need to prepare a job configuration file and submit it through webportal.
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow-serving` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow-serving` with your own.
Here're some configuration file examples:
@@ -106,3 +59,4 @@ Here're some configuration file examples:
```
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
+
diff --git a/examples/tensorflow/DOCKER.md b/examples/tensorflow/DOCKER.md
new file mode 100644
index 0000000000..1f86926152
--- /dev/null
+++ b/examples/tensorflow/DOCKER.md
@@ -0,0 +1,77 @@
+# TensorFlow on PAI docker env
+
+## Contents
+
+1. [Basic environment](#basic-environment)
+2. [Advanced environment](#advanced-environment)
+
+## Basic environment
+
+First of all, PAI runs all jobs in Docker container.
+
+[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
+
+You can also jump to [TensorFlow examples](#tensorflow-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.tensorflow/) on Docker Hub.
+
+We need to build a TensorFlow image with GPU support to run TensorFlow workload on PAI, this can be done in two steps:
+
+1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
+
+ ```bash
+ $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
+ $ sudo docker build -f Dockerfile.build.base \
+ > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
+ $ cd -
+ ```
+
+2. Prepare TensorFlow envoriment in a [Dockerfile](./Dockerfile.example.tensorflow) using the base image.
+
+ Write a TensorFlow Dockerfile and save it to `Dockerfile.example.tensorflow`:
+
+ ```dockerfile
+ FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ ENV TENSORFLOW_VERSION=1.4.0
+
+ # For how to run TensorFlow on Hadoop,
+ # please refer to https://www.tensorflow.org/deploy/hadoop
+ RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION} && \
+ pip3 install tensorflow-gpu==${TENSORFLOW_VERSION}
+
+ WORKDIR /root
+ ```
+
+ Build the Docker image from `Dockerfile.example.tensorflow`:
+
+ ```bash
+ $ sudo docker build -f Dockerfile.example.tensorflow -t pai.example.tensorflow .
+ ```
+
+ Push the Docker image to a Docker registry:
+
+ ```bash
+ $ sudo docker tag pai.example.tensorflow USER/pai.example.tensorflow
+ $ sudo docker push USER/pai.example.tensorflow
+ ```
+ *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
+
+
+## Advanced environment
+
+You can skip this section if you do not need to prepare other dependencies.
+
+You can customize runtime TensorFlow environment in [Dockerfile.example.tensorflow](./Dockerfile.example.tensorflow), for example, adding other dependeces in Dockerfile:
+
+```dockerfile
+FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
+
+ENV TENSORFLOW_VERSION=1.4.0
+
+# install other packages using apt-get
+RUN apt-get -y update && apt-get -y install git PACKAGE
+
+# install other packages using pip
+RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION} PACKAGE
+
+WORKDIR /root
+```
\ No newline at end of file
diff --git a/examples/tensorflow/README.md b/examples/tensorflow/README.md
index 0e2892c209..8c823ee14f 100644
--- a/examples/tensorflow/README.md
+++ b/examples/tensorflow/README.md
@@ -18,100 +18,29 @@
-->
-# TensorFlow on PAI
+# TensorFlow on OpenPAI
-This guide introduces how to run [TensorFlow](https://www.tensorflow.org/) workload on PAI.
+This guide introduces how to run [TensorFlow](https://www.tensorflow.org/) job on OpenPAI.
The following contents show some basic TensorFlow examples, other customized TensorFlow code can be run similarly.
## Contents
-1. [Basic environment](#basic-environment)
-2. [Advanced environment](#advanced-environment)
-3. [TensorFlow examples](#tensorflow-examples)
-
-
-## Basic environment
-
-First of all, PAI runs all jobs in Docker container.
-
-[Install Docker-CE](https://docs.docker.com/install/linux/docker-ce/ubuntu/) if you haven't. Register an account at public Docker registry [Docker Hub](https://hub.docker.com/) if you do not have a private Docker registry.
-
-You can also jump to [TensorFlow examples](#tensorflow-examples) using [pre-built images](https://hub.docker.com/r/openpai/pai.example.tensorflow/) on Docker Hub.
-
-We need to build a TensorFlow image with GPU support to run TensorFlow workload on PAI, this can be done in two steps:
-
-1. Build a base Docker image for PAI. We prepared a [base Dockerfile](../../job-tutorial/Dockerfiles/cuda8.0-cudnn6/Dockerfile.build.base) which can be built directly.
-
- ```bash
- $ cd ../job-tutorial/Dockerfiles/cuda8.0-cudnn6
- $ sudo docker build -f Dockerfile.build.base \
- > -t pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04 .
- $ cd -
- ```
-
-2. Prepare TensorFlow envoriment in a [Dockerfile](./Dockerfile.example.tensorflow) using the base image.
-
- Write a TensorFlow Dockerfile and save it to `Dockerfile.example.tensorflow`:
-
- ```dockerfile
- FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
- ENV TENSORFLOW_VERSION=1.4.0
-
- # For how to run TensorFlow on Hadoop,
- # please refer to https://www.tensorflow.org/deploy/hadoop
- RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION} && \
- pip3 install tensorflow-gpu==${TENSORFLOW_VERSION}
-
- WORKDIR /root
- ```
-
- Build the Docker image from `Dockerfile.example.tensorflow`:
-
- ```bash
- $ sudo docker build -f Dockerfile.example.tensorflow -t pai.example.tensorflow .
- ```
-
- Push the Docker image to a Docker registry:
-
- ```bash
- $ sudo docker tag pai.example.tensorflow USER/pai.example.tensorflow
- $ sudo docker push USER/pai.example.tensorflow
- ```
- *Note: Replace USER with the Docker Hub username you registered, you will be required to login before pushing Docker image.*
-
-
-## Advanced environment
-
-You can skip this section if you do not need to prepare other dependencies.
-
-You can customize runtime TensorFlow environment in [Dockerfile.example.tensorflow](./Dockerfile.example.tensorflow), for example, adding other dependeces in Dockerfile:
-
-```dockerfile
-FROM pai.build.base:hadoop2.7.2-cuda8.0-cudnn6-devel-ubuntu16.04
-
-ENV TENSORFLOW_VERSION=1.4.0
-
-# install other packages using apt-get
-RUN apt-get -y update && apt-get -y install git PACKAGE
-
-# install other packages using pip
-RUN pip install tensorflow-gpu==${TENSORFLOW_VERSION} PACKAGE
-
-WORKDIR /root
-```
-
+1. [TensorFlow CIFAR-10 image classification](#tensorflow-cifar-10-image-classification)
+2. [TensorFlow ImageNet image classification](#tensorflow-imagenet-image-classification)
+3. [Distributed TensorFlow CIFAR-10 image classification](#distributed-tensorflow-cifar-10-image-classification )
+4. [TensorFlow Tensorboard](#tensorflow-tensorboard)
# TensorFlow examples
-To run TensorFlow examples in PAI, you need to prepare a job configuration file and submit it through webportal.
+To run TensorFlow examples in OpenPAI, you need to prepare a job configuration file and submit it through webportal.
-If you have built your image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow` with your own.
+OpenPAI packaged the docker env required by the job for user to use. User could refer to [DOCKER.md](./DOCKER.md) to customize this example docker env. If user have built a customized image and pushed it to Docker Hub, replace our pre-built image `openpai/pai.example.tensorflow` with your own.
Here're some configuration file examples:
-### Image classification on CIFAR-10
+### TensorFlow CIFAR-10 image classification
+
```js
{
"jobName": "tensorflow-cifar10",
@@ -133,7 +62,8 @@ Here're some configuration file examples:
}
```
-### Image classification on ImageNet
+### TensorFlow ImageNet image classification
+
```js
{
"jobName": "tensorflow-imagenet",
@@ -159,7 +89,8 @@ Here're some configuration file examples:
}
```
-### Distributed traning on CIFAR-10
+### Distributed TensorFlow CIFAR-10 image classification
+
```js
{
"jobName": "tensorflow-distributed-cifar10",
@@ -195,7 +126,8 @@ Here're some configuration file examples:
}
```
-### Tensorboard
+### TensorFlow Tensorboard
+
```js
{
"jobName": "tensorflow-tensorboard",
@@ -220,3 +152,4 @@ Here're some configuration file examples:
```
For more details on how to write a job configuration file, please refer to [job tutorial](../../docs/job_tutorial.md#json-config-file-for-job-submission).
+
diff --git a/frameworklauncher/bin/start.bat b/frameworklauncher/bin/start.bat
index 6f961c73ac..d4229b848f 100644
--- a/frameworklauncher/bin/start.bat
+++ b/frameworklauncher/bin/start.bat
@@ -23,8 +23,9 @@ pushd %~dp0
if not defined LAUNCHER_LOG_DIR (
set LAUNCHER_LOG_DIR=.\logs
)
+set LAUNCHER_OPTS=%LAUNCHER_OPTS% -DLAUNCHER_LOG_DIR=%LAUNCHER_LOG_DIR%
set PATH=%PATH%;%HADOOP_HOME%\bin;%JAVA_HOME%\bin
for /f %%i in ('hadoop classpath') do set HADOOP_CLASSPATH=%%i
-java -DLAUNCHER_LOG_DIR=%LAUNCHER_LOG_DIR% -cp *;%CLASSPATH%;%HADOOP_CLASSPATH% com.microsoft.frameworklauncher.service.Bootstrap
+java %LAUNCHER_OPTS% -cp *;%CLASSPATH%;%HADOOP_CLASSPATH% com.microsoft.frameworklauncher.service.Bootstrap
popd
\ No newline at end of file
diff --git a/frameworklauncher/bin/start.sh b/frameworklauncher/bin/start.sh
old mode 100644
new mode 100755
index e1d567e768..4cc9e34589
--- a/frameworklauncher/bin/start.sh
+++ b/frameworklauncher/bin/start.sh
@@ -21,9 +21,10 @@ pushd "${0%/*}"
if [ "$LAUNCHER_LOG_DIR" = "" ]; then
export LAUNCHER_LOG_DIR=./logs
fi
+export LAUNCHER_OPTS="$LAUNCHER_OPTS -DLAUNCHER_LOG_DIR=$LAUNCHER_LOG_DIR"
export PATH=$PATH:$HADOOP_HOME/bin:$JAVA_HOME/bin
export LD_LIBRARY_PATH=$HADOOP_HOME/lib/native
export HADOOP_CLASSPATH=$(hadoop classpath)
-java -DLAUNCHER_LOG_DIR=$LAUNCHER_LOG_DIR -cp *:$CLASSPATH:$HADOOP_CLASSPATH com.microsoft.frameworklauncher.service.Bootstrap
+java $LAUNCHER_OPTS -cp *:$CLASSPATH:$HADOOP_CLASSPATH com.microsoft.frameworklauncher.service.Bootstrap
popd
\ No newline at end of file
diff --git a/frameworklauncher/build-internal.bat b/frameworklauncher/build-internal.bat
index a5f299e22f..863ccbf54e 100644
--- a/frameworklauncher/build-internal.bat
+++ b/frameworklauncher/build-internal.bat
@@ -46,7 +46,7 @@ if %exitcode% neq 0 (
if %exitcode% neq 0 (
echo Failed to make Binary Distributions with exitcode %exitcode%
) else (
- echo Succeed to make Binary Distributions with exitcode %exitcode%
+ echo Succeeded to make Binary Distributions with exitcode %exitcode%
)
popd
exit %exitcode%
diff --git a/frameworklauncher/build.sh b/frameworklauncher/build.sh
index 322fd5db85..23ec4b061b 100755
--- a/frameworklauncher/build.sh
+++ b/frameworklauncher/build.sh
@@ -22,7 +22,7 @@ stop() {
if [ $exitcode != 0 ]; then
echo Failed to make Binary Distributions with exitcode $exitcode
else
- echo Succeed to make Binary Distributions with exitcode $exitcode
+ echo Succeeded to make Binary Distributions with exitcode $exitcode
fi
popd
exit $exitcode
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/ApplicationMaster.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/ApplicationMaster.java
index 690a2adde5..bcc87985f1 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/ApplicationMaster.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/ApplicationMaster.java
@@ -473,28 +473,17 @@ private Boolean tryToReleaseContainer(String containerId) {
}
}
- private float getApplicationProgress() throws Exception {
- String requestManagerLogScope = "RequestManager_GetApplicationProgress";
- String statusManagerLogScope = "StatusManager_GetApplicationProgress";
- CHANGE_AWARE_LOGGER.initializeScope(requestManagerLogScope, Level.DEBUG);
- CHANGE_AWARE_LOGGER.initializeScope(statusManagerLogScope, Level.DEBUG);
-
- try {
- return requestManager.getApplicationProgress();
- } catch (Exception reqEx) {
- CHANGE_AWARE_LOGGER.log(requestManagerLogScope,
- "Failed to getApplicationProgress from RequestManager.%s",
- CommonUtils.toString(reqEx));
-
- try {
- return statusManager.getApplicationProgress();
- } catch (Exception statEx) {
- CHANGE_AWARE_LOGGER.log(statusManagerLogScope,
- "Failed to getApplicationProgress from StatusManager. Return 0 Progress.%s",
- CommonUtils.toString(reqEx));
- return 0;
- }
+ private float getApplicationProgress() {
+ Float progress = requestManager == null ? null :
+ requestManager.getApplicationProgress();
+ if (progress == null) {
+ progress = statusManager == null ? null :
+ statusManager.getApplicationProgress();
}
+ if (progress == null) {
+ progress = (float) 0;
+ }
+ return progress;
}
private TaskStatus findTask(Container container) {
@@ -1362,16 +1351,22 @@ public void onShutdownRequest() {
"onShutdownRequest called into AM from RM, maybe this Attempt does not exist in RM.");
}
- public float getProgress() throws Exception {
+ public float getProgress() {
// Note queueSystemTask and wait its result here will block the RMClient
// Deliver ApplicationProgress to RM on next heartbeat
- float progress = getApplicationProgress();
+ float progress;
+ try {
+ progress = getApplicationProgress();
+ } catch (Throwable e) {
+ LOGGER.logWarning(e,
+ "Failed to getApplicationProgress. Using default Progress 0");
+ progress = 0;
+ }
String logScope = "getApplicationProgress";
CHANGE_AWARE_LOGGER.initializeScope(logScope, Level.DEBUG);
CHANGE_AWARE_LOGGER.log(logScope,
- "getProgress called into AM from RM: Progress: [%s]", progress);
-
+ "getProgress called into AM from RM: Report Progress: [%s]", progress);
return progress;
}
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RMClientCallbackHandler.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RMClientCallbackHandler.java
index 5da4b31ed1..16fb9e68da 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RMClientCallbackHandler.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RMClientCallbackHandler.java
@@ -41,11 +41,7 @@ public void onShutdownRequest() {
}
public float getProgress() {
- try {
- return am.getProgress();
- } catch (Exception e) {
- return 0;
- }
+ return am.getProgress();
}
public void onNodesUpdated(List updatedNodes) {
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RequestManager.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RequestManager.java
index 431db189e6..108e09ce08 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RequestManager.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/RequestManager.java
@@ -396,14 +396,9 @@ public Integer getServiceVersion(String taskRoleName) {
return getTaskService(taskRoleName).getVersion();
}
- public Float getApplicationProgress() throws Exception {
- Float progress = overrideApplicationProgressRequest.getApplicationProgress().floatValue();
- if (progress >= 0) {
- return progress;
- } else {
- throw new Exception(String.format(
- "ApplicationProgress %s is not nonnegative", progress));
- }
+ public Float getApplicationProgress() {
+ return overrideApplicationProgressRequest == null ? null :
+ overrideApplicationProgressRequest.getApplicationProgress().floatValue();
}
public Boolean existsLocalVersionFrameworkRequest() {
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/StatusManager.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/StatusManager.java
index b84b3c4dc5..0387096fd4 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/StatusManager.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/applicationmaster/StatusManager.java
@@ -757,14 +757,10 @@ public synchronized Boolean isAllTaskInFinalState() {
return (getFinalStateTaskCount() == getTaskCount());
}
- public synchronized float getApplicationProgress() throws Exception {
- float progress = (float) getFinalStateTaskCount() / getTaskCount();
- if (progress >= 0) {
- return progress;
- } else {
- throw new Exception(String.format(
- "ApplicationProgress %s is not nonnegative", progress));
- }
+ public synchronized Float getApplicationProgress() {
+ int totalTaskCount = getTaskCount();
+ return totalTaskCount == 0 ? null :
+ (float) getFinalStateTaskCount() / totalTaskCount;
}
public synchronized ContainerRequest getContainerRequest(TaskStatusLocator locator) {
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/log/ChangeAwareLogger.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/log/ChangeAwareLogger.java
index 1e34538371..59240007da 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/log/ChangeAwareLogger.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/log/ChangeAwareLogger.java
@@ -49,12 +49,13 @@ public synchronized void initializeScope(String scope, Level changedLogLevel, Le
unchangedLogLevels.put(scope, unchangedLogLevel);
}
- public synchronized void log(String scope, String format, Object... args) throws Exception {
+ public synchronized void log(String scope, String format, Object... args) {
String msg = CommonUtils.formatString(format, args);
if (!changedLogLevels.containsKey(scope)) {
- throw new Exception(String.format(
- "Scope [%s] is not initialized for before log it.", scope));
+ LOGGER.logWarning("Scope [%s] is not initialized for before log it. " +
+ "Using default scope configuration.", scope);
+ changedLogLevels.put(scope, Level.INFO);
}
if (lastLogs.containsKey(scope) && lastLogs.get(scope).equals(msg)) {
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/model/LauncherConfiguration.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/model/LauncherConfiguration.java
index daff49c82e..5e80809aeb 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/model/LauncherConfiguration.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/common/model/LauncherConfiguration.java
@@ -116,6 +116,11 @@ public class LauncherConfiguration implements Serializable {
private String webServerAddress = "http://localhost:9086";
private Integer webServerStatusPullIntervalSec = 30;
private Boolean webServerAclEnable = false;
+ // If this feature is enabled, ACL check will be ignored for Framework which does
+ // not belong to any Namespace.
+ // It should only be enabled to compatible with existing Framework which was PUT before
+ // ACL is enabled.
+ private Boolean webServerAclIgnoreWithoutNamespace = false;
public String getZkConnectString() {
return zkConnectString;
@@ -412,4 +417,12 @@ public Boolean getWebServerAclEnable() {
public void setWebServerAclEnable(Boolean webServerAclEnable) {
this.webServerAclEnable = webServerAclEnable;
}
+
+ public Boolean getWebServerAclIgnoreWithoutNamespace() {
+ return webServerAclIgnoreWithoutNamespace;
+ }
+
+ public void setWebServerAclIgnoreWithoutNamespace(Boolean webServerAclIgnoreWithoutNamespace) {
+ this.webServerAclIgnoreWithoutNamespace = webServerAclIgnoreWithoutNamespace;
+ }
}
diff --git a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/webserver/LauncherModule.java b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/webserver/LauncherModule.java
index 738f085bf0..c45ed3dbdc 100644
--- a/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/webserver/LauncherModule.java
+++ b/frameworklauncher/src/main/java/com/microsoft/frameworklauncher/webserver/LauncherModule.java
@@ -123,7 +123,19 @@ private void checkWritableAccess(
checkNonFrameworkWritableAccess(user, aclConf);
} else {
validateFrameworkUserConsistency(user, frameworkUser);
- String namespace = CommonValidation.validateAndGetNamespace(frameworkName);
+
+ String namespace;
+ try {
+ namespace = CommonValidation.validateAndGetNamespace(frameworkName);
+ } catch (Exception e) {
+ // Framework does not belong to any Namespace
+ if (conf.getWebServerAclIgnoreWithoutNamespace()) {
+ return;
+ } else {
+ throw e;
+ }
+ }
+ // Framework belongs to a Namespace
checkFrameworkWritableAccess(user, namespace, aclConf);
}
}
diff --git a/hadoop-ai/README.md b/hadoop-ai/README.md
index 268e7e6cb3..4ee2c64417 100644
--- a/hadoop-ai/README.md
+++ b/hadoop-ai/README.md
@@ -96,7 +96,6 @@ Usually there will have multiple patch files, the newest one is the last known g
Use hadoop-2.9.0.tar.gz to udpate the hadoop-binary settings in services-configuration.yaml under your cluster configs path:
custom-hadoop-binary-path: ***/hadoop-dist/target/hadoop-2.9.0.tar.gz
- hadoop-version: 2.9.0
diff --git a/hadoop-ai/hadoop-build/README.md b/hadoop-ai/hadoop-build/README.md
index e98408fba7..a5f85171bf 100644
--- a/hadoop-ai/hadoop-build/README.md
+++ b/hadoop-ai/hadoop-build/README.md
@@ -40,10 +40,9 @@ currently we support two hadoop versions: 2.7.2 and 2.9.0, If you want to switch
2. Change the patch file name in build.sh if necessary.
-3. Change two hadoop-binary settings in services-configuration.yaml under your cluster configs path:
+3. Change hadoop-binary setting in services-configuration.yaml under your cluster configs path:
custom-hadoop-binary-path: /hadoop-binary/hadoop-2.9.0.tar.gz
- hadoop-version: 2.9.0
4. Change the done file ID in pai-management/paiLibrary/managementTool/ubuntu16.04/hadoop-ai-build.sh
diff --git a/hadoop-ai/hadoop-build/build.sh b/hadoop-ai/hadoop-build/build.sh
index 16b1eade69..1ad556067d 100644
--- a/hadoop-ai/hadoop-build/build.sh
+++ b/hadoop-ai/hadoop-build/build.sh
@@ -30,9 +30,12 @@ cd hadoop
git checkout branch-2.9.0
cp /hadoop-2.9.0.gpu-port.patch /hadoop
+cp /HDFS-13773.patch /hadoop
+cp /docker-executor.patch /hadoop
git apply hadoop-2.9.0.gpu-port.patch
git apply HDFS-13773.patch
+git apply docker-executor.patch
mvn package -Pdist,native -DskipTests -Dmaven.javadoc.skip=true -Dtar
@@ -43,4 +46,5 @@ echo "Successfully build hadoop 2.9.0 AI"
# When Changing the patch id, please update the filename here.
-touch /hadoop-binary/12932984-done
+rm /hadoop-binary/*-done
+touch /hadoop-binary/12932984-12933562-docker_executor-done
diff --git a/hadoop-ai/hadoop-build/docker-executor.patch b/hadoop-ai/hadoop-build/docker-executor.patch
new file mode 100644
index 0000000000..4a93b2b0ee
--- /dev/null
+++ b/hadoop-ai/hadoop-build/docker-executor.patch
@@ -0,0 +1,123 @@
+diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+index 96f6c57..1b89e90 100644
+--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
++++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+@@ -1544,6 +1544,14 @@ public static boolean isAclEnabled(Configuration conf) {
+ public static final String NM_DOCKER_CONTAINER_EXECUTOR_IMAGE_NAME =
+ NM_PREFIX + "docker-container-executor.image-name";
+
++ /** The Docker run option(For DockerContainerExecutor).*/
++ public static final String NM_DOCKER_CONTAINER_EXECUTOR_EXEC_OPTION =
++ NM_PREFIX + "docker-container-executor.exec-option";
++
++ /** The command before launch script(For DockerContainerExecutor).*/
++ public static final String NM_DOCKER_CONTAINER_EXECUTOR_SCRIPT_COMMAND =
++ NM_PREFIX + "docker-container-executor.script-command";
++
+ /** The name of the docker executor (For DockerContainerExecutor).*/
+ public static final String NM_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME =
+ NM_PREFIX + "docker-container-executor.exec-name";
+diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java
+index a044cb6..819c496 100644
+--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java
++++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/DockerContainerExecutor.java
+@@ -98,7 +98,7 @@
+ //containername:0.1 or
+ //containername
+ public static final String DOCKER_IMAGE_PATTERN =
+- "^(([\\w\\.-]+)(:\\d+)*\\/)?[\\w\\.:-]+$";
++ "^(([\\w\\.-]+)(:\\d+)*\\/)?([\\w\\.-]+\\/)?[\\w\\.:-]+$";
+
+ private final FileContext lfs;
+ private final Pattern dockerImagePattern;
+@@ -127,7 +127,12 @@ public void init() throws IOException {
+ String dockerExecutor = getConf().get(
+ YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME,
+ YarnConfiguration.NM_DEFAULT_DOCKER_CONTAINER_EXECUTOR_EXEC_NAME);
+- if (!new File(dockerExecutor).exists()) {
++ // /use/bin/docker -H=tcp://0.0.0.0:xx is also a valid docker executor
++ String[] arr = dockerExecutor.split("\\s");
++ if (LOG.isDebugEnabled()) {
++ LOG.debug("dockerExecutor: " + dockerExecutor);
++ }
++ if (!new File(arr[0]).exists()) {
+ throw new IllegalStateException(
+ "Invalid docker exec path: " + dockerExecutor);
+ }
+@@ -181,8 +186,11 @@ public int launchContainer(ContainerStartContext ctx) throws IOException {
+
+ //Variables for the launch environment can be injected from the command-line
+ //while submitting the application
+- String containerImageName = container.getLaunchContext().getEnvironment()
+- .get(YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_IMAGE_NAME);
++ //modify get image from configuration rather than env
++ String containerImageName = getConf().get(
++ YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_IMAGE_NAME);
++
++ //
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("containerImageName from launchContext: " + containerImageName);
+ }
+@@ -240,19 +248,27 @@ public int launchContainer(ContainerStartContext ctx) throws IOException {
+ //--net=host allows the container to take on the host's network stack
+ //--name sets the Docker Container name to the YARN containerId string
+ //-v is used to bind mount volumes for local, log and work dirs.
++ //-w sets the work dir inside the container
++ //add docker option
++ String dockerOption = getConf().get(
++ YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_EXEC_OPTION);
+ String commandStr = commands.append(dockerExecutor)
+ .append(" ")
+ .append("run")
+ .append(" ")
+- .append("--rm --net=host")
++ .append("--rm --net=host --pid=host --privileged=true")
++ .append(" ")
++ .append("-w " + containerWorkDir.toUri().getPath().toString())
++ .append(" ")
++ .append(dockerOption)
+ .append(" ")
+ .append(" --name " + containerIdStr)
+- .append(localDirMount)
+- .append(logDirMount)
+- .append(containerWorkDirMount)
+ .append(" ")
+ .append(containerImageName)
+ .toString();
++ if (LOG.isDebugEnabled()) {
++ LOG.debug("Docker run command: " + commandStr);
++ }
+ //Get the pid of the process which has been launched as a docker container
+ //using docker inspect
+ String dockerPidScript = "`" + dockerExecutor +
+@@ -597,13 +613,28 @@ private void writeSessionScript(Path launchDst, Path pidFile)
+ // We need to do a move as writing to a file is not atomic
+ // Process reading a file being written to may get garbled data
+ // hence write pid to tmp file first followed by a mv
++ // Move dockerpid command to backend, avoid blocking docker run command
++ // need to improve it with publisher mode
++ // Ref: https://issues.apache.org/jira/browse/YARN-3080
+ pout.println("#!/usr/bin/env bash");
+ pout.println();
++ pout.println("{");
++ pout.println("n=10");
++ pout.println("while [ $n -gt 0 ]; do");
++ pout.println("let n=$n-1");
++ pout.println("sleep 5");
+ pout.println("echo "+ dockerPidScript +" > " + pidFile.toString()
+ + ".tmp");
++ pout.println("[ -n \"$(cat \"" + pidFile.toString()
++ + ".tmp\")\" ] && break");
++ pout.println("done");
+ pout.println("/bin/mv -f " + pidFile.toString() + ".tmp " + pidFile);
+- pout.println(dockerCommand + " bash \"" +
+- launchDst.toUri().getPath().toString() + "\"");
++ pout.println("} &");
++ //Add exec command before launch_script.
++ String scriptCommand = getConf().get(
++ YarnConfiguration.NM_DOCKER_CONTAINER_EXECUTOR_SCRIPT_COMMAND);
++ pout.println(dockerCommand + " bash -c '" + scriptCommand + " && bash \"" +
++ launchDst.toUri().getPath().toString() + "\"'");
+ } finally {
+ IOUtils.cleanupWithLogger(LOG, pout, out);
+ }
diff --git a/hadoop-ai/hadoop-build/dockerfile b/hadoop-ai/hadoop-build/dockerfile
index ceeab8759b..3a9d0709ce 100644
--- a/hadoop-ai/hadoop-build/dockerfile
+++ b/hadoop-ai/hadoop-build/dockerfile
@@ -73,6 +73,8 @@ RUN wget https://github.com/google/protobuf/releases/download/v2.5.0/protobuf-2.
## The build environment of hadoop has been prepared above.
## Copy your build script here. Default script will build our hadoop-ai.
+COPY docker-executor.patch /
+
COPY build.sh /
RUN chmod u+x build.sh
diff --git a/pai-management/bootstrap/prometheus/alert-configmap.yaml.template b/pai-management/bootstrap/alert-manager/alert-configmap.yaml.template
similarity index 100%
rename from pai-management/bootstrap/prometheus/alert-configmap.yaml.template
rename to pai-management/bootstrap/alert-manager/alert-configmap.yaml.template
diff --git a/pai-management/bootstrap/prometheus/alert-deployment.yaml.template b/pai-management/bootstrap/alert-manager/alert-manager.yaml.template
similarity index 97%
rename from pai-management/bootstrap/prometheus/alert-deployment.yaml.template
rename to pai-management/bootstrap/alert-manager/alert-manager.yaml.template
index 2f8e6af7ab..97a51b97b7 100644
--- a/pai-management/bootstrap/prometheus/alert-deployment.yaml.template
+++ b/pai-management/bootstrap/alert-manager/alert-manager.yaml.template
@@ -39,8 +39,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- alertmanager: "true"
containers:
- name: alertmanager
image: prom/alertmanager:v0.15.1
diff --git a/pai-management/bootstrap/pylon/node-label.sh.template b/pai-management/bootstrap/alert-manager/delete.sh
similarity index 82%
rename from pai-management/bootstrap/pylon/node-label.sh.template
rename to pai-management/bootstrap/alert-manager/delete.sh
index 3769debcfb..1b952d11cc 100644
--- a/pai-management/bootstrap/pylon/node-label.sh.template
+++ b/pai-management/bootstrap/alert-manager/delete.sh
@@ -17,8 +17,10 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-{% for host in machinelist %}
- {% if 'pylon' in machinelist[ host ] and machinelist[ host ][ 'pylon' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} pylon=true || exit $?
- {% endif %}
-{% endfor %}
+pushd $(dirname "$0") > /dev/null
+
+echo "Call stop script to stop all service first"
+/bin/bash stop.sh || exit $?
+
+
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/grafana/node-label.sh.template b/pai-management/bootstrap/alert-manager/refresh.sh.template
similarity index 81%
rename from pai-management/bootstrap/grafana/node-label.sh.template
rename to pai-management/bootstrap/alert-manager/refresh.sh.template
index c49b32cab2..305833cebe 100644
--- a/pai-management/bootstrap/grafana/node-label.sh.template
+++ b/pai-management/bootstrap/alert-manager/refresh.sh.template
@@ -17,8 +17,5 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-{% for host in machinelist %}
- {% if 'grafana' in machinelist[ host ] and machinelist[ host ][ 'grafana' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} grafana=true || exit $?
- {% endif %}
-{% endfor %}
+
+# TODO need to refactor refresh.sh
diff --git a/pai-management/bootstrap/rest-server/node-label.sh.template b/pai-management/bootstrap/alert-manager/service.yaml
similarity index 79%
rename from pai-management/bootstrap/rest-server/node-label.sh.template
rename to pai-management/bootstrap/alert-manager/service.yaml
index 07f9607fd5..da86bf1ab0 100644
--- a/pai-management/bootstrap/rest-server/node-label.sh.template
+++ b/pai-management/bootstrap/alert-manager/service.yaml
@@ -1,5 +1,3 @@
-#!/bin/bash
-
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
@@ -17,8 +15,21 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-{% for host in machinelist %}
- {% if 'restserver' in machinelist[ host ] and machinelist[ host ][ 'restserver' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} restserver=true || exit $?
- {% endif %}
-{% endfor %}
+prerequisite:
+ - cluster-configuration
+ - drivers
+
+template-list:
+ - alert-manager.yaml
+ - alert-configmap.yaml
+ - start.sh
+
+start-script: start.sh
+stop-script: stop.sh
+delete-script: delete.sh
+refresh-script: refresh.sh
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/webportal/node-label.sh.template b/pai-management/bootstrap/alert-manager/start.sh.template
similarity index 81%
rename from pai-management/bootstrap/webportal/node-label.sh.template
rename to pai-management/bootstrap/alert-manager/start.sh.template
index 587e4c86de..f1f5b863a7 100644
--- a/pai-management/bootstrap/webportal/node-label.sh.template
+++ b/pai-management/bootstrap/alert-manager/start.sh.template
@@ -1,5 +1,7 @@
#!/bin/bash
+#!/bin/bash
+
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
@@ -17,8 +19,11 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-{% for host in machinelist %}
- {% if 'webportal' in machinelist[ host ] and machinelist[ host ][ 'webportal' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} webportal=true || exit $?
- {% endif %}
-{% endfor %}
+pushd $(dirname "$0") > /dev/null
+
+{% if clusterinfo['prometheusinfo']['alerting'] %}
+kubectl apply --overwrite=true -f alert-configmap.yaml || exit $?
+kubectl apply --overwrite=true -f alert-manager.yaml || exit $?
+{% endif %}
+
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/frameworklauncher/node-label.sh.template b/pai-management/bootstrap/alert-manager/stop.sh
similarity index 81%
rename from pai-management/bootstrap/frameworklauncher/node-label.sh.template
rename to pai-management/bootstrap/alert-manager/stop.sh
index 2db6f0bde2..6dffaadbec 100644
--- a/pai-management/bootstrap/frameworklauncher/node-label.sh.template
+++ b/pai-management/bootstrap/alert-manager/stop.sh
@@ -1,3 +1,5 @@
+#!/bin/sh
+
#!/bin/bash
# Copyright (c) Microsoft Corporation
@@ -17,8 +19,11 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-{% for host in machinelist %}
- {% if 'launcher' in machinelist[ host ] and machinelist[ host ][ 'launcher' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} launcher=true || exit $?
- {% endif %}
-{% endfor %}
+INSTANCES="deployment/alertmanager
+configmap/alertmanager
+"
+
+for instance in ${INSTANCES}; do
+ kubectl delete --ignore-not-found --now ${instance}
+done
+
diff --git a/pai-management/bootstrap/frameworklauncher/frameworklauncher-configuration/frameworklauncher.yml b/pai-management/bootstrap/frameworklauncher/frameworklauncher-configuration/frameworklauncher.yml
index 35261f6d0f..5d6bc6cad8 100644
--- a/pai-management/bootstrap/frameworklauncher/frameworklauncher-configuration/frameworklauncher.yml
+++ b/pai-management/bootstrap/frameworklauncher/frameworklauncher-configuration/frameworklauncher.yml
@@ -21,6 +21,8 @@
zkConnectString: {ZOOKEEPER_ADDRESS}:2181
zkRootDir: /Launcher
hdfsRootDir: /Launcher
+rootAdminUsers: !!set
+ ? {name: root}
# Service Setup
serviceRMResyncIntervalSec: 20
diff --git a/pai-management/bootstrap/frameworklauncher/frameworklauncher.yaml.template b/pai-management/bootstrap/frameworklauncher/frameworklauncher.yaml.template
index 5bf664483f..5ae0ab5a89 100644
--- a/pai-management/bootstrap/frameworklauncher/frameworklauncher.yaml.template
+++ b/pai-management/bootstrap/frameworklauncher/frameworklauncher.yaml.template
@@ -30,8 +30,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- launcher: "true"
containers:
- name: frameworklauncher
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}frameworklauncher:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/pai-management/bootstrap/frameworklauncher/service.yaml b/pai-management/bootstrap/frameworklauncher/service.yaml
index e2a4100a80..76d1610bdb 100644
--- a/pai-management/bootstrap/frameworklauncher/service.yaml
+++ b/pai-management/bootstrap/frameworklauncher/service.yaml
@@ -21,7 +21,6 @@ prerequisite:
- hadoop-batch-job
template-list:
- - node-label.sh
- frameworklauncher.yaml
- stop.sh
- refresh.sh
@@ -31,4 +30,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/frameworklauncher/start.sh b/pai-management/bootstrap/frameworklauncher/start.sh
index 62c082a135..c8863234eb 100755
--- a/pai-management/bootstrap/frameworklauncher/start.sh
+++ b/pai-management/bootstrap/frameworklauncher/start.sh
@@ -19,10 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
-
#chmod u+x configmap-create.sh
/bin/bash configmap-create.sh || exit $?
diff --git a/pai-management/bootstrap/grafana/configmap-create.sh b/pai-management/bootstrap/grafana/configmap-create.sh
index 8be975d652..ef10d3fae8 100755
--- a/pai-management/bootstrap/grafana/configmap-create.sh
+++ b/pai-management/bootstrap/grafana/configmap-create.sh
@@ -17,4 +17,14 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-kubectl create configmap grafana-configuration --from-file=grafana-configuration/ --dry-run -o yaml | kubectl apply --overwrite=true -f - || exit $?
+# minify json files
+tmp="$(mktemp)"
+for i in `find grafana-configuration/ -type f -regex ".*json" ` ; do
+ cat $i | python minify.py > $tmp
+ mv $tmp $i
+done
+
+# create configmap
+for i in `find grafana-configuration/ -type f -regex ".*json" ` ; do
+ echo --from-file=$i
+done | xargs kubectl create configmap grafana-configuration --dry-run -o yaml | kubectl apply -f - || exit $?
diff --git a/pai-management/bootstrap/grafana/grafana-configuration/pai-clusterview-dashboard.json.template b/pai-management/bootstrap/grafana/grafana-configuration/pai-clusterview-dashboard.json.template
index b52c7098af..3880f3802b 100644
--- a/pai-management/bootstrap/grafana/grafana-configuration/pai-clusterview-dashboard.json.template
+++ b/pai-management/bootstrap/grafana/grafana-configuration/pai-clusterview-dashboard.json.template
@@ -29,30 +29,102 @@
"colorBackground": false,
"colorValue": true,
"colors": [
- "#d44a3a",
"#299c46",
- "#299c46"
+ "rgba(237, 129, 40, 0.89)",
+ "#d44a3a"
],
- "datasource": "PM",
- "description": "",
+ "datasource": null,
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
- "show": false,
+ "show": true,
"thresholdLabels": false,
"thresholdMarkers": true
},
- "height": "312px",
- "id": 9,
+ "height": "370",
+ "id": 10,
"interval": null,
- "links": [
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
{
- "dashboard": "PAI Cluster Node List",
- "type": "absolute",
- "url": "{{ clusterinfo['grafanainfo']['grafana_url'] }}:{{ clusterinfo['grafanainfo']['grafana_port'] }}/dashboard/db/pai_clusterview"
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
}
],
+ "span": 6,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": false
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "100 * (yarn_gpus_used / sum(nvidiasmi_attached_gpus))",
+ "format": "time_series",
+ "hide": false,
+ "instant": false,
+ "intervalFactor": 2,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "10, 90",
+ "title": "Allocated GPUs ( % )",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "0",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": false,
+ "colors": [
+ "rgb(255, 0, 0)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(237, 129, 40, 0.89)"
+ ],
+ "datasource": null,
+ "description": "Nodes available to run jobs.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "height": "180",
+ "id": 11,
+ "interval": null,
+ "links": [],
"mappingType": 1,
"mappingTypes": [
{
@@ -65,7 +137,7 @@
}
],
"maxDataPoints": 100,
- "minSpan": 3,
+ "minSpan": null,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
@@ -83,29 +155,27 @@
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
- "lineColor": "rgb(31, 120, 193)",
+ "lineColor": "rgb(31, 220, 13)",
"show": false
},
"tableColumn": "",
"targets": [
{
- "expr": "sum(node_uname_info)",
+ "expr": "yarn_nodes_active",
"format": "time_series",
"hide": false,
- "instant": false,
"intervalFactor": 2,
- "legendFormat": "{sum(node_uname_info)} node",
"refId": "A"
}
],
"thresholds": "1",
- "title": "Detected Node Number",
+ "title": "Worker Nodes",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
- "text": "N/A",
+ "text": "0",
"value": "null"
}
],
@@ -114,23 +184,24 @@
{
"cacheTimeout": null,
"colorBackground": false,
- "colorValue": true,
+ "colorValue": false,
"colors": [
- "#299c46",
+ "rgb(255, 0, 0)",
"rgba(237, 129, 40, 0.89)",
- "#d44a3a"
+ "rgba(237, 129, 40, 0.89)"
],
"datasource": null,
+ "description": "Total nodes of your cluster.",
"format": "none",
"gauge": {
"maxValue": 100,
"minValue": 0,
- "show": true,
+ "show": false,
"thresholdLabels": false,
"thresholdMarkers": true
},
- "height": "312",
- "id": 10,
+ "height": "180",
+ "id": 16,
"interval": null,
"links": [],
"mappingType": 1,
@@ -145,6 +216,7 @@
}
],
"maxDataPoints": 100,
+ "minSpan": null,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
@@ -158,26 +230,25 @@
"to": "null"
}
],
- "span": 6,
+ "span": 3,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
- "lineColor": "rgb(31, 120, 193)",
+ "lineColor": "rgb(31, 220, 13)",
"show": false
},
"tableColumn": "",
"targets": [
{
- "expr": "100 * (count(nvidiasmi_utilization_gpu > 0) / sum(nvidiasmi_attached_gpus))",
+ "expr": "count(node_uname_info)",
"format": "time_series",
"hide": false,
- "instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
- "thresholds": "10, 90",
- "title": "Active GPUs ( % )",
+ "thresholds": "1",
+ "title": "Total Nodes",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -192,13 +263,14 @@
{
"cacheTimeout": null,
"colorBackground": false,
- "colorValue": false,
+ "colorValue": true,
"colors": [
"#299c46",
"rgba(237, 129, 40, 0.89)",
"#d44a3a"
],
"datasource": null,
+ "description": "GPUs allocated by YARN.",
"format": "none",
"gauge": {
"maxValue": 100,
@@ -207,8 +279,8 @@
"thresholdLabels": false,
"thresholdMarkers": true
},
- "height": "150",
- "id": 11,
+ "height": "180",
+ "id": 13,
"interval": null,
"links": [],
"mappingType": 1,
@@ -237,7 +309,7 @@
"to": "null"
}
],
- "span": 3,
+ "span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
@@ -247,7 +319,7 @@
"tableColumn": "",
"targets": [
{
- "expr": "count(nvidiasmi_utilization_gpu > 0)",
+ "expr": "yarn_gpus_used",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
@@ -255,7 +327,7 @@
}
],
"thresholds": "50, 90",
- "title": "Active GPUs",
+ "title": "Allocated GPUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
@@ -273,10 +345,11 @@
"colorValue": true,
"colors": [
"#d44a3a",
- "#299c46",
+ "rgba(237, 129, 40, 0.89)",
"#299c46"
],
"datasource": null,
+ "description": "GPUs available to use.",
"format": "none",
"gauge": {
"maxValue": 100,
@@ -285,8 +358,8 @@
"thresholdLabels": false,
"thresholdMarkers": true
},
- "height": "150",
- "id": 12,
+ "height": "180",
+ "id": 14,
"interval": null,
"links": [],
"mappingType": 1,
@@ -301,7 +374,7 @@
}
],
"maxDataPoints": 100,
- "minSpan": 3,
+ "minSpan": null,
"nullPointMode": "connected",
"nullText": null,
"postfix": "",
@@ -315,7 +388,86 @@
"to": "null"
}
],
- "span": 3,
+ "span": 2,
+ "sparkline": {
+ "fillColor": "rgba(31, 118, 189, 0.18)",
+ "full": false,
+ "lineColor": "rgb(31, 120, 193)",
+ "show": true
+ },
+ "tableColumn": "",
+ "targets": [
+ {
+ "expr": "yarn_total_gpu_num-yarn_gpus_used",
+ "format": "time_series",
+ "hide": false,
+ "intervalFactor": 2,
+ "refId": "A"
+ }
+ ],
+ "thresholds": "50, 90",
+ "title": "Free GPUs",
+ "type": "singlestat",
+ "valueFontSize": "80%",
+ "valueMaps": [
+ {
+ "op": "=",
+ "text": "0",
+ "value": "null"
+ }
+ ],
+ "valueName": "current"
+ },
+ {
+ "cacheTimeout": null,
+ "colorBackground": false,
+ "colorValue": true,
+ "colors": [
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(237, 129, 40, 0.89)",
+ "rgba(237, 129, 40, 0.89)"
+ ],
+ "datasource": null,
+ "description": "GPUs in your cluster that can't be utilized.",
+ "format": "none",
+ "gauge": {
+ "maxValue": 100,
+ "minValue": 0,
+ "show": false,
+ "thresholdLabels": false,
+ "thresholdMarkers": true
+ },
+ "height": "180",
+ "id": 15,
+ "interval": null,
+ "links": [],
+ "mappingType": 1,
+ "mappingTypes": [
+ {
+ "name": "value to text",
+ "value": 1
+ },
+ {
+ "name": "range to text",
+ "value": 2
+ }
+ ],
+ "maxDataPoints": 100,
+ "minSpan": null,
+ "nullPointMode": "connected",
+ "nullText": null,
+ "postfix": "",
+ "postfixFontSize": "50%",
+ "prefix": "",
+ "prefixFontSize": "50%",
+ "rangeMaps": [
+ {
+ "from": "null",
+ "text": "N/A",
+ "to": "null"
+ }
+ ],
+ "span": 2,
"sparkline": {
"fillColor": "rgba(31, 118, 189, 0.18)",
"full": false,
@@ -325,20 +477,21 @@
"tableColumn": "",
"targets": [
{
- "expr": "sum(nvidiasmi_attached_gpus)",
+ "expr": "sum(nvidiasmi_attached_gpus)-yarn_total_gpu_num",
"format": "time_series",
+ "hide": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": "1",
- "title": "Total GPUs",
+ "title": "NotAvailable GPUs",
"type": "singlestat",
"valueFontSize": "80%",
"valueMaps": [
{
"op": "=",
- "text": "N/A",
+ "text": "0",
"value": "null"
}
],
@@ -388,7 +541,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "100 - avg (irate(node_cpu_seconds_total{mode=\"idle\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) * 100",
+ "expr": "100 - avg (irate(node_cpu_seconds_total{mode=\"idle\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) * 100",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "cpu utilization",
@@ -553,14 +706,14 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_network_receive_bytes_total{device!~\"lo\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "in",
"refId": "A"
},
{
- "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_network_transmit_bytes_total{device!~\"lo\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "out",
@@ -647,14 +800,14 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(node_disk_read_bytes_total[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_disk_read_bytes_total[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "read",
"refId": "A"
},
{
- "expr": "sum(rate(node_disk_written_bytes_total[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_disk_written_bytes_total[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "write",
@@ -895,4 +1048,4 @@
"timezone": "",
"title": "PAI_ClusterView",
"version": 1
-}}
+}}
\ No newline at end of file
diff --git a/pai-management/bootstrap/grafana/grafana-configuration/pai-jobview-dashboard.json.template b/pai-management/bootstrap/grafana/grafana-configuration/pai-jobview-dashboard.json.template
index b376ad0ebe..4e08510da3 100644
--- a/pai-management/bootstrap/grafana/grafana-configuration/pai-jobview-dashboard.json.template
+++ b/pai-management/bootstrap/grafana/grafana-configuration/pai-jobview-dashboard.json.template
@@ -280,7 +280,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_NetIn{container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) ",
+ "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_NetIn{container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) ",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
@@ -291,7 +291,7 @@
"target": ""
},
{
- "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_NetOut{container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) ",
+ "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_NetOut{container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) ",
"format": "time_series",
"hide": false,
"interval": "",
@@ -387,7 +387,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_BlockIn{ container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_BlockIn{ container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
@@ -398,7 +398,7 @@
"target": ""
},
{
- "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_BlockOut{ container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "avg by (container_label_PAI_JOB_NAME) (irate(container_BlockOut{ container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Read {{'{{container_label_PAI_JOB_NAME}}'}}",
diff --git a/pai-management/bootstrap/grafana/grafana-configuration/pai-nodeview-dashboard.json.template b/pai-management/bootstrap/grafana/grafana-configuration/pai-nodeview-dashboard.json.template
index 0c87d5386b..a0b13054a4 100644
--- a/pai-management/bootstrap/grafana/grafana-configuration/pai-nodeview-dashboard.json.template
+++ b/pai-management/bootstrap/grafana/grafana-configuration/pai-nodeview-dashboard.json.template
@@ -74,7 +74,7 @@
"refId": "A"
},
{
- "expr": "100 - (avg by (instance)(irate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) * 100)",
+ "expr": "100 - (avg by (instance)(irate(node_cpu_seconds_total{mode=\"idle\",instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) * 100)",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
@@ -281,7 +281,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(node_network_receive_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_network_receive_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
@@ -292,7 +292,7 @@
"target": ""
},
{
- "expr": "sum(rate(node_network_transmit_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_network_transmit_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"hide": true,
"interval": "",
@@ -388,7 +388,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "sum(rate(node_disk_read_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_disk_read_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
@@ -399,7 +399,7 @@
"target": ""
},
{
- "expr": "sum(rate(node_disk_written_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "sum(rate(node_disk_written_bytes_total{instance=~\"$node\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"hide": false,
"intervalFactor": 2,
diff --git a/pai-management/bootstrap/grafana/grafana-configuration/pai-taskroleview-dashboard.json.template b/pai-management/bootstrap/grafana/grafana-configuration/pai-taskroleview-dashboard.json.template
index 5abe4bbd26..de3584fc42 100644
--- a/pai-management/bootstrap/grafana/grafana-configuration/pai-taskroleview-dashboard.json.template
+++ b/pai-management/bootstrap/grafana/grafana-configuration/pai-taskroleview-dashboard.json.template
@@ -297,7 +297,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_NetIn{ container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\", container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) ",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_NetIn{ container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\", container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) ",
"interval": "",
"intervalFactor": 2,
"legendFormat": "In {{'{{container_label_PAI_JOB_NAME}}'}}-{{'{{container_label_PAI_CURRENT_TASK_ROLE_NAME}}'}}",
@@ -307,7 +307,7 @@
"target": ""
},
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_NetOut{ container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) ",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_NetOut{ container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) ",
"format": "time_series",
"hide": false,
"interval": "",
@@ -403,7 +403,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_BlockIn{container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_BlockIn{container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"interval": "",
"intervalFactor": 2,
"legendFormat": "Write {{'{{container_label_PAI_JOB_NAME}}'}}-{{'{{container_label_PAI_CURRENT_TASK_ROLE_NAME}}'}}",
@@ -413,7 +413,7 @@
"target": ""
},
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_BlockOut{container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME) (irate(container_BlockOut{container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"intervalFactor": 2,
"legendFormat": "Read {{'{{container_label_PAI_JOB_NAME}}'}}-{{'{{container_label_PAI_CURRENT_TASK_ROLE_NAME}}'}}",
"refId": "B"
diff --git a/pai-management/bootstrap/grafana/grafana-configuration/pai-taskview-dashboard.json.template b/pai-management/bootstrap/grafana/grafana-configuration/pai-taskview-dashboard.json.template
index 4921e37288..a110a51e45 100644
--- a/pai-management/bootstrap/grafana/grafana-configuration/pai-taskview-dashboard.json.template
+++ b/pai-management/bootstrap/grafana/grafana-configuration/pai-taskview-dashboard.json.template
@@ -285,7 +285,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME, container_env_PAI_TASK_INDEX) (irate(container_NetIn{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\", container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) ",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME, container_env_PAI_TASK_INDEX) (irate(container_NetIn{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\", container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) ",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
@@ -296,7 +296,7 @@
"target": ""
},
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME, container_env_PAI_TASK_INDEX) (irate(container_NetOut{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\", container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s])) ",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME, container_env_PAI_TASK_INDEX) (irate(container_NetOut{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\", container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s])) ",
"format": "time_series",
"hide": false,
"interval": "",
@@ -392,7 +392,7 @@
"steppedLine": false,
"targets": [
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME,container_env_PAI_TASK_INDEX) (irate(container_BlockIn{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME,container_env_PAI_TASK_INDEX) (irate(container_BlockIn{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
@@ -403,7 +403,7 @@
"target": ""
},
{
- "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME,container_env_PAI_TASK_INDEX) (irate(container_BlockOut{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval'] * 10}}s]))",
+ "expr": "avg by (container_label_PAI_JOB_NAME, container_label_PAI_CURRENT_TASK_ROLE_NAME,container_env_PAI_TASK_INDEX) (irate(container_BlockOut{container_env_PAI_TASK_INDEX=~\"$task\",container_label_PAI_CURRENT_TASK_ROLE_NAME=~\"$task_role\",container_label_PAI_JOB_NAME=~\"$job\"}[{{clusterinfo['prometheusinfo']['scrape_interval']|default(30) * 10}}s]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Read {{'{{container_label_PAI_JOB_NAME}}'}}-{{'{{container_env_PAI_TASK_INDEX}}'}}",
diff --git a/pai-management/bootstrap/grafana/grafana.yaml.template b/pai-management/bootstrap/grafana/grafana.yaml.template
index 0f7583256f..201d37867a 100644
--- a/pai-management/bootstrap/grafana/grafana.yaml.template
+++ b/pai-management/bootstrap/grafana/grafana.yaml.template
@@ -23,18 +23,14 @@ spec:
replicas: 1
selector:
matchLabels:
- task: monitor
- app: grafana
+ app: grafana
template:
metadata:
labels:
- task: monitor
app: grafana
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- grafana: "true"
volumes:
- name: grafana-confg-volume
configMap:
@@ -55,4 +51,4 @@ spec:
- name: GF_AUTH_ANONYMOUS_ENABLED
value: "true"
imagePullSecrets:
- - name: {{ clusterinfo['dockerregistryinfo']['secretname'] }}
+ - name: {{ clusterinfo['dockerregistryinfo']['secretname'] }}
\ No newline at end of file
diff --git a/pai-management/bootstrap/grafana/minify.py b/pai-management/bootstrap/grafana/minify.py
new file mode 100644
index 0000000000..faa66839d3
--- /dev/null
+++ b/pai-management/bootstrap/grafana/minify.py
@@ -0,0 +1,29 @@
+#!/usr/bin/env python
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import json
+import sys
+
+def minify(raw):
+ """ minify json file """
+ obj = json.loads(raw)
+ return json.dumps(obj, separators=(",", ":"))
+
+if __name__ == '__main__':
+ sys.stdout.write(minify(sys.stdin.read()))
diff --git a/pai-management/bootstrap/grafana/service.yaml b/pai-management/bootstrap/grafana/service.yaml
index 3d1c6e0c70..0c569bdf63 100644
--- a/pai-management/bootstrap/grafana/service.yaml
+++ b/pai-management/bootstrap/grafana/service.yaml
@@ -20,7 +20,6 @@ prerequisite:
- prometheus
template-list:
- - node-label.sh
- grafana.yaml
- stop.sh
- grafana-configuration/pai-clusterview-dashboard.json
@@ -35,4 +34,7 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/grafana/start.sh b/pai-management/bootstrap/grafana/start.sh
index 26b466d100..a82cbeb20f 100755
--- a/pai-management/bootstrap/grafana/start.sh
+++ b/pai-management/bootstrap/grafana/start.sh
@@ -23,10 +23,6 @@ pushd $(dirname "$0") > /dev/null
/bin/bash configmap-create.sh || exit $?
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
-
i=0
while ! kubectl get configmap | grep -q "grafana-configuration"
diff --git a/pai-management/bootstrap/hadoop-node-manager/hadoop-node-manager-delete/delete.sh b/pai-management/bootstrap/hadoop-node-manager/hadoop-node-manager-delete/delete.sh
new file mode 100644
index 0000000000..2776d68c9d
--- /dev/null
+++ b/pai-management/bootstrap/hadoop-node-manager/hadoop-node-manager-delete/delete.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+# Clean running job
+
+if which docker > /dev/null && [ -S /var/run/docker.sock ]; then
+
+ echo "Clean hadoop jobs"
+
+ docker ps | awk '/container_\w{3}_[0-9]{13}_[0-9]{4}_[0-9]{2}_[0-9]{6}/ { print $NF}' | xargs timeout 30 docker stop || \
+ docker ps | awk '/container_\w{3}_[0-9]{13}_[0-9]{4}_[0-9]{2}_[0-9]{6}/ { print $NF}' | xargs docker kill
+
+fi
+
+
+# Clean data
+
+echo "Clean the hadoop node manager's data on the disk"
+
+if [ -d "/mnt/yarn/node" ]; then
+
+ rm -rf /mnt/yarn/node
+
+fi
+
+if [ -d "/mnt/hadooptmp/nodemanager" ]; then
+
+ rm -rf /mnt/hadooptmp/nodemanager
+
+fi
diff --git a/pai-management/bootstrap/node-exporter/delete.sh b/pai-management/bootstrap/node-exporter/delete.sh
new file mode 100644
index 0000000000..1b952d11cc
--- /dev/null
+++ b/pai-management/bootstrap/node-exporter/delete.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+pushd $(dirname "$0") > /dev/null
+
+echo "Call stop script to stop all service first"
+/bin/bash stop.sh || exit $?
+
+
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template b/pai-management/bootstrap/node-exporter/node-exporter.yaml.template
similarity index 95%
rename from pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template
rename to pai-management/bootstrap/node-exporter/node-exporter.yaml.template
index fdd6cb6038..f8b9d1b8dd 100644
--- a/pai-management/bootstrap/prometheus/node-exporter-ds.yaml.template
+++ b/pai-management/bootstrap/node-exporter/node-exporter.yaml.template
@@ -31,8 +31,6 @@ spec:
app: node-exporter
name: node-exporter
spec:
- nodeSelector:
- node-exporter: "true"
containers:
- image: prom/node-exporter:v0.16.0
imagePullPolicy: Always
@@ -89,8 +87,12 @@ spec:
readinessProbe:
exec:
command:
- - python
- - /usr/local/healthy_check.py
+ - "python"
+ - "/job_exporter/no_older_than.py"
+ - "--delta"
+ - "60"
+ - "/datastorage/prometheus/job_exporter.prom"
+ - "/datastorage/prometheus/gpu_exporter.prom"
initialDelaySeconds: 30
periodSeconds: 30
resources:
@@ -105,8 +107,6 @@ spec:
name: docker-bin
- mountPath: /var/run/docker.sock
name: docker-socket
- - name: docker
- mountPath: /var/lib/docker
- mountPath: /dev
name: device-mount
- mountPath: /var/drivers
@@ -142,9 +142,6 @@ spec:
- name: sys
hostPath:
path: /sys
- - name: docker
- hostPath:
- path: /var/lib/docker
imagePullSecrets:
- name: {{ clusterinfo['dockerregistryinfo']['secretname'] }}
hostNetwork: true
diff --git a/pai-management/bootstrap/node-exporter/refresh.sh.template b/pai-management/bootstrap/node-exporter/refresh.sh.template
new file mode 100644
index 0000000000..305833cebe
--- /dev/null
+++ b/pai-management/bootstrap/node-exporter/refresh.sh.template
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# TODO need to refactor refresh.sh
diff --git a/pai-management/bootstrap/node-exporter/service.yaml b/pai-management/bootstrap/node-exporter/service.yaml
new file mode 100644
index 0000000000..116b30d3a1
--- /dev/null
+++ b/pai-management/bootstrap/node-exporter/service.yaml
@@ -0,0 +1,33 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+prerequisite:
+ - cluster-configuration
+ - drivers
+
+template-list:
+ - node-exporter.yaml
+
+start-script: start.sh
+stop-script: stop.sh
+delete-script: delete.sh
+refresh-script: refresh.sh
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ notin: no-nodeexporter
diff --git a/pai-management/bootstrap/node-exporter/start.sh b/pai-management/bootstrap/node-exporter/start.sh
new file mode 100644
index 0000000000..5332846a89
--- /dev/null
+++ b/pai-management/bootstrap/node-exporter/start.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+pushd $(dirname "$0") > /dev/null
+
+kubectl apply --overwrite=true -f node-exporter.yaml || exit $?
+
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/node-exporter/stop.sh b/pai-management/bootstrap/node-exporter/stop.sh
new file mode 100644
index 0000000000..68b615b29b
--- /dev/null
+++ b/pai-management/bootstrap/node-exporter/stop.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+INSTANCES="daemonset/node-exporter"
+
+for instance in ${INSTANCES}; do
+ kubectl delete --ignore-not-found --now ${instance}
+done
diff --git a/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template b/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template
index f0a4c98555..824ebf4dbb 100644
--- a/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template
+++ b/pai-management/bootstrap/prometheus/prometheus-configmap.yaml.template
@@ -33,7 +33,7 @@ data:
- "/etc/prometheus-alert/*.rules"
scrape_configs:
- job_name: 'node_exporter'
- scrape_interval: {{clusterinfo['prometheusinfo']['scrape_interval']}}s
+ scrape_interval: {{ clusterinfo['prometheusinfo']['scrape_interval']|default(30) }}s
kubernetes_sd_configs:
- api_server: '{{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}'
role: node
@@ -64,7 +64,10 @@ data:
target_label: __address__
- source_labels: [__meta_kubernetes_pod_name]
action: replace
- target_label: k8s_pod_name
+ target_label: scraped_from
+ - source_labels: [__meta_kubernetes_pod_label_app]
+ action: replace
+ target_label: pai_service_name
{% if clusterinfo['prometheusinfo']['alerting'] %}
alerting:
alertmanagers:
@@ -73,4 +76,4 @@ data:
{% for host in clusterinfo['prometheusinfo']['alerting']['alert-manager-hosts'] %}
- {{ host }}:{{ port }}
{% endfor %}
-{% endif %}
+{% endif %}
\ No newline at end of file
diff --git a/pai-management/bootstrap/prometheus/prometheus-delete/delete-data.sh b/pai-management/bootstrap/prometheus/prometheus-delete/delete-data.sh
index cfa9de8de9..4f15f65fac 100644
--- a/pai-management/bootstrap/prometheus/prometheus-delete/delete-data.sh
+++ b/pai-management/bootstrap/prometheus/prometheus-delete/delete-data.sh
@@ -21,4 +21,6 @@
echo "Clean the prometheus's data on the disk"
-#TODO: Yanjie should write script to clean all node-exporter, GPU-exporter and prom's data on host.
\ No newline at end of file
+# TODO: Yanjie should write script to clean all node-exporter, GPU-exporter and prom's data on host.
+
+# Note: Please clean prometheus, clean node-exporter in its individual directory
\ No newline at end of file
diff --git a/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template b/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template
index 75d9ab255d..b90618e98e 100644
--- a/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template
+++ b/pai-management/bootstrap/prometheus/prometheus-deployment.yaml.template
@@ -32,8 +32,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- prometheus: "true"
containers:
- name: prometheus
image: prom/prometheus:v2.1.0
diff --git a/pai-management/bootstrap/prometheus/service.yaml b/pai-management/bootstrap/prometheus/service.yaml
index ea0db9afd3..bb33021f81 100644
--- a/pai-management/bootstrap/prometheus/service.yaml
+++ b/pai-management/bootstrap/prometheus/service.yaml
@@ -20,21 +20,17 @@ prerequisite:
- drivers
template-list:
- - node-label.sh
- - node-exporter-ds.yaml
- prometheus-configmap.yaml
- prometheus-deployment.yaml
- - start.sh
- - stop.sh
- refresh.sh
- delete.yaml
- - watchdog-ds.yaml
- - watchdog-configmap.yaml
- - alert-deployment.yaml
- - alert-configmap.yaml
start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/prometheus/start.sh.template b/pai-management/bootstrap/prometheus/start.sh
old mode 100755
new mode 100644
similarity index 75%
rename from pai-management/bootstrap/prometheus/start.sh.template
rename to pai-management/bootstrap/prometheus/start.sh
index 7dcb1be23c..e6bfc9651c
--- a/pai-management/bootstrap/prometheus/start.sh.template
+++ b/pai-management/bootstrap/prometheus/start.sh
@@ -21,20 +21,8 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
-
-kubectl apply --overwrite=true -f prometheus-configmap.yaml || exit $?
kubectl create configmap prometheus-alert --from-file=../../../prometheus/prometheus-alert --dry-run -o yaml | kubectl apply --overwrite=true -f - || exit $?
-kubectl apply --overwrite=true -f node-exporter-ds.yaml || exit $?
+kubectl apply --overwrite=true -f prometheus-configmap.yaml || exit $?
kubectl apply --overwrite=true -f prometheus-deployment.yaml || exit $?
-kubectl apply --overwrite=true -f watchdog-configmap.yaml || exit $?
-kubectl apply --overwrite=true -f watchdog-ds.yaml || exit $?
-
-{% if clusterinfo['prometheusinfo']['alerting'] %}
-kubectl apply --overwrite=true -f alert-configmap.yaml || exit $?
-kubectl apply --overwrite=true -f alert-deployment.yaml || exit $?
-{% endif %}
-popd > /dev/null
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/prometheus/stop.sh b/pai-management/bootstrap/prometheus/stop.sh
new file mode 100644
index 0000000000..f61f3ee14d
--- /dev/null
+++ b/pai-management/bootstrap/prometheus/stop.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+INSTANCES="deployment/prometheus-deployment
+configmap/prometheus-configmap
+configmap/prometheus-alert
+"
+
+for instance in ${INSTANCES}; do
+ kubectl delete --ignore-not-found --now ${instance}
+done
+
diff --git a/pai-management/bootstrap/prometheus/stop.sh.template b/pai-management/bootstrap/prometheus/stop.sh.template
deleted file mode 100644
index 7d8b9bc08d..0000000000
--- a/pai-management/bootstrap/prometheus/stop.sh.template
+++ /dev/null
@@ -1,49 +0,0 @@
-#!/bin/sh
-
-#!/bin/bash
-
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-INSTANCES="daemonset/node-exporter
-daemonset/watchdog
-deployment/prometheus-deployment
-configmap/prometheus-configmap
-configmap/prometheus-alert
-configmap/watchdog
-deployment/alertmanager
-configmap/alertmanager
-"
-
-for instance in ${INSTANCES}; do
- kubectl delete --ignore-not-found --now ${instance}
-done
-
-{% for host in machinelist %}
- {% if 'prometheus' in machinelist[ host ] and machinelist[ host ][ 'prometheus' ] == 'true' %}
-kubectl label nodes {{ machinelist[ host ][ 'nodename' ] }} prometheus- || exit $?
- {% endif %}
- {% if 'node-exporter' in machinelist[ host ] and machinelist[ host ][ 'node-exporter' ] == 'true' %}
-kubectl label nodes {{ machinelist[ host ][ 'nodename' ] }} node-exporter- || exit $?
- {% endif %}
- {% if 'watchdog' in machinelist[ host ] and machinelist[ host ][ 'watchdog' ] == 'true' %}
-kubectl label nodes {{ machinelist[ host ][ 'nodename' ] }} watchdog- || exit $?
- {% endif %}
- {% if 'alert-manager' in machinelist[ host ] and machinelist[ host ][ 'alert-manager' ] == 'true' %}
-kubectl label nodes {{ machinelist[ host ][ 'nodename' ] }} alertmanager- || exit $?
- {% endif %}
-{% endfor %}
diff --git a/pai-management/bootstrap/pylon/pylon.yaml.template b/pai-management/bootstrap/pylon/pylon.yaml.template
index 1fbec8c658..303bd22b02 100644
--- a/pai-management/bootstrap/pylon/pylon.yaml.template
+++ b/pai-management/bootstrap/pylon/pylon.yaml.template
@@ -31,8 +31,6 @@ spec:
spec:
hostNetwork: false
hostPID: false
- nodeSelector:
- pylon: "true"
containers:
- name: pylon
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}pylon:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/pai-management/bootstrap/pylon/service.yaml b/pai-management/bootstrap/pylon/service.yaml
index 9b560e1ae8..a08cac85f4 100644
--- a/pai-management/bootstrap/pylon/service.yaml
+++ b/pai-management/bootstrap/pylon/service.yaml
@@ -19,7 +19,6 @@ prerequisite:
- cluster-configuration
template-list:
- - node-label.sh
- pylon.yaml
- stop.sh
- refresh.sh
@@ -28,4 +27,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/pylon/start.sh b/pai-management/bootstrap/pylon/start.sh
index 96d1d7209c..33d3666ed8 100755
--- a/pai-management/bootstrap/pylon/start.sh
+++ b/pai-management/bootstrap/pylon/start.sh
@@ -19,10 +19,8 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
kubectl apply --overwrite=true -f pylon.yaml || exit $?
+
popd > /dev/null
diff --git a/pai-management/bootstrap/rest-server/rest-server.yaml.template b/pai-management/bootstrap/rest-server/rest-server.yaml.template
index 99ead432b0..d59b60f97b 100644
--- a/pai-management/bootstrap/rest-server/rest-server.yaml.template
+++ b/pai-management/bootstrap/rest-server/rest-server.yaml.template
@@ -31,8 +31,6 @@ spec:
spec:
hostNetwork: false
hostPID: false
- nodeSelector:
- restserver: "true"
containers:
- name: rest-server
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}rest-server:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/pai-management/bootstrap/rest-server/service.yaml b/pai-management/bootstrap/rest-server/service.yaml
index 6dd29f11f1..da231a8a24 100644
--- a/pai-management/bootstrap/rest-server/service.yaml
+++ b/pai-management/bootstrap/rest-server/service.yaml
@@ -20,7 +20,6 @@ prerequisite:
- frameworklauncher
template-list:
- - node-label.sh
- rest-server.yaml
- stop.sh
- refresh.sh
@@ -29,4 +28,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/rest-server/start.sh b/pai-management/bootstrap/rest-server/start.sh
index 0e307dbadd..31e43bd6dc 100755
--- a/pai-management/bootstrap/rest-server/start.sh
+++ b/pai-management/bootstrap/rest-server/start.sh
@@ -19,10 +19,7 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
+kubectl apply --overwrite=true -f rest-server.yaml|| exit $?
-/bin/bash node-label.sh || exit $?
-
-kubectl apply --overwrite=true -f rest-server.yaml || exit $?
popd > /dev/null
diff --git a/pai-management/bootstrap/watchdog/delete.sh b/pai-management/bootstrap/watchdog/delete.sh
new file mode 100644
index 0000000000..1b952d11cc
--- /dev/null
+++ b/pai-management/bootstrap/watchdog/delete.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+pushd $(dirname "$0") > /dev/null
+
+echo "Call stop script to stop all service first"
+/bin/bash stop.sh || exit $?
+
+
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/watchdog/refresh.sh.template b/pai-management/bootstrap/watchdog/refresh.sh.template
new file mode 100644
index 0000000000..305833cebe
--- /dev/null
+++ b/pai-management/bootstrap/watchdog/refresh.sh.template
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+# TODO need to refactor refresh.sh
diff --git a/pai-management/bootstrap/watchdog/service.yaml b/pai-management/bootstrap/watchdog/service.yaml
new file mode 100644
index 0000000000..d8bbf63f36
--- /dev/null
+++ b/pai-management/bootstrap/watchdog/service.yaml
@@ -0,0 +1,34 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+prerequisite:
+ - cluster-configuration
+ - drivers
+
+template-list:
+ - watchdog-configmap.yaml
+ - watchdog.yaml
+
+start-script: start.sh
+stop-script: stop.sh
+delete-script: delete.sh
+refresh-script: refresh.sh
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/watchdog/start.sh b/pai-management/bootstrap/watchdog/start.sh
new file mode 100644
index 0000000000..6387a6a13a
--- /dev/null
+++ b/pai-management/bootstrap/watchdog/start.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+pushd $(dirname "$0") > /dev/null
+
+kubectl apply --overwrite=true -f watchdog-configmap.yaml || exit $?
+kubectl apply --overwrite=true -f watchdog.yaml || exit $?
+
+popd > /dev/null
\ No newline at end of file
diff --git a/pai-management/bootstrap/watchdog/stop.sh b/pai-management/bootstrap/watchdog/stop.sh
new file mode 100644
index 0000000000..e82991d1fa
--- /dev/null
+++ b/pai-management/bootstrap/watchdog/stop.sh
@@ -0,0 +1,29 @@
+#!/bin/sh
+
+#!/bin/bash
+
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+INSTANCES="daemonset/watchdog
+deployment/watchdog
+configmap/watchdog
+"
+
+for instance in ${INSTANCES}; do
+ kubectl delete --ignore-not-found --now ${instance}
+done
\ No newline at end of file
diff --git a/pai-management/bootstrap/prometheus/watchdog-configmap.yaml.template b/pai-management/bootstrap/watchdog/watchdog-configmap.yaml.template
similarity index 100%
rename from pai-management/bootstrap/prometheus/watchdog-configmap.yaml.template
rename to pai-management/bootstrap/watchdog/watchdog-configmap.yaml.template
diff --git a/pai-management/bootstrap/prometheus/watchdog-ds.yaml.template b/pai-management/bootstrap/watchdog/watchdog.yaml.template
similarity index 69%
rename from pai-management/bootstrap/prometheus/watchdog-ds.yaml.template
rename to pai-management/bootstrap/watchdog/watchdog.yaml.template
index 77b625a69f..804bbe4c4f 100644
--- a/pai-management/bootstrap/prometheus/watchdog-ds.yaml.template
+++ b/pai-management/bootstrap/watchdog/watchdog.yaml.template
@@ -16,50 +16,57 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
apiVersion: apps/v1
-kind: DaemonSet
+kind: Deployment
metadata:
- annotations:
- prometheus.io/scrape: 'true'
name: watchdog
spec:
+ replicas: 1
selector:
matchLabels:
app: watchdog
template:
metadata:
+ name: watchdog
labels:
app: watchdog
- name: watchdog
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/path: "/"
+ prometheus.io/port: "9101"
spec:
- nodeSelector:
- watchdog: "true"
containers:
- - image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}watchdog:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
+ - name: watchdog
+ image: {{ clusterinfo["dockerregistryinfo"]["prefix"] }}watchdog:{{ clusterinfo["dockerregistryinfo"]["docker_tag"] }}
imagePullPolicy: Always
+ readinessProbe:
+ httpGet:
+ path: /
+ port: 9101
+ initialDelaySeconds: 30
+ periodSeconds: 30
resources:
limits:
memory: "1Gi"
- securityContext:
- privileged: true
volumeMounts:
- mountPath: /datastorage/prometheus
name: collector-mount
- mountPath: /etc/watchdog
name: config-volume
- name: watchdog
- env:
- - name: K8S_API_SERVER_URI
- value: {{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}
- command: ['python']
- args: ['/usr/local/watchdog.py', '/datastorage/prometheus', '30']
+ command:
+ - "python"
+ - "/watchdog.py"
+ - "--interval"
+ - "30"
+ - "--port"
+ - "9101"
+ - "{{ clusterinfo['webportalinfo']['k8s_api_server_uri'] }}"
volumes:
- name: collector-mount
hostPath:
- path: {{ clusterinfo[ 'dataPath' ] }}/prometheus
+ path: {{ clusterinfo["dataPath"] }}/prometheus
- name: config-volume
configMap:
name: watchdog
imagePullSecrets:
- - name: {{ clusterinfo['dockerregistryinfo']['secretname'] }}
+ - name: {{ clusterinfo["dockerregistryinfo"]["secretname"] }}
hostNetwork: true
- hostPID: true
diff --git a/pai-management/bootstrap/webportal/service.yaml b/pai-management/bootstrap/webportal/service.yaml
index c1cf2a6e05..afb137342f 100644
--- a/pai-management/bootstrap/webportal/service.yaml
+++ b/pai-management/bootstrap/webportal/service.yaml
@@ -20,7 +20,6 @@ prerequisite:
- rest-server
template-list:
- - node-label.sh
- webportal.yaml
- stop.sh
- refresh.sh
@@ -29,4 +28,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/pai-management/bootstrap/webportal/start.sh b/pai-management/bootstrap/webportal/start.sh
index 83cae1a8b7..07926cdd12 100755
--- a/pai-management/bootstrap/webportal/start.sh
+++ b/pai-management/bootstrap/webportal/start.sh
@@ -19,9 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
kubectl apply --overwrite=true -f webportal.yaml || exit $?
diff --git a/pai-management/bootstrap/webportal/webportal.yaml.template b/pai-management/bootstrap/webportal/webportal.yaml.template
index 1ee9eaf271..e99d2e48c2 100644
--- a/pai-management/bootstrap/webportal/webportal.yaml.template
+++ b/pai-management/bootstrap/webportal/webportal.yaml.template
@@ -31,8 +31,6 @@ spec:
spec:
hostNetwork: false
hostPID: false
- nodeSelector:
- webportal: "true"
containers:
- name: webportal
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}webportal:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/pai-management/confStorage/__init__.py b/pai-management/confStorage/__init__.py
new file mode 100644
index 0000000000..d647bb847f
--- /dev/null
+++ b/pai-management/confStorage/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
\ No newline at end of file
diff --git a/pai-management/confStorage/conf_storage_util.py b/pai-management/confStorage/conf_storage_util.py
new file mode 100644
index 0000000000..bc4466d5d9
--- /dev/null
+++ b/pai-management/confStorage/conf_storage_util.py
@@ -0,0 +1,177 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+from __future__ import print_function
+
+import os
+import sys
+import time
+import errno
+import logging
+import logging.config
+
+from pprint import pprint
+
+import kubernetes.client
+from kubernetes.client.rest import ApiException
+from kubernetes import client, config, watch
+
+
+logger = logging.getLogger(__name__)
+
+
+
+def get_subdirectory_list(path):
+
+ return next(os.walk(path))[1]
+
+
+
+def create_path(path):
+
+ if not os.path.exists("{0}".format(path)):
+ try:
+ os.makedirs(path)
+
+ except OSError as exc:
+ if exc.errno == errno.EEXIST and os.path.isdir(path):
+ logger.warning("Failed to create path {0}, due to that the path exists.".format(path))
+ else:
+ sys.exit(1)
+
+
+
+def read_file_from_path(file_path):
+
+ with open(file_path, "r") as fin:
+ file_data = fin.read().decode('utf-8')
+
+ return file_data
+
+
+
+def write_generated_file(generated_file, file_path):
+
+ with open(file_path, "w+") as fout:
+ fout.write(generated_file)
+
+
+
+def get_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, name, namespace = "default"):
+
+ config.load_kube_config(config_file=PAI_KUBE_CONFIG_DEFAULT_LOCATION)
+ api_instance = kubernetes.client.CoreV1Api()
+ exact = True
+ export = True
+
+ target_configmap_data = None
+ target_configmap_metadata = None
+
+ try:
+ api_response = api_instance.read_namespaced_config_map(name, namespace, exact=exact, export=export)
+ target_configmap_data = api_response.data
+ target_configmap_metadata = api_response.metadata
+
+ except ApiException as e:
+ if e.status == 404:
+ logger.info("Couldn't find configmap named {0}".format(name))
+ return None
+ else:
+ logger.error("Exception when calling CoreV1Api->read_namespaced_config_map: {0}".format(str(e)))
+ sys.exit(1)
+
+ ret = {
+ "metadata" : target_configmap_metadata,
+ "data" : target_configmap_data
+ }
+
+ return ret
+
+
+
+def update_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, name, data_dict, namespace = "default"):
+
+ config.load_kube_config(config_file=PAI_KUBE_CONFIG_DEFAULT_LOCATION)
+ api_instance = kubernetes.client.CoreV1Api()
+
+ meta_data = kubernetes.client.V1ObjectMeta()
+ meta_data.namespace = namespace
+ meta_data.name = name
+ body = kubernetes.client.V1ConfigMap(
+ metadata = meta_data,
+ data = data_dict)
+
+ try:
+ api_response = api_instance.replace_namespaced_config_map(name, namespace, body)
+ logger.info("configmap named {0} is updated.".format(name))
+
+ except ApiException as e:
+
+ if e.status == 404:
+
+ try:
+ logger.info("Couldn't find configmap named {0}. Create a new configmap".format(name))
+ api_response = api_instance.create_namespaced_config_map(namespace, body)
+ logger.info("Configmap named {0} is created".format(name))
+
+ except ApiException as ie:
+ logger.error("Exception when calling CoreV1Api->create_namespaced_config_map: {0}".format(str(e)))
+ sys.exit(1)
+
+ else:
+ logger.error("Exception when calling CoreV1Api->replace_namespaced_config_map: {0}".format(str(e)))
+ sys.exit(1)
+
+
+
+def get_cluster_id(PAI_KUBE_CONFIG_DEFAULT_LOCATION):
+
+ resp = get_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, "pai-cluster-id")
+ if resp == None:
+ return None
+
+ # return a string
+ return resp["data"]["cluster-id"]
+
+
+
+def update_cluster_id(PAI_KUBE_CONFIG_DEFAULT_LOCATION, cluster_id):
+
+ data_dict = dict()
+ data_dict["cluster-id"] = cluster_id
+ update_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, "pai-cluster-id", data_dict)
+
+
+
+def get_conf_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION):
+
+ resp = get_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, "pai-configuration")
+ if resp == None:
+ return None
+
+ # return a dict
+ return resp["data"]
+
+
+
+def update_conf_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, conf_data_dict):
+
+ update_configmap(PAI_KUBE_CONFIG_DEFAULT_LOCATION, "pai-configuration", conf_data_dict)
+
+
+
diff --git a/pai-management/confStorage/download.py b/pai-management/confStorage/download.py
new file mode 100644
index 0000000000..23afe2342a
--- /dev/null
+++ b/pai-management/confStorage/download.py
@@ -0,0 +1,81 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import yaml
+import os
+import sys
+import subprocess
+import jinja2
+import argparse
+import logging
+import logging.config
+
+from . import conf_storage_util
+
+
+package_directory_kubeinstall = os.path.dirname(os.path.abspath(__file__))
+
+
+class download_configuration:
+
+
+ def __init__(self):
+
+ self.logger = logging.getLogger(__name__)
+ self.KUBE_CONFIG_DEFAULT_LOCATION = os.path.expanduser("~/.kube/config")
+ if os.environ.get('KUBECONFIG', None) != None:
+ self.KUBE_CONFIG_DEFAULT_LOCATION = os.environ.get('KUBECONFIG', None)
+
+
+
+ def check_cluster_id(self):
+
+ cluster_id = conf_storage_util.get_cluster_id(self.KUBE_CONFIG_DEFAULT_LOCATION)
+
+ if cluster_id == None:
+ self.logger.error("No cluster_id found in your cluster, which should be done the first time you upload your configuration.")
+ sys.exit(1)
+
+ user_input = raw_input("Please input the cluster-id which you wanna operate: ")
+ if user_input != cluster_id:
+ self.logger.error("Ops, maybe you find the wrong cluster. Please check your input and the target cluster.")
+ sys.exit(1)
+
+ self.logger.info("Congratulations: Cluster-id checking passed.")
+
+
+
+ def download_cluster_configuration(self, local_path):
+
+ cluster_id = conf_storage_util.get_cluster_id(self.KUBE_CONFIG_DEFAULT_LOCATION)
+ configuration_dict = conf_storage_util.get_conf_configmap(self.KUBE_CONFIG_DEFAULT_LOCATION)
+
+ if configuration_dict == None:
+ self.logger.error("The configuration doesn't exists on your cluster. Please upload it first.")
+ sys.exit(1)
+
+ conf_storage_util.create_path("{0}/{1}".format(local_path, cluster_id))
+ for key in configuration_dict:
+ conf_storage_util.write_generated_file(configuration_dict[key], "{0}/{1}/{2}".format(local_path, cluster_id, key))
+
+
+
+ def run(self, local_path = "."):
+
+ self.check_cluster_id()
+ self.download_cluster_configuration(local_path)
diff --git a/pai-management/confStorage/environment.py b/pai-management/confStorage/environment.py
new file mode 100644
index 0000000000..853a525d7b
--- /dev/null
+++ b/pai-management/confStorage/environment.py
@@ -0,0 +1,125 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+from __future__ import print_function
+
+import os
+import sys
+import time
+import logging
+import subprocess
+import logging.config
+
+import kubernetes.client
+from kubernetes import client, config, watch
+from kubernetes.client.rest import ApiException
+
+
+package_directory_kubeinstall = os.path.dirname(os.path.abspath(__file__))
+
+
+
+class environment_check:
+
+ def __init__(self):
+
+ self.logger = logging.getLogger(__name__)
+
+ self.KUBE_CONFIG_DEFAULT_LOCATION = os.path.expanduser("~/.kube/config")
+ if os.environ.get('KUBECONFIG', None) != None:
+ self.KUBE_CONFIG_DEFAULT_LOCATION = os.environ.get('KUBECONFIG', None)
+
+
+
+ def execute_shell_return(self, shell_cmd, error_msg):
+
+ try:
+ subprocess.check_call(shell_cmd, shell=True)
+
+ except subprocess.CalledProcessError:
+ self.logger.error(error_msg)
+ return False
+
+ return True
+
+
+ def check_conf_exits(self):
+
+ if not os.path.isfile(self.KUBE_CONFIG_DEFAULT_LOCATION):
+ self.logger.error(
+ "CHECKING FAILED: The path {0} doesn't exist.".format(self.KUBE_CONFIG_DEFAULT_LOCATION)
+ )
+ sys.exit(1)
+
+ self.logger.info(
+ "CHECKING SUCCESSFULLY: Kubeconfig is found."
+ )
+
+
+
+ def check_kubectl(self):
+
+ api_versions_cmd = "kubectl api-versions"
+ error_msg = "Failed to execute the command [ kubectl api-versions ]"
+ if self.execute_shell_return(api_versions_cmd, error_msg) == False:
+ self.logger.error(
+ "CHECKING FAILED: There is something wrong with kubectl. Please check."
+ )
+ sys.exit(1)
+ self.logger.info(
+ "CHECKING SUCCESSFULLY: Kubectl is found. And execute it successfully."
+ )
+
+
+
+ def check_python_kubernetes(self):
+
+ #configuration = kubernetes.client.Configuration()
+ core_api_instance = client.CoreApi()
+ try_count = 0
+
+ while True:
+
+ try:
+ self.logger.info("Try to access to the target kubernetes cluster")
+ config.load_kube_config(config_file=self.KUBE_CONFIG_DEFAULT_LOCATION)
+ api_response = core_api_instance.get_api_versions()
+ self.logger.info(str(api_response))
+ break
+
+ except ApiException as e:
+ self.logger.error("Failed connect to k8s with python client.")
+ try_count = try_count + 1
+
+ if try_count == 3:
+ self.logger.error("All 3 tries of connecting k8s with python client fails.")
+ sys.exit(1)
+
+ time.sleep(5)
+
+ self.logger.info(
+ "CHECKING SUCCESSFULLY: Successfully access kubernetes through python client. "
+ )
+
+
+
+ def run(self):
+
+ self.check_conf_exits()
+ self.check_kubectl()
+ self.check_python_kubernetes()
diff --git a/pai-management/confStorage/upload.py b/pai-management/confStorage/upload.py
new file mode 100644
index 0000000000..4014e60853
--- /dev/null
+++ b/pai-management/confStorage/upload.py
@@ -0,0 +1,86 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+import yaml
+import os
+import sys
+import subprocess
+import jinja2
+import argparse
+import logging
+import logging.config
+
+from . import conf_storage_util
+
+
+package_directory_kubeinstall = os.path.dirname(os.path.abspath(__file__))
+
+
+
+class upload_configuration:
+
+
+ def __init__(self, config_path):
+
+ self.logger = logging.getLogger(__name__)
+ self.KUBE_CONFIG_DEFAULT_LOCATION = os.path.expanduser("~/.kube/config")
+ if os.environ.get('KUBECONFIG', None) != None:
+ self.KUBE_CONFIG_DEFAULT_LOCATION = os.environ.get('KUBECONFIG', None)
+
+ self.config_path = config_path
+
+
+
+ def check_cluster_id(self):
+
+ cluster_id = conf_storage_util.get_cluster_id(self.KUBE_CONFIG_DEFAULT_LOCATION)
+
+ if cluster_id == None:
+ self.logger.warning("No cluster_id found in your cluster.")
+ user_input = raw_input("Please input the cluster-id for your cluster: ")
+ conf_storage_util.update_cluster_id(self.KUBE_CONFIG_DEFAULT_LOCATION, user_input)
+ return False
+
+ user_input = raw_input("Please input the cluster-id which you wanna operate: ")
+ if user_input != cluster_id:
+ self.logger.error("Ops, maybe you find the wrong cluster. Please check your input and the target cluster.")
+ sys.exit(1)
+
+ self.logger.info("Congratulations: Cluster-id checking passed.")
+ return True
+
+
+
+ def upload_latest_configuration(self):
+
+ conf_dict = dict()
+ conf_dict["cluster-configuration.yaml"] = conf_storage_util.read_file_from_path("{0}/cluster-configuration.yaml".format(self.config_path))
+ conf_dict["k8s-role-definition.yaml"] = conf_storage_util.read_file_from_path("{0}/k8s-role-definition.yaml".format(self.config_path))
+ conf_dict["kubernetes-configuration.yaml"] = conf_storage_util.read_file_from_path(
+ "{0}/kubernetes-configuration.yaml".format(self.config_path))
+ conf_dict["serivces-configuration.yaml"] = conf_storage_util.read_file_from_path(
+ "{0}/services-configuration.yaml".format(self.config_path))
+
+ conf_storage_util.update_conf_configmap(self.KUBE_CONFIG_DEFAULT_LOCATION, conf_dict)
+
+
+
+ def run(self):
+
+ self.check_cluster_id()
+ self.upload_latest_configuration()
\ No newline at end of file
diff --git a/pai-management/doc/etcd.md b/pai-management/doc/etcd.md
new file mode 100644
index 0000000000..6a20532555
--- /dev/null
+++ b/pai-management/doc/etcd.md
@@ -0,0 +1,73 @@
+# Etcd maintenance
+
+## Goal
+Etcd is a service used by Kubernetes as a consistent and highly-available key value store for all its backing cluster data.
+
+## Build
+OpenPAI doesn't directly build etcd service. The service's image is pulled from the docker registry.
+
+## Configuration
+
+Configuration file [kubernetes-configuration.yaml](../../cluster-configuration/kubernetes-configuration.yaml) defines etcd as kubernetes storage and specifies the version.
+```yaml
+storage-backend: etcd3
+etcd-version: 3.2.17
+```
+The etcd node can be configured in file [cluster-configuration.yaml](../../cluster-configuration/cluster-configuration.yaml) by adding a `etcdid` label on the machine.
+
+## Deployment
+
+Etcd is deployed as a component of Kubernetes when running this command:
+```bash
+python paictl.py cluster k8s-bootup -p ./path/to/cluster/configuration/dir
+```
+After the cluster is up, the cluster nodes can be retrieved by command:
+```bash
+etcdctl --endpoints=http://CONFIGUED_ETCD_NODE_ADDRESS:2380 member list
+```
+The `CONFIGUED_ETCD_NODE_ADDRESS` is the one of the node addresses you configured to deploy etcd. This command will return all
+the etcd nodes with their status. The nodes will be deployed successfully if their status are `started`.
+
+## Upgrading and rolling back
+
+The etcd data is stored in
+```bash
+/var/etcd/data
+```
+on each node.
+By default the data will be kept when cleaning the cluster and upgrading to a new Kubernetes version. So when the cluster of new version is up,
+all the services can be restored and continue to run.
+If you want to clean the etcd data completely when cleaning the cluster. Please run this command
+```bash
+python paictl.py cluster k8s-clean -p /path/to/configuration/directory -f
+```
+For general instructions about upgrading etcd please refer [upgrading-etcd](https://kubernetes.io/docs/tasks/administer-cluster/configure-upgrade-etcd/#upgrading-and-rolling-back-etcd-clusters).
+
+## Service Monitoring
+
+- Watchdog can report the etcd health metrics `etcd_current_status_error`. Please refer [watchdog doc](../../prometheus/doc/watchdog-metrics.md) for the detailed metrics.
+- Etcd service can be monitored by Prometheus or Grafana. Please refer [monitoring](https://coreos.com/etcd/docs/latest/op-guide/monitoring.html) for details.
+- Etcd's status can also be found from the cluster's Kubernets dashboard. In the Pods view the pods with name prefix "etcd-server" will run etcd service.
+
+## High Availability
+
+To support high availability, etcd cluster must be deployed on multiple nodes and distribute the data. The recommended cluster size is at least 3.
+
+## Fix etcd nodes
+
+Sometimes the etcd nodes may not healthy, it can be repaired with command
+```bash
+python paictl.py machine etcd-fix -p /path/to/configuration/directory -l /path/to/your/errornodelist.yaml
+```
+Please follow instructions in [machine maintenance](./machine-maintenance.md) for the details.
+
+## Data Stored on Etcd
+
+In OpenPAI cluster the data on etcd comes from two service:
+- Kubernetes: All of the Kubernetes objects like deployment, pod or service information will be stored to etcd.
+- Rest-Server: The OpenPAI user account information will be stored to etcd.
+
+## Reference
+
+- [Etcd document](https://coreos.com/etcd/docs/latest/docs.html#documentation)
+- [Operating etcd for kubernetes](https://kubernetes.io/docs/tasks/administer-cluster/configure-upgrade-etcd/)
diff --git a/pai-management/doc/hdfs.md b/pai-management/doc/hdfs.md
new file mode 100644
index 0000000000..ac96747ff1
--- /dev/null
+++ b/pai-management/doc/hdfs.md
@@ -0,0 +1,215 @@
+# HDFS
+
+This guidance provides users instructions to operate the HDFS cluster in OpenPAI.
+
+# Table of Content
+
+- [ Goal ](#Goal)
+- [ Build ](#Build)
+- [ Configuration ](#Configuration)
+ - [ Properties Configuration ](#Properties_Configuration)
+ - [ Storage Path ](#Storage_Path)
+ - [ Name Node ](#Name_Node)
+ - [ Data Node ](#Data_Node)
+- [ Deployment ](#Deployment)
+- [ Upgrading ](#Upgrading)
+- [ Service Monitoring ](#Service_Monitoring)
+ - [ Metrics ](#Metrics)
+ - [ Monitoring ](#Monitoring)
+ - [ Monitor via Prometheus ](#Monitor_via_Prometheus)
+ - [ Monitor via HTTP API ](#Monitor_via_HTTP_API)
+- [ High Availability ](#High_Availability)
+- [ Access HDFS Data ](#Access_HDFS_Data)
+ - [ WebHDFS ](#WebHDFS)
+ - [ HDFS Command ](#HDFS_Command)
+ - [ Web Portal ](#Web_Portal)
+ - [ Mountable HDFS ](#Mountable_HDFS)
+ - [ API ](#API)
+ - [ Java API ](#Java_API)
+ - [ C API ](#C_API)
+ - [ Python API](#Python_API)
+- [ Reference ](#Reference)
+
+# Goal
+
+ The Hadoop Distributed File System (HDFS) in OpenPAI serves as a central storage for both user's application and data.
+ The application log will also be stored to HDFS.
+
+# Build
+
+The HDFS service image can be built together with other services by running this command:
+```bash
+python paictl.py image build -p /path/to/configuration/
+```
+HDFS is in the hadoop-run image, it can be built respectively with following commands:
+```bash
+python paictl.py image build -p /path/to/configuration/ -n hadoop-run
+```
+
+# Configuration
+
+## Properties Configuration
+
+HDFS name node and data node both have it configuration files.
+They are located in [name node configuration](../bootstrap/hadoop-name-node/hadoop-name-node-configuration)
+and [data node configuration](../bootstrap/hadoop-data-node/hadoop-data-node-configuration) respectively.
+All the HDFS related properties are in file *core-site.xml* and *hdfs-site.xml*.
+Please refer [core-site.xml](https://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-common/core-default.xml)
+and [hdfs-site.xml](https://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml)
+for the detailed property descriptions.
+
+## Storage Path
+
+HDFS's data storage path on a machine is configured by *cluster.data-path* in
+file [services-configuration.yaml](../../cluster-configuration/services-configuration.yaml).
+All the HDFS related data both on name node and data node will be stored under this path.
+
+### Name Node
+
+* Configuration Data: Its path is defined by *hadoop-name-node-configuration* configuration map.
+* Name Data: It is in the *hdfs/name* directory under the storage path.
+* Temp Data: It is in the *hadooptmp/namenode* directory under the storage path.
+
+### Data Node
+
+* Configuration Data: Its path is defined by *hadoop-data-node-configuration* configuration map.
+* Data Storage: It is in the *hdfs/data* directory under the storage path.
+* Host Configuration: Its path is defined by *host-configuration* configuration map.
+* Temp Data: It is in the *hadooptmp/datanode* directory under the storage path.
+
+# Deployment
+
+HDFS can be deployed when starting the OpenPAI services with command:
+```bash
+python paictl.py service start -p /service/configuration/path
+```
+The name node and data node service can be started separately by specifying the service name in the command.
+```bash
+python paictl.py service start -p /service/configuration/path -n hadoop-name-node
+python paictl.py service start -p /service/configuration/path -n hadoop-data-node
+```
+
+# Upgrading
+
+It is recommended to have a backup of the name node data before upgrading the cluster.
+Please refer [rolling upgrade](https://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-hdfs/HdfsRollingUpgrade.html) for the detailed instructions.
+
+# Service Monitoring
+
+## Metrics
+HDFS exposes various metrics for monitoring and debugging. Please refer [HDFS Metrics](https://hadoop.apache.org/docs/r2.9.0/hadoop-project-dist/hadoop-common/Metrics.html)
+for all the detailed metrics and their explanations.
+
+## Monitoring
+
+### Monitoring via Prometheus
+
+The Prometheus service will collect those metrics and monitor HDFS in real time. This is still an undergoing work.
+
+### Monitoring via HTTP API
+
+* Data Node: all the metrics can be retrieved by command
+```bash
+curl http://DATA_NODE_ADDRESS:50075/jmx
+```
+
+* Name Node: all the metrics can be retrieved by command
+```bash
+curl http://NAME_NODE_ADDRESS:50070/jmx
+```
+
+# High Availability
+
+Currently OpenPAI management tool doesn't deploy HDFS in a High Availability (HA) fashion. This will be added in a future release.
+For solution about the HA feature please refer [HDFS High Availability](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HDFSHighAvailabilityWithNFS.html).
+
+# Access HDFS Data
+
+Data on HDFS can be accessed by various ways. Users can choose the proper way according to there needs.
+
+## WebHDFS
+
+WebHDFS provides a set of REST APIs and this is our recommended way to access data.
+[WebHDFS REST API](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/WebHDFS.html) contains the detailed instructions of the APIs.
+The rest server URI is http://hdfs-name-node-address:50070. The *hdfs-name-node-address* is the address of the machine with *pai-master* label *true*
+in configuration file [cluster-configuration.yaml](../../cluster-configuration/cluster-configuration.yaml).
+Following are two simple examples to show how the APIs can be used to create and delete a file.
+
+1. Create a File
+Suppose to create file *test_file* under directory */test*. First step is submit a request without redirection and data with command:
+```bash
+curl -i -X PUT "http://hdfs-name-node-address:50070/webhdfs/v1/test/test_file?op=CREATE"
+```
+This command will return the data node where the file should be written. The location URI would be like
+>http://hdfs-name-node-address:50075/webhdfs/v1/test/test_file?op=CREATE&namenoderpcaddress=hdfs-data-node-address:9000&createflag=&createparent=true&overwrite=false
+
+Then run following command with this URI to write file data:
+```bash
+curl -i -X PUT -T file-data-to-write returned-location-uri
+```
+
+2. Delete a File
+If we want to delete the file created by above example, run following command:
+```bash
+curl -i -X DELETE "http://hdfs-name-node-address:50070/webhdfs/v1/test/test_file?op=DELETE"
+```
+
+## HDFS Command
+
+The commands are available in the Hadoop package. Please download the version you need on [Hadoop Releases](http://hadoop.apache.org/releases.html).
+Then extract it to your machine by running
+```bash
+tar -zxvf hadoop-package-name
+```
+All commands are located in *bin* directory.
+Please refer [HDFS Command Guid](http://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HDFSCommands.html) for detailed command descriptions.
+All files in the HDFS are specified by its URI following pattern *hdfs://hdfs-name-node-address:name-node-port/parent/child*.
+Here the *name-node-port* is 9000. The *hdfs-name-node-address* is the address of the machine with *pai-master* label *true* in configuration
+file [cluster-configuration.yaml](../../cluster-configuration/cluster-configuration.yaml).
+
+## Web Portal
+
+Data on HDFS can be accessed by pointing your web browser to http://hdfs-name-node-address:50070/explorer.html after the cluster is ready.
+The *hdfs-name-node-address* is the address of the machine with *pai-master* label *true*
+in configuration file [cluster-configuration.yaml](../../cluster-configuration/cluster-configuration.yaml).
+From release 2.9.0 users can upload or delete files on the web portal. On earlier release users can only browse the data.
+
+## Mountable HDFS
+
+The *hadoop-hdfs-fuse* tool can mount HDFS on local file system and users can access the data with Linux commands.
+The tool can be installed with following commands on Ubuntu system:
+```bash
+# add the CDH5 repository
+wget http://archive.cloudera.com/cdh5/one-click-install/trusty/amd64/cdh5-repository_1.0_all.deb
+sudo dpkg -i cdh5-repository_1.0_all.deb
+# install the hadoop-dfs-fuse tool
+sudo apt-get update
+sudo apt-get install hadoop-hdfs-fuse
+# mount to local system
+mkdir -p your-mount-directory
+sudo hadoop-fuse-dfs dfs://hdfs-name-node-address:9000 your-mount-directory
+```
+
+## API
+
+### Java API
+
+The Java APIs allow users to access data from Java programs.
+The detailed HDFS API interfaces can be found on [HDFS API Doc](https://hadoop.apache.org/docs/stable/api/org/apache/hadoop/fs/FileSystem.html)。
+
+### C API
+
+The C API is provided by *libhdfs* library and it only supports a subset of the HDFS operations.
+Please follow the instructions on [C APIs](http://hadoop.apache.org/docs/r2.9.1/hadoop-project-dist/hadoop-hdfs/LibHdfs.html) for details.
+
+### Python API
+
+The Python API can be installed with command:
+```bash
+pip install hdfs
+```
+Please refer [HdfsCLI](https://hdfscli.readthedocs.io/en/latest/) for the details.
+
+# Reference
+
+1. [Hadoop reference doc](https://hadoop.apache.org/docs/r2.9.0/)
\ No newline at end of file
diff --git a/deployment/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh b/pai-management/k8sPaiLibrary/template/kubernetes-cleanup.sh.template
old mode 100755
new mode 100644
similarity index 92%
rename from deployment/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh
rename to pai-management/k8sPaiLibrary/template/kubernetes-cleanup.sh.template
index 04ba0fb77b..20421b3e47
--- a/deployment/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh
+++ b/pai-management/k8sPaiLibrary/template/kubernetes-cleanup.sh.template
@@ -32,9 +32,9 @@ if [ -d "/etc/kubernetes" ]; then
fi
-if [ -d "/var/etcd/data" -a "$option" == "-f" ]; then
+if [ -d "{{ clusterconfig['etcd-data-path'] }}" -a "$option" == "-f" ]; then
- rm -rf /var/etcd/data
+ rm -rf {{ clusterconfig['etcd-data-path'] }}
fi
diff --git a/pai-management/paiLibrary/paiBuild/build_center.py b/pai-management/paiLibrary/paiBuild/build_center.py
new file mode 100644
index 0000000000..7339018f2e
--- /dev/null
+++ b/pai-management/paiLibrary/paiBuild/build_center.py
@@ -0,0 +1,190 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import os
+import sys
+import logging
+import logging.config
+#
+from . import image_push
+from . import image_build
+from . import image_tag
+from . import hadoop_ai_build
+#
+from ..common import linux_shell
+from ..common import file_handler
+from ..common import directory_handler
+from ..common import docker_handler
+
+
+class build_center:
+
+
+
+ def __init__(self, cluster_object_model, build_target = None, os_type="ubuntu16.04"):
+
+ self.logger = logging.getLogger(__name__)
+
+ self.cluster_object_model = cluster_object_model
+ if build_target == None:
+ self.image_list = self.get_image_list()
+ else:
+ self.image_list = build_target
+
+ self.os_type = os_type
+
+
+
+ def get_image_list(self):
+
+ image_list = list()
+
+ subdir_list = directory_handler.get_subdirectory_list("src/")
+ for subdir in subdir_list:
+
+ image_conf_path = "src/{0}/image.yaml".format(subdir)
+ if file_handler.file_exist_or_not(image_conf_path):
+ image_list.append(subdir)
+
+ self.logger.info("Get the image-list to build : {0}".format(str(image_list)))
+
+ return image_list
+
+ def get_base_image(self, image_name):
+ file_path = "src/{0}/dockerfile".format(image_name)
+ with open(file_path, 'r') as fin:
+ for line in fin:
+ if line.strip().startswith("FROM"):
+ _, image = line.split()
+ if image.split(':')[0] in self.get_image_list():
+ return image
+ break
+ return None
+
+
+ def hadoop_binary_prepare(self):
+
+ custom_hadoop_path = self.cluster_object_model['clusterinfo']['hadoopinfo']['custom_hadoop_binary_path']
+
+ self.logger.info("Begin to prepare the hadoop binary for image building.")
+
+ if os.path.exists("src/hadoop-run/hadoop") != True:
+ directory_handler.directory_create("src/hadoop-run/hadoop")
+
+ if custom_hadoop_path != "None":
+ self.logger.info("Customized hadoop path is found.")
+ directory_handler.directory_copy(custom_hadoop_path, "src/hadoop-run/hadoop")
+ else:
+ self.logger.error("None hadoop-ai binary is found.")
+ sys.exit(1)
+
+ self.logger.info("Hadoop binary is prepared.")
+
+
+
+ def hadoop_binary_remove(self):
+
+ self.logger.info("Remove the hadoop binary.")
+
+ binary_path = "src/hadoop-run/hadoop"
+ if os.path.exists(binary_path):
+ directory_handler.directory_delete(binary_path)
+
+
+
+ def build(self, image_name):
+
+ if image_name in self.done_dict and self.done_dict[image_name] == True:
+ return
+
+ image_conf = file_handler.load_yaml_config("src/{0}/image.yaml".format(image_name))
+
+ image_build_worker = image_build.image_build(
+ image_name,
+ image_conf,
+ self.cluster_object_model,
+ self.docker_cli
+ )
+
+ base_image = self.get_base_image(image_name)
+ if base_image != None:
+ self.build(base_image)
+
+ self.logger.info("-----------------------------------------------------------")
+ self.logger.info("Begin to build {0}'s image.".format(image_name))
+ image_build_worker.run()
+ self.done_dict[image_name] = True
+ self.logger.info("{0}'s image building is successful.".format(image_name))
+ self.logger.info("-----------------------------------------------------------")
+ self.tag(image_name)
+
+
+
+ def tag(self, image_name):
+
+ image_tag_worker = image_tag.image_tag(
+ image_name,
+ self.cluster_object_model,
+ self.docker_cli)
+ image_tag_worker.run()
+
+
+
+ def hadoop_ai_build(self):
+ hadoop_ai_path = self.cluster_object_model['clusterinfo']['hadoopinfo']['custom_hadoop_binary_path']
+ hadoop_ai_build_instance = hadoop_ai_build.hadoop_ai_build(self.os_type, hadoop_ai_path)
+ hadoop_ai_build_instance.build()
+
+
+
+ def run(self):
+
+ self.hadoop_binary_remove()
+ self.hadoop_ai_build()
+ self.hadoop_binary_prepare()
+
+ self.done_dict = dict()
+ self.docker_cli = docker_handler.docker_handler(
+ docker_registry = self.cluster_object_model['clusterinfo']['dockerregistryinfo']['docker_registry_domain'],
+ docker_namespace = self.cluster_object_model['clusterinfo']['dockerregistryinfo']['docker_namespace'],
+ docker_username = self.cluster_object_model['clusterinfo']['dockerregistryinfo']['docker_username'],
+ docker_password = self.cluster_object_model['clusterinfo']['dockerregistryinfo']['docker_password']
+ )
+
+ for image_name in self.image_list:
+ if file_handler.file_exist_or_not("src/{0}/image.yaml".format(image_name)) == False:
+ self.logger.warning("image.yaml can't be found on the directory of {0}".format(image_name))
+ self.logger.warning("Please check your source code. The {0}'s image will be skipped")
+ continue
+ if image_name in self.done_dict and self.done_dict[image_name] == True:
+ continue
+ self.build(image_name)
+
+ self.hadoop_binary_remove()
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pai-management/bootstrap/prometheus/node-label.sh.template b/pai-management/paiLibrary/paiBuild/hadoop_ai_build.py
similarity index 50%
rename from pai-management/bootstrap/prometheus/node-label.sh.template
rename to pai-management/paiLibrary/paiBuild/hadoop_ai_build.py
index ed87ffd5e9..1d862b0ffa 100644
--- a/pai-management/bootstrap/prometheus/node-label.sh.template
+++ b/pai-management/paiLibrary/paiBuild/hadoop_ai_build.py
@@ -1,5 +1,3 @@
-#!/bin/bash
-
# Copyright (c) Microsoft Corporation
# All rights reserved.
#
@@ -17,17 +15,37 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-{% for host in machinelist %}
- {% if 'prometheus' in machinelist[ host ] and machinelist[ host ][ 'prometheus' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} prometheus=true || exit $?
- {% endif %}
- {% if 'node-exporter' in machinelist[ host ] and machinelist[ host ][ 'node-exporter' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} node-exporter=true || exit $?
- {% endif %}
- {% if 'watchdog' in machinelist[ host ] and machinelist[ host ][ 'watchdog' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} watchdog=true || exit $?
- {% endif %}
- {% if 'alert-manager' in machinelist[ host ] and machinelist[ host ][ 'alert-manager' ] == 'true' %}
-kubectl label --overwrite=true nodes {{ machinelist[ host ][ 'nodename' ] }} alertmanager=true || exit $?
- {% endif %}
-{% endfor %}
+import logging
+import logging.config
+
+from ..common import linux_shell
+
+
+
+class hadoop_ai_build:
+
+ def __init__(self, os_type = "ubuntu16.04", hadoop_customize_path = None):
+
+ self.logger = logging.getLogger(__name__)
+
+ self.hadoop_customize_path = hadoop_customize_path
+ self.os_type = os_type
+
+
+ def build(self):
+
+ if self.hadoop_customize_path == "None":
+ self.logger.warning("Because the property of custom_hadoop_binary_path in your service-configuration.yaml is None.")
+ self.logger.warning("The process of hadoop-ai build will be skipped.")
+ return
+
+ self.logger.info("Hadoop AI will be built soon.")
+ self.logger.info("The hadoop AI binary will be found at the path [ {0} ]".format(self.hadoop_customize_path))
+
+ commandline = "./paiLibrary/managementTool/{0}/hadoop-ai-build.sh {1}".format(self.os_type, self.hadoop_customize_path)
+ error_msg = "Failed to build hadoop-ai."
+ linux_shell.execute_shell(commandline, error_msg)
+
+ self.logger.info("Successfully. Hadoop AI build finished.")
+
+
diff --git a/pai-management/src/cleaning-image/dockerfile b/pai-management/src/cleaning-image/dockerfile
index cfcef52901..e2f20eefd8 100644
--- a/pai-management/src/cleaning-image/dockerfile
+++ b/pai-management/src/cleaning-image/dockerfile
@@ -17,7 +17,12 @@
FROM base-image
+RUN wget https://download.docker.com/linux/static/stable/x86_64/docker-17.06.2-ce.tgz && \
+ tar xzvf docker-17.06.2-ce.tgz && \
+ mv docker/* /usr/bin/ && \
+ rm docker-17.06.2-ce.tgz
+
COPY start.sh /usr/local/start.sh
RUN chmod a+x /usr/local/start.sh
-CMD ["/usr/local/start.sh"]
\ No newline at end of file
+CMD ["/usr/local/start.sh"]
diff --git a/pai-management/src/end-to-end-test/etc/cntk.sh b/pai-management/src/end-to-end-test/etc/cntk.sh
index 16c946dc4a..3517c78cf5 100755
--- a/pai-management/src/end-to-end-test/etc/cntk.sh
+++ b/pai-management/src/end-to-end-test/etc/cntk.sh
@@ -28,4 +28,10 @@ export OUTPUT_DIR=$(sed -e "s@hdfs://\([0-9]\{1,3\}\.\)\{3\}[0-9]\{1,3\}:[0-9]\{
sed -i "/stderr/s/^/# /" G2P.cntk
sed -i "/maxEpochs/c\maxEpochs = 1" G2P.cntk
-cntk configFile=G2P.cntk DataDir=$DATA_DIR OutDir=$OUTPUT_DIR
\ No newline at end of file
+# Replace train set to accelerate test
+sed -i "s/cmudict-0.7b.train-dev-20-21.ctf/cmudict-0.7b.train-dev-1-21.ctf/g" G2P.cntk
+
+
+cntk configFile=G2P.cntk DataDir=$DATA_DIR OutDir=$OUTPUT_DIR
+
+rm -rf $OUTPUT_DIR
diff --git a/pai-management/src/end-to-end-test/start.sh b/pai-management/src/end-to-end-test/start.sh
index 3370df9dc7..c334129bc1 100755
--- a/pai-management/src/end-to-end-test/start.sh
+++ b/pai-management/src/end-to-end-test/start.sh
@@ -36,7 +36,7 @@ get_auth_token() {
while true; do
printf "\nStarting end to end tests:\n"
- if [ ! -f $token_file ] || [ $(( $(date +%s) - $(stat -c %Y $token_file) )) -gt $expiration ]; then
+ if [ ! -f $token_file ] || [ ! -s $token_file ] || [ $(( $(date +%s) - $(stat -c %Y $token_file) )) -gt $expiration ]; then
get_auth_token
fi
diff --git a/pai-management/src/gpu-exporter/dockerfile b/pai-management/src/gpu-exporter/dockerfile
index c7f3b3016d..22b5944f36 100644
--- a/pai-management/src/gpu-exporter/dockerfile
+++ b/pai-management/src/gpu-exporter/dockerfile
@@ -15,31 +15,19 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-FROM ubuntu:16.04
+FROM python:2.7
-#
-# Preparation
-#
ENV NVIDIA_VERSION=current
ENV NV_DRIVER=/var/drivers/nvidia/$NVIDIA_VERSION
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$NV_DRIVER/lib:$NV_DRIVER/lib64
ENV PATH=$PATH:$NV_DRIVER/bin
-WORKDIR /root/
-
-RUN apt-get update && \
- apt-get -y install wget build-essential python python-pip git pciutils
-
-COPY copied_file/exporter/* /usr/local/
+RUN mkdir -p /job_exporter
+COPY copied_file/exporter/* /job_exporter/
RUN wget https://download.docker.com/linux/static/stable/x86_64/docker-17.06.2-ce.tgz
-RUN cp docker-17.06.2-ce.tgz /usr/local
-RUN tar xzvf /usr/local/docker-17.06.2-ce.tgz -C /usr/local/
+RUN tar xzvf docker-17.06.2-ce.tgz -C /usr/local/
RUN cp -r /usr/local/docker/* /usr/bin/
-#
-# start
-#
-
-CMD python /usr/local/job_exporter.py /datastorage/prometheus 30
+CMD python /job_exporter/job_exporter.py /datastorage/prometheus 30
diff --git a/pai-management/src/watchdog/dockerfile b/pai-management/src/watchdog/dockerfile
index 502843abc8..ef400343b4 100644
--- a/pai-management/src/watchdog/dockerfile
+++ b/pai-management/src/watchdog/dockerfile
@@ -15,18 +15,8 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-FROM ubuntu:16.04
+FROM python:2.7
-#
-# Preparation
-#
-
-WORKDIR /root/
-
-RUN apt-get update && \
- apt-get -y install wget build-essential python python-pip git python-yaml python-jinja2 && \
- pip install requests paramiko python-etcd
+RUN pip install PyYAML requests paramiko prometheus_client
-COPY copied_file/exporter/watchdog.py /usr/local/
-COPY copied_file/exporter/utils.py /usr/local/
-COPY copied_file/common.py /usr/local
+COPY copied_file/exporter/watchdog.py /
diff --git a/pai-management/src/watchdog/image.yaml b/pai-management/src/watchdog/image.yaml
index cfc3764178..c61210bd20 100644
--- a/pai-management/src/watchdog/image.yaml
+++ b/pai-management/src/watchdog/image.yaml
@@ -18,5 +18,3 @@
copy-list:
- src: ../prometheus/exporter
dst: src/watchdog/copied_file
- - src: ../pai-management/k8sPaiLibrary/maintainlib/common.py
- dst: src/watchdog/copied_file
diff --git a/pai-management/src/yarn-exporter/dockerfile b/pai-management/src/yarn-exporter/dockerfile
new file mode 100644
index 0000000000..70a0e80124
--- /dev/null
+++ b/pai-management/src/yarn-exporter/dockerfile
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+FROM python:3.7
+
+#
+# Preparation
+#
+
+WORKDIR /root/
+
+COPY copied_file/yarn_exporter.py /usr/local/
+COPY ./requirements.txt /usr/local/
+
+RUN pip3 install -r /usr/local/requirements.txt
+
diff --git a/pai-management/src/yarn-exporter/image.yaml b/pai-management/src/yarn-exporter/image.yaml
new file mode 100644
index 0000000000..b48c73e3e3
--- /dev/null
+++ b/pai-management/src/yarn-exporter/image.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+copy-list:
+ - src: ../prometheus/exporter/yarn_exporter.py
+ dst: src/yarn-exporter/copied_file
\ No newline at end of file
diff --git a/pai-management/src/yarn-exporter/requirements.txt b/pai-management/src/yarn-exporter/requirements.txt
new file mode 100644
index 0000000000..69cae6460c
--- /dev/null
+++ b/pai-management/src/yarn-exporter/requirements.txt
@@ -0,0 +1,11 @@
+altgraph==0.15
+attrs==18.1.0
+certifi==2018.4.16
+chardet==3.0.4
+future==0.16.0
+idna==2.7
+macholib==1.9
+pefile==2017.11.5
+prometheus-client==0.2.0
+requests==2.19.1
+urllib3==1.23
diff --git a/paictl.py b/paictl.py
index 63a2cf831a..5e6a28ad42 100755
--- a/paictl.py
+++ b/paictl.py
@@ -44,7 +44,6 @@
logger = logging.getLogger(__name__)
-
def setup_logging():
"""
Setup logging configuration.
@@ -54,8 +53,6 @@ def setup_logging():
logging.config.dictConfig(logging_configuration)
-
-
#########
## TODO: Please remove all function following, after cluster_object_model is finsied.
@@ -283,206 +280,263 @@ def kubectl_env_checking(cluster_object_mode):
return True
-def pai_machine_info():
+class SubCmd(object):
+ """ interface class for defining sub-command for paictl """
- logger.error("The command is wrong.")
- logger.error("Add New Machine Node into cluster : paictl.py machine add -p /path/to/configuration/ -l /path/to/nodelist.yaml")
- logger.error("Remove Machine Node from cluster : paictl.py machine remove -p /path/to/configuration/ -l /path/to/nodelist.yaml")
- logger.error("Repair Error Etcd Node in cluster : paictl.py machine etcd-fix -p /path/to/configuration/ -l /path/to/nodelist.yaml")
+ def register(self, parser):
+ """ subclass use this method to register arguments """
+ pass
+ @staticmethod
+ def add_handler(parser, handler, *args, **kwargs):
+ """ helper function for adding sub-command handler """
+ sub_parser = parser.add_parser(*args, **kwargs)
+ sub_parser.set_defaults(handler=handler) # let handler handle this subcmd
+ return sub_parser
+ def run(self, args):
+ """ will call run with expected args, subclass do not have to override this method
+ if subclass use `add_handler` to register handler. """
+ args.handler(args)
-def pai_machine():
- if len(sys.argv) < 2:
- pai_machine_info()
- return
+class Image(SubCmd):
+ def register(self, parser):
+ image_parser = parser.add_subparsers(help="image operations")
- option = sys.argv[1]
- del sys.argv[1]
+ def add_arguments(parser):
+ parser.add_argument("-p", "--config-path", dest="config_path", required=True,
+ help="The path of your configuration directory.")
+ parser.add_argument("-n", "--image-name", dest="image_name", default="all",
+ help="Build and push the target image to the registry")
- if option not in ["add", "remove", "etcd-fix"]:
- pai_machine_info()
- return
+ build_parser = SubCmd.add_handler(image_parser, self.image_build, "build")
+ push_parser = SubCmd.add_handler(image_parser, self.image_push, "push")
- parser = argparse.ArgumentParser()
- parser.add_argument('-p', '--config-path', dest="config_path", required=True,
- help="The path of your configuration directory.")
- parser.add_argument('-l', '--node-list', dest="node_list", required=True,
- help="The node-list to be operator")
- args = parser.parse_args(sys.argv[1:])
+ add_arguments(build_parser)
+ add_arguments(push_parser)
+
+ def process_args(self, args):
+ cluster_object_model = load_cluster_objectModel_service(args.config_path)
+
+ image_list = None
+ if args.image_name != "all":
+ image_list = [args.image_name]
+
+ return cluster_object_model, image_list
+
+ def image_build(self, args):
+ cluster_object_model, image_list = self.process_args(args)
+
+ center = build_center.build_center(cluster_object_model, image_list)
+ center.run()
- config_path = args.config_path
- node_lists_path = args.node_list
+ def image_push(self, args):
+ cluster_object_model, image_list = self.process_args(args)
- cluster_object_model = cluster_object_model_generate_service(config_path)
- cluster_object_model_k8s = cluster_object_model_generate_k8s(config_path)
- node_list = file_handler.load_yaml_config(node_lists_path)
+ center = push_center.push_center(cluster_object_model, image_list)
+ center.run()
- if kubectl_env_checking(cluster_object_model_k8s) == False:
- return
- for host in node_list['machine-list']:
+class Machine(SubCmd):
+ def register(self, parser):
+ machine_parser = parser.add_subparsers(help="machine operations")
- if 'nodename' not in host:
- host['nodename'] = host['hostip']
+ def add_arguments(parser):
+ parser.add_argument("-p", "--config-path", dest="config_path", required=True,
+ help="The path of your configuration directory.")
+ parser.add_argument("-l", "--node-list", dest="node_list", required=True,
+ help="The node-list to be operator")
+ add_parser = SubCmd.add_handler(machine_parser, self.machine_add, "add")
+ remove_parser = SubCmd.add_handler(machine_parser, self.machine_remove, "remove")
+ etcd_parser = SubCmd.add_handler(machine_parser, self.etcd_fix, "etcd-fix")
- if option == "add":
+ add_arguments(add_parser)
+ add_arguments(remove_parser)
+ add_arguments(etcd_parser)
- for host in node_list['machine-list']:
+ def process_args(self, args):
+ cluster_object_model_k8s = cluster_object_model_generate_k8s(args.config_path)
+ node_list = file_handler.load_yaml_config(args.node_list)
+
+ if not kubectl_env_checking(cluster_object_model_k8s):
+ raise RuntimeError("failed to do kubectl checking")
+
+ for host in node_list["machine-list"]:
+ if "nodename" not in host:
+ host["nodename"] = host["hostip"]
+
+ return cluster_object_model_k8s, node_list
+
+ def machine_add(self, args):
+ cluster_object_model_k8s, node_list = self.process_args(args)
+
+ for host in node_list["machine-list"]:
add_worker = k8s_add.add(cluster_object_model_k8s, host, True)
add_worker.run()
- if host['k8s-role'] == 'master':
+ if host["k8s-role"] == "master":
logger.info("Master Node is added, sleep 60s to wait it ready.")
time.sleep(60)
+ def machine_remove(self, args):
+ cluster_object_model_k8s, node_list = self.process_args(args)
- if option == "remove":
-
- for host in node_list['machine-list']:
+ for host in node_list["machine-list"]:
add_worker = k8s_remove.remove(cluster_object_model_k8s, host, True)
add_worker.run()
- if host['k8s-role'] == 'master':
+ if host["k8s-role"] == "master":
logger.info("master node is removed, sleep 60s for etcd cluster's updating")
time.sleep(60)
+ def etcd_fix(self, args):
+ cluster_object_model_k8s, node_list = self.process_args(args)
- if option == "etcd-fix":
-
- if len(node_list['machine-list']) > 1:
+ if len(node_list["machine-list"]) > 1:
logger.error("etcd-fix can't fix more than one machine everytime. Please fix them one by one!")
sys.exit(1)
- for host in node_list['machine-list']:
+ for host in node_list["machine-list"]:
etcd_fix_worker = k8s_etcd_fix.etcdfix(cluster_object_model_k8s, host, True)
etcd_fix_worker.run()
logger.info("Etcd has been fixed.")
+class Service(SubCmd):
+ def register(self, parser):
+ service_parser = parser.add_subparsers(help="service operations")
+ def add_arguments(parser):
+ parser.add_argument("-p", "--config-path", dest="config_path", required=True,
+ help="The path of your configuration directory.")
+ parser.add_argument("-n", "--service-name", dest="service_name", default="all",
+ help="Build and push the target image to the registry")
+ start_parser = SubCmd.add_handler(service_parser, self.service_start, "start")
+ stop_parser = SubCmd.add_handler(service_parser, self.service_stop, "stop")
+ delete_parser = SubCmd.add_handler(service_parser, self.service_delete, "delete")
+ refresh_parser = SubCmd.add_handler(service_parser, self.service_refresh, "refresh")
+ # TODO: Two feature.
+ # Rolling Update Service : paictl.py service update -p /path/to/configuration/ [ -n service-x ]
+ # Rolling back Service : paictl.py service update -p /path/to/configuration/ [ -n service-x ]
-def pai_service_info():
-
- logger.error("The command is wrong.")
- logger.error("Start Service: paictl.py service start -p /path/to/configuration/ [ -n service-x ]")
- logger.error("Stop Service : paictl.py service stop -p /path/to/configuration/ [ -n service-x ]")
- logger.error("Delete Service (Stop Service, Then clean all service's data): paictl.py service delete -p /path/to/configuration/ [ -n service-x ]")
- logger.error("refresh Service (Update Configuration, Update Machine's Label): paictl.py service delete -p /path/to/configuration/ [ -n service-x ]")
- # TODO: Two feature.
- #logger.error("Rolling Update Service : paictl.py service update -p /path/to/configuration/ [ -n service-x ]")
- #logger.error("Rolling back Service : paictl.py service update -p /path/to/configuration/ [ -n service-x ]")
-
+ add_arguments(start_parser)
+ add_arguments(stop_parser)
+ add_arguments(delete_parser)
+ add_arguments(refresh_parser)
+ def process_args(self, args):
+ cluster_object_model = cluster_object_model_generate_service(args.config_path)
+ cluster_object_model_k8s = cluster_object_model_generate_k8s(args.config_path)
-def pai_service():
+ service_list = None
+ if args.service_name != "all":
+ service_list = [args.service_name]
- if len(sys.argv) < 2:
- pai_service_info()
- return
+ # Tricky, re-install kubectl first.
+ # TODO: install kubectl-install here.
+ if not kubectl_env_checking(cluster_object_model_k8s):
+ raise RuntimeError("failed to do kubectl checking")
- option = sys.argv[1]
- del sys.argv[1]
+ return cluster_object_model, service_list
- if option not in ["start", "delete", "stop", "refresh"]:
- pai_service_info()
- return
+ def service_start(self, args):
+ cluster_object_model, service_list = self.process_args(args)
- parser = argparse.ArgumentParser()
- parser.add_argument('-p', '--config-path', dest="config_path", required=True,
- help="The path of your configuration directory.")
- parser.add_argument('-n', '--service-name', dest="service_name", default='all',
- help="Build and push the target image to the registry")
- args = parser.parse_args(sys.argv[1:])
-
- config_path = args.config_path
- service_name = args.service_name
- cluster_object_model = cluster_object_model_generate_service(config_path)
- cluster_object_model_k8s = cluster_object_model_generate_k8s(config_path)
-
- service_list = None
- if service_name != "all":
- service_list = [ service_name ]
-
- # Tricky , re-install kubectl first.
- # TODO: install kubectl-install here.
- if kubectl_env_checking(cluster_object_model_k8s) == False:
- return
-
- if option == "start":
service_management_starter = service_management_start.serivce_management_start(cluster_object_model, service_list)
service_management_starter.run()
- if option == "delete":
- service_management_deleter = service_management_delete.service_management_delete(cluster_object_model, service_list)
- service_management_deleter.run()
- if option == "stop":
+ def service_stop(self, args):
+ cluster_object_model, service_list = self.process_args(args)
+
service_management_stopper = service_management_stop.service_management_stop(cluster_object_model, service_list)
service_management_stopper.run()
- if option == "refresh":
+ def service_delete(self, args):
+ cluster_object_model, service_list = self.process_args(args)
+
+ logger.warning("--------------------------------------------------------")
+ logger.warning("--------------------------------------------------------")
+ logger.warning("---------- Dangerous Operation!!! ---------------")
+ logger.warning("------ The target service will be stopped -------")
+ logger.warning("------ And the persistent data on the disk -------")
+ logger.warning("------- will be deleted --------")
+ logger.warning("--------------------------------------------------------")
+ logger.warning("--------------------------------------------------------")
+ logger.warning("-------- It's an irreversible operation -------")
+ logger.warning("-------- After this operation, -------")
+ logger.warning("------ the deleted service data is unrecoverable -------")
+ logger.warning("--------------------------------------------------------")
+ logger.warning("--------------------------------------------------------")
+ logger.warning("---- Please ensure you wanna do this operator, ------")
+ logger.warning("------- after knowing all risk above. -------")
+ logger.warning("--------------------------------------------------------")
+ logger.warning("--------------------------------------------------------")
+
+ count_input = 0
+ while True:
+ user_input = raw_input("Do you want to continue this operation? (Y/N) ")
+ if user_input == "N":
+ return
+ elif user_input == "Y":
+ break
+ else:
+ print(" Please type Y or N.")
+ count_input = count_input + 1
+ if count_input == 3:
+ logger.warning("3 Times......... Sorry, we will force stopping your operation.")
+ return
+
+ service_management_deleter = service_management_delete.service_management_delete(cluster_object_model, service_list)
+ service_management_deleter.run()
+
+ def service_refresh(self, args):
+ cluster_object_model, service_list = self.process_args(args)
+
service_management_refresher = service_management_refresh.service_management_refresh(cluster_object_model, service_list)
service_management_refresher.run()
+class Cluster(SubCmd):
+ def register(self, parser):
+ image_parser = parser.add_subparsers(help="cluster operations")
-def pai_cluster_info():
+ bootup_parser = SubCmd.add_handler(image_parser, self.k8s_bootup, "k8s-bootup")
+ clean_parser = SubCmd.add_handler(image_parser, self.k8s_clean, "k8s-clean")
+ generate_parser = SubCmd.add_handler(image_parser, self.generate_configuration, "generate-configuration",
+ description="Generate configuration files based on a quick-start yaml file.",
+ formatter_class=argparse.RawDescriptionHelpFormatter)
+ install_parser = SubCmd.add_handler(image_parser, self.install_kubectl, "install-kubectl")
- logger.error("The command is wrong.")
- logger.error("Bootup kubernetes cluster : paictl.py cluster k8s-bootup -p /path/to/cluster-configuration/dir")
- logger.error("Destroy kubernetes cluster: paictl.py cluster k8s-clean -p /path/to/cluster-configuration/dir")
+ bootup_parser.add_argument("-p", "--config-path", dest="config_path", required=True,
+ help="path of cluster configuration file")
+ clean_parser.add_argument("-p", "--config-path", dest="config_path", required=True, help="path of cluster configuration file")
+ clean_parser.add_argument("-f", "--force", dest="force", required=False, action="store_true", help="clean all the data forcefully")
+ generate_parser.add_argument("-i", "--input", dest="quick_start_config_file", required=True,
+ help="the path of the quick-start configuration file (yaml format) as the input")
+ generate_parser.add_argument("-o", "--output", dest="configuration_directory", required=True,
+ help="the path of the directory the configurations will be generated to")
+ generate_parser.add_argument("-f", "--force", dest="force", action="store_true", default=False,
+ help="overwrite existing files")
-def pai_cluster():
- if len(sys.argv) < 2:
- pai_cluster_info()
- return
- option = sys.argv[1]
- del sys.argv[1]
- if option not in ["k8s-bootup", "k8s-clean", "generate-configuration"]:
- pai_cluster_info()
- return
- if option == "k8s-bootup":
- parser = argparse.ArgumentParser()
- parser.add_argument('-p', '--config-path', dest="config_path", required=True,
+ install_parser.add_argument("-p", "--config-path", dest="config_path", required=True,
help="path of cluster configuration file")
- args = parser.parse_args(sys.argv[1:])
- config_path = args.config_path
- cluster_config = cluster_object_model_generate_k8s(config_path)
+
+ def k8s_bootup(self, args):
+ cluster_config = cluster_object_model_generate_k8s(args.config_path)
logger.info("Begin to initialize PAI k8s cluster.")
cluster_util.maintain_cluster_k8s(cluster_config, option_name="deploy", clean=True)
logger.info("Finish initializing PAI k8s cluster.")
- elif option == "generate-configuration":
- parser = argparse.ArgumentParser(
- description="Generate configuration files based on a quick-start yaml file.",
- formatter_class=argparse.RawDescriptionHelpFormatter)
- parser.add_argument('-i', '--input', dest="quick_start_config_file", required=True,
- help="the path of the quick-start configuration file (yaml format) as the input")
- parser.add_argument('-o', '--output', dest="configuration_directory", required=True,
- help="the path of the directory the configurations will be generated to")
- parser.add_argument('-f', '--force', dest='force', action='store_true', required=False,
- help="overwrite existing files")
- parser.set_defaults(force=False)
- args = parser.parse_args()
- cluster_util.generate_configuration(
- args.quick_start_config_file,
- args.configuration_directory,
- args.force)
- elif option == "k8s-clean":
+
+ def k8s_clean(self, args):
# just use 'k8s-clean' for testing temporarily .
- parser = argparse.ArgumentParser()
- parser.add_argument('-p', '--config-path', dest="config_path", required=True, help="path of cluster configuration file")
- parser.add_argument('-f', '--force', dest="force", required=False, action="store_true", help="clean all the data forcefully")
- args = parser.parse_args(sys.argv[1:])
- config_path = args.config_path
- force = args.force
- cluster_config = cluster_object_model_generate_k8s(config_path)
+ cluster_config = cluster_object_model_generate_k8s(args.config_path)
logger.warning("--------------------------------------------------------")
logger.warning("--------------------------------------------------------")
@@ -490,7 +544,7 @@ def pai_cluster():
logger.warning("------ Your k8s Cluster will be destroyed -------")
logger.warning("------ PAI service on k8s will be stopped -------")
logger.warning("--------------------------------------------------------")
- if force:
+ if args.force:
logger.warning("--------------------------------------------------------")
logger.warning("---------- ETCD data will be cleaned. ------------")
logger.warning("----- If you wanna keep pai's user data. ---------")
@@ -520,40 +574,49 @@ def pai_cluster():
return
logger.info("Begin to clean up whole cluster.")
- cluster_util.maintain_cluster_k8s(cluster_config, option_name = "clean", force = force, clean = True)
+ cluster_util.maintain_cluster_k8s(cluster_config, option_name="clean", force=args.force, clean=True)
logger.info("Clean up job finished")
+ def generate_configuration(self, args):
+ cluster_util.generate_configuration(
+ args.quick_start_config_file,
+ args.configuration_directory,
+ args.force)
+ def install_kubectl(self, args):
+ cluster_object_model_k8s = cluster_object_model_generate_k8s(args.config_path)
+ kubectl_install_worker = kubectl_install.kubectl_install(cluster_object_model_k8s)
+ kubectl_install_worker.run()
-def main():
-
- if len(sys.argv) < 2:
- logger.error("You should pass at least one argument")
- return
-
- module = sys.argv[1]
- del sys.argv[1]
-
- if module == "machine":
- pai_machine()
+class Main(SubCmd):
+ def __init__(self, subcmds):
+ self.subcmds = subcmds
- elif module == "service":
+ def register(self, parser):
+ sub_parser = parser.add_subparsers(help="paictl operations")
- pai_service()
+ for name, subcmd in self.subcmds.items():
+ subparser = SubCmd.add_handler(sub_parser, subcmd.run, name)
+ subcmd.register(subparser)
- elif module == "cluster":
+def main(args):
+ parser = argparse.ArgumentParser()
- pai_cluster()
+ main_handler = Main({
+ "image": Image(),
+ "machine": Machine(),
+ "service": Service(),
+ "cluster": Cluster()
+ })
- else:
+ main_handler.register(parser)
- logger.error("Sorry, there is no definition of the argument [{0}]".format(module))
+ args = parser.parse_args(args)
+ args.handler(args)
if __name__ == "__main__":
-
setup_logging()
- main()
-
+ main(sys.argv[1:])
diff --git a/prometheus/doc/alert-manager.md b/prometheus/doc/alert-manager.md
new file mode 100644
index 0000000000..28f19ab2cd
--- /dev/null
+++ b/prometheus/doc/alert-manager.md
@@ -0,0 +1,98 @@
+Building on top of [Prometheus Alertmanager](https://prometheus.io/docs/alerting/alertmanager/),
+OpenPAI started to support sending cluster failure and issues emails to administrator since 0.7
+release.
+
+# Configuration
+
+To enable Alert Manager, follow the following steps:
+* select a node to deploy Alert Manager, both master and work node can be used as Alter Manager.
+* in `cluster-configuration` file, set `alert-manager: "true"` for this node.
+* configure Alert Manager by adding `alerting` fields under `prometheus` to services-configuration file.
+
+Refer to example [`cluster-configuration`](../../cluster-configuration/cluster-configuration.yaml) and
+[`service-configuration`](../../cluster-configuration/services-configuration.yaml) for more
+information.
+
+`alerting` fields has following subfield:
+
+| Field Name | Description |
+| --- | --- |
+| alert_manager_port | port for alert manager to listen, make sure this port is not used in node |
+| alert_receiver | which email should receive alert email |
+| smtp_url | smtp server url for alert manager to connect |
+| smtp_from | this email address is where alerting email sent from |
+| smtp_auth_username | use this user name to login to smtp server. This user should be able to send email as `smtp_from`, can be same with `smtp_from` |
+| smtp_auth_password | use this password to login to smtp server |
+
+More advanced configurations for alert manager is not supported in pai, see official alert manager
+[document](https://prometheus.io/docs/alerting/configuration/) for more options.
+
+# Alerting rule
+
+To facilitate the OpenPAI usage, we had predefined few alerting rules for OpenPAI.
+Checkout [rule directory](../prometheus-alert) to see rules we defined.
+
+Following are these rule's triggering condition:
+
+| Rule name | Triggering condition |
+| --- | --- |
+| k8sApiServerNotOk | response from api server's healthz page is not ok or connection error |
+| k8sEtcdNotOk | response from etcd server's healthz page is not ok or connection error |
+| k8sKubeletNotOk | response from each kubelet's healthz page is not ok or connection error |
+| k8sDockerDaemonNotOk | docker daemon's status is not ok |
+| NodeFilesystemUsage | One of node's free space in file system is less than 20% |
+| NodeMemoryUsage | One of node's free memory is less than 20% |
+| NodeCPUUsage | One of node's cpu idle time is less than 20% |
+| NodeDiskPressure | kubernetes indicate one of node is under disk pressure |
+| PaiServicePodNotRunning | kubernetes indicate one of pai service pod is not in running status |
+| PaiServicePodNotReady | kubernetes indicate one of pai service pod is not in ready status |
+
+If you want to add more rules, please reference syntax
+[here](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
+After adding rules, you should stop and start prometheus by using paictl
+
+```
+cd $pai-management
+./paictl.py service stop -p $pai-config -n prometheus
+./paictl.py service start -p $pai-config -n prometheus
+```
+
+Please fire a pull request if you find any rule useful.
+
+# Muting firing alert
+
+OpenPAI leverage [amtool](https://github.com/prometheus/alertmanager#amtool) to interactive with
+alert manager. If you're using [dev-box](../../pai-management/doc/cluster-bootup.md#c-step-0) to do
+cluster management, the amtool might has already been installed for you, you can try to execute
+`amtool` in your shell to test. If it hasn't been installed, please install it using method provided
+by amtool [README](https://github.com/prometheus/alertmanager#install-1).
+
+To silent an alerts
+
+```
+$ amtool silence add alertname=Test_Alert
+b3ede22e-ca14-4aa0-932c-ca2f3445f926
+
+$ amtool silence add alertname="Test_Alert" instance=~".+0"
+e48cb58a-0b17-49ba-b734-3585139b1d25
+```
+
+Please refer amtool
+[README](https://github.com/prometheus/alertmanager#examples) for more example commands for managing
+alert.
+
+Please remember the script and amtool are only usable when you have alert manager up and running.
+You should also have amtool configured correctly.
+To ease configuration, OpenPAI provides a script that generate amtool config file for you, you can use
+
+```
+python utilities/gen-amtool-config.py -p /cluster-configuration
+```
+
+to generate config, or using
+
+```
+python utilities/gen-amtool-config.py -p /cluster-configuration -f
+```
+
+to overwrite existing amtool config.
diff --git a/prometheus/doc/exporter-for-other-services.md b/prometheus/doc/exporter-for-other-services.md
index 0c156b6fe8..86c0c010c6 100644
--- a/prometheus/doc/exporter-for-other-services.md
+++ b/prometheus/doc/exporter-for-other-services.md
@@ -148,4 +148,67 @@ If service decide to leverage node-exporter to expose metrics, service should mo
located in `/datastorage/prometheus` and write file with extension `.prom`.
For expose using port, service should annotate pod with `prometheus.io/scrape`, `prometheus.io/path`
-and `prometheus.io/port`.
+
+and `prometheus.io/port`. For service with label key `app`, prometheus will generate `pai_service_name`
+label for generated metrics.
+
+Prometheus will list all pods and get those pods with `prometheus.io/scrape` is true, and then,
+prometheus will issue request to `http://${pod_host_ip}:${prometheus.io/port}${prometheus.io/path}` to
+get metrics.
+
+For example, following deployment:
+
+```yaml
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: foo
+spec:
+ selector:
+ matchLabels:
+ app: foo
+ template:
+ metadata:
+ name: foo
+ labels:
+ app: foo
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/path: "/"
+ prometheus.io/port: "80"
+ spec:
+ containers:
+ - name: nginx
+ image: nginx
+ ports:
+ - containerPort: 80
+ volumeMounts:
+ - name: workdir
+ mountPath: /usr/share/nginx/html
+ initContainers:
+ - name: install
+ image: busybox
+ command: ["sh", "-c", "echo pai_service_count 1 > /work-dir/index.html"]
+ volumeMounts:
+ - name: workdir
+ mountPath: "/work-dir"
+ volumes:
+ - name: workdir
+ emptyDir: {}
+ hostNetwork: true
+```
+
+will generate metrics like:
+
+`pai_service_count{instance="10.151.40.4:80",job="pai_serivce_exporter",k8s_pod_name="foo-69869cc4d6",pai_service_name="foo"} 1`
+
+metric labels `k8s_pod_name` and `pai_service_name` are generated from pod name and pod label value which has key `app`.
+
+In above example, we use nginx to accept requests from prometheus and expose metrics we write in
+initContainers. It is best for service owner to use
+[prometheus client](https://prometheus.io/docs/instrumenting/clientlibs/) to expose metrics.
+
+Note that, currently pai do not assume
+[kubernetes network model](https://kubernetes.io/docs/concepts/cluster-administration/networking/)
+enabled, so service has to use hostNetwork, this implies service owner should carefully choose port
+number to avoid port conflict, it is best to configure port through pai service configuration.
\ No newline at end of file
diff --git a/prometheus/doc/watchdog-metrics.md b/prometheus/doc/watchdog-metrics.md
index 91951948b0..19449171af 100644
--- a/prometheus/doc/watchdog-metrics.md
+++ b/prometheus/doc/watchdog-metrics.md
@@ -64,6 +64,17 @@ vi watchdog-xx.log
| k8s_etcd_count | has error key in label, if error != "ok", means etcd is not functioning correctly |
| k8s_kubelet_count | has error key in label, if error != "ok", means kubelet is not functioning correctly |
+## Other Metrics
+| Metric name| Description |
+| ---------- | ----------- |
+| process_error_log_total | count of error/exception log |
+| k8s_api_healthz_resp_latency_seconds | response latency from k8s api healthz page |
+| ssh_resp_latency_seconds | response latency from ssh into worker node and execute docker daemon check cmd |
+| k8s_etcd_resp_latency_seconds | response latency from etcd healthz page |
+| k8s_kubelet_resp_latency_seconds | response latency from kubelet healthz page |
+| k8s_api_list_pods_latency_seconds | response latency from listing pods from k8s api |
+| k8s_api_list_nodes_latency_seconds | response latency from listing nodes from k8s api |
+
# Alerting
Alerting rules are under `[prometheus/prometheus-alert](../prometheus-alert)`, we added some basic
healthcheck rules for pai service and node. You can add more alert rule by adding file `*.rules` to
diff --git a/prometheus/exporter/gpu_exporter.py b/prometheus/exporter/gpu_exporter.py
index da81c44c12..64f2484b95 100644
--- a/prometheus/exporter/gpu_exporter.py
+++ b/prometheus/exporter/gpu_exporter.py
@@ -64,6 +64,13 @@ def collect_gpu_info():
else:
logger.exception("command '%s' return with error (code %d): %s",
e.cmd, e.returncode, e.output)
+ except OSError as e:
+ if e.errno == os.errno.ENOENT:
+ logger.warning("nvidia-smi not found")
+
+ return None
+
+
def convert_gpu_info_to_metrics(gpuInfos):
if gpuInfos is None:
diff --git a/prometheus/exporter/healthy_check.py b/prometheus/exporter/healthy_check.py
deleted file mode 100644
index bdb93cd65f..0000000000
--- a/prometheus/exporter/healthy_check.py
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/usr/bin/python
-# Copyright (c) Microsoft Corporation
-# All rights reserved.
-#
-# MIT License
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
-# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
-# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
-# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
-# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
-# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
-# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-import subprocess
-import sys
-import logging
-from logging.handlers import RotatingFileHandler
-import os
-import re
-
-import utils
-
-logger = logging.getLogger(__name__)
-
-def main():
- runTimeException = []
- gpuExists = False
-
- try:
- gpuOutput = utils.check_output(["lspci"])
- r = re.search("[0-9a-fA-F][0-9a-fA-F]:[0-9a-fA-F][0-9a-fA-F].[0-9] (3D|VGA compatible) controller: NVIDIA Corporation.*", gpuOutput, flags=0)
- if r is not None:
- gpuExists = True
- except subprocess.CalledProcessError as e:
- runTimeException.append("lspci")
- logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output)
-
- if gpuExists:
- try:
- smiOutput = utils.check_output(["nvidia-smi", "-q", "-x"])
- except subprocess.CalledProcessError as e:
- runTimeException.append("nvidia-smi")
- logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output)
-
- try:
- dockerDockerInspect = utils.check_output(["docker", "inspect", "--help"])
- except subprocess.CalledProcessError as e:
- runTimeException.append("docker_inspect")
- logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output)
-
- try:
- dockerDockerStats = subprocess.check_output(["docker", "stats", "--no-stream", "--format",
- "table {{.Container}}, {{.CPUPerc}},{{.MemUsage}},{{.NetIO}},{{.BlockIO}},{{.MemPerc}}"])
- except subprocess.CalledProcessError as e:
- runTimeException.append("docker_stats")
- logger.error("command '%s' return with error (code %d): %s", e.cmd, e.returncode, e.output)
-
- if not os.path.exists("/datastorage/prometheus/job_exporter.prom"):
- runTimeException.append(joblogfile)
- logger.error("/datastorage/prometheus/job_exporter.prom does not exists")
-
- if len(runTimeException) > 0:
- exception = "| ".join(runTimeException)
- raise RuntimeError("gpu-exporter readiness probe failed, error component:" + exception)
-
-if __name__ == "__main__":
- rootLogger = logging.getLogger()
- rootLogger.setLevel(logging.INFO)
- fh = RotatingFileHandler("/datastorage/prometheus/node_exporter_probe.log", maxBytes= 1024 * 1024 * 10, backupCount=5)
- fh.setLevel(logging.INFO)
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s")
- fh.setFormatter(formatter)
- rootLogger.addHandler(fh)
-
- main()
diff --git a/prometheus/exporter/job_exporter.py b/prometheus/exporter/job_exporter.py
index c3ce3d5e76..c7fc60df44 100644
--- a/prometheus/exporter/job_exporter.py
+++ b/prometheus/exporter/job_exporter.py
@@ -20,7 +20,6 @@
import sys
import time
import logging
-from logging.handlers import RotatingFileHandler
import docker_stats
import docker_inspect
@@ -43,7 +42,6 @@ def parse_from_labels(labels):
else:
otherLabels[key] = val
-
return gpuIds, otherLabels
@@ -83,18 +81,10 @@ def collect_job_metrics(gpuInfos):
return result
def main(argv):
- logDir = argv[0]
- gpuMetricsPath = logDir + "/gpu_exporter.prom"
- jobMetricsPath = logDir + "/job_exporter.prom"
- timeSleep = int(argv[1])
-
- rootLogger = logging.getLogger()
- rootLogger.setLevel(logging.INFO)
- fh = RotatingFileHandler(logDir + "/gpu_exporter.log", maxBytes= 1024 * 1024 * 10, backupCount=5)
- fh.setLevel(logging.INFO)
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s")
- fh.setFormatter(formatter)
- rootLogger.addHandler(fh)
+ log_dir = argv[0]
+ gpu_metrics_path = log_dir + "/gpu_exporter.prom"
+ job_metrics_path = log_dir + "/job_exporter.prom"
+ time_sleep_s = int(argv[1])
iter = 0
@@ -104,21 +94,22 @@ def main(argv):
try:
logger.info("job exporter running {0} iteration".format(str(iter)))
iter += 1
- gpuInfos = singleton.try_get()
+ gpu_infos = singleton.try_get()
- gpuMetrics = gpu_exporter.convert_gpu_info_to_metrics(gpuInfos)
- if gpuMetrics is not None:
- utils.export_metrics_to_file(gpuMetricsPath, gpuMetrics)
+ gpu_metrics = gpu_exporter.convert_gpu_info_to_metrics(gpu_infos)
+ utils.export_metrics_to_file(gpu_metrics_path, gpu_metrics)
# join with docker stats metrics and docker inspect labels
- jobMetrics = collect_job_metrics(gpuInfos)
- if jobMetrics is not None:
- utils.export_metrics_to_file(jobMetricsPath, jobMetrics)
+ job_metrics = collect_job_metrics(gpu_infos)
+ utils.export_metrics_to_file(job_metrics_path, job_metrics)
except Exception as e:
logger.exception("exception in job exporter loop")
- time.sleep(timeSleep)
+ time.sleep(time_sleep_s)
if __name__ == "__main__":
+ logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
+ level=logging.INFO)
+
main(sys.argv[1:])
diff --git a/prometheus/exporter/no_older_than.py b/prometheus/exporter/no_older_than.py
new file mode 100644
index 0000000000..03ccbda354
--- /dev/null
+++ b/prometheus/exporter/no_older_than.py
@@ -0,0 +1,42 @@
+#!/usr/bin/python
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import argparse
+import datetime
+import os
+
+def check_no_older_than(paths, delta):
+ """ raise RuntimeError exception if any path in paths is older than `now - delta` """
+ now = datetime.datetime.now()
+ delta = datetime.timedelta(seconds=delta)
+ oldest = now - delta
+
+ for path in paths:
+ mtime = os.path.getmtime(path)
+ mtime = datetime.datetime.fromtimestamp(mtime)
+ if oldest > mtime:
+ raise RuntimeError("{} was updated more than {} seconds ago".format(path, delta))
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("paths", nargs="+", help="file to be checked")
+ parser.add_argument("-d", "--delta", type=int, default=60, help="check file is no older than -d seconds")
+ args = parser.parse_args()
+
+ check_no_older_than(args.paths, args.delta)
diff --git a/prometheus/exporter/utils.py b/prometheus/exporter/utils.py
index 99ac04b845..ad63250d5a 100644
--- a/prometheus/exporter/utils.py
+++ b/prometheus/exporter/utils.py
@@ -51,10 +51,13 @@ def __repr__(self):
return "{}{} {}".format(self.name, labels, self.value)
def export_metrics_to_file(path, metrics):
+ """ if metrics not None, should still open the path, to modify time stamp of file,
+ readiness probe needs this"""
with codecs.open(path, "w", encoding="utf-8") as f:
- for metric in metrics:
- f.write(str(metric))
- f.write("\n")
+ if metrics is not None:
+ for metric in metrics:
+ f.write(str(metric))
+ f.write("\n")
def check_output(*args, **kwargs):
diff --git a/prometheus/exporter/watchdog.py b/prometheus/exporter/watchdog.py
index a7b3348c4b..8ca8424746 100644
--- a/prometheus/exporter/watchdog.py
+++ b/prometheus/exporter/watchdog.py
@@ -16,82 +16,131 @@
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+import argparse
import urlparse
import os
-import subprocess
import json
import sys
import requests
import logging
from logging.handlers import RotatingFileHandler
import time
-import common
-import collections
-import copy
+import threading
-import utils
-from utils import Metric
+import paramiko
+import yaml
+from wsgiref.simple_server import make_server
+from prometheus_client import make_wsgi_app, Counter, Summary, Histogram
+from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, Summary, REGISTRY
logger = logging.getLogger(__name__)
-class MetricEntity(object):
- """ interface that has one method that can convert this obj into Metric """
- def to_metric(self):
- pass
+##### watchdog will generate following metrics
+# Document about these metrics is in `prometheus/doc/watchdog-metrics.md`
+
+error_counter = Counter("process_error_log_total", "total count of error log", ["type"])
+
+api_healthz_histogram = Histogram("k8s_api_healthz_resp_latency_seconds",
+ "Response latency for requesting k8s api healthz (seconds)")
+
+# use `histogram_quantile(0.95, sum(rate(ssh_resp_latency_seconds_bucket[5m])) by (le))`
+# to get 95 percentile latency in past 5 miniute.
+ssh_histogram = Histogram("ssh_resp_latency_seconds",
+ "Response latency for ssh (seconds)")
+etcd_healthz_histogram = Histogram("k8s_etcd_resp_latency_seconds",
+ "Response latency for requesting etcd healthz (seconds)")
-class PaiPod(MetricEntity):
- """ represent a pod count, all fields except condition_map should of type string """
+kubelet_healthz_histogram = Histogram("k8s_kubelet_resp_latency_seconds",
+ "Response latency for requesting kubelet healthz (seconds)")
- def __init__(self, name, phase, host_ip, condition_map):
- self.name = name
- self.phase = phase # should be lower case
- self.host_ip = host_ip # maybe None
- self.condition_map = condition_map
+list_pods_histogram = Histogram("k8s_api_list_pods_latency_seconds",
+ "Response latency for list pods from k8s api (seconds)")
- def to_metric(self):
- label = {"name": self.name, "phase": self.phase}
+list_nodes_histogram = Histogram("k8s_api_list_nodes_latency_seconds",
+ "Response latency for list nodes from k8s api (seconds)")
- if self.host_ip is not None:
- label["host_ip"] = self.host_ip
+def gen_pai_pod_gauge():
+ return GaugeMetricFamily("pai_pod_count", "count of pai pod",
+ labels=["service_name", "name", "phase", "host_ip",
+ "initialized", "pod_scheduled", "ready"])
- for k, v in self.condition_map.items():
- label[k] = v
+def gen_pai_container_gauge():
+ return GaugeMetricFamily("pai_container_count", "count of container pod",
+ labels=["service_name", "pod_name", "name", "state", "host_ip", "ready"])
- return Metric("pai_pod_count", label, 1)
+def gen_pai_node_gauge():
+ return GaugeMetricFamily("pai_node_count", "count of pai node",
+ labels=["name", "disk_pressure", "memory_pressure", "out_of_disk", "ready"])
+def gen_docker_daemon_gauge():
+ return GaugeMetricFamily("docker_daemon_count", "count of docker daemon",
+ labels=["host_ip", "error"])
-class PaiContainer(MetricEntity):
- """ represent a container count, all fields should of type string """
+def gen_k8s_component_gauge():
+ return GaugeMetricFamily("k8s_component_count", "count of k8s component",
+ labels=["service_name", "error", "host_ip"])
- def __init__(self, service_name, name, state, ready):
- self.service_name = service_name
- self.name = name
- self.state = state
- self.ready = ready
+##### watchdog will generate above metrics
- def to_metric(self):
- label = {"service_name": self.service_name, "name": self.name, "state": self.state,
- "ready": self.ready}
- return Metric("pai_container_count", label, 1)
+class AtomicRef(object):
+ """ a thread safe way to store and get object, should not modify data get from this ref """
+ def __init__(self):
+ self.data = None
+ self.lock = threading.RLock()
+ def get_and_set(self, new_data):
+ data = None
+ with self.lock:
+ data, self.data = self.data, new_data
+ return data
-class PaiNode(MetricEntity):
- """ will output metric like
- pai_node_count{name="1.2.3.4", out_of_disk="true", memory_pressure="true"} 1 """
+ def get(self):
+ with self.lock:
+ return self.data
- def __init__(self, name, condition_map):
- self.name = name
- # key should be underscore instead of camel case, value should be lower case
- self.condition_map = condition_map
- def to_metric(self):
- label = {"name": self.name}
- for k, v in self.condition_map.items():
- label[k] = v
+class CustomCollector(object):
+ def __init__(self, atomic_ref):
+ self.atomic_ref = atomic_ref
- return Metric("pai_node_count", label, 1)
+ def collect(self):
+ data = self.atomic_ref.get()
+
+ if data is not None:
+ for datum in data:
+ yield datum
+ else:
+ # https://stackoverflow.com/a/6266586
+ # yield nothing
+ return
+ yield
+
+def ssh_exec(host_config, command, histogram=ssh_histogram):
+ with ssh_histogram.time():
+ hostip = str(host_config["hostip"])
+ username = str(host_config["username"])
+ password = str(host_config["password"])
+ port = 22
+ if "sshport" in host_config:
+ port = int(host_config["sshport"])
+
+ ssh = paramiko.SSHClient()
+
+ try:
+ ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+ ssh.connect(hostname=hostip, port=port, username=username, password=password)
+
+ logger.info("Executing the command on host [{0}]: {1}".format(hostip, command))
+
+ stdin, stdout, stderr = ssh.exec_command(command, get_pty=True)
+
+ out = "".join(map(lambda x: x.encode("utf-8"), stdout))
+ err = "".join(map(lambda x: x.encode("utf-8"), stderr))
+ return out, err
+ finally:
+ ssh.close()
def catch_exception(fn, msg, default, *args, **kwargs):
@@ -99,23 +148,17 @@ def catch_exception(fn, msg, default, *args, **kwargs):
try:
return fn(*args, **kwargs)
except Exception as e:
+ error_counter.labels(type="parse").inc()
logger.exception(msg)
return default
-def keep_not_none(item):
- """ used in filter to keep item that is not None """
- return item is not None
-
-
-def to_metric(metric_entity):
- return metric_entity.to_metric()
-
-
-def parse_pod_item(pod):
- """ return pai_pod and list of pai_container, return None on not pai service
+def parse_pod_item(pai_pod_gauge, pai_container_gauge, pod):
+ """ add metrics to pai_pod_gauge or pai_container_gauge if successfully paesed pod.
Because we are parsing json outputed by k8s, its format is subjected to change,
we should test if field exists before accessing it to avoid KeyError """
+
+ pod_name = pod["metadata"]["name"]
labels = pod["metadata"].get("labels")
if labels is None or "app" not in labels.keys():
logger.warning("unkown pod %s", pod["metadata"]["name"])
@@ -134,21 +177,28 @@ def parse_pod_item(pod):
if status.get("hostIP") is not None:
host_ip = status["hostIP"]
- condition_map = {}
+ initialized = pod_scheduled = ready = "unknown"
conditions = status.get("conditions")
if conditions is not None:
for cond in conditions:
- cond_t = cond["type"].lower() # Initialized|Ready|PodScheduled
+ cond_t = cond["type"] # Initialized|Ready|PodScheduled
cond_status = cond["status"].lower()
- condition_map[cond_t] = cond_status
+ if cond_t == "Initialized":
+ initialized = cond_status
+ elif cond_t == "PodScheduled":
+ pod_scheduled = cond_status
+ elif cond_t == "Ready":
+ ready = cond_status
+ else:
+ error_counter.labels(type="unknown_pod_cond").inc()
+ logger.error("unexpected condition %s in pod %s", cond_t, pod_name)
- pai_pod = PaiPod(service_name, phase, host_ip, condition_map)
+ pai_pod_gauge.add_metric([service_name, pod_name, phase, host_ip,
+ initialized, pod_scheduled, ready], 1)
# generate pai_containers
- pai_containers = []
-
if status.get("containerStatuses") is not None:
container_statuses = status["containerStatuses"]
@@ -164,60 +214,46 @@ def parse_pod_item(pod):
if container_status.get("state") is not None:
state = container_status["state"]
if len(state) != 1:
+ error_counter.labels(type="unexpected_container_state").inc()
logger.error("unexpected state %s in container %s",
json.dumps(state), container_name)
else:
container_state = state.keys()[0].lower()
- pai_containers.append(PaiContainer(service_name, container_name,
- container_state, str(ready).lower()))
-
- return pai_pod, pai_containers
+ pai_container_gauge.add_metric([service_name, pod_name, container_name,
+ container_state, host_ip, str(ready).lower()], 1)
+ return pai_pod_gauge, pai_container_gauge
-def robust_parse_pod_item(item):
- return catch_exception(parse_pod_item,
- "catch exception when parsing pod item",
- None,
- item)
+def process_pods_status(pai_pod_gauge, pai_container_gauge, podsJsonObject):
+ def _map_fn(item):
+ return catch_exception(parse_pod_item,
+ "catch exception when parsing pod item",
+ None,
+ pai_pod_gauge, pai_container_gauge, item)
-def parse_pods_status(podsJsonObject):
- metrics = []
+ map(_map_fn, podsJsonObject["items"])
- results = filter(keep_not_none,
- map(robust_parse_pod_item, podsJsonObject["items"]))
- pai_pods = map(lambda result: result[0], results)
- pai_containers = map(lambda result: result[1], results)
- pai_containers = [subitem for sublist in pai_containers for subitem in sublist]
-
- pod_metrics = map(to_metric, pai_pods)
- container_metrics = map(to_metric, pai_containers)
-
- return pod_metrics + container_metrics
-
-
-def collect_healthz(metric_name, address, port, url):
- label = {"address": address, "error": "ok"}
-
- try:
- healthy = requests.get("http://{}:{}{}".format(address, port, url)).text
-
- if healthy != "ok":
- label["error"] = healthy
- except Exception as e:
- label["error"] = str(e)
- logger.exception("requesting %s:%d%s failed", address, port, url)
-
- return Metric(metric_name, label, 1)
+def collect_healthz(gauge, histogram, service_name, address, port, url):
+ with histogram.time():
+ error = "ok"
+ try:
+ error = requests.get("http://{}:{}{}".format(address, port, url)).text
+ except Exception as e:
+ error_counter.labels(type="healthz").inc()
+ error = str(e)
+ logger.exception("requesting %s:%d%s failed", address, port, url)
+ gauge.add_metric([service_name, error, address], 1)
-def collect_k8s_componentStaus(api_server_ip, api_server_port, nodesJsonObject):
- metrics = []
- metrics.append(collect_healthz("k8s_api_server_count", api_server_ip, api_server_port, "/healthz"))
- metrics.append(collect_healthz("k8s_etcd_count", api_server_ip, api_server_port, "/healthz/etcd"))
+def collect_k8s_componentStaus(k8s_gauge, api_server_ip, api_server_port, nodesJsonObject):
+ collect_healthz(k8s_gauge, api_healthz_histogram,
+ "k8s_api_server", api_server_ip, api_server_port, "/healthz")
+ collect_healthz(k8s_gauge, etcd_healthz_histogram,
+ "k8s_etcd", api_server_ip, api_server_port, "/healthz/etcd")
# check kubelet
nodeItems = nodesJsonObject["items"]
@@ -225,15 +261,14 @@ def collect_k8s_componentStaus(api_server_ip, api_server_port, nodesJsonObject):
for name in nodeItems:
ip = name["metadata"]["name"]
- metrics.append(collect_healthz("k8s_kubelet_count", ip, 10255, "/healthz"))
-
- return metrics
+ collect_healthz(k8s_gauge, kubelet_healthz_histogram,
+ "k8s_kubelet", ip, 10255, "/healthz")
-def parse_node_item(node):
+def parse_node_item(pai_node_gauge, node):
name = node["metadata"]["name"]
- cond_map = {}
+ disk_pressure = memory_pressure = out_of_disk = ready = "unknown"
if node.get("status") is not None:
status = node["status"]
@@ -242,130 +277,145 @@ def parse_node_item(node):
conditions = status["conditions"]
for cond in conditions:
- cond_t = utils.camel_to_underscore(cond["type"])
+ cond_t = cond["type"]
status = cond["status"].lower()
- cond_map[cond_t] = status
+ if cond_t == "DiskPressure":
+ disk_pressure = status
+ elif cond_t == "MemoryPressure":
+ memory_pressure = status
+ elif cond_t == "OutOfDisk":
+ out_of_disk = status
+ elif cond_t == "Ready":
+ ready = status
+ else:
+ error_counter.labels(type="unknown_node_cond").inc()
+ logger.error("unexpected condition %s in node %s", cond_t, name)
else:
logger.warning("unexpected structure of node %s: %s", name, json.dumps(node))
- return PaiNode(name, cond_map)
-
-
-def robust_parse_node_item(item):
- return catch_exception(parse_node_item,
- "catch exception when parsing node item",
- None,
- item)
+ pai_node_gauge.add_metric([name, disk_pressure, memory_pressure, out_of_disk, ready], 1)
+ return pai_node_gauge
-def parse_nodes_status(nodesJsonObject):
- nodeItems = nodesJsonObject["items"]
- return map(to_metric,
- map(robust_parse_node_item, nodesJsonObject["items"]))
+def process_nodes_status(pai_node_gauge, nodesJsonObject):
+ def _map_fn(item):
+ return catch_exception(parse_node_item,
+ "catch exception when parsing node item",
+ None,
+ pai_node_gauge, item)
+ map(_map_fn, nodesJsonObject["items"])
-def collect_docker_daemon_status(hosts):
- metrics = []
+def collect_docker_daemon_status(docker_daemon_gauge, hosts):
cmd = "sudo systemctl is-active docker | if [ $? -eq 0 ]; then echo \"active\"; else exit 1 ; fi"
for host in hosts:
- label = {"ip": host["hostip"], "error": "ok"}
+ host_ip = host["hostip"]
+ error = "ok"
try:
- flag = common.ssh_shell_paramiko(host, cmd)
- if not flag:
- label["error"] = "config" # configuration is not correct
+ out, err = ssh_exec(host, cmd)
+ if "active" not in out:
+ error = "inactive"
except Exception as e:
- label["error"] = str(e)
- logger.exception("ssh to %s failed", host["hostip"])
+ error_counter.labels(type="docker").inc()
+ error = str(e)
+ logger.exception("ssh to %s failed", host_ip)
- metrics.append(Metric("docker_daemon_count", label, 1))
+ docker_daemon_gauge.add_metric([host_ip, error], 1)
- return metrics
+ return docker_daemon_gauge
-def log_and_export_metrics(path, metrics):
- utils.export_metrics_to_file(path, metrics)
- for metric in metrics:
- logger.info(metric)
+def load_machine_list(configFilePath):
+ with open(configFilePath, "r") as f:
+ return yaml.load(f)["hosts"]
-def load_machine_list(configFilePath):
- cluster_config = common.load_yaml_file(configFilePath)
- return cluster_config['hosts']
-
-
-#####
-# Watchdog generate 7 metrics:
-# * pai_pod_count
-# * pai_container_count
-# * pai_node_count
-# * docker_daemon_count
-# * k8s_api_server_count
-# * k8s_etcd_count
-# * k8s_kubelet_count
-# Document about these metrics is in `prometheus/doc/watchdog-metrics.md`
-#####
+def request_with_histogram(url, histogram):
+ with histogram.time():
+ return requests.get(url).json()
+
+
+def try_remove_old_prom_file(path):
+ """ try to remove old prom file, since old prom file are exposed by node-exporter,
+ if we do not remove, node-exporter will still expose old metrics """
+ if os.path.isfile(path):
+ try:
+ os.unlink(path)
+ except Exception as e:
+ log.warning("can not remove old prom file %s", path)
+
+def main(args):
+ logDir = args.log
-def main(argv):
- logDir = argv[0]
- timeSleep = int(argv[1])
+ try_remove_old_prom_file(logDir + "/watchdog.prom")
- address = os.environ["K8S_API_SERVER_URI"]
+ address = args.k8s_api
parse_result = urlparse.urlparse(address)
api_server_ip = parse_result.hostname
api_server_port = parse_result.port or 80
- hosts = load_machine_list("/etc/watchdog/config.yml")
+ hosts = load_machine_list(args.hosts)
- rootLogger = logging.getLogger()
- rootLogger.setLevel(logging.INFO)
- fh = RotatingFileHandler(logDir + "/watchdog.log", maxBytes= 1024 * 1024 * 100, backupCount=5)
- fh.setLevel(logging.INFO)
- formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s")
- fh.setFormatter(formatter)
- rootLogger.addHandler(fh)
+ list_pods_url = "{}/api/v1/namespaces/default/pods/".format(address)
+ list_nodes_url = "{}/api/v1/nodes/".format(address)
+
+ atomic_ref = AtomicRef()
+
+ REGISTRY.register(CustomCollector(atomic_ref))
+
+ app = make_wsgi_app(REGISTRY)
+ httpd = make_server("", int(args.port), app)
+ t = threading.Thread(target=httpd.serve_forever)
+ t.daemon = True
+ t.start()
while True:
- try:
- metrics = []
+ # these gauge is generate on each iteration
+ pai_pod_gauge = gen_pai_pod_gauge()
+ pai_container_gauge = gen_pai_container_gauge()
+ pai_node_gauge = gen_pai_node_gauge()
+ docker_daemon_gauge = gen_docker_daemon_gauge()
+ k8s_gauge = gen_k8s_component_gauge()
+ try:
# 1. check service level status
- podsStatus = requests.get("{}/api/v1/namespaces/default/pods/".format(address)).json()
- pods_metrics = parse_pods_status(podsStatus)
- if pods_metrics is not None:
- metrics.extend(pods_metrics)
+ podsStatus = request_with_histogram(list_pods_url, list_pods_histogram)
+ process_pods_status(pai_pod_gauge, pai_container_gauge, podsStatus)
# 2. check nodes level status
- nodesStatus = requests.get("{}/api/v1/nodes/".format(address)).json()
- nodes_metrics = parse_nodes_status(nodesStatus)
- if nodes_metrics is not None:
- metrics.extend(nodes_metrics)
+ nodesStatus = request_with_histogram(list_nodes_url, list_nodes_histogram)
+ process_nodes_status(pai_node_gauge, nodesStatus)
# 3. check docker deamon status
- docker_daemon_metrics = collect_docker_daemon_status(hosts)
- if docker_daemon_metrics is not None:
- metrics.extend(docker_daemon_metrics)
+ collect_docker_daemon_status(docker_daemon_gauge, hosts)
# 4. check k8s level status
- k8s_metrics = collect_k8s_componentStaus(api_server_ip, api_server_port, nodesStatus)
- if k8s_metrics is not None:
- metrics.extend(k8s_metrics)
-
- # 5. log and export
- log_and_export_metrics(logDir + "/watchdog.prom", metrics)
+ collect_k8s_componentStaus(k8s_gauge, api_server_ip, api_server_port, nodesStatus)
except Exception as e:
+ error_counter.labels(type="unknown").inc()
logger.exception("watchdog failed in one iteration")
- # do not lost metrics due to exception
- log_and_export_metrics(logDir + "/watchdog.prom", metrics)
+ atomic_ref.get_and_set([pai_pod_gauge, pai_container_gauge, pai_node_gauge,
+ docker_daemon_gauge, k8s_gauge])
- time.sleep(timeSleep)
+ time.sleep(float(args.interval))
-# python watch_dog.py /datastorage/prometheus /usr/local/cluster-configuration.yaml 3
-# requires env K8S_API_SERVER_URI set to correct value, eg. http://10.151.40.133:8080
+# python watchdog.py http://10.151.40.133:8080
if __name__ == "__main__":
- main(sys.argv[1:])
+ parser = argparse.ArgumentParser()
+ parser.add_argument("k8s_api", help="kubernetes api uri eg. http://10.151.40.133:8080")
+ parser.add_argument("--log", "-l", help="log dir to store log", default="/datastorage/prometheus")
+ parser.add_argument("--interval", "-i", help="interval between two collection", default="30")
+ parser.add_argument("--port", "-p", help="port to expose metrics", default="9101")
+ parser.add_argument("--hosts", "-m", help="yaml file path contains host info", default="/etc/watchdog/config.yml")
+ args = parser.parse_args()
+
+ logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s:%(lineno)s - %(message)s",
+ level=logging.INFO)
+
+ main(args)
diff --git a/prometheus/exporter/yarn_exporter.py b/prometheus/exporter/yarn_exporter.py
new file mode 100644
index 0000000000..1d116b49cb
--- /dev/null
+++ b/prometheus/exporter/yarn_exporter.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python3
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+from datetime import datetime, time, timedelta
+from wsgiref.simple_server import make_server
+from collections import defaultdict
+import urllib.parse
+import argparse
+
+from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily, REGISTRY
+from prometheus_client import make_wsgi_app
+import attr
+import requests
+
+class YarnCollector(object):
+ api_path = '/'
+
+ def __init__(self, endpoint, cluster_name='yarn'):
+ self.endpoint = endpoint
+ self.cluster_name = cluster_name
+
+ @property
+ def metric_url(self):
+ return urllib.parse.urljoin(self.endpoint, self.api_path)
+
+ def collect(self):
+ raise NotImplemented
+
+@attr.s
+class YarnMetric(object):
+ GAUGE = 'gauge'
+ COUNTER = 'counter'
+ supported_type = [GAUGE, COUNTER]
+
+ namespace = "yarn"
+
+ name = attr.ib()
+ metric_type = attr.ib()
+
+ @metric_type.validator
+ def check(self, _, value):
+ if value not in self.supported_type:
+ raise ValueError('Parameter metric_type value must in {0}, can not be {1}'.format(self.supported_type, value))
+
+ description = attr.ib()
+ labels = attr.ib(default=attr.Factory(list))
+
+ @property
+ def metric_name(self):
+ return '{0}_{1}'.format(self.namespace, self.name)
+
+ def create_metric(self):
+ if self.metric_type == self.GAUGE:
+ return GaugeMetricFamily(self.metric_name, self.description, labels=self.labels)
+ elif self.metric_type == self.COUNTER:
+ return CounterMetricFamily(self.metric_name, self.description, labels=self.labels)
+ else:
+ raise ValueError('property metric_type value must in {0}, can not be {1}'.format(self.supported_type, self.metric_type))
+
+class YarnMetricCollector(YarnCollector):
+ api_path = '/ws/v1/cluster/metrics'
+
+ def collect(self):
+ response = requests.get(self.metric_url, allow_redirects=True)
+ response.raise_for_status()
+ metric = response.json()['clusterMetrics']
+
+
+ total_gpu_num = YarnMetric('total_gpu_num', YarnMetric.COUNTER,
+ 'The total number of GPUs of cluster',['cluster']).create_metric()
+ total_gpu_num.add_metric([self.cluster_name], metric['totalGPUs'])
+ yield total_gpu_num
+
+ gpus_used = YarnMetric('gpus_used', YarnMetric.COUNTER,
+ 'The number of allocated GPUs',['cluster']).create_metric()
+ gpus_used.add_metric([self.cluster_name], metric['allocatedGPUs'])
+ yield gpus_used
+
+ nodes_all = YarnMetric('nodes_all', YarnMetric.GAUGE,
+ 'The total number of nodes', ['cluster']).create_metric()
+ nodes_all.add_metric([self.cluster_name], metric['totalNodes'])
+ yield nodes_all
+
+ nodes_active = YarnMetric('nodes_active', YarnMetric.GAUGE,
+ 'The number of active nodes', ['cluster']).create_metric()
+ nodes_active.add_metric([self.cluster_name], metric['activeNodes'])
+ yield nodes_active
+
+ # nodes_lost = YarnMetric('nodes_lost', YarnMetric.GAUGE,
+ # 'The number of lost nodes', ['cluster']).create_metric()
+ # nodes_lost.add_metric([self.cluster_name], metric['lostNodes'])
+ # yield nodes_lost
+
+ # nodes_unhealthy = YarnMetric('nodes_unhealthy', YarnMetric.GAUGE,
+ # 'The number of unhealthy nodes', ['cluster']).create_metric()
+ # nodes_unhealthy.add_metric([self.cluster_name], metric['unhealthyNodes'])
+ # yield nodes_unhealthy
+
+ # nodes_decommissioned = YarnMetric('nodes_decommissioned', YarnMetric.COUNTER,
+ # 'The number of nodes decommissioned', ['cluster']).create_metric()
+ # nodes_decommissioned.add_metric([self.cluster_name], metric['decommissionedNodes'])
+ # yield nodes_decommissioned
+
+ # nodes_rebooted = YarnMetric('nodes_rebooted', YarnMetric.COUNTER,
+ # 'The number of nodes rebooted', ['cluster']).create_metric()
+ # nodes_rebooted.add_metric([self.cluster_name], metric['rebootedNodes'])
+ # yield nodes_rebooted
+
+def get_parser():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("yarn_url", help="Yarn rest api address, eg: http://127.0.0.1:8088")
+ parser.add_argument("--cluster-name", "-n", help="Yarn cluster name",
+ default="cluster_0")
+ parser.add_argument("--port", "-p", help="Exporter listen port",default="9459")
+ parser.add_argument("--host", "-H", help="Exporter host address", default="0.0.0.0")
+ parser.add_argument("--collected-apps", "-c", nargs="*",
+ help="Name of applications need to collect running status")
+
+ return parser
+
+if __name__ == "__main__":
+ args = get_parser().parse_args()
+
+
+ REGISTRY.register(YarnMetricCollector(args.yarn_url + '/metrics', args.cluster_name))
+ app = make_wsgi_app(REGISTRY)
+ httpd = make_server(args.host, int(args.port), app)
+ httpd.serve_forever()
\ No newline at end of file
diff --git a/prometheus/prometheus-alert/pai-services.rules b/prometheus/prometheus-alert/pai-services.rules
index 79ca719645..57c88f3222 100644
--- a/prometheus/prometheus-alert/pai-services.rules
+++ b/prometheus/prometheus-alert/pai-services.rules
@@ -28,6 +28,7 @@ groups:
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected"
description: "{{$labels.name}} in {{$labels.host_ip}} is not running"
+
- alert: PaiServicePodNotReady
expr: pai_pod_count{phase="running", ready="false"} > 0
for: 1m
@@ -36,3 +37,12 @@ groups:
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
description: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
+
+ - alert: PaiServiceNotUp
+ expr: up != 1
+ for: 1m
+ labels:
+ type: pai_service
+ annotations:
+ summary: "{{$labels.name}} in {{$labels.host_ip}} not up detected"
+ description: "{{$labels.name}} in {{$labels.host_ip}} not up detected"
diff --git a/prometheus/test/test_watchdog.py b/prometheus/test/test_watchdog.py
index 7c21130cba..cf8ce784be 100644
--- a/prometheus/test/test_watchdog.py
+++ b/prometheus/test/test_watchdog.py
@@ -62,20 +62,33 @@ def get_data_test_input(self, path):
def test_parse_pods_status(self):
obj = json.loads(self.get_data_test_input("data/pods_list.json"))
- metrics = watchdog.parse_pods_status(obj)
- self.assertTrue(len(metrics) > 0)
+ pod_gauge = watchdog.gen_pai_node_gauge()
+ container_gauge = watchdog.gen_pai_container_gauge()
- def test_parse_nodes_status(self):
+ watchdog.process_pods_status(pod_gauge, container_gauge, obj)
+
+ self.assertTrue(len(pod_gauge.samples) > 0)
+ self.assertTrue(len(container_gauge.samples) > 0)
+
+ def test_process_nodes_status(self):
obj = json.loads(self.get_data_test_input("data/nodes_list.json"))
- metrics = watchdog.parse_nodes_status(obj)
- self.assertTrue(len(metrics) > 0)
+ gauge = watchdog.gen_pai_node_gauge()
+
+ watchdog.process_nodes_status(gauge, obj)
- def test_parse_pods_with_no_condition(self):
+ self.assertTrue(len(gauge.samples) > 0)
+
+ def test_process_pods_with_no_condition(self):
obj = json.loads(self.get_data_test_input("data/no_condtion_pod.json"))
- metrics = watchdog.parse_pods_status(obj)
- self.assertTrue(len(metrics) > 0)
+ pod_gauge = watchdog.gen_pai_node_gauge()
+ container_gauge = watchdog.gen_pai_container_gauge()
+
+ watchdog.process_pods_status(pod_gauge, container_gauge, obj)
+
+ self.assertTrue(len(pod_gauge.samples) > 0)
+ self.assertEquals(0, len(container_gauge.samples))
if __name__ == '__main__':
unittest.main()
diff --git a/rest-server/src/models/job.js b/rest-server/src/models/job.js
index 9ecc330de3..6d1fe06ba3 100644
--- a/rest-server/src/models/job.js
+++ b/rest-server/src/models/job.js
@@ -137,8 +137,11 @@ class Job {
if (!data.originalData.outputDir) {
data.outputDir = `${launcherConfig.hdfsUri}/Output/${data.userName}/${name}`;
}
+
for (let fsPath of ['authFile', 'dataDir', 'outputDir', 'codeDir']) {
data[fsPath] = data[fsPath].replace('$PAI_DEFAULT_FS_URI', launcherConfig.hdfsUri);
+ data[fsPath] = data[fsPath].replace(/\$PAI_JOB_NAME(?![\w\d])/g, name);
+ data[fsPath] = data[fsPath].replace(/(\$PAI_USER_NAME|\$PAI_USERNAME)(?![\w\d])/g, data.userName);
}
userModel.checkUserVc(data.userName, data.virtualCluster, (error, result) => {
if (error) return next(error);
diff --git a/rest-server/src/models/user.js b/rest-server/src/models/user.js
index 34cebcae0e..1a4f6a4f4e 100644
--- a/rest-server/src/models/user.js
+++ b/rest-server/src/models/user.js
@@ -116,6 +116,9 @@ const updateUserVc = (username, virtualClusters, callback) => {
return callback(err);
}
}
+ if (res.get(etcdConfig.userAdminPath(username)) === 'true') {
+ return callback(createError('Forbidden', 'ForbiddenUserError', 'Admin\'s virtual clusters cannot be updated.'));
+ }
VirtualCluster.prototype.getVcList((vcList, err) => {
if (err) {
return callback(err);
@@ -123,9 +126,7 @@ const updateUserVc = (username, virtualClusters, callback) => {
if (!vcList) {
return callback(createError.unknown('There is no virtual clusters.'));
}
- let updateVcList = (res.get(etcdConfig.userAdminPath(username)) === 'true')
- ? Object.keys(vcList)
- : virtualClusters.trim().split(',').filter((updateVc) => (updateVc !== ''));
+ let updateVcList = virtualClusters.trim().split(',').filter((updateVc) => (updateVc !== ''));
let addUserWithInvalidVc = null;
for (let item of updateVcList) {
if (!vcList.hasOwnProperty(item)) {
@@ -242,11 +243,14 @@ if (config.env !== 'test') {
prepareStoragePath();
} else {
logger.info('base storage path exists');
- getUserList((errMsg, res) => {
+ getUserList((errMsg, userInfoList) => {
if (errMsg) {
- logger.warn('get user list failed');
+ logger.warn('get user list failed', errMsg);
} else {
- logger.warn(res);
+ logger.warn('users:', userInfoList);
+ if (userInfoList.length === 0) {
+ setDefaultAdmin();
+ }
}
});
}
diff --git a/rest-server/src/templates/yarnContainerScript.mustache b/rest-server/src/templates/yarnContainerScript.mustache
index a8b4c2e7d3..d33d8d64c3 100644
--- a/rest-server/src/templates/yarnContainerScript.mustache
+++ b/rest-server/src/templates/yarnContainerScript.mustache
@@ -37,12 +37,10 @@ function debug_log()
function exit_handler()
{
+ rc=$?
local handler="Yarn container exit handler"
debug_log "$handler" "EXIT signal received in yarn container, performing clean up action..."
- debug_log "$handler" "clean the container code"
- rm -rf /tmp/pai-root/code/$APP_ID/$CONTAINER_ID 2>/dev/null
-
debug_log "$handler" "trying to kill docker container $docker_name"
pid=$(docker inspect --format={{{ inspectFormat }}} $docker_name 2>/dev/null)
if [ $pid ]; then
@@ -52,8 +50,12 @@ function exit_handler()
else
debug_log "$handler" "docker container $docker_name has already exited"
fi
-
- kill 0
+
+ debug_log "$handler" "write exit code to file"
+ debug_log "$handler" "yarn container exit code: $rc"
+ debug_log "$handler" "exit code file path: /var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
+ echo $rc > "/var/lib/hadoopdata/nm-local-dir/nmPrivate/$APP_ID/$CONTAINER_ID/$CONTAINER_ID.pid.exitcode"
+ exit $rc
}
set -x
@@ -205,12 +207,14 @@ export $(grep -v '^#' $ENV_LIST | xargs)
# Prepare docker bootstrap script
-mkdir -p "/tmp/pai-root/bootstrap/$APP_ID/$CONTAINER_ID"
-hdfs dfs -get {{{ hdfsUri }}}/Container/$HADOOP_USER_NAME/$FRAMEWORK_NAME/DockerContainerScripts/{{{ idx }}}.sh /tmp/pai-root/bootstrap/$APP_ID/$CONTAINER_ID/docker_bootstrap.sh \
+bootstrap_dir="tmp/pai-root/bootstrap"
+mkdir -p $bootstrap_dir
+hdfs dfs -get {{{ hdfsUri }}}/Container/$HADOOP_USER_NAME/$FRAMEWORK_NAME/DockerContainerScripts/{{{ idx }}}.sh $bootstrap_dir/docker_bootstrap.sh \
|| { echo "Can not get script from HDFS. HDFS may be down."; exit 1; }
# Prepare user code
-mkdir -p "/tmp/pai-root/code/$APP_ID/$CONTAINER_ID"
+code_dir="tmp/pai-root/code"
+mkdir -p $code_dir
if [[ -n $PAI_CODE_DIR ]]; then
hdfs dfs -stat $PAI_CODE_DIR \
|| { echo "Can not stat $PAI_CODE_DIR"; exit 1; }
@@ -220,15 +224,20 @@ if [[ -n $PAI_CODE_DIR ]]; then
echo "$PAI_CODE_DIR is larger than 200MB, exit."
exit 1
fi
- hdfs dfs -get $PAI_CODE_DIR "/tmp/pai-root/code/$APP_ID/$CONTAINER_ID" \
+ hdfs dfs -get $PAI_CODE_DIR $code_dir \
|| { echo "Can not get code from HDFS. HDFS may be down."; exit 1; }
fi
-
# Prepare docker debug log
mkdir -p "/tmp/pai-root/log/$APP_ID/$CONTAINER_ID"
ln -s /tmp/pai-root/log/$APP_ID/$CONTAINER_ID/DockerContainerDebug.log $LAUNCHER_LOG_DIR/DockerContainerDebug.log
+# retrieve the yarn local dir
+hadoop_tmp_dir="/var/lib/hadoopdata"
+container_id=$(cat /proc/self/cgroup | grep "memory" | awk -F '/' '{print $NF}')
+mounted_path=$(docker inspect $container_id |\
+ jq -r --arg hadoop_tmp_dir "$hadoop_tmp_dir" '.[] | .Mounts | .[] | select(.Destination==$hadoop_tmp_dir) | .Source')
+container_local_dir=$mounted_path/nm-local-dir/usercache/{{{ jobData.userName }}}/appcache/$APP_ID/$CONTAINER_ID
# Pull docker image and run
docker pull {{{ jobData.image }}} \
@@ -247,8 +256,8 @@ docker run --name $docker_name \
--security-opt apparmor:unconfined \
--volume /tmp/pai-root/alive/$APP_ID:/alive \
--volume /tmp/pai-root/log/$APP_ID/$CONTAINER_ID:/pai/log \
- --volume /tmp/pai-root/bootstrap/$APP_ID/$CONTAINER_ID:/pai/bootstrap \
- --volume /tmp/pai-root/code/$APP_ID/$CONTAINER_ID:/pai/code \
+ --volume $container_local_dir/$bootstrap_dir:/pai/bootstrap:rw \
+ --volume $container_local_dir/$code_dir:/pai/code:rw \
--volume /var/drivers/nvidia/current:/usr/local/nvidia:ro \
--volume /etc/hadoop-configuration-for-jobs:/etc/hadoop:ro \
--label GPU_ID=$gpu_id \
@@ -260,3 +269,4 @@ docker run --name $docker_name \
--env-file $ENV_LIST \
--entrypoint '/bin/bash' {{{ jobData.image }}} \
'/pai/bootstrap/docker_bootstrap.sh'
+
diff --git a/rest-server/test/userManagement.js b/rest-server/test/userManagement.js
index e13423acd7..996753a15c 100644
--- a/rest-server/test/userManagement.js
+++ b/rest-server/test/userManagement.js
@@ -15,11 +15,11 @@
// DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-deleteUserTemplate = JSON.stringify({
+const deleteUserTemplate = JSON.stringify({
'username': '{{username}}'
});
-updateUserVcTemplate = JSON.stringify({
+const updateUserVcTemplate = JSON.stringify({
'virtualClusters': '{{virtualClusters}}'
});
@@ -777,20 +777,7 @@ describe('update user virtual cluster : put /api/v1/user/:username/virtualCluste
});
});
- it('Case 2 (Positive): should update admin user with all valid virtual cluster', (done) => {
- global.chai.request(global.server)
- .put('/api/v1/user/test_non_admin_user/virtualClusters')
- .set('Authorization', 'Bearer ' + validToken)
- .send(JSON.parse(global.mustache.render(updateUserVcTemplate, { 'virtualClusters': 'default' })))
- .end((err, res) => {
- global.chai.expect(res, 'status code').to.have.status(201);
- global.chai.expect(res, 'response format').be.json;
- global.chai.expect(res.body.message, 'response message').equal('update user virtual clusters successfully');
- done();
- });
- });
-
- it('Case 3 (Positive): add new user with invalid virtual cluster should add default vc only and throw update vc error', (done) => {
+ it('Case 2 (Positive): add new user with invalid virtual cluster should add default vc only and throw update vc error', (done) => {
global.chai.request(global.server)
.put('/api/v1/user/test_user/virtualClusters')
.set('Authorization', 'Bearer ' + validToken)
@@ -803,7 +790,7 @@ describe('update user virtual cluster : put /api/v1/user/:username/virtualCluste
});
});
- it('Case 4 (Positive): should delete all virtual clusters except default when virtual cluster value sets to be empty ', (done) => {
+ it('Case 3 (Positive): should delete all virtual clusters except default when virtual cluster value sets to be empty ', (done) => {
global.chai.request(global.server)
.put('/api/v1/user/test_delete_user/virtualClusters')
.set('Authorization', 'Bearer ' + validToken)
@@ -859,6 +846,19 @@ describe('update user virtual cluster : put /api/v1/user/:username/virtualCluste
});
});
+ it('Case 4 (Negative): should fail to update admin virtual cluster', (done) => {
+ global.chai.request(global.server)
+ .put('/api/v1/user/test_non_admin_user/virtualClusters')
+ .set('Authorization', 'Bearer ' + validToken)
+ .send(JSON.parse(global.mustache.render(updateUserVcTemplate, { 'virtualClusters': 'default' })))
+ .end((err, res) => {
+ global.chai.expect(res, 'status code').to.have.status(403);
+ global.chai.expect(res, 'response format').be.json;
+ global.chai.expect(res.body.code, 'response code').equal('ForbiddenUserError');
+ done();
+ });
+ });
+
});
diff --git a/src/dev-box/build/dev-box.dockerfile b/src/dev-box/build/dev-box.dockerfile
index 94852b3596..74bb5fa288 100644
--- a/src/dev-box/build/dev-box.dockerfile
+++ b/src/dev-box/build/dev-box.dockerfile
@@ -44,6 +44,7 @@ RUN apt-get -y update && \
openssh-server \
openssh-client \
git \
+ bash-completion \
inotify-tools \
rsync \
realpath \
@@ -52,17 +53,34 @@ RUN apt-get -y update && \
git clone https://github.com/Microsoft/pai.git &&\
pip install python-etcd docker kubernetes
+WORKDIR /tmp
+
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
+RUN echo "source /usr/share/bash-completion/completions/git" >> ~/.bashrc
+
# Only node manager need this.#
-#COPY docker-17.06.2-ce.tgz /usr/local
RUN wget https://download.docker.com/linux/static/stable/x86_64/docker-17.06.2-ce.tgz
-RUN cp docker-17.06.2-ce.tgz /usr/local
-RUN tar xzvf /usr/local/docker-17.06.2-ce.tgz
+RUN tar xzvf docker-17.06.2-ce.tgz
+RUN mv docker/* /usr/local/bin/
+
+# alert manager tool
+RUN wget https://github.com/prometheus/alertmanager/releases/download/v0.15.2/alertmanager-0.15.2.linux-amd64.tar.gz
+RUN tar xzvf alertmanager-0.15.2.linux-amd64.tar.gz
+RUN mv alertmanager-0.15.2.linux-amd64/amtool /usr/local/bin
+
+RUN wget https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
+RUN chmod +x kubectl
+RUN mv kubectl /usr/local/bin
+
+# reinstall requests otherwise will get error: `cannot import name DependencyWarning`
+RUN echo y | pip uninstall requests && \
+ echo y | pip install requests && \
+ echo y | pip install docopt
COPY build/container-setup.sh /
COPY build/kubectl-install.sh /kubectl-install.sh
RUN /bin/bash kubectl-install.sh
-CMD ["/container-setup.sh"]
\ No newline at end of file
+CMD ["/bin/bash"]
diff --git a/src/drivers/deploy/drivers.yaml.template b/src/drivers/deploy/drivers.yaml.template
index 570ec4d891..598c3e14c5 100644
--- a/src/drivers/deploy/drivers.yaml.template
+++ b/src/drivers/deploy/drivers.yaml.template
@@ -29,9 +29,7 @@ spec:
app: drivers-one-shot
spec:
hostNetwork: true
- hostPID: true
- nodeSelector:
- machinetype: gpu
+ hostPID: true
containers:
- name: nvidia-drivers
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}drivers:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/src/drivers/deploy/service.yaml b/src/drivers/deploy/service.yaml
index 727da1a81c..656eb91f64 100644
--- a/src/drivers/deploy/service.yaml
+++ b/src/drivers/deploy/service.yaml
@@ -19,7 +19,6 @@ prerequisite:
- cluster-configuration
template-list:
- - node-label.sh
- drivers.yaml
- stop.sh
- refresh.sh
@@ -30,4 +29,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ notin: no-drivers
\ No newline at end of file
diff --git a/src/drivers/deploy/start.sh b/src/drivers/deploy/start.sh
index 0e702d6c98..2b8ffe1dbc 100755
--- a/src/drivers/deploy/start.sh
+++ b/src/drivers/deploy/start.sh
@@ -19,8 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-/bin/bash node-label.sh || exit $?
kubectl apply --overwrite=true -f drivers.yaml || exit $?
diff --git a/src/hadoop-data-node/deploy/hadoop-data-node.yaml.template b/src/hadoop-data-node/deploy/hadoop-data-node.yaml.template
index 0d43ce6d00..20ca3d6b58 100644
--- a/src/hadoop-data-node/deploy/hadoop-data-node.yaml.template
+++ b/src/hadoop-data-node/deploy/hadoop-data-node.yaml.template
@@ -30,8 +30,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- hadoop-data-node: "true"
containers:
- name: hadoop-data-node
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}hadoop-run:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/src/hadoop-data-node/deploy/service.yaml b/src/hadoop-data-node/deploy/service.yaml
index cb5b36deb8..c377738d45 100644
--- a/src/hadoop-data-node/deploy/service.yaml
+++ b/src/hadoop-data-node/deploy/service.yaml
@@ -22,7 +22,6 @@ prerequisite:
- hadoop-name-node
template-list:
- - node-label.sh
- hadoop-data-node.yaml
- stop.sh
- refresh.sh
@@ -32,4 +31,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-worker
\ No newline at end of file
diff --git a/src/hadoop-data-node/deploy/start.sh b/src/hadoop-data-node/deploy/start.sh
index b77cdfec8e..979b2eeb7b 100644
--- a/src/hadoop-data-node/deploy/start.sh
+++ b/src/hadoop-data-node/deploy/start.sh
@@ -19,9 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
#chmod u+x configmap-create.sh
diff --git a/src/hadoop-jobhistory/deploy/hadoop-jobhistory.yaml.template b/src/hadoop-jobhistory/deploy/hadoop-jobhistory.yaml.template
index eca1fe82ef..f62919d780 100644
--- a/src/hadoop-jobhistory/deploy/hadoop-jobhistory.yaml.template
+++ b/src/hadoop-jobhistory/deploy/hadoop-jobhistory.yaml.template
@@ -30,8 +30,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- jobhistory: "true"
containers:
- name: hadoop-jobhistory-service
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}hadoop-run:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/src/hadoop-jobhistory/deploy/service.yaml b/src/hadoop-jobhistory/deploy/service.yaml
index f7c1b08a5e..9bc6c6baec 100644
--- a/src/hadoop-jobhistory/deploy/service.yaml
+++ b/src/hadoop-jobhistory/deploy/service.yaml
@@ -25,7 +25,6 @@ prerequisite:
- hadoop-node-manager
template-list:
- - node-label.sh
- hadoop-jobhistory.yaml
- stop.sh
- refresh.sh
@@ -35,4 +34,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/src/hadoop-jobhistory/deploy/start.sh b/src/hadoop-jobhistory/deploy/start.sh
index 4aa35d5645..79f18923c5 100644
--- a/src/hadoop-jobhistory/deploy/start.sh
+++ b/src/hadoop-jobhistory/deploy/start.sh
@@ -19,9 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
#chmod u+x configmap-create.sh
diff --git a/src/hadoop-name-node/deploy/hadoop-name-node.yaml.template b/src/hadoop-name-node/deploy/hadoop-name-node.yaml.template
index cb418dc31b..beac4bb863 100644
--- a/src/hadoop-name-node/deploy/hadoop-name-node.yaml.template
+++ b/src/hadoop-name-node/deploy/hadoop-name-node.yaml.template
@@ -30,8 +30,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- hadoop-name-node: "true"
containers:
- name: hadoop-name-node
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}hadoop-run:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/src/hadoop-name-node/deploy/service.yaml b/src/hadoop-name-node/deploy/service.yaml
index 792ab4b594..1c12a8a118 100644
--- a/src/hadoop-name-node/deploy/service.yaml
+++ b/src/hadoop-name-node/deploy/service.yaml
@@ -21,7 +21,6 @@ prerequisite:
- zookeeper
template-list:
- - node-label.sh
- hadoop-name-node.yaml
- stop.sh
- refresh.sh
@@ -31,4 +30,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/src/hadoop-name-node/deploy/start.sh b/src/hadoop-name-node/deploy/start.sh
index bc66227af2..b57d9de8ba 100644
--- a/src/hadoop-name-node/deploy/start.sh
+++ b/src/hadoop-name-node/deploy/start.sh
@@ -19,9 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
#chmod u+x configmap-create.sh
diff --git a/src/hadoop-node-manager/deploy/delete.yaml.template b/src/hadoop-node-manager/deploy/delete.yaml.template
index e72cbbc19a..1fab91b707 100644
--- a/src/hadoop-node-manager/deploy/delete.yaml.template
+++ b/src/hadoop-node-manager/deploy/delete.yaml.template
@@ -39,11 +39,13 @@ spec:
name: data-path
- mountPath: /hadoop-node-manager-delete
name: hadoop-node-manager-delete-config
+ - mountPath: /var/run/docker.sock
+ name: docker-socket
env:
- name: DELETE_CONFIG
value: hadoop-node-manager-delete
- name: WORKER_CONFIG
- value: delete-data.sh
+ value: delete.sh
readinessProbe:
exec:
command:
@@ -60,3 +62,7 @@ spec:
- name: hadoop-node-manager-delete-config
configMap:
name: hadoop-node-manager-delete
+ - name: docker-socket
+ hostPath:
+ path: /var/run/docker.sock
+
diff --git a/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/nodemanager-generate-script.sh b/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/nodemanager-generate-script.sh
index bdf2bda6a7..165e42fbb4 100644
--- a/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/nodemanager-generate-script.sh
+++ b/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/nodemanager-generate-script.sh
@@ -49,6 +49,9 @@ sed -i "s/{ZOOKEEPER_ADDRESS}/${ZOOKEEPER_ADDRESS}/g" $HADOOP_CONF_DIR/yarn-sit
sed -i "s/{HDFS_ADDRESS}/${HDFS_ADDRESS}/g" $HADOOP_CONF_DIR/yarn-site.xml
sed -i "s/{LOGSERVER_ADDRESS}/${LOGSERVER_ADDRESS}/g" $HADOOP_CONF_DIR/yarn-site.xml
sed -i "s/{TIMELINE_SERVER_ADDRESS}/${TIMELINE_SERVER_ADDRESS}/g" $HADOOP_CONF_DIR/yarn-site.xml
+sed -i "s#{HOST_YARN_NODEMANAGER_STORAGE}#${HOST_YARN_NODEMANAGER_STORAGE}#g" $HADOOP_CONF_DIR/yarn-site.xml
+sed -i "s#{HOST_HADOOP_TMP_STORAGE}#${HOST_HADOOP_TMP_STORAGE}#g" $HADOOP_CONF_DIR/yarn-site.xml
+sed -i "s#{CURRENT_IMAGE_NAME}#${CURRENT_IMAGE_NAME}#g" $HADOOP_CONF_DIR/yarn-site.xml
sed -i "s/{HDFS_ADDRESS}/${HDFS_ADDRESS}/g" $HADOOP_CONF_DIR/core-site.xml
@@ -65,3 +68,4 @@ sed -i "s/{cpu_vcores}/${cpu_vcores}/g" $HADOOP_CONF_DIR/yarn-site.xml
# Backup hadoop configuration for job (Spark) use
cp $HADOOP_CONF_DIR/* /hadoop-configuration-for-jobs/
+
diff --git a/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml b/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml
index 7ad6734ffb..b476c7d140 100644
--- a/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml
+++ b/src/hadoop-node-manager/deploy/hadoop-node-manager-configuration/yarn-site.xml
@@ -272,5 +272,44 @@
yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs
88000
+
+
+ yarn.nodemanager.docker-container-executor.exec-name
+ /usr/bin/docker
+
+ Name or path to the Docker client. This is a required parameter. If this is empty,
+ user must pass an image name as part of the job invocation(see below).
+
+
+
+
+ yarn.nodemanager.container-executor.class
+ org.apache.hadoop.yarn.server.nodemanager.DockerContainerExecutor
+
+ This is the container executor setting that ensures that all
+jobs are started with the DockerContainerExecutor.
+
+
+
+
+ yarn.nodemanager.docker-container-executor.image-name
+ {CURRENT_IMAGE_NAME}
+
+
+
+ yarn.nodemanager.docker-container-executor.exec-option
+ -v /dev:/dev -v /var/run/docker.sock:/var/run/docker.sock -v /var/drivers:/var/drivers -v {HOST_YARN_NODEMANAGER_STORAGE}:/var/lib/yarn -v /tmp/pai-root:/tmp/pai-root -v /etc/hadoop-configuration-for-jobs:/hadoop-configuration-for-jobs -v {HOST_HADOOP_TMP_STORAGE}:/var/lib/hadoopdata -e HDFS_ADDRESS={HDFS_ADDRESS} -e LOGSERVER_ADDRESS={LOGSERVER_ADDRESS} -e TIMELINE_SERVER_ADDRESS={TIMELINE_SERVER_ADDRESS} -e RESOURCEMANAGER_ADDRESS={RESOURCEMANAGER_ADDRESS} -e ZOOKEEPER_ADDRESS={ZOOKEEPER_ADDRESS}
+
+ Docker run option when launch container.
+
+
+
+
+ yarn.nodemanager.docker-container-executor.script-command
+ cp -r /docker/* /usr/bin/ && cp /hadoop-configuration-for-jobs/* $HADOOP_CONF_DIR/
+
+
+ Image command before launch_container script.
+
diff --git a/src/hadoop-node-manager/deploy/hadoop-node-manager.yaml.template b/src/hadoop-node-manager/deploy/hadoop-node-manager.yaml.template
index adac20abc5..76ab6b756b 100644
--- a/src/hadoop-node-manager/deploy/hadoop-node-manager.yaml.template
+++ b/src/hadoop-node-manager/deploy/hadoop-node-manager.yaml.template
@@ -30,8 +30,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- hadoop-node-manager: "true"
containers:
- name: hadoop-node-manager
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}hadoop-run:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
@@ -43,8 +41,6 @@ spec:
name: device-mount
- mountPath: /var/run/docker.sock
name: docker-socket
- - mountPath: /var/lib/docker
- name: docker-var-lib
- mountPath: /var/drivers
name: driver-path
- mountPath: /var/lib/yarn
@@ -57,8 +53,6 @@ spec:
name: hadoop-node-manager-config-volume-for-jobs
- mountPath: /host-configuration
name: host-confg-volume
- - mountPath: /root/.docker
- name: docker-cred-volume
- mountPath: /var/lib/hadoopdata
name: hadoop-tmp-storage
readinessProbe:
@@ -83,6 +77,12 @@ spec:
value: nodemanager-generate-script.sh
- name: START_SERVICE
value: nodemanager-start-service.sh
+ - name: HOST_YARN_NODEMANAGER_STORAGE
+ value: {{ clusterinfo[ 'dataPath' ] }}/yarn/node
+ - name: HOST_HADOOP_TMP_STORAGE
+ value: {{ clusterinfo[ 'dataPath' ] }}/hadooptmp/nodemanager
+ - name: CURRENT_IMAGE_NAME
+ value: {{ clusterinfo['dockerregistryinfo']['prefix'] }}hadoop-run:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
imagePullSecrets:
- name: {{ clusterinfo['dockerregistryinfo']['secretname'] }}
volumes:
@@ -92,10 +92,6 @@ spec:
- name: docker-socket
hostPath:
path: /var/run/docker.sock
- - name: docker-var-lib
- hostPath:
- path: /var/lib/docker
- # Docker's storage base directory
- name: driver-path
hostPath:
path: /var/drivers
@@ -114,9 +110,6 @@ spec:
- name: host-confg-volume
configMap:
name: host-configuration
- - name: docker-cred-volume
- configMap:
- name: docker-credentials
- name: hadoop-tmp-storage
hostPath:
path: {{ clusterinfo[ 'dataPath' ] }}/hadooptmp/nodemanager
diff --git a/src/hadoop-node-manager/deploy/service.yaml b/src/hadoop-node-manager/deploy/service.yaml
index 2ff61ea28d..4386216852 100644
--- a/src/hadoop-node-manager/deploy/service.yaml
+++ b/src/hadoop-node-manager/deploy/service.yaml
@@ -24,7 +24,6 @@ prerequisite:
- hadoop-resource-manager
template-list:
- - node-label.sh
- hadoop-node-manager.yaml
- stop.sh
- refresh.sh
@@ -34,4 +33,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-worker
\ No newline at end of file
diff --git a/src/hadoop-node-manager/deploy/start.sh b/src/hadoop-node-manager/deploy/start.sh
index 807f8ef44b..4b4fdc85b5 100644
--- a/src/hadoop-node-manager/deploy/start.sh
+++ b/src/hadoop-node-manager/deploy/start.sh
@@ -19,9 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
#chmod u+x configmap-create.sh
diff --git a/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml b/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml
index 5acb12b97a..281e229592 100644
--- a/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml
+++ b/src/hadoop-resource-manager/deploy/hadoop-resource-manager-configuration/yarn-site.xml
@@ -34,6 +34,11 @@
maximum number of completed applications
+
+ yarn.resourcemanager.bind-host
+ 0.0.0.0
+
+
yarn.scheduler.maximum-allocation-mb
1048576
@@ -230,6 +235,15 @@
yarn.resourcemanager.container-tokens.master-key-rolling-interval-secs
88000
+
+
+
+ The duration (in ms) the YARN client waits for an expected state change
+ to occur. -1 means unlimited wait time.
+
+ yarn.client.application-client-protocol.poll-timeout-ms
+ 900000
+
The minimum allocation for every container request at the RM,
@@ -247,6 +261,12 @@
8
-
+
+ The maximum allocation for every container request at the RM in terms of virtual CPU cores.
+ Requests higher than this will throw an InvalidResourceRequestException.
+ yarn.scheduler.maximum-allocation-vcores
+ 32
+
+
diff --git a/src/hadoop-resource-manager/deploy/hadoop-resource-manager.yaml.template b/src/hadoop-resource-manager/deploy/hadoop-resource-manager.yaml.template
index 1795339a6a..d674779160 100644
--- a/src/hadoop-resource-manager/deploy/hadoop-resource-manager.yaml.template
+++ b/src/hadoop-resource-manager/deploy/hadoop-resource-manager.yaml.template
@@ -27,11 +27,13 @@ spec:
metadata:
labels:
app: hadoop-resource-manager
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/path: '/metrics'
+ prometheus.io/port: '{{ clusterinfo['prometheusinfo']['yarn_exporter_port'] }}'
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- hadoop-resource-manager: "true"
containers:
- name: hadoop-resource-manager
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}hadoop-run:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
@@ -65,6 +67,21 @@ spec:
value: resourcemanager-generate-script.sh
- name: START_SERVICE
value: resourcemanager-start-service.sh
+ - name: yarn-exporter
+ image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}yarn-exporter:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
+ imagePullPolicy: Always
+ ports:
+ - containerPort: {{ clusterinfo['prometheusinfo']['yarn_exporter_port'] }}
+ hostPort: {{ clusterinfo['prometheusinfo']['yarn_exporter_port'] }}
+ name: scrape
+ resources:
+ limits:
+ memory: "1Gi"
+ command:
+ - "python3"
+ - "/usr/local/yarn_exporter.py"
+ - "http://127.0.0.1:8088"
+ - "-p {{ clusterinfo['prometheusinfo']['yarn_exporter_port'] }}"
imagePullSecrets:
- name: {{ clusterinfo['dockerregistryinfo']['secretname'] }}
volumes:
diff --git a/src/hadoop-resource-manager/deploy/service.yaml b/src/hadoop-resource-manager/deploy/service.yaml
index d7e2299b60..c7f623cf16 100644
--- a/src/hadoop-resource-manager/deploy/service.yaml
+++ b/src/hadoop-resource-manager/deploy/service.yaml
@@ -23,7 +23,6 @@ prerequisite:
- hadoop-data-node
template-list:
- - node-label.sh
- hadoop-resource-manager.yaml
- stop.sh
- refresh.sh
@@ -34,4 +33,8 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/src/hadoop-resource-manager/deploy/start.sh b/src/hadoop-resource-manager/deploy/start.sh
index 4af0c4d6f7..98547510be 100644
--- a/src/hadoop-resource-manager/deploy/start.sh
+++ b/src/hadoop-resource-manager/deploy/start.sh
@@ -19,9 +19,6 @@
pushd $(dirname "$0") > /dev/null
-#chmod u+x node-label.sh
-
-/bin/bash node-label.sh || exit $?
#chmod u+x configmap-create.sh
diff --git a/src/zookeeper/deploy/service.yaml b/src/zookeeper/deploy/service.yaml
index 14bd930ef3..cc4c47b9c2 100644
--- a/src/zookeeper/deploy/service.yaml
+++ b/src/zookeeper/deploy/service.yaml
@@ -19,7 +19,6 @@ prerequisite:
- cluster-configuration
template-list:
- - node-label.sh
- zookeeper.yaml
- stop.sh
- refresh.sh
@@ -29,4 +28,7 @@ start-script: start.sh
stop-script: stop.sh
delete-script: delete.sh
refresh-script: refresh.sh
-upgraded-script: upgraded.sh
\ No newline at end of file
+upgraded-script: upgraded.sh
+
+deploy-rules:
+ in: pai-master
\ No newline at end of file
diff --git a/src/zookeeper/deploy/start.sh b/src/zookeeper/deploy/start.sh
index 9f03eaf791..6f04ea9ecb 100644
--- a/src/zookeeper/deploy/start.sh
+++ b/src/zookeeper/deploy/start.sh
@@ -19,7 +19,6 @@
pushd $(dirname "$0") > /dev/null
-/bin/bash node-label.sh || exit $?
# Zookeeper
kubectl apply --overwrite=true -f zookeeper.yaml || exit $?
diff --git a/src/zookeeper/deploy/zookeeper.yaml.template b/src/zookeeper/deploy/zookeeper.yaml.template
index fa79da3061..4c711f5596 100644
--- a/src/zookeeper/deploy/zookeeper.yaml.template
+++ b/src/zookeeper/deploy/zookeeper.yaml.template
@@ -30,8 +30,6 @@ spec:
spec:
hostNetwork: true
hostPID: true
- nodeSelector:
- zookeeper: "true"
containers:
- name: zookeeper
image: {{ clusterinfo['dockerregistryinfo']['prefix'] }}zookeeper:{{ clusterinfo['dockerregistryinfo']['docker_tag'] }}
diff --git a/utilities/gen-amtool-config.py b/utilities/gen-amtool-config.py
new file mode 100644
index 0000000000..5e92485d02
--- /dev/null
+++ b/utilities/gen-amtool-config.py
@@ -0,0 +1,79 @@
+#!/usr/bin/python
+# Copyright (c) Microsoft Corporation
+# All rights reserved.
+#
+# MIT License
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
+# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
+# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
+# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+import argparse
+import codecs
+import os
+import sys
+
+project_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+
+sys.path.append(os.path.join(project_root, "pai-management"))
+
+from paiLibrary.clusterObjectModel import objectModelFactory
+
+amtool_config = """
+# see https://github.com/prometheus/alertmanager#config for detail
+# Define the path that amtool can find your `alertmanager` instance at
+alertmanager.url: "http://{}:{}"
+
+# Override the default author. (unset defaults to your username)
+# author: me@example.com
+
+# Force amtool to give you an error if you don't include a comment on a silence
+comment_required: true
+
+# Set a default output format. (unset defaults to simple)
+output: extended
+"""
+
+def gen_amtool_config(args):
+ model = objectModelFactory.objectModelFactory(args.config_path)
+ service_config = model.objectModelPipeLine()
+
+ try:
+ prometheus_info = service_config["service"]["clusterinfo"]["prometheusinfo"]
+ alerting = prometheus_info["alerting"]
+ port = alerting["alert_manager_port"]
+ alert_manager_hosts = alerting["alert-manager-hosts"]
+ host = alert_manager_hosts[0] # TODO not sure if alert manager with HA workds this way
+ except KeyError:
+ sys.stderr.write("no alert manager configured\n")
+ sys.exit(1)
+
+ home = os.path.expanduser("~")
+ amtool_dir = os.path.join(home, ".config/amtool")
+ if not os.path.exists(amtool_dir):
+ os.makedirs(amtool_dir)
+ config = os.path.join(amtool_dir, "config.yml")
+ if os.path.isfile(config) and not args.force:
+ sys.stderr.write("{} already exist, specify -f to overwirte\n".format(config))
+ sys.exit(1)
+
+ with codecs.open(config, "w", "utf-8") as f:
+ f.write(amtool_config.format(host,port))
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-p", "--config-path", dest="config_path", required=True,
+ help="The path of your configuration directory.")
+ parser.add_argument("-f", "--force", dest="force", default=False, action="store_true",
+ help="clean all the data forcefully")
+ args = parser.parse_args()
+
+ gen_amtool_config(args)
diff --git a/webportal/config/webpack.common.js b/webportal/config/webpack.common.js
index 5374c7fe40..383c42fa03 100644
--- a/webportal/config/webpack.common.js
+++ b/webportal/config/webpack.common.js
@@ -38,7 +38,7 @@ const htmlMinifierOptions = {
removeTagWhitespace: true
};
-const config = {
+const config = (env, argv) => ({
entry: {
index: './src/app/index.js',
layout: './src/app/layout/layout.component.js',
@@ -155,11 +155,6 @@ const config = {
new ExtractTextPlugin({
filename: 'styles/[name].bundle.css'
}),
- new UglifyJsPlugin({
- cache: true,
- parallel: true,
- sourceMap: true
- }),
new webpack.ProvidePlugin({
_: 'underscore'
}),
@@ -284,13 +279,17 @@ const config = {
cache: true,
chunks: ['layout', 'docs']
})
- ],
+ ].concat(argv.debug ? [] : [new UglifyJsPlugin({
+ cache: true,
+ parallel: true,
+ sourceMap: true
+ })]),
node: {
global: true,
fs: 'empty',
process: true,
module: false
}
-};
+});
-module.exports = config;
\ No newline at end of file
+module.exports = config;
diff --git a/webportal/src/app/job/job-view/job-view.component.js b/webportal/src/app/job/job-view/job-view.component.js
index 584aae65df..91a76a302e 100644
--- a/webportal/src/app/job/job-view/job-view.component.js
+++ b/webportal/src/app/job/job-view/job-view.component.js
@@ -213,6 +213,9 @@ const loadJobs = (specifiedVc) => {
}
},
},
+ 'rowId'(row) {
+ return row.name;
+ },
'columns': [
{title: 'Job', data: 'name', render(name, type) {
if (type !== 'display') return name;
@@ -271,8 +274,33 @@ const stopJob = (jobName) => {
Authorization: `Bearer ${token}`,
},
success: (data) => {
- window.location.href = 'view.html?jobName=' + jobName;
- loadJobs();
+ const $jobTable = $('#job-table');
+ if ($jobTable.length === 0) {
+ // Detail view: reload current page
+ return window.location.reload(false);
+ } else {
+ // Table view: replace current row
+ const api = $jobTable.dataTable().api();
+ const row = api.row('#' + jobName);
+ const rowData = row.data();
+ $.ajax({
+ url: `${webportalConfig.restServerUri}/api/v1/jobs/${jobName}`,
+ type: 'GET',
+ success: function(data) {
+ rowData.appExitCode = data.jobStatus.appExitCode;
+ rowData.completedTime = data.jobStatus.completedTime;
+ rowData.createdTime = data.jobStatus.createdTime;
+ rowData.executionType = data.jobStatus.executionType;
+ rowData.retries = data.jobStatus.retries;
+ rowData.state = data.jobStatus.state;
+ rowData.subState = data.jobStatus.subState;
+ rowData.username = data.jobStatus.username;
+ rowData.virtualCluster = data.jobStatus.virtualCluster;
+ row.data(rowData);
+ row.invalidate();
+ },
+ });
+ }
},
error: (xhr, textStatus, error) => {
const res = JSON.parse(xhr.responseText);
diff --git a/webportal/src/app/layout/layout.component.ejs b/webportal/src/app/layout/layout.component.ejs
index a76b11a6ab..3a9fd1457c 100644
--- a/webportal/src/app/layout/layout.component.ejs
+++ b/webportal/src/app/layout/layout.component.ejs
@@ -114,6 +114,6 @@
-
+