Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into canwan/update-jenkins
Browse files Browse the repository at this point in the history
# Conflicts:
#	Jenkinsfile
#	deployment/k8sPaiLibrary/maintainconf/clean.yaml
#	deployment/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh
#	docs/pai-management/doc/add-service.md
#	docs/pai-management/doc/cluster-bootup.md
#	docs/pai-management/doc/how-to-write-pai-configuration.md
#	pai-management/bootstrap/alert-manager/delete.sh
#	pai-management/bootstrap/alert-manager/refresh.sh.template
#	pai-management/bootstrap/alert-manager/service.yaml
#	pai-management/bootstrap/alert-manager/stop.sh
#	pai-management/bootstrap/drivers/node-label.sh.template
#	pai-management/bootstrap/hadoop-jobhistory/node-label.sh.template
#	pai-management/bootstrap/hadoop-node-manager/hadoop-node-manager-delete/delete-data.sh
#	pai-management/container-setup.sh
#	pai-management/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh
#	pai-management/k8sPaiLibrary/template/kubernetes-cleanup.sh.template
#	pai-management/paiLibrary/paiBuild/build_center.py
#	pai-management/paiLibrary/paiBuild/hadoop_ai_build.py
#	paictl.py
#	prometheus/doc/exporter-for-other-services.md
#	src/dev-box/build/container-setup.sh
#	src/dev-box/build/dev-box.dockerfile
#	src/drivers/deploy/node-label.sh.template
#	src/hadoop-ai/build/build-pre.sh
#	src/hadoop-jobhistory/deploy/node-label.sh.template
#	src/hadoop-name-node/deploy/node-label.sh.template
#	src/hadoop-node-manager/deploy/hadoop-node-manager-delete/delete-data.sh
#	src/hadoop-node-manager/deploy/node-label.sh.template
#	src/hadoop-resource-manager/deploy/node-label.sh.template
#	src/zookeeper/deploy/node-label.sh.template
  • Loading branch information
wangcan0329 committed Sep 10, 2018
2 parents 042324c + 62f53a2 commit e430930
Show file tree
Hide file tree
Showing 212 changed files with 5,171 additions and 2,105 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ matrix:
before_install:
- cd prometheus
install:
- pip install paramiko pyyaml jinja2 python-etcd requests
- pip install paramiko pyyaml jinja2 python-etcd requests prometheus_client
script:
- python -m unittest discover test/
- language: python
Expand Down
9 changes: 9 additions & 0 deletions Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ echo ${labels[0]} > ${WORKSPACE}/BED.txt

sh '''#!/bin/bash
set -ex
echo ${GIT_BRANCH//\\//-}-$(git rev-parse --short HEAD)-${BUILD_ID} > ${WORKSPACE}/IMAGE_TAG.txt
'''
env.IMAGE_TAG = readFile("${WORKSPACE}/IMAGE_TAG.txt").trim()
Expand Down Expand Up @@ -161,8 +162,10 @@ sed -i "42s/.*/ zkid: "1"/" /cluster-configuration/cluster-configuration.yaml
# Step 2. Boot up Kubernetes
# install k8s
./paictl.py cluster k8s-bootup -p /cluster-configuration
# ! TODO wait for cluster ready
sleep 6s
# Step 3. Start all PAI services
# start pai services
./paictl.py service start -p /cluster-configuration
Expand Down Expand Up @@ -251,8 +254,10 @@ sed -i "42s/.*/ zkid: "2"/" /cluster-configuration/cluster-configuration.yaml
# Step 2. Boot up Kubernetes
# install k8s
./paictl.py cluster k8s-bootup -p /cluster-configuration
# ! TODO wait for cluster ready
sleep 6s
# Step 3. Start all PAI services
# start pai services
./paictl.py service start -p /cluster-configuration
Expand Down Expand Up @@ -533,9 +538,11 @@ else
fi
# delete service for next install
./paictl.py service start -p /cluster-configuration -n cluster-configuration
./paictl.py service delete -p /cluster-configuration << EOF
Y
EOF
# clean k8s
./paictl.py cluster k8s-clean -p /cluster-configuration -f << EOF
Y
Expand Down Expand Up @@ -572,9 +579,11 @@ else
fi
# delete service for next install
./paictl.py service start -p /cluster-configuration -n cluster-configuration
./paictl.py service delete -p /cluster-configuration << EOF
Y
EOF
# clean k8s
./paictl.py cluster k8s-clean -p /cluster-configuration -f << EOF
Y
Expand Down
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,12 +56,13 @@ Before start, you need to meet the following requirements:
## How to use
### How to train jobs
- How to write PAI jobs
- [Learn from Example Jobs](./examples/README.md)
- [Quick start: how to write and submit a CIFAR-10 job](./examples/README.md#quickstart)
- [Write job from scratch in deepth](./docs/job_tutorial.md)
- [Learn more example jobs](./examples/#offtheshelf)
- How to submit PAI jobs
- [Submit a job in Web Portal](./docs/submit_from_webportal.md)
- [Submit a job in Visual Studio](https://github.com/Microsoft/vs-tools-for-ai/blob/master/docs/pai.md)
- [Submit a job in Visual Studio Code](https://github.com/Microsoft/vscode-tools-for-ai/blob/master/docs/quickstart-05-pai.md)
- [Submit a job in web portal](https://github.com/Microsoft/pai/blob/master/job-tutorial/README.md#job-submission)
- How to request on-demand resource for in place training
- [Launch a jupyter notebook and work in it](./examples/jupyter/README.md)

Expand All @@ -71,7 +72,9 @@ Before start, you need to meet the following requirements:
- [Monitoring](./webportal/README.md)

## Resources
The OpenPAI user [documentations](./docs/documentation.md) provides in-depth instructions for using OpenPAI

- The OpenPAI user [documentations](./docs/documentation.md) provides in-depth instructions for using OpenPAI
- Visit the [release notes](https://github.com/Microsoft/pai/releases) to read about the new features, or download the release today.

## Get Involved
- [StackOverflow:](./docs/stackoverflow.md) If you have questions about OpenPAI, please submit question at Stackoverflow under tag: openpai
Expand Down
8 changes: 2 additions & 6 deletions cluster-configuration/cluster-configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,7 @@ machine-list:
dashboard: "true"
zkid: "1"
pai-master: "true"
watchdog: "true"
alert-manager: "true"



- hostname: hostname
hostip: IP
Expand All @@ -73,8 +71,7 @@ machine-list:
#username: username (Optional)
#password: password (Optional)
k8s-role: master
node-exporter: "true"



- hostname: hostname
hostip: IP
Expand All @@ -84,7 +81,6 @@ machine-list:
#username: username (Optional)
#password: password (Optional)
k8s-role: master
node-exporter: "true"


- hostname: hostname
Expand Down
2 changes: 2 additions & 0 deletions cluster-configuration/kubernetes-configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ kubernetes:
kube-controller-manager-version: v1.9.9
# http://gcr.io/google_containers/kubernetes-dashboard-amd64
dashboard-version: v1.8.3
# The path to storage etcd data.
etcd-data-path: "/var/etcd"



8 changes: 5 additions & 3 deletions cluster-configuration/services-configuration.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ cluster:
# If the docker registry doesn't require authentication, please comment out docker_username and docker_password
#docker-username: your_registry_username
#docker-password: your_registry_password

docker-tag: latest

# The name of the secret in kubernetes will be created in your cluster
Expand All @@ -47,7 +47,6 @@ hadoop:
# More about hadoop-ai please follow the link: https://github.com/Microsoft/pai/tree/master/hadoop-ai.
# Notice: the name should be hadoop-{hadoop-version}.tar.gz
custom-hadoop-binary-path: /pathHadoop/hadoop-2.9.0.tar.gz
hadoop-version: 2.9.0
# Step 1 of 4 to set up Hadoop queues.
# Define all virtual clusters, equivalent concept of Hadoop queues:
# - Each VC will be assigned with (capacity / total_capacity * 100%) of the resources in the system.
Expand Down Expand Up @@ -85,7 +84,7 @@ restserver:
# database admin password
default-pai-admin-password: your_default_pai_admin_password


webportal:
# port for webportal
server-port: 9286
Expand All @@ -101,8 +100,11 @@ prometheus:
prometheus-port: 9091
# port for node exporter
node-exporter-port: 9100
# port for yarn exporter
yarn_exporter_port: 9459
# How frequently to scrape targets
scrape_interval: 30

# if you want to enable alert manager to send alert email, uncomment following lines and fill
# right values.
# alerting:
Expand Down
4 changes: 2 additions & 2 deletions deployment/k8sPaiLibrary/maintainconf/clean.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@

clean:

file-list:
template-list:
- name: kubernetes-cleanup.sh
src: deployment/k8sPaiLibrary/maintaintool/kubernetes-cleanup.sh
src: k8sPaiLibrary/template/kubernetes-cleanup.sh.template
dst: clean

4 changes: 1 addition & 3 deletions deployment/k8sPaiLibrary/maintainlib/clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def job_executer(self, node_config):
self.logger.error("Failed to uncompress {0}.tar".format(self.jobname))
sys.exit(1)

commandline = "sudo ./{0}/kubernetes-cleanup.sh".format(self.jobname)
commandline = "sudo /bin/bash {0}/kubernetes-cleanup.sh".format(self.jobname)
if self.force_flag:
commandline += " -f"
if common.ssh_shell_with_password_input_paramiko(node_config, commandline) == False:
Expand Down Expand Up @@ -126,5 +126,3 @@ def run(self):


self.logger.info("The kubernetes has been destroyed, and metadata has been removed")


16 changes: 16 additions & 0 deletions deployment/k8sPaiLibrary/maintainlib/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,22 @@ def execute_shell(shell_cmd, error_msg):



def execute_shell_retry(shell_cmd, error_msg, retry_count):

count = 0
while count < retry_count:
try:
subprocess.check_call( shell_cmd, shell=True )
break
except subprocess.CalledProcessError:
count += 1
logger.error(error_msg)
logger.info("run command \" %s \" exception, retrying %d", shell_cmd, count)
if count == retry_count:
sys.exit(1)
time.sleep(5)



def execute_shell_return(shell_cmd, error_msg):

Expand Down
20 changes: 16 additions & 4 deletions deployment/k8sPaiLibrary/maintainlib/deploy.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,12 @@ def create_kube_proxy(self):
generated_data = common.generate_from_template_dict(template_data, dict_map)

common.write_generated_file(generated_data, "kube-proxy.yaml")
common.execute_shell(

retry_count = 5
common.execute_shell_retry(
"kubectl apply --overwrite=true -f kube-proxy.yaml",
"Failed to create kube-proxy"
"Failed to create kube-proxy",
retry_count
)

os.remove("kube-proxy.yaml")
Expand All @@ -141,9 +144,12 @@ def create_k8s_dashboard(self):
generated_data = common.generate_from_template_dict(template_data, dict_map)

common.write_generated_file(generated_data, "dashboard-service.yaml")
common.execute_shell(

retry_count = 5
common.execute_shell_retry(
"kubectl apply --overwrite=true -f dashboard-service.yaml",
"Failed to create dashboard-service"
"Failed to create dashboard-service",
retry_count
)

os.remove("dashboard-service.yaml")
Expand Down Expand Up @@ -197,6 +203,12 @@ def run(self):
kubectl_install_instance = kubectl_install.kubectl_install(self.cluster_config)
kubectl_install_instance.run()

# check the registerd api resources
common.execute_shell_retry("kubectl api-resources", "kubectl command failed!", 5)

# create kube-proxy until daemonset resource is registerd
common.execute_shell_retry("kubectl api-resources | grep -q daemonsets", "Controller manager hasn't create daemonset object!", 5)

self.create_kube_proxy()
self.create_k8s_dashboard()

Expand Down
7 changes: 3 additions & 4 deletions deployment/k8sPaiLibrary/maintaintool/kubectl-install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,11 @@

mkdir -p ~/.kube

[[ -f "/usr/local/bin/kubectl" ]] &&
{
if which kubectl > /dev/null; then
echo "kubectl has been installed."
echo "Skip this precess"
exit 0
}
fi

set -e

Expand All @@ -38,4 +37,4 @@ pathofusllocalbin="/usr/local/bin"
mkdir -p /usr/local/bin
}

mv ./kubectl /usr/local/bin/kubectl
mv ./kubectl /usr/local/bin/kubectl
8 changes: 6 additions & 2 deletions deployment/k8sPaiLibrary/template/config.template
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,21 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

apiVersion: v1
kind: Config
preferences: {}

clusters:
- cluster:
insecure-skip-tls-verify: true
server: http://{{ clusterconfig['api-servers-ip'] }}:8080
name: kubernetes

contexts:
- context:
cluster: kubernetes
user: admin
name: kubernetes

current-context: kubernetes
kind: Config
preferences: {}

users: []
2 changes: 1 addition & 1 deletion deployment/k8sPaiLibrary/template/etcd.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -54,5 +54,5 @@ spec:
name: varetcd
volumes:
- hostPath:
path: /var/etcd/data
path: {{ clusterconfig['etcd-data-path'] }}
name: varetcd
9 changes: 8 additions & 1 deletion deployment/k8sPaiLibrary/template/kubelet.sh.template
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ docker run \
--volume=/sys:/sys:rw \
--volume=/dev:/dev:rw \
--volume=/var/lib/docker/:/var/lib/docker:rw \
--volume=/var/lib/kubelet/:/var/lib/kubelet:rw \
--volume=/var/lib/kubelet/:/var/lib/kubelet:rw,shared \
--volume=/etc/resolv.conf:/etc/resolv.conf:rw \
--volume=/var/run:/var/run:rw \
--volume=/var/log:/var/log:rw \
Expand All @@ -46,6 +46,13 @@ docker run \
--pod-manifest-path=/etc/kubernetes/manifests \
--allow-privileged=true \
--logtostderr=true \
{% if 'pai-master' in hostcofig -%}
--node-labels pai-master=true \
{% endif -%}
{% if 'pai-worker' in hostcofig -%}
--node-labels pai-worker=true \
{% endif -%}
--pod-infra-container-image {{ clusterconfig['dockerregistry'] }}/pause-amd64:3.0 \
--eviction-hard="memory.available<5%,nodefs.available<5%,imagefs.available<5%,nodefs.inodesFree<5%,imagefs.inodesFree<5%" \
--image-pull-progress-deadline=10m \
--v=2
63 changes: 63 additions & 0 deletions deployment/k8sPaiLibrary/template/kubernetes-cleanup.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

# Copyright (c) Microsoft Corporation
# All rights reserved.
#
# MIT License
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
# documentation files (the "Software"), to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and
# to permit persons to whom the Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
# BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
# DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

option=$1

apt-get install -y gawk

docker stop kubelet
docker rm kubelet

for ID in `docker ps -a | awk "/k8s_/ {print\\$1}"`; do docker kill $ID; docker rm $ID ; done

if [ -d "/etc/kubernetes" ]; then

rm -rf /etc/kubernetes

fi

if [ -d "{{ clusterconfig['etcd-data-path'] }}" -a "$option" == "-f" ]; then

rm -rf {{ clusterconfig['etcd-data-path'] }}

fi

if [ -d "/var/log/pods" ]; then

rm -rf /var/log/pods

fi

if [ -d "/var/lib/kubelet/pods" ]; then

rm -rf /var/lib/kubelet/pods

fi

if [ -d "src" ]; then

rm -rf src

fi

if [ -f "kubernetes.tar" ]; then

rm -rf kubernetes.tar

fi
Loading

0 comments on commit e430930

Please sign in to comment.