forked from NVIDIA/gpu-operator
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add shell scripts for common testing tasks
This change introduces a set of shell scripts for performing common end-to-end test functions. These include: * installing the operator * verifyng completion of the installation * installing a GPU workload * verifying completion of the workload * uninstalling components Signed-off-by: Evan Lezar <[email protected]>
- Loading branch information
Showing
15 changed files
with
339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
# GPU operator test utilities | ||
|
||
## Provided scripts / utilities | ||
|
||
* `./tests/scripts/remote.sh [command]`: Execute `[command]` on the remote instance via SSH. This behaves the same as the `ssh` command and if no command is specified an interactive session is started. | ||
* `./tests/scripts/remote.sh -t tmux new-session -A -s ${USER}`: Attach to (or create) a `tmux` session on the remote instance with a name matching the local user. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# !/bin/bash | ||
set -e | ||
|
||
[[ -z "${DEBUG}" ]] || set -x | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )" | ||
PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )" | ||
|
||
# The terraform command is executed from the TERRAFORM_DIR | ||
TERRAFORM_DIR=${PROJECT_DIR}/aws-kube-ci | ||
TERRAFORM="terraform -chdir=${TERRAFORM_DIR}" | ||
|
||
# Set default values if not defined | ||
: ${HELM:="helm"} | ||
: ${LOG_DIR:="/tmp/logs"} | ||
: ${PROJECT:="$(basename "${PROJECT_DIR}")"} | ||
: ${TEST_NAMESPACE:="test-operator"} | ||
|
||
: ${OPERATOR_IMAGE:="nvcr.io/nvidia/gpu-operator"} | ||
|
||
function remote() { | ||
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && $@" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
#! /bin/bash | ||
|
||
check_pod_ready() { | ||
local pod_label=$1 | ||
local current_time=0 | ||
while :; do | ||
echo "Checking $pod_label pod" | ||
kubectl get pods -lapp=$pod_label -n gpu-operator-resources | ||
|
||
echo "Checking $pod_label pod readiness" | ||
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n gpu-operator-resources -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}') | ||
|
||
if [ "${is_pod_ready}" = "True" ]; then | ||
echo "Pod $pod_label is ready" | ||
break; | ||
fi | ||
|
||
if [[ "${current_time}" -gt $((60 * 45)) ]]; then | ||
echo "timeout reached" | ||
exit 1; | ||
fi | ||
|
||
# Echo useful information on stdout | ||
kubectl get pods -n gpu-operator-resources | ||
|
||
echo "Sleeping 5 seconds" | ||
current_time=$((${current_time} + 5)) | ||
sleep 5 | ||
done | ||
} | ||
|
||
# This function kills the operator and waits for the operator to be back in a running state | ||
# Timeout is 100 seconds | ||
test_restart_operator() { | ||
local ns=${1} | ||
# The operator is the only container that has the string '"gpu-operator"' | ||
docker kill "$(docker ps --format '{{.ID}} {{.Command}}' | grep "gpu-operator" | cut -f 1 -d ' ')" | ||
|
||
for i in $(seq 1 10); do | ||
# Sleep a reasonable amount of time for k8s to update the container status to crashing | ||
sleep 10 | ||
|
||
state=$(kubectl get pods -n "${ns}" -l "app.kubernetes.io/component=gpu-operator" \ | ||
-o jsonpath='{.items[0].status.phase}') | ||
|
||
echo "Checking state of the GPU Operator, it is: '$state'" | ||
if [ "$state" = "Running" ]; then | ||
return 0 | ||
fi | ||
done | ||
|
||
echo "Timeout reached, the GPU Operator is still not ready. See below for logs:" | ||
kubectl logs -n gpu-operator "$(kubectl get pods -n "${ns}" -o json | jq -r '.items[0].metadata.name')" | ||
exit 1 | ||
} | ||
|
||
check_gpu_pod_ready() { | ||
local log_dir=$1 | ||
local current_time=0 | ||
while :; do | ||
pods="$(kubectl get --all-namespaces pods -o json | jq '.items[] | {name: .metadata.name, ns: .metadata.namespace}' | jq -s -c .)" | ||
status=$(kubectl get pods gpu-operator-test -o json | jq -r .status.phase) | ||
if [ "${status}" = "Succeeded" ]; then | ||
echo "GPU pod terminated successfully" | ||
rc=0 | ||
break; | ||
fi | ||
|
||
if [[ "${current_time}" -gt $((60 * 45)) ]]; then | ||
echo "timeout reached" | ||
exit 1 | ||
fi | ||
|
||
# Echo useful information on stdout | ||
kubectl get pods --all-namespaces | ||
|
||
for pod in $(echo "$pods" | jq -r .[].name); do | ||
ns=$(echo "$pods" | jq -r ".[] | select(.name == \"$pod\") | .ns") | ||
echo "Generating logs for pod: ${pod} ns: ${ns}" | ||
echo "------------------------------------------------" >> "${log_dir}/${pod}.describe" | ||
kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe" | ||
kubectl -n "${ns}" logs "${pod}" --all-containers=true > "${log_dir}/${pod}.logs" || true | ||
done | ||
|
||
echo "Generating cluster logs" | ||
echo "------------------------------------------------" >> "${log_dir}/cluster.logs" | ||
kubectl get --all-namespaces pods >> "${log_dir}/cluster.logs" | ||
|
||
echo "Sleeping 5 seconds" | ||
current_time=$((${current_time} + 5)) | ||
sleep 5; | ||
done | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#! /bin/bash | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
# Install the operator and ensure that this works as expected | ||
${SCRIPT_DIR}/install-operator.sh | ||
${SCRIPT_DIR}/verify-operator.sh | ||
|
||
# Verify the installation | ||
${SCRIPT_DIR}/verify-operator.sh | ||
|
||
# Install a workload and verify that this works as expected | ||
${SCRIPT_DIR}/install-workload.sh | ||
${SCRIPT_DIR}/verify-workload.sh | ||
|
||
# TODO: This should be reusable | ||
source ${SCRIPT_DIR}/checks.sh | ||
test_restart_operator ${TEST_NAMESPACE} | ||
|
||
# Uninstall the workload and operator | ||
${SCRIPT_DIR}/uninstall-workload.sh | ||
${SCRIPT_DIR}/uninstall-operator.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_INSTALL}" == "true" ]]; then | ||
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
OPERATOR_REPOSITORY=$(dirname ${OPERATOR_IMAGE}) | ||
|
||
: ${OPERATOR_OPTIONS:=""} | ||
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.repository=${OPERATOR_REPOSITORY}" | ||
|
||
if [[ -n "${OPERATOR_VERSION}" ]]; then | ||
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION}" | ||
fi | ||
|
||
# We set up the options for the toolkit container | ||
: ${TOOLKIT_CONTAINER_OPTIONS:=""} | ||
|
||
if [[ -n "${TOOLKIT_CONTAINER_IMAGE}" ]]; then | ||
TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\"" | ||
fi | ||
|
||
# Create the test namespace | ||
kubectl create namespace "${TEST_NAMESPACE}" | ||
|
||
# Run the helm install command | ||
${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \ | ||
-n "${TEST_NAMESPACE}" \ | ||
${OPERATOR_OPTIONS} \ | ||
${TOOLKIT_CONTAINER_OPTIONS} \ | ||
--wait |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_INSTALL}" == "true" ]]; then | ||
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
echo "Deploy GPU pod" | ||
kubectl apply -f ${TEST_DIR}/gpu-pod.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
#! /bin/bash | ||
|
||
if [[ "${SKIP_LAUNCH}" == "true" ]]; then | ||
echo "Skipping launch: SKIP_LAUNCH=${SKIP_LAUNCH}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
${TERRAFORM} plan | ||
${TERRAFORM} apply | ||
${TERRAFORM} output |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_PREREQUISITES}" == "true" ]]; then | ||
echo "Skipping prerequisites: SKIP_PREREQUISITES=${SKIP_PREREQUISITES}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
echo "Create log dir ${LOG_DIR}" | ||
mkdir -p "${LOG_DIR}" | ||
|
||
export DEBIAN_FRONTEND=noninteractive | ||
|
||
echo "Load kernel modules i2c_core and ipmi_msghandler" | ||
sudo modprobe -a i2c_core ipmi_msghandler | ||
|
||
echo "Install dependencies" | ||
sudo apt update && sudo apt install -y jq | ||
|
||
echo "Install Helm" | ||
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# !/bin/bash | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
if [[ -z ${instance_hostname} ]]; then | ||
export instance_hostname=$(${TERRAFORM} output -raw instance_hostname) | ||
fi | ||
|
||
if [[ -z ${private_key} ]]; then | ||
export private_key=$(${TERRAFORM} output -raw private_key) | ||
fi | ||
|
||
ssh -i ${private_key} ${instance_hostname} "${@}" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
#! /bin/bash | ||
|
||
if [[ "${SKIP_SYNC}" == "true" ]]; then | ||
echo "Skipping sync: SKIP_SYNC=${SKIP_SYNC}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
instance_hostname=$(${TERRAFORM} output -raw instance_hostname) | ||
private_key=$(${TERRAFORM} output -raw private_key) | ||
|
||
REMOTE_PROJECT_FOLDER="~/${PROJECT}" | ||
# TODO: Create an exclude file for this instead | ||
# Copy over the contents of the project folder | ||
rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \ | ||
-avz --delete \ | ||
--exclude="vendor/" --exclude=".git" --exclude="aws-kube-ci" \ | ||
"${PROJECT_DIR}/" \ | ||
"${instance_hostname}:${REMOTE_PROJECT_FOLDER}" \ | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_UNINSTALL}" == "true" ]]; then | ||
echo "Skipping uninstall: SKIP_UNINSTALL=${SKIP_UNINSTALL}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
OPERATOR_NAME=$(${HELM} list -n ${TEST_NAMESPACE} | grep gpu-operator | awk '{print $1}') | ||
|
||
# Run the helm install command | ||
[[ -z ${OPERATOR_NAME} ]] || ${HELM} uninstall -n ${TEST_NAMESPACE} ${OPERATOR_NAME} | ||
|
||
# Remove the namespace | ||
kubectl delete namespace ${TEST_NAMESPACE} || true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_UNINSTALL}" == "true" ]]; then | ||
echo "Skipping uninstall: SKIP_UNINSTALL=${SKIP_UNINSTALL}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
# Remove the test pod | ||
kubectl delete pod gpu-operator-test || true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_INSTALL}" == "true" ]]; then | ||
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
${SCRIPT_DIR}/uninstall-workload.sh | ||
${SCRIPT_DIR}/uninstall-operator.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_VERIY}" == "true" ]]; then | ||
echo "Skipping verify: SKIP_VERIY=${SKIP_VERIY}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
# Import the check definitions | ||
source ${SCRIPT_DIR}/checks.sh | ||
|
||
# We verify that the pods of the operator have come up | ||
check_pod_ready "nvidia-driver-daemonset" | ||
check_pod_ready "nvidia-container-toolkit-daemonset" | ||
check_pod_ready "nvidia-dcgm-exporter" | ||
check_pod_ready "gpu-feature-discovery" | ||
check_pod_ready "nvidia-operator-validator" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# !/bin/bash | ||
|
||
if [[ "${SKIP_VERIY}" == "true" ]]; then | ||
echo "Skipping verify: SKIP_VERIY=${SKIP_VERIY}" | ||
exit 0 | ||
fi | ||
|
||
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" | ||
source ${SCRIPT_DIR}/.definitions.sh | ||
|
||
# Import the check definitions | ||
source ${SCRIPT_DIR}/checks.sh | ||
|
||
check_gpu_pod_ready ${LOG_DIR} |