Skip to content

Commit

Permalink
Add shell scripts for common testing tasks
Browse files Browse the repository at this point in the history
This change introduces a set of shell scripts for performing common
end-to-end test functions. These include:
* installing the operator
* verifyng completion of the installation
* installing a GPU workload
* verifying completion of the workload
* uninstalling components

Signed-off-by: Evan Lezar <[email protected]>
  • Loading branch information
elezar committed Jun 8, 2021
1 parent fb3fc47 commit 7bac5fa
Show file tree
Hide file tree
Showing 15 changed files with 339 additions and 0 deletions.
6 changes: 6 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# GPU operator test utilities

## Provided scripts / utilities

* `./tests/scripts/remote.sh [command]`: Execute `[command]` on the remote instance via SSH. This behaves the same as the `ssh` command and if no command is specified an interactive session is started.
* `./tests/scripts/remote.sh -t tmux new-session -A -s ${USER}`: Attach to (or create) a `tmux` session on the remote instance with a name matching the local user.
24 changes: 24 additions & 0 deletions tests/scripts/.definitions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# !/bin/bash
set -e

[[ -z "${DEBUG}" ]] || set -x

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
TEST_DIR="$( cd "${SCRIPT_DIR}/.." && pwd )"
PROJECT_DIR="$( cd "${TEST_DIR}/.." && pwd )"

# The terraform command is executed from the TERRAFORM_DIR
TERRAFORM_DIR=${PROJECT_DIR}/aws-kube-ci
TERRAFORM="terraform -chdir=${TERRAFORM_DIR}"

# Set default values if not defined
: ${HELM:="helm"}
: ${LOG_DIR:="/tmp/logs"}
: ${PROJECT:="$(basename "${PROJECT_DIR}")"}
: ${TEST_NAMESPACE:="test-operator"}

: ${OPERATOR_IMAGE:="nvcr.io/nvidia/gpu-operator"}

function remote() {
${SCRIPT_DIR}/remote.sh "cd ${PROJECT} && $@"
}
93 changes: 93 additions & 0 deletions tests/scripts/checks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#! /bin/bash

check_pod_ready() {
local pod_label=$1
local current_time=0
while :; do
echo "Checking $pod_label pod"
kubectl get pods -lapp=$pod_label -n gpu-operator-resources

echo "Checking $pod_label pod readiness"
is_pod_ready=$(kubectl get pods -lapp=$pod_label -n gpu-operator-resources -ojsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}')

if [ "${is_pod_ready}" = "True" ]; then
echo "Pod $pod_label is ready"
break;
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1;
fi

# Echo useful information on stdout
kubectl get pods -n gpu-operator-resources

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5
done
}

# This function kills the operator and waits for the operator to be back in a running state
# Timeout is 100 seconds
test_restart_operator() {
local ns=${1}
# The operator is the only container that has the string '"gpu-operator"'
docker kill "$(docker ps --format '{{.ID}} {{.Command}}' | grep "gpu-operator" | cut -f 1 -d ' ')"

for i in $(seq 1 10); do
# Sleep a reasonable amount of time for k8s to update the container status to crashing
sleep 10

state=$(kubectl get pods -n "${ns}" -l "app.kubernetes.io/component=gpu-operator" \
-o jsonpath='{.items[0].status.phase}')

echo "Checking state of the GPU Operator, it is: '$state'"
if [ "$state" = "Running" ]; then
return 0
fi
done

echo "Timeout reached, the GPU Operator is still not ready. See below for logs:"
kubectl logs -n gpu-operator "$(kubectl get pods -n "${ns}" -o json | jq -r '.items[0].metadata.name')"
exit 1
}

check_gpu_pod_ready() {
local log_dir=$1
local current_time=0
while :; do
pods="$(kubectl get --all-namespaces pods -o json | jq '.items[] | {name: .metadata.name, ns: .metadata.namespace}' | jq -s -c .)"
status=$(kubectl get pods gpu-operator-test -o json | jq -r .status.phase)
if [ "${status}" = "Succeeded" ]; then
echo "GPU pod terminated successfully"
rc=0
break;
fi

if [[ "${current_time}" -gt $((60 * 45)) ]]; then
echo "timeout reached"
exit 1
fi

# Echo useful information on stdout
kubectl get pods --all-namespaces

for pod in $(echo "$pods" | jq -r .[].name); do
ns=$(echo "$pods" | jq -r ".[] | select(.name == \"$pod\") | .ns")
echo "Generating logs for pod: ${pod} ns: ${ns}"
echo "------------------------------------------------" >> "${log_dir}/${pod}.describe"
kubectl -n "${ns}" describe pods "${pod}" >> "${log_dir}/${pod}.describe"
kubectl -n "${ns}" logs "${pod}" --all-containers=true > "${log_dir}/${pod}.logs" || true
done

echo "Generating cluster logs"
echo "------------------------------------------------" >> "${log_dir}/cluster.logs"
kubectl get --all-namespaces pods >> "${log_dir}/cluster.logs"

echo "Sleeping 5 seconds"
current_time=$((${current_time} + 5))
sleep 5;
done
}
23 changes: 23 additions & 0 deletions tests/scripts/end-to-end.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#! /bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

# Install the operator and ensure that this works as expected
${SCRIPT_DIR}/install-operator.sh
${SCRIPT_DIR}/verify-operator.sh

# Verify the installation
${SCRIPT_DIR}/verify-operator.sh

# Install a workload and verify that this works as expected
${SCRIPT_DIR}/install-workload.sh
${SCRIPT_DIR}/verify-workload.sh

# TODO: This should be reusable
source ${SCRIPT_DIR}/checks.sh
test_restart_operator ${TEST_NAMESPACE}

# Uninstall the workload and operator
${SCRIPT_DIR}/uninstall-workload.sh
${SCRIPT_DIR}/uninstall-operator.sh
35 changes: 35 additions & 0 deletions tests/scripts/install-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# !/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

OPERATOR_REPOSITORY=$(dirname ${OPERATOR_IMAGE})

: ${OPERATOR_OPTIONS:=""}
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.repository=${OPERATOR_REPOSITORY}"

if [[ -n "${OPERATOR_VERSION}" ]]; then
OPERATOR_OPTIONS="${OPERATOR_OPTIONS} --set operator.version=${OPERATOR_VERSION}"
fi

# We set up the options for the toolkit container
: ${TOOLKIT_CONTAINER_OPTIONS:=""}

if [[ -n "${TOOLKIT_CONTAINER_IMAGE}" ]]; then
TOOLKIT_CONTAINER_OPTIONS="${TOOLKIT_CONTAINER_OPTIONS} --set toolkit.repository=\"\" --set toolkit.version=\"\" --set toolkit.image=\"${TOOLKIT_CONTAINER_IMAGE}\""
fi

# Create the test namespace
kubectl create namespace "${TEST_NAMESPACE}"

# Run the helm install command
${HELM} install ${PROJECT_DIR}/deployments/gpu-operator --generate-name \
-n "${TEST_NAMESPACE}" \
${OPERATOR_OPTIONS} \
${TOOLKIT_CONTAINER_OPTIONS} \
--wait
12 changes: 12 additions & 0 deletions tests/scripts/install-workload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# !/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

echo "Deploy GPU pod"
kubectl apply -f ${TEST_DIR}/gpu-pod.yaml
13 changes: 13 additions & 0 deletions tests/scripts/launch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#! /bin/bash

if [[ "${SKIP_LAUNCH}" == "true" ]]; then
echo "Skipping launch: SKIP_LAUNCH=${SKIP_LAUNCH}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

${TERRAFORM} plan
${TERRAFORM} apply
${TERRAFORM} output
23 changes: 23 additions & 0 deletions tests/scripts/prerequisites.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# !/bin/bash

if [[ "${SKIP_PREREQUISITES}" == "true" ]]; then
echo "Skipping prerequisites: SKIP_PREREQUISITES=${SKIP_PREREQUISITES}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

echo "Create log dir ${LOG_DIR}"
mkdir -p "${LOG_DIR}"

export DEBIAN_FRONTEND=noninteractive

echo "Load kernel modules i2c_core and ipmi_msghandler"
sudo modprobe -a i2c_core ipmi_msghandler

echo "Install dependencies"
sudo apt update && sudo apt install -y jq

echo "Install Helm"
curl https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 | bash
14 changes: 14 additions & 0 deletions tests/scripts/remote.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# !/bin/bash

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

if [[ -z ${instance_hostname} ]]; then
export instance_hostname=$(${TERRAFORM} output -raw instance_hostname)
fi

if [[ -z ${private_key} ]]; then
export private_key=$(${TERRAFORM} output -raw private_key)
fi

ssh -i ${private_key} ${instance_hostname} "${@}"
22 changes: 22 additions & 0 deletions tests/scripts/sync.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#! /bin/bash

if [[ "${SKIP_SYNC}" == "true" ]]; then
echo "Skipping sync: SKIP_SYNC=${SKIP_SYNC}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

instance_hostname=$(${TERRAFORM} output -raw instance_hostname)
private_key=$(${TERRAFORM} output -raw private_key)

REMOTE_PROJECT_FOLDER="~/${PROJECT}"
# TODO: Create an exclude file for this instead
# Copy over the contents of the project folder
rsync -e "ssh -i ${private_key} -o StrictHostKeyChecking=no" \
-avz --delete \
--exclude="vendor/" --exclude=".git" --exclude="aws-kube-ci" \
"${PROJECT_DIR}/" \
"${instance_hostname}:${REMOTE_PROJECT_FOLDER}" \

17 changes: 17 additions & 0 deletions tests/scripts/uninstall-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# !/bin/bash

if [[ "${SKIP_UNINSTALL}" == "true" ]]; then
echo "Skipping uninstall: SKIP_UNINSTALL=${SKIP_UNINSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

OPERATOR_NAME=$(${HELM} list -n ${TEST_NAMESPACE} | grep gpu-operator | awk '{print $1}')

# Run the helm install command
[[ -z ${OPERATOR_NAME} ]] || ${HELM} uninstall -n ${TEST_NAMESPACE} ${OPERATOR_NAME}

# Remove the namespace
kubectl delete namespace ${TEST_NAMESPACE} || true
12 changes: 12 additions & 0 deletions tests/scripts/uninstall-workload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# !/bin/bash

if [[ "${SKIP_UNINSTALL}" == "true" ]]; then
echo "Skipping uninstall: SKIP_UNINSTALL=${SKIP_UNINSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

# Remove the test pod
kubectl delete pod gpu-operator-test || true
12 changes: 12 additions & 0 deletions tests/scripts/uninstall.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# !/bin/bash

if [[ "${SKIP_INSTALL}" == "true" ]]; then
echo "Skipping install: SKIP_INSTALL=${SKIP_INSTALL}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

${SCRIPT_DIR}/uninstall-workload.sh
${SCRIPT_DIR}/uninstall-operator.sh
19 changes: 19 additions & 0 deletions tests/scripts/verify-operator.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# !/bin/bash

if [[ "${SKIP_VERIY}" == "true" ]]; then
echo "Skipping verify: SKIP_VERIY=${SKIP_VERIY}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

# We verify that the pods of the operator have come up
check_pod_ready "nvidia-driver-daemonset"
check_pod_ready "nvidia-container-toolkit-daemonset"
check_pod_ready "nvidia-dcgm-exporter"
check_pod_ready "gpu-feature-discovery"
check_pod_ready "nvidia-operator-validator"
14 changes: 14 additions & 0 deletions tests/scripts/verify-workload.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# !/bin/bash

if [[ "${SKIP_VERIY}" == "true" ]]; then
echo "Skipping verify: SKIP_VERIY=${SKIP_VERIY}"
exit 0
fi

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
source ${SCRIPT_DIR}/.definitions.sh

# Import the check definitions
source ${SCRIPT_DIR}/checks.sh

check_gpu_pod_ready ${LOG_DIR}

0 comments on commit 7bac5fa

Please sign in to comment.