Skip to content

Commit

Permalink
aks hpc-diagnostics and nhc
Browse files Browse the repository at this point in the history
  • Loading branch information
garvct committed Oct 8, 2023
1 parent f9766c2 commit d55cbc7
Show file tree
Hide file tree
Showing 10 changed files with 491 additions and 0 deletions.
26 changes: 26 additions & 0 deletions experimental/aks_ndv4/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.03-py3

FROM ${FROM_IMAGE_NAME}

RUN apt update
RUN apt-get -y install build-essential
RUN apt-get -y install infiniband-diags
RUN apt-get -y install openssh-server
RUN apt-get -y install kmod
RUN apt-get -y install net-tools
RUN apt-get -y install pciutils
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata
RUN apt-get -y install hwloc
COPY nccl-tests.sh .
RUN ./nccl-tests.sh
COPY ndv4-topo.xml .
COPY osu-micro-benchmarks-7.0.1.sh .
RUN ./osu-micro-benchmarks-7.0.1.sh
COPY hpc-diagnostics.sh .
RUN ./hpc-diagnostics.sh
COPY azure_nccl_allreduce.nhc .
COPY azure_nccl_allreduce_ib_loopback.nhc .
COPY custom-test-setup.sh .
COPY azurehpc-health-checks.sh .
RUN ./azurehpc-health-checks.sh
COPY nd96asr_v4.conf azurehpc-health-checks/conf
69 changes: 69 additions & 0 deletions experimental/aks_ndv4/azure_nccl_allreduce.nhc
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#!/bin/bash

# Check for NVlink issues by running NCCL allreduce.

MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root"

function collect_nccl_allreduce_data() {
TOPOFILE=$1
MESSAGE_SIZE=$2

ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_TOPO_FILE=$TOPOFILE"
NCCL_ARGS="-b $MESSAGE_SIZE -f 2 -g 1 -e $MESSAGE_SIZE -c 1"

# ensure unloaded before use
# source /etc/profile.d/modules.sh && module -s unload mpi/hpcx
# nccl_allreduce_out=$(source /etc/profile.d/modules.sh && module -s load mpi/hpcx && mpirun $MPI_ARGS $ENVIRON_VARS /opt/nccl-tests/build/all_reduce_perf $NCCL_ARGS)
nccl_allreduce_out=$(/usr/local/mpi/bin/mpirun $MPI_ARGS $ENVIRON_VARS /workspace/nccl-tests/build/all_reduce_perf $NCCL_ARGS)
nccl_allreduce_out_rc=$?
if [[ $nccl_allreduce_out_rc != 0 ]]; then
log "nccl_allreduce_freq_out"
die 1 "$FUNCNAME: nccl_allreduce returned error code $nccl_allreduce_out_rc"
fi
IFS=$'\n'
nccl_allreduce_out_lines=( $nccl_allreduce_out )
IFS=$' \t\n'
}

function check_nccl_allreduce() {

EXP_NCCL_ALLREDUCE_BW=$1
REPEATS="${2:-1}"

TOPOFILE=$3
MESSAGE_SIZE=$4

for iter in $(seq 1 $REPEATS)
do
collect_nccl_allreduce_data $TOPOFILE $MESSAGE_SIZE

for ((i=0; i<${#nccl_allreduce_out_lines[*]}; i++))
do
if [[ "${nccl_allreduce_out_lines[$i]//FAILED}" != "${nccl_allreduce_out_lines[$i]}" ]]
then
log "$nccl_allreduce_out"
die 1 "$FUNCNAME: NCCL allreduce, Out of bounds values failed"
return 0
fi
if [[ "${nccl_allreduce_out_lines[$i]//bandwidth}" != "${nccl_allreduce_out_lines[$i]}" ]]
then
IFS=$' \t\n'
nccl_allreduce_out_line=( ${nccl_allreduce_out_lines[$i]} )
avg_bus_bw=${nccl_allreduce_out_line[5]}
dbg "Measured Avg NCCL allreduce bus BW $avg_bus_bw GB/s (expected >=$EXP_NCCL_ALLREDUCE_BW GB/s)"
break
fi
done

if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_BW ]]
then
dbg "$nccl_allreduce_out"
log "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_BW GB/s"
else
return 0
fi
done

die 1 "$FUNCNAME: NCCL allreduce, BUS BW (expected >=$EXP_NCCL_ALLREDUCE_BW GB/s, but measured $avg_bus_bw GB/s)"
return 0
}
60 changes: 60 additions & 0 deletions experimental/aks_ndv4/azure_nccl_allreduce_ib_loopback.nhc
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/bin/bash

# Check for IB issues by running NCCL allreduce disabling NCCL shared memory.

function collect_nccl_allreduce_ib_loopback_data() {

TOPOFILE=$1
MESSAGE_SIZE=$2
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root"
ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=$TOPOFILE -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1"
NCCL_ARGS="-b $MESSAGE_SIZE -f 2 -g 1 -e $MESSAGE_SIZE -c 1"

nccl_allreduce_ib_loopback_out=$(/usr/local/mpi/bin/mpirun $MPI_ARGS $ENVIRON_VARS /workspace/nccl-tests/build/all_reduce_perf $NCCL_ARGS)
nccl_allreduce_ib_loopback_out_rc=$?
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then
log "nccl_allreduce_ib_loopback_freq_out"
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc"
fi
IFS=$'\n'
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out )
IFS=$' \t\n'

}

function check_nccl_allreduce_ib_loopback() {

EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=$1
REPEATS="${2:-1}"
TOPOFILE=$3
MESSAGE_SIZE=$4


for iter in $(seq 1 $REPEATS)
do
collect_nccl_allreduce_ib_loopback_data $TOPOFILE $MESSAGE_SIZE

for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++))
do
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]]
then
IFS=$' \t\n'
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} )
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]}
break
fi
done

if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]]
then
dbg "$nccl_allreduce_ib_loopback_out"
log "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s"
else
dbg "NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s"
return 0
fi
done

die 1 "$FUNCNAME: NCCL allreduce, BUS BW (expected >=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s)"
return 0
}
16 changes: 16 additions & 0 deletions experimental/aks_ndv4/azurehpc-health-checks.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

apt install -y sudo
git clone https://github.com/Azure/azurehpc-health-checks.git

cp custom-test-setup.sh /workspace/azurehpc-health-checks/customTests/custom-test-setup.sh
cp azure_nccl_allreduce.nhc /workspace/azurehpc-health-checks/customTests
cp azure_nccl_allreduce_ib_loopback.nhc /workspace/azurehpc-health-checks/customTests
cd /workspace/azurehpc-health-checks
chmod 775 install-nhc.sh
chmod 775 distributed_nhc.sb.sh
chmod 775 run-health-checks.sh
chmod 775 customTests/custom-test-setup.sh

./install-nhc.sh
cd /workspace
139 changes: 139 additions & 0 deletions experimental/aks_ndv4/custom-test-setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#!/bin/bash
set -e

NVCC=/usr/local/cuda/bin/nvcc
SRC_DIR=$1
EXE_DIR=$2

# location for any source files default current directory
if [[ -z "$SRC_DIR" ]];then
SRC_DIR=.
fi

# location where we will be putting execuatble. Must match custom tests.
if [[ -z "$EXE_DIR" ]];then
EXE_DIR=/opt/azurehpc/test/nhc
fi

mkdir -p $EXE_DIR

function install_perf_test(){
type=$1
# create perf-test executables
if [[ "$type" == "cuda" ]]; then
echo -e "Building PerfTest with CUDA"
else
echo -e "Building PerfTest"
fi

VERSION=4.5-0.12
VERSION_HASH=ge93c538

distro=`awk -F= '/^NAME/{print $2}' /etc/os-release`
if [[ $distro =~ "Ubuntu" ]]; then
apt-get install -y libpci-dev
elif [[ $distro =~ "AlmaLinux" ]]; then
dnf install -y pciutils-devel
else
echo "OS version is not supported, Perf-test build skipped. Proceed w/ caution."
return 1
fi

pushd ${EXE_DIR}
perftest_dir="perftest-${VERSION}"
mkdir -p ${EXE_DIR}/${perftest_dir}
archive_url="https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz"
wget -q -O - $archive_url | tar -xz --strip=1 -C ${EXE_DIR}/${perftest_dir}

pushd ${perftest_dir}
if [[ "$type" == "cuda" ]]; then
./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h
else
./autogen.sh
./configure
fi

make
popd
popd
}


#Nvidia installs
if lspci | grep -iq NVIDIA ; then
# CUDA BW Test Setup
#Test if nvcc is installed and if so install gpu-copy test.
if test -f "$NVCC"; then
#Compile the gpu-copy benchmark.

cufile="$SRC_DIR/gpu-copy.cu"
outfile="$EXE_DIR/gpu-copy"

#Test if the default gcc compiler is new enough to compile gpu-copy.
#If it is not then use the 9.2 compiler, that should be installed in
#/opt.
if [ $(gcc -dumpversion | cut -d. -f1) -gt 6 ]; then
$NVCC -lnuma $cufile -o $outfile
else
$NVCC --compiler-bindir /opt/gcc-9.2.0/bin \
-lnuma $cufile -o $outfile
fi
else
echo "$NVCC not found. Exiting setup"
fi

# install_perf_test "cuda"

else

install_perf_test

# Stream
if command -v /opt/AMD/aocc-compiler-4.0.0/bin/clang &> /dev/null || command -v clang &> /dev/null; then
echo -e "clang compiler found Building Stream"
pushd ${SRC_DIR}/stream
if ! [[ -f "stream.c" ]]; then
wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c
fi


HB_HX_SKUS="standard_hb176rs_v4|standard_hb176-144rs_v4|standard_hb176-96rs_v4|standard_hb176-48rs_v4|standard_hb176-24rs_v4|standard_hx176rs|standard_hx176-144rs|standard_hx176-96rs|standard_hx176-48rs|standard_hx176-24rs"
SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text")
SKU=$(echo "$SKU" | tr '[:upper:]' '[:lower:]')

if [[ "$HB_HX_SKUS" =~ "$SKU" ]]; then
BUILD=ZEN4
elif echo $SKU | grep "hb120rs_v3"; then
BUILD=ZEN3
elif echo $SKU | grep "hb120rs_v2"; then
BUILD=ZEN2
else
#default to zen3 build
BUILD=ZEN3
fi

if command -v /opt/AMD/aocc-compiler-4.0.0/bin/clang &> /dev/null; then
make $BUILD CC=/opt/AMD/aocc-compiler-4.0.0/bin/clang EXEC_DIR=$EXE_DIR
else
make $BUILD CC=clang EXEC_DIR=$EXE_DIR
fi
popd
else
echo "clang command not found. Skipping Stream build. Add clang to PATH ENV variable and rerun script to build Stream"
fi

fi

# Ensure lstopo-no-graphics is installed for the azure_hw_topology_check.nhc
distro=`awk -F= '/^NAME/{print $2}' /etc/os-release`
if [[ $distro =~ "Ubuntu" ]]; then
apt-get install -y hwloc
elif [[ $distro =~ "AlmaLinux" ]]; then
dnf install -y hwloc
else
echo "OS version is not supported, azure_hw_topology_check will not work."
return 1
fi

# copy all custom test to the nhc scripts dir
cp $SRC_DIR/*.nhc /etc/nhc/scripts
4 changes: 4 additions & 0 deletions experimental/aks_ndv4/hpc-diagnostics.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
#!/bin/bash

git clone https://github.com/Azure/azhpc-diagnostics.git
chmod 775 azhpc-diagnostics/Linux/src/gather_azhpc_vm_diagnostics.sh
53 changes: 53 additions & 0 deletions experimental/aks_ndv4/hpc-diagnostics.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: hpc-diags-job1
spec:
minAvailable: 1
schedulerName: volcano
plugins:
ssh: []
svc: []
tasks:
- replicas: 1
name: mpiworker
template:
spec:
containers:
- command:
- /bin/bash
- -c
- |
chmod 775 /workspace/azhpc-diagnostics/Linux/src/gather_azhpc_vm_diagnostics.sh
echo -e "y" | /workspace/azhpc-diagnostics/Linux/src/gather_azhpc_vm_diagnostics.sh --dir=/tmp
image: cgacr2.azurecr.io/pytorch_nccl_tests_2303_2:latest
securityContext:
capabilities:
add: ["IPC_LOCK"]
name: mpiworker
ports:
- containerPort: 22
name: mpijob-port
workingDir: /workspace
resources:
requests:
nvidia.com/mlnxnics: 8
limits:
nvidia.com/mlnxnics: 8
volumeMounts:
- mountPath: /dev/shm
name: shm
- mountPath: /tmp
name: tmp
restartPolicy: OnFailure
terminationGracePeriodSeconds: 0
volumes:
- name: shm
emptyDir:
medium: Memory
sizeLimit: 8Gi
- name: tmp
hostPath:
path: /tmp
type: Directory
---
5 changes: 5 additions & 0 deletions experimental/aks_ndv4/nccl-tests.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#!/bin/bash

git clone https://github.com/NVIDIA/nccl-tests.git
cd nccl-tests
make MPI=1 MPI_HOME=/usr/local/mpi
Loading

0 comments on commit d55cbc7

Please sign in to comment.