forked from Azure/azurehpc
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
491 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:23.03-py3 | ||
|
||
FROM ${FROM_IMAGE_NAME} | ||
|
||
RUN apt update | ||
RUN apt-get -y install build-essential | ||
RUN apt-get -y install infiniband-diags | ||
RUN apt-get -y install openssh-server | ||
RUN apt-get -y install kmod | ||
RUN apt-get -y install net-tools | ||
RUN apt-get -y install pciutils | ||
RUN DEBIAN_FRONTEND=noninteractive apt-get install -y tzdata | ||
RUN apt-get -y install hwloc | ||
COPY nccl-tests.sh . | ||
RUN ./nccl-tests.sh | ||
COPY ndv4-topo.xml . | ||
COPY osu-micro-benchmarks-7.0.1.sh . | ||
RUN ./osu-micro-benchmarks-7.0.1.sh | ||
COPY hpc-diagnostics.sh . | ||
RUN ./hpc-diagnostics.sh | ||
COPY azure_nccl_allreduce.nhc . | ||
COPY azure_nccl_allreduce_ib_loopback.nhc . | ||
COPY custom-test-setup.sh . | ||
COPY azurehpc-health-checks.sh . | ||
RUN ./azurehpc-health-checks.sh | ||
COPY nd96asr_v4.conf azurehpc-health-checks/conf |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#!/bin/bash | ||
|
||
# Check for NVlink issues by running NCCL allreduce. | ||
|
||
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root" | ||
|
||
function collect_nccl_allreduce_data() { | ||
TOPOFILE=$1 | ||
MESSAGE_SIZE=$2 | ||
|
||
ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_TOPO_FILE=$TOPOFILE" | ||
NCCL_ARGS="-b $MESSAGE_SIZE -f 2 -g 1 -e $MESSAGE_SIZE -c 1" | ||
|
||
# ensure unloaded before use | ||
# source /etc/profile.d/modules.sh && module -s unload mpi/hpcx | ||
# nccl_allreduce_out=$(source /etc/profile.d/modules.sh && module -s load mpi/hpcx && mpirun $MPI_ARGS $ENVIRON_VARS /opt/nccl-tests/build/all_reduce_perf $NCCL_ARGS) | ||
nccl_allreduce_out=$(/usr/local/mpi/bin/mpirun $MPI_ARGS $ENVIRON_VARS /workspace/nccl-tests/build/all_reduce_perf $NCCL_ARGS) | ||
nccl_allreduce_out_rc=$? | ||
if [[ $nccl_allreduce_out_rc != 0 ]]; then | ||
log "nccl_allreduce_freq_out" | ||
die 1 "$FUNCNAME: nccl_allreduce returned error code $nccl_allreduce_out_rc" | ||
fi | ||
IFS=$'\n' | ||
nccl_allreduce_out_lines=( $nccl_allreduce_out ) | ||
IFS=$' \t\n' | ||
} | ||
|
||
function check_nccl_allreduce() { | ||
|
||
EXP_NCCL_ALLREDUCE_BW=$1 | ||
REPEATS="${2:-1}" | ||
|
||
TOPOFILE=$3 | ||
MESSAGE_SIZE=$4 | ||
|
||
for iter in $(seq 1 $REPEATS) | ||
do | ||
collect_nccl_allreduce_data $TOPOFILE $MESSAGE_SIZE | ||
|
||
for ((i=0; i<${#nccl_allreduce_out_lines[*]}; i++)) | ||
do | ||
if [[ "${nccl_allreduce_out_lines[$i]//FAILED}" != "${nccl_allreduce_out_lines[$i]}" ]] | ||
then | ||
log "$nccl_allreduce_out" | ||
die 1 "$FUNCNAME: NCCL allreduce, Out of bounds values failed" | ||
return 0 | ||
fi | ||
if [[ "${nccl_allreduce_out_lines[$i]//bandwidth}" != "${nccl_allreduce_out_lines[$i]}" ]] | ||
then | ||
IFS=$' \t\n' | ||
nccl_allreduce_out_line=( ${nccl_allreduce_out_lines[$i]} ) | ||
avg_bus_bw=${nccl_allreduce_out_line[5]} | ||
dbg "Measured Avg NCCL allreduce bus BW $avg_bus_bw GB/s (expected >=$EXP_NCCL_ALLREDUCE_BW GB/s)" | ||
break | ||
fi | ||
done | ||
|
||
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_BW ]] | ||
then | ||
dbg "$nccl_allreduce_out" | ||
log "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_BW GB/s" | ||
else | ||
return 0 | ||
fi | ||
done | ||
|
||
die 1 "$FUNCNAME: NCCL allreduce, BUS BW (expected >=$EXP_NCCL_ALLREDUCE_BW GB/s, but measured $avg_bus_bw GB/s)" | ||
return 0 | ||
} |
60 changes: 60 additions & 0 deletions
60
experimental/aks_ndv4/azure_nccl_allreduce_ib_loopback.nhc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#!/bin/bash | ||
|
||
# Check for IB issues by running NCCL allreduce disabling NCCL shared memory. | ||
|
||
function collect_nccl_allreduce_ib_loopback_data() { | ||
|
||
TOPOFILE=$1 | ||
MESSAGE_SIZE=$2 | ||
MPI_ARGS="-np 8 --map-by ppr:8:node -bind-to numa -mca coll_hcoll_enable 0 --allow-run-as-root" | ||
ENVIRON_VARS="-x LD_LIBRARY_PATH=/usr/local/nccl-rdma-sharp-plugins/lib:$LD_LIBRARY_PATH -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x UCX_IB_PCI_RELAXED_ORDERING=on -x UCX_TLS=tcp -x UCX_NET_DEVICES=eth0 -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_SOCKET_IFNAME=eth0 -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=$TOPOFILE -x NCCL_SHM_DISABLE=1 -x NCCL_P2P_DISABLE=1" | ||
NCCL_ARGS="-b $MESSAGE_SIZE -f 2 -g 1 -e $MESSAGE_SIZE -c 1" | ||
|
||
nccl_allreduce_ib_loopback_out=$(/usr/local/mpi/bin/mpirun $MPI_ARGS $ENVIRON_VARS /workspace/nccl-tests/build/all_reduce_perf $NCCL_ARGS) | ||
nccl_allreduce_ib_loopback_out_rc=$? | ||
if [[ $nccl_allreduce_ib_loopback_out_rc != 0 ]]; then | ||
log "nccl_allreduce_ib_loopback_freq_out" | ||
die 1 "$FUNCNAME: nccl_allreduce (IB loopback) returned error code $nccl_allreduce_ib_loopback_out_rc" | ||
fi | ||
IFS=$'\n' | ||
nccl_allreduce_ib_loopback_out_lines=( $nccl_allreduce_ib_loopback_out ) | ||
IFS=$' \t\n' | ||
|
||
} | ||
|
||
function check_nccl_allreduce_ib_loopback() { | ||
|
||
EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW=$1 | ||
REPEATS="${2:-1}" | ||
TOPOFILE=$3 | ||
MESSAGE_SIZE=$4 | ||
|
||
|
||
for iter in $(seq 1 $REPEATS) | ||
do | ||
collect_nccl_allreduce_ib_loopback_data $TOPOFILE $MESSAGE_SIZE | ||
|
||
for ((i=0; i<${#nccl_allreduce_ib_loopback_out_lines[*]}; i++)) | ||
do | ||
if [[ "${nccl_allreduce_ib_loopback_out_lines[$i]//bandwidth}" != "${nccl_allreduce_ib_loopback_out_lines[$i]}" ]] | ||
then | ||
IFS=$' \t\n' | ||
nccl_allreduce_ib_loopback_out_line=( ${nccl_allreduce_ib_loopback_out_lines[$i]} ) | ||
avg_bus_bw=${nccl_allreduce_ib_loopback_out_line[5]} | ||
break | ||
fi | ||
done | ||
|
||
if [[ $avg_bus_bw < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW ]] | ||
then | ||
dbg "$nccl_allreduce_ib_loopback_out" | ||
log "Iteration ${iter} of ${REPEATS} failed: NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s < $EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s" | ||
else | ||
dbg "NCCL allreduce IB loopback bandwidth $avg_bus_bw GB/s" | ||
return 0 | ||
fi | ||
done | ||
|
||
die 1 "$FUNCNAME: NCCL allreduce, BUS BW (expected >=$EXP_NCCL_ALLREDUCE_IB_LOOPBACK_BW GB/s, but measured $avg_bus_bw GB/s)" | ||
return 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#!/bin/bash | ||
|
||
apt install -y sudo | ||
git clone https://github.com/Azure/azurehpc-health-checks.git | ||
|
||
cp custom-test-setup.sh /workspace/azurehpc-health-checks/customTests/custom-test-setup.sh | ||
cp azure_nccl_allreduce.nhc /workspace/azurehpc-health-checks/customTests | ||
cp azure_nccl_allreduce_ib_loopback.nhc /workspace/azurehpc-health-checks/customTests | ||
cd /workspace/azurehpc-health-checks | ||
chmod 775 install-nhc.sh | ||
chmod 775 distributed_nhc.sb.sh | ||
chmod 775 run-health-checks.sh | ||
chmod 775 customTests/custom-test-setup.sh | ||
|
||
./install-nhc.sh | ||
cd /workspace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
#!/bin/bash | ||
set -e | ||
|
||
NVCC=/usr/local/cuda/bin/nvcc | ||
SRC_DIR=$1 | ||
EXE_DIR=$2 | ||
|
||
# location for any source files default current directory | ||
if [[ -z "$SRC_DIR" ]];then | ||
SRC_DIR=. | ||
fi | ||
|
||
# location where we will be putting execuatble. Must match custom tests. | ||
if [[ -z "$EXE_DIR" ]];then | ||
EXE_DIR=/opt/azurehpc/test/nhc | ||
fi | ||
|
||
mkdir -p $EXE_DIR | ||
|
||
function install_perf_test(){ | ||
type=$1 | ||
# create perf-test executables | ||
if [[ "$type" == "cuda" ]]; then | ||
echo -e "Building PerfTest with CUDA" | ||
else | ||
echo -e "Building PerfTest" | ||
fi | ||
|
||
VERSION=4.5-0.12 | ||
VERSION_HASH=ge93c538 | ||
|
||
distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` | ||
if [[ $distro =~ "Ubuntu" ]]; then | ||
apt-get install -y libpci-dev | ||
elif [[ $distro =~ "AlmaLinux" ]]; then | ||
dnf install -y pciutils-devel | ||
else | ||
echo "OS version is not supported, Perf-test build skipped. Proceed w/ caution." | ||
return 1 | ||
fi | ||
|
||
pushd ${EXE_DIR} | ||
perftest_dir="perftest-${VERSION}" | ||
mkdir -p ${EXE_DIR}/${perftest_dir} | ||
archive_url="https://github.com/linux-rdma/perftest/releases/download/v${VERSION}/perftest-${VERSION}.${VERSION_HASH}.tar.gz" | ||
wget -q -O - $archive_url | tar -xz --strip=1 -C ${EXE_DIR}/${perftest_dir} | ||
|
||
pushd ${perftest_dir} | ||
if [[ "$type" == "cuda" ]]; then | ||
./configure CUDA_H_PATH=/usr/local/cuda/include/cuda.h | ||
else | ||
./autogen.sh | ||
./configure | ||
fi | ||
|
||
make | ||
popd | ||
popd | ||
} | ||
|
||
|
||
#Nvidia installs | ||
if lspci | grep -iq NVIDIA ; then | ||
# CUDA BW Test Setup | ||
#Test if nvcc is installed and if so install gpu-copy test. | ||
if test -f "$NVCC"; then | ||
#Compile the gpu-copy benchmark. | ||
|
||
cufile="$SRC_DIR/gpu-copy.cu" | ||
outfile="$EXE_DIR/gpu-copy" | ||
|
||
#Test if the default gcc compiler is new enough to compile gpu-copy. | ||
#If it is not then use the 9.2 compiler, that should be installed in | ||
#/opt. | ||
if [ $(gcc -dumpversion | cut -d. -f1) -gt 6 ]; then | ||
$NVCC -lnuma $cufile -o $outfile | ||
else | ||
$NVCC --compiler-bindir /opt/gcc-9.2.0/bin \ | ||
-lnuma $cufile -o $outfile | ||
fi | ||
else | ||
echo "$NVCC not found. Exiting setup" | ||
fi | ||
|
||
# install_perf_test "cuda" | ||
|
||
else | ||
|
||
install_perf_test | ||
|
||
# Stream | ||
if command -v /opt/AMD/aocc-compiler-4.0.0/bin/clang &> /dev/null || command -v clang &> /dev/null; then | ||
echo -e "clang compiler found Building Stream" | ||
pushd ${SRC_DIR}/stream | ||
if ! [[ -f "stream.c" ]]; then | ||
wget https://www.cs.virginia.edu/stream/FTP/Code/stream.c | ||
fi | ||
|
||
|
||
HB_HX_SKUS="standard_hb176rs_v4|standard_hb176-144rs_v4|standard_hb176-96rs_v4|standard_hb176-48rs_v4|standard_hb176-24rs_v4|standard_hx176rs|standard_hx176-144rs|standard_hx176-96rs|standard_hx176-48rs|standard_hx176-24rs" | ||
SKU=$( curl -H Metadata:true --max-time 10 -s "http://169.254.169.254/metadata/instance/compute/vmSize?api-version=2021-01-01&format=text") | ||
SKU=$(echo "$SKU" | tr '[:upper:]' '[:lower:]') | ||
|
||
if [[ "$HB_HX_SKUS" =~ "$SKU" ]]; then | ||
BUILD=ZEN4 | ||
elif echo $SKU | grep "hb120rs_v3"; then | ||
BUILD=ZEN3 | ||
elif echo $SKU | grep "hb120rs_v2"; then | ||
BUILD=ZEN2 | ||
else | ||
#default to zen3 build | ||
BUILD=ZEN3 | ||
fi | ||
|
||
if command -v /opt/AMD/aocc-compiler-4.0.0/bin/clang &> /dev/null; then | ||
make $BUILD CC=/opt/AMD/aocc-compiler-4.0.0/bin/clang EXEC_DIR=$EXE_DIR | ||
else | ||
make $BUILD CC=clang EXEC_DIR=$EXE_DIR | ||
fi | ||
popd | ||
else | ||
echo "clang command not found. Skipping Stream build. Add clang to PATH ENV variable and rerun script to build Stream" | ||
fi | ||
|
||
fi | ||
|
||
# Ensure lstopo-no-graphics is installed for the azure_hw_topology_check.nhc | ||
distro=`awk -F= '/^NAME/{print $2}' /etc/os-release` | ||
if [[ $distro =~ "Ubuntu" ]]; then | ||
apt-get install -y hwloc | ||
elif [[ $distro =~ "AlmaLinux" ]]; then | ||
dnf install -y hwloc | ||
else | ||
echo "OS version is not supported, azure_hw_topology_check will not work." | ||
return 1 | ||
fi | ||
|
||
# copy all custom test to the nhc scripts dir | ||
cp $SRC_DIR/*.nhc /etc/nhc/scripts |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
#!/bin/bash | ||
|
||
git clone https://github.com/Azure/azhpc-diagnostics.git | ||
chmod 775 azhpc-diagnostics/Linux/src/gather_azhpc_vm_diagnostics.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
apiVersion: batch.volcano.sh/v1alpha1 | ||
kind: Job | ||
metadata: | ||
name: hpc-diags-job1 | ||
spec: | ||
minAvailable: 1 | ||
schedulerName: volcano | ||
plugins: | ||
ssh: [] | ||
svc: [] | ||
tasks: | ||
- replicas: 1 | ||
name: mpiworker | ||
template: | ||
spec: | ||
containers: | ||
- command: | ||
- /bin/bash | ||
- -c | ||
- | | ||
chmod 775 /workspace/azhpc-diagnostics/Linux/src/gather_azhpc_vm_diagnostics.sh | ||
echo -e "y" | /workspace/azhpc-diagnostics/Linux/src/gather_azhpc_vm_diagnostics.sh --dir=/tmp | ||
image: cgacr2.azurecr.io/pytorch_nccl_tests_2303_2:latest | ||
securityContext: | ||
capabilities: | ||
add: ["IPC_LOCK"] | ||
name: mpiworker | ||
ports: | ||
- containerPort: 22 | ||
name: mpijob-port | ||
workingDir: /workspace | ||
resources: | ||
requests: | ||
nvidia.com/mlnxnics: 8 | ||
limits: | ||
nvidia.com/mlnxnics: 8 | ||
volumeMounts: | ||
- mountPath: /dev/shm | ||
name: shm | ||
- mountPath: /tmp | ||
name: tmp | ||
restartPolicy: OnFailure | ||
terminationGracePeriodSeconds: 0 | ||
volumes: | ||
- name: shm | ||
emptyDir: | ||
medium: Memory | ||
sizeLimit: 8Gi | ||
- name: tmp | ||
hostPath: | ||
path: /tmp | ||
type: Directory | ||
--- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
#!/bin/bash | ||
|
||
git clone https://github.com/NVIDIA/nccl-tests.git | ||
cd nccl-tests | ||
make MPI=1 MPI_HOME=/usr/local/mpi |
Oops, something went wrong.