Skip to content

Commit

Permalink
Added Option to run nhc via slurm prolog
Browse files Browse the repository at this point in the history
  • Loading branch information
garvct committed Feb 21, 2023
1 parent 4564233 commit 2243159
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ NHC_PROLOG=1
NHC_EPILOG=0
AUTOSCALING=0
PROLOG_NOHOLD_REQUEUE=0
PROLOG_RUN_NHC=0
NHC_EXTRA_TEST_FILES="csc_nvidia_smi.nhc azure_cuda_bandwidth.nhc azure_gpu_app_clocks.nhc azure_gpu_ecc.nhc azure_gpu_persistence.nhc azure_ib_write_bw_gdr.nhc azure_nccl_allreduce_ib_loopback.nhc azure_ib_link_flapping.nhc azure_gpu_clock_throttling.nhc azure_cpu_drop_cache_mem.nhc azure_gpu_xid.nhc azure_nccl_allreduce.nhc azure_raid_health.nhc"

source $CYCLECLOUD_SPEC_PATH/files/common_functions.sh
Expand Down Expand Up @@ -109,7 +110,7 @@ function update_slurm_prolog_epilog() {
echo "Epilog=/sched/scripts/epilog.sh" >> $SLURM_CONF
fi
fi
echo "/sched/scripts/$script" >> /sched/scripts/${prolog_epilog}.sh
echo "/sched/scripts/$script $prolog_epilog $PROLOG_RUN_NHC" >> /sched/scripts/${prolog_epilog}.sh
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,14 @@ do
done
}

prolog_epilog=$1
exclusive_node
exclusive_node_rc=$?

set_detached_mode 0
NHC_RC=0
if [ $exclusive_node_rc -eq 0 ]; then
echo "[Epilog] execute nhc" >> /var/log/nhc.log
echo "[$prolog_eplilog] execute nhc" >> /var/log/nhc.log
sudo /usr/sbin/nhc
NHC_RC=$?
fi
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,30 @@
#!/bin/bash

$2 = PROLOG_RUN_NHC

while [ ! -f /usr/sbin/nhc ];do
sleep 2
echo "[Prolog] waiting for /usr/sbin/nhc" >> /var/log/nhc.log
done

pid=`ps -ef | grep -v grep | grep /usr/sbin/nhc | tr -s ' ' | cut -d ' ' -f2 | head -n 1`

while ps -p $pid > /dev/null 2>&1
do
sleep 10
TIMESTAMP=$(/bin/date '+%Y%m%d %H:%M:%S')
echo "${TIMESTAMP} [prolog] NHC processes still running" >> /var/log/nhc.log
done

TIMESTAMP=$(/bin/date '+%Y%m%d %H:%M:%S')
echo "${TIMESTAMP} [prolog] NHC processes finished and job can start" >> /var/log/nhc.log
if [ -n "$pid" ]; then
while ps -p $pid > /dev/null 2>&1
do
sleep 10
TIMESTAMP=$(/bin/date '+%Y%m%d %H:%M:%S')
echo "${TIMESTAMP} [prolog] NHC processes still running" >> /var/log/nhc.log
done

if [ -f /var/run/nhc/nhc.status ]; then
exit 1
else
exit 0
TIMESTAMP=$(/bin/date '+%Y%m%d %H:%M:%S')
echo "${TIMESTAMP} [prolog] NHC processes finished and job can start" >> /var/log/nhc.log
if [ -f /var/run/nhc/nhc.status ]; then
exit 1
else
exit 0
fi
elif [[ $PROLOG_RUN_NHC == 1 ]]; then
/sched/scripts/run_nhc.sh
fi

0 comments on commit 2243159

Please sign in to comment.