Skip to content

Commit

Permalink
Added RAID health check test
Browse files Browse the repository at this point in the history
  • Loading branch information
vanzod committed Feb 7, 2023
1 parent b70a138 commit 80b9191
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

# Check for the health of a RAID array of a specified size associated to a given mount point

function check_raid()
{
MOUNT_PATH=$1
ARRAY_SIZE=$2

MOUNT=$(grep "${MOUNT_PATH} " /proc/mounts)
[ $? != 0 ] && die 1 "${MOUNT_PATH} not a valid mount point" && return 1

DEVICE=$(echo ${MOUNT} | awk '{print $1}')
dbg "Found device $DEVICE mounted at ${MOUNT_PATH}"

mdadm --detail ${DEVICE} > /dev/null
[ $? != 0 ] && die 1 "Device ${DEVICE} does not appear to be a RAID array" && return 1

RAID_STATE=$(mdadm --detail ${DEVICE} | awk '/State :/ {print $3}')
if [ "${RAID_STATE}" == 'clean' ]; then
dbg "RAID device ${DEVICE} is clean"
else
die 1 "RAID device ${DEVICE} is not in clean state (${RAID_STATE})"
return 1
fi
RAID_DEVICES=$(sudo mdadm --detail ${DEVICE} | awk '/Working Devices :/ {print $4}')
if [ "${RAID_DEVICES}" == ${ARRAY_SIZE} ]; then
dbg "Found ${RAID_DEVICES} RAID devices in array ${DEVICE} "
else
die 1 "RAID device ${DEVICE} reported ${RAID_DEVICES} devices (expected ${ARRAY_SIZE})"
return 1
fi

TESTFILE=$(mktemp -p ${MOUNT_PATH} -t nhc_checkraid_XXXXXXXX.tmp)
if [ $? == 0 ]; then
dbg "File ${TESTFILE} successfully created"
else
die 1 "Cannot create file ${TESTFILE}"
return 1
fi

rm ${TESTFILE}
if [ $? == 0 ]; then
dbg "File ${TESTFILE} successfully deleted"
else
die 1 "Cannot delete file ${TESTFILE}"
return 1
fi

return 0
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ SLURM_HEALTH_CHECK_NODE_STATE=IDLE
NHC_PROLOG=1
NHC_EPILOG=0
AUTOSCALING=0
NHC_EXTRA_TEST_FILES="csc_nvidia_smi.nhc azure_cuda_bandwidth.nhc azure_gpu_app_clocks.nhc azure_gpu_ecc.nhc azure_gpu_persistence.nhc azure_ib_write_bw_gdr.nhc azure_nccl_allreduce_ib_loopback.nhc azure_ib_link_flapping.nhc azure_gpu_clock_throttling.nhc azure_cpu_drop_cache_mem.nhc azure_gpu_xid.nhc azure_nccl_allreduce.nhc"
NHC_EXTRA_TEST_FILES="csc_nvidia_smi.nhc azure_cuda_bandwidth.nhc azure_gpu_app_clocks.nhc azure_gpu_ecc.nhc azure_gpu_persistence.nhc azure_ib_write_bw_gdr.nhc azure_nccl_allreduce_ib_loopback.nhc azure_ib_link_flapping.nhc azure_gpu_clock_throttling.nhc azure_cpu_drop_cache_mem.nhc azure_gpu_xid.nhc azure_nccl_allreduce.nhc azure_raid_health.nhc"

source $CYCLECLOUD_SPEC_PATH/files/common_functions.sh

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
* || check_fs_iused /sched 100%
* || check_fs_iused /shared 98%
* || check_fs_iused /mnt/resource_nvme 98%

* || check_raid /mnt/resource_nvme 8

#######################################################################
###
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@
* || check_fs_iused /sched 100%
* || check_fs_iused /shared 98%
* || check_fs_iused /mnt/resource_nvme 98%

* || check_raid /mnt/resource_nvme 8

#######################################################################
###
Expand Down

0 comments on commit 80b9191

Please sign in to comment.