Skip to content

Commit

Permalink
Merge pull request Azure#716 from Azure/app_pinning_tool_slurm
Browse files Browse the repository at this point in the history
App pinning tool ( slurm/srun integration)
  • Loading branch information
garvct authored Apr 6, 2023
2 parents 0161fdc + 0440fcb commit 5da70fe
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 9 deletions.
104 changes: 98 additions & 6 deletions experimental/check_app_pinning_tool/check_app_pinning.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ def one_numa(row_l):


def parse_lstopo():
cmd = ["lstopo-no-graphics", "--no-caches", "--taskset"]
cmd = ["lstopo-no-graphics", "--no-caches", "--taskset", "--whole-io"]
try:
cmdpipe = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
except FileNotFoundError:
Expand Down Expand Up @@ -256,10 +256,10 @@ def parse_lstopo():
row_l = row_s.split()
core_id = re.findall(r'\d+',row_l[-2])[0]
topo_d["numanode_ids"][numanode]["core_ids"].append(int(core_id))
if re.search(r'GPU.*card', row_s):
if re.search(r' {10,}GPU.*card', row_s):
row_l = row_s.split()
gpu_id = re.findall(r'\d+',row_l[-1])[0]
topo_d["numanode_ids"][numanode]["gpu_ids"].append(int(gpu_id))
topo_d["numanode_ids"][numanode]["gpu_ids"].append(int(gpu_id)-1)
cmdpipe.stdout.close()
cmdpipe.stderr.close()
return topo_d
Expand Down Expand Up @@ -497,6 +497,80 @@ def calc_process_pinning(number_processes_per_vm, num_numa_domains, l3cache_topo
return (pinning_l, number_processes_per_numa, number_cores_in_l3cache)


def calc_slurm_pinning(number_processes_per_numa, topo_2_d):
slurm_pinning_l = []
for numa_id in topo_2_d["numanode_ids"]:
numa_pinning_l = []
indx = 0
while len(numa_pinning_l) < number_processes_per_numa:
for l3cache_id in topo_2_d["numanode_ids"][numa_id]["l3cache_ids"]:
if indx > len(topo_2_d["numanode_ids"][numa_id]["l3cache_ids"][l3cache_id])-1:
continue
if len(numa_pinning_l) < number_processes_per_numa:
numa_pinning_l.append(topo_2_d["numanode_ids"][numa_id]["l3cache_ids"][l3cache_id][indx])
else:
break
indx += 1
slurm_pinning_l += numa_pinning_l
return (slurm_pinning_l)


def calc_slurm_pin_range(slurm_pinning_l, num_threads):
core_id_range_l = []
for core_id in slurm_pinning_l:
range_end = core_id + num_threads - 1
core_id_range = str(core_id) + "-" + str(range_end)
core_id_range_l.append(core_id_range)
return core_id_range_l


def execute_cmd(cmd_l):
proc = subprocess.Popen(cmd_l, stdout=subprocess.PIPE, universal_newlines=True)
cmd_out, errs = proc.communicate()
return cmd_out


def convert_range_to_mask(core_id_range_l):
slurm_mask_str = ""
for core_id_range in core_id_range_l:
hwloc_calc_arg = 'core:' + core_id_range
cmd_l = ['hwloc-calc', "--taskset", hwloc_calc_arg]
hwloc_calc_out = execute_cmd(cmd_l)
slurm_mask_str += "," + hwloc_calc_out.rstrip()
return slurm_mask_str[1:]


def create_gpu_numa_mask_str(topo_d, total_num_gpus):
gpu_numa_mask_str = ""
for gpu_id in range(0,total_num_gpus):
for numa_id in topo_d["numanode_ids"]:
gpu_ids_l = topo_d["numanode_ids"][numa_id]["gpu_ids"]
if gpu_id in gpu_ids_l:
gpu_numa_mask_str += "," + topo_d["numanode_ids"][numa_id]["mask"]
break
return gpu_numa_mask_str[1:]


def l3cache_id_in_numa(l3cache_l, numa_core_l):
for core_id in l3cache_l:
if core_id in numa_core_l:
return True
else:
return False


def create_topo_2_d(topo_d, l3cache_topo_d):
topo_2_d = {}
topo_2_d = topo_d
for numa_id in topo_2_d["numanode_ids"]:
topo_2_d["numanode_ids"][numa_id]["l3cache_ids"] = {}
for l3cache_id in l3cache_topo_d["l3cache_ids"]:
if l3cache_id_in_numa(l3cache_topo_d["l3cache_ids"][l3cache_id], topo_d["numanode_ids"][numa_id]["core_ids"]):
topo_2_d["numanode_ids"][numa_id]["l3cache_ids"][l3cache_id] = l3cache_topo_d["l3cache_ids"][l3cache_id]

return topo_2_d


def check_process_numa_distribution(total_num_processes, total_num_numa_domains, process_d):
num_numa_domains = min(total_num_processes, total_num_numa_domains)
numas_l = []
Expand Down Expand Up @@ -715,7 +789,7 @@ def check_number_threads_per_l3cache(number_processes_per_vm, number_threads_per
return have_warning


def report(app_pattern, print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_syntax_l, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, num_numas):
def report(app_pattern, print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_syntax_l, slurm_pinning_l, slurm_mask_str, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, num_numas, total_num_gpus):
hostname = socket.gethostname()
print("")
print("Virtual Machine ({}, {}) Numa topology".format(sku_name, hostname))
Expand Down Expand Up @@ -773,6 +847,15 @@ def report(app_pattern, print_pinning_syntax, topo_d, process_d, sku_name, l3cac
else:
az_mpi_args = "--bind-to l3cache --map-by ppr:{}:numa -report-bindings".format(number_processes_per_numa)
print("mpirun -np {} {}".format(total_number_processes, az_mpi_args))
elif mpi_type == "srun":
if total_num_gpus == 0 or total_num_gpus != number_processes_per_vm:
az_mpi_args = "--mpi=pmix --cpu-bind=mask_cpu:{} --ntasks-per-node={}".format(slurm_mask_str, number_processes_per_vm)
print("core id pinning: {}\n".format(slurm_pinning_l))
print("srun {}".format(az_mpi_args))
else:
gpu_numa_mask_str = create_gpu_numa_mask_str(topo_d, total_num_gpus)
az_mpi_args = "--mpi=pmix --cpu-bind=mask_cpu:{} --ntasks-per-node={} --gpus-per-node={}".format(gpu_numa_mask_str, number_processes_per_vm, total_num_gpus)
print("srun {}".format(az_mpi_args))
elif mpi_type == "intel":
num_l3cache = len(l3cache_topo_d["l3cache_ids"])
if number_threads_per_process == 1:
Expand All @@ -799,6 +882,8 @@ def main():
number_processes_per_vm = 0
number_threads_per_process = 0
pinning_l = []
slurm_pinning_l = []
slurm_mask_str = ""
process_d = {}
number_processes_per_numa = 0
number_cores_in_l3cache = 0
Expand All @@ -813,7 +898,7 @@ def main():
parser.add_argument("-nv", "--total_number_vms", dest="total_number_vms", type=int, default=1, help="Total number of VM's (used with -pps)")
parser.add_argument("-nppv", "--number_processes_per_vm", dest="number_processes_per_vm", type=int, help="Total number of MPI processes per VM (used with -pps)")
parser.add_argument("-ntpp", "--number_threads_per_process", dest="number_threads_per_process", type=int, help="Number of threads per process (used with -pps)")
parser.add_argument("-mt", "--mpi_type", dest="mpi_type", type=str, choices=["openmpi","intel","mvapich2"], default="openmpi", help="Select which type of MPI to generate pinning syntax (used with -pps)")
parser.add_argument("-mt", "--mpi_type", dest="mpi_type", type=str, choices=["openmpi","intel","mvapich2","srun"], default="openmpi", help="Select which type of MPI to generate pinning syntax (used with -pps)(select srun when you are using a SLURM scheduler)")
args = parser.parse_args()
force = args.force
if len(sys.argv) > 1 and not args.application_pattern and not args.print_pinning_syntax:
Expand Down Expand Up @@ -857,7 +942,14 @@ def main():
have_warning = check_pinning_syntax(number_processes_per_vm, number_threads_per_process, topo_d, l3cache_topo_d)
(pinning_l, number_processes_per_numa, number_cores_in_l3cache) = calc_process_pinning(number_processes_per_vm, total_num_numa_domains, l3cache_topo_d)

report(args.application_pattern, args.print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_l, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, total_num_numa_domains)
if mpi_type == "srun":
if total_num_gpus == 0 or total_num_gpus != number_processes_per_vm:
topo_2_d = create_topo_2_d(topo_d, l3cache_topo_d)
slurm_pinning_l = calc_slurm_pinning(number_processes_per_numa, topo_2_d)
slurm_pinning_l = calc_slurm_pin_range(slurm_pinning_l, number_threads_per_process)
slurm_mask_str = convert_range_to_mask(slurm_pinning_l)

report(args.application_pattern, args.print_pinning_syntax, topo_d, process_d, sku_name, l3cache_topo_d, number_cores_per_vm, total_number_vms, number_processes_per_vm, number_threads_per_process, pinning_l, slurm_pinning_l, slurm_mask_str, number_processes_per_numa, number_cores_in_l3cache, mpi_type, have_warning, force, total_num_numa_domains, total_num_gpus)
check_app(args.application_pattern, total_num_numa_domains, total_num_gpus, topo_d, process_d, l3cache_topo_d)


Expand Down
52 changes: 49 additions & 3 deletions experimental/check_app_pinning_tool/readme.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# HPC Application process/thread mapping/pinning checking tool

Correct mapping/pinning of HPC Application processes/threads is critical for optimal performance.
The HPC Application process/thread mapping/pinning checking tool has three main features, it allows you to quickly verify that the processes/threads associated with your HPC Application are mapped/pinned correctly/optimally, it can generate the MPI process/thread pinning syntax for OpenMPI/HPCX, Intel MPI and Mvapich2 (Currently for HPC VM's based on AMD processors (HB (v1,v2 & v3) and NDv4) and you can use this tool directly in an mpi run script to pass and use the optimal mpi pinning arguments. This tool shows you the virtual machine NUMA topology (i.e location of core id's, GPU's and NUMA domains), where the processes/threads associated with your HPC Application are mapped/pinned and warnings if they are not mapped/pinned optimally.
The HPC Application process/thread mapping/pinning checking tool has three main features, it allows you to quickly verify that the processes/threads associated with your HPC Application are mapped/pinned correctly/optimally, it can generate the MPI process/thread pinning syntax for OpenMPI/HPCX, Intel MPI and Mvapich2 (Currently for HPC VM's based on AMD processors (HB (v1,v2 & v3) and NDv4) and you can use this tool directly in an mpi run script (or slurm/srun) to pass and use the optimal mpi pinning arguments. This tool shows you the virtual machine NUMA topology (i.e location of core id's, GPU's and NUMA domains), where the processes/threads associated with your HPC Application are mapped/pinned and warnings if they are not mapped/pinned optimally.

## Prerequisites

Expand Down Expand Up @@ -34,9 +34,10 @@ optional arguments:
-ntpp NUMBER_THREADS_PER_PROCESS, --number_threads_per_process NUMBER_THREADS_PER_PROCESS
Number of threads per process (used with -pps)
(default: None)
-mt {openmpi,intel,mvapich2}, --mpi_type {openmpi,intel,mvapich2}
-mt {openmpi,intel,mvapich2,srun}, --mpi_type {openmpi,intel,mvapich2,srun}
Select which type of MPI to generate pinning syntax
(used with -pps) (default: openmpi)
(used with -pps)(select srun when you are using a
SLURM scheduler) (default: openmpi)
```
## Examples
You are on a Standard_HB120-64rs_v3 virtual machine, you would like to know the correct HPCX pinning syntax to pin 16 MPI
Expand Down Expand Up @@ -259,3 +260,48 @@ To run 16 processes and 6 threads using Intel MPI on HB120-96rs_v3, just add -mt
check_app_pinning.py -pps -nv 1 -nppv 16 -ntpp $OMP_NUM_THREADS -mt intel
```
>Note: AZ_MPI_NP=16 and AZ_MPI_ARGS="-genv I_MPI_PIN_DOMAIN 6:compact -genv FI_PROVIDER mlx -genv I_MPI_COLL_EXTERNAL 1 -genv I_MPI_DEBUG 6"
Example of Slurm/srun integration, run 16 processes and 6 threads on HB120-96rs_v3 using srun with a Slurm scheduler.

```
#!/bin/bash
#SBATCH --mem=0
#SBATCH --ntasks-per-node=16
#SBATCH --exclusive
module load gcc-9.2.0
module load mpi/hpcx
export SLURM_CPU_BIND=verbose
export OMP_NUM_THREADS=6
check_app_pinning.py -pps -nv $SLURM_NNODES -nppv $SLURM_NTASKS_PER_NODE -ntpp $OMP_NUM_THREADS -mt srun
AZ_MPI_NP=$(cat AZ_MPI_NP)
AZ_MPI_ARGS=$(cat AZ_MPI_ARGS)
srun $AZ_MPI_ARGS mpi_executable
```
>Note: AZ_MPI_ARGS="--mpi=pmix --cpu-bind=mask_cpu:0x3f,0xfc0,0x3f000,0xfc0000,0x3f000000,0xfc0000000,0x3f000000000,0xfc0000000000,0x3f000000000000,0xfc0000000000000,0x3f000000000000000,0xfc0000000000000000,0x3f000000000000000000,0xfc0000000000000000000,0x3f000000000000000000000,0xfc0000000000000000000000 --ntasks-per-node=16"

Example of Slurm/srun integration, run 8 processes on NDm_A100_v4 using srun (Slurm Scheduler).

```
#!/bin/bash
#SBATCH --mem=0
#SBATCH --ntasks-per-node=8
#SBATCH --exclusive
module load gcc-9.2.0
module load mpi/hpcx
export SLURM_CPU_BIND=verbose
export OMP_NUM_THREADS=1
check_app_pinning.py -pps -nv $SLURM_NNODES -nppv $SLURM_NTASKS_PER_NODE -ntpp $OMP_NUM_THREADS -mt srun
AZ_MPI_NP=$(cat AZ_MPI_NP)
AZ_MPI_ARGS=$(cat AZ_MPI_ARGS)
srun $AZ_MPI_ARGS mpi_executable
```
>Note: AZ_MPI_ARGS="--mpi=pmix --cpu-bind=mask_cpu:0xffffff000000,0xffffff000000,0xffffff,0xffffff,0xffffff000000000000000000,0xffffff000000000000000000,0xffffff000000000000,0xffffff000000000000 --ntasks-per-node=8 --gpus-per-node=8"

0 comments on commit 5da70fe

Please sign in to comment.