Skip to content

Commit

Permalink
Merge pull request Azure#558 from Azure/gpu_monitoring_update3
Browse files Browse the repository at this point in the history
Update GPU Monitoring scripts
  • Loading branch information
garvct authored Jan 28, 2022
2 parents 3f95eaa + 0f078ef commit d47ebdf
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 37 deletions.
127 changes: 95 additions & 32 deletions experimental/gpu_monitoring/gpu_data_collector.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#!/usr/bin/python3

import json
import requests
import datetime
Expand All @@ -7,9 +9,13 @@
import subprocess
import socket
import os
import sys
import glob
import struct
import time
import argparse

# Some useful DCGM field ID's
#110: sm_app_clock (expect 1410 on A100, assume MHz)
#110: mem_app_clock (expect 1215 on A100, assume MHz))
#150: gpu_temp (in C)
Expand All @@ -20,16 +26,15 @@
#1007: fp32_active
#1008: fp16_active

dcgm_field_ids = '203,252,1004,1006,1007,1008'

# Update the customer ID to your Log Analytics workspace ID
customer_id = 'XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX'

# For the shared key, use either the primary or the secondary Connected Sources client authentication key
shared_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
# You can also use LOG_ANALYTICS_CUSTOMER_ID environmental variable, if you set this variable here the environmental variable
# will be ignored.
#customer_id = 'XXXXXXXXXXX'

# The log type is the name of the event that is being submitted
log_type = 'MYGPUMonitor'
# For the shared key, use either the primary or the secondary Connected Sources client authentication key.
# You can also use LOG_ANALYTICS_SHARED_KEY environmental variable, if you set this variable here the environmental variable
# will be ignored.
#shared_key = 'XXXXXXXXXXXXX'


# Build the API signature
Expand All @@ -44,7 +49,7 @@ def build_signature(customer_id, shared_key, date, content_length, method, conte


# Build and send a request to the POST API
def post_data(customer_id, shared_key, body, log_type):
def post_data(customer_id, shared_key, body, name_log_event):
method = 'POST'
content_type = 'application/json'
resource = '/api/logs'
Expand All @@ -56,7 +61,7 @@ def post_data(customer_id, shared_key, body, log_type):
headers = {
'content-type': content_type,
'Authorization': signature,
'Log-Type': log_type,
'Log-Type': name_log_event,
'x-ms-date': rfc1123date
}

Expand All @@ -73,7 +78,7 @@ def execute_cmd(cmd_l):
return cmd_out


def find_long_field_name(field_name):
def find_long_field_name(field_name,dcgm_dmon_list_out):
for line in dcgm_dmon_list_out.splitlines():
line_split = line.split()
if field_name in line:
Expand All @@ -87,7 +92,7 @@ def num(s):
return float(s)


def create_data_records():
def create_data_records(dcgm_dmon_fields_out,hostname,have_jobid,physicalhostname_val,dcgm_dmon_list_out):
data_l = []
field_name_l = []
for line in dcgm_dmon_fields_out.splitlines():
Expand All @@ -98,10 +103,11 @@ def create_data_records():
record_d = {}
record_d['gpu_id'] = int(line_split[1])
record_d['hostname'] = hostname
record_d['slurm_jobid'] = slurm_jobid
if have_jobid:
record_d['slurm_jobid'] = slurm_jobid
record_d['physicalhostname'] = physicalhostname_val
for field_name in field_name_l:
long_field_name = find_long_field_name(field_name)
long_field_name = find_long_field_name(field_name,dcgm_dmon_list_out)
indx = field_name_l.index(field_name) + 2
record_d[long_field_name] = num(line_split[indx])
data_l.append(record_d)
Expand All @@ -111,8 +117,11 @@ def create_data_records():
def get_slurm_jobid():
if os.path.isdir('/sys/fs/cgroup/memory/slurm'):
file_l = glob.glob('/sys/fs/cgroup/memory/slurm/uid_*/job_*')
jobid = int(file_l[0].split("_")[2])
return (True, jobid)
if file_l:
jobid = int(file_l[0].split("_")[2])
return (True, jobid)
else:
return (False, None)
else:
return (False, None)

Expand All @@ -128,19 +137,73 @@ def get_physicalhostname():
key = key.split(b'\x00')
value = value.split(b'\x00')
if "PhysicalHostNameFullyQualified" in str(key[0]):
return str(value[0])[2:][:-1]



(have_jobid, slurm_jobid) = get_slurm_jobid()
if have_jobid:
hostname = socket.gethostname()
dcgm_dmon_fields_cmd_l = ['dcgmi', 'dmon', '-e', dcgm_field_ids, '-c', '1']
dcgm_dmon_list_cmd_l = ['dcgmi', 'dmon', '-l']
dcgm_dmon_fields_out = execute_cmd(dcgm_dmon_fields_cmd_l)
dcgm_dmon_list_out = execute_cmd(dcgm_dmon_list_cmd_l)
physicalhostname_val = get_physicalhostname()
data_l = create_data_records()
print(data_l)
body = json.dumps(data_l)
post_data(customer_id, shared_key, body, log_type)
return str(value[0])[2:][:-1]


def read_env_vars():
if 'customer_id' in globals():
customer_id = globals()['customer_id']
else:
if 'LOG_ANALYTICS_CUSTOMER_ID' in os.environ:
customer_id = os.environ['LOG_ANALYTICS_CUSTOMER_ID']
else:
sys.exit("Error: LOG_ANALYTICS_CUSTOMER_ID enviromental variable is not defined")
if 'shared_key' in globals():
shared_key = globals()['shared_key']
else:
if 'LOG_ANALYTICS_SHARED_KEY' in os.environ:
shared_key = os.environ['LOG_ANALYTICS_SHARED_KEY']
else:
sys.exit("Error: LOG_ANALYTICS_SHARED_KEY enviromental variable is not defined")

return (customer_id,shared_key)


def parse_args():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("-dfi", "--dcgm_field_ids", dest="dcgm_field_ids", type=str, default="203,252,1004", help="Select the DCGM field ids you would like to monitor (if multiple field ids are desired then separate by commas) [string]")
parser.add_argument("-nle", "--name_log_event", dest="name_log_event", type=str, default="MyGPUMonitor", help="Select a name for the log events you want to monitor")
parser.add_argument("-fgm", "--force_gpu_monitoring", action="store_true", help="Forces data to be sent to log analytics WS even if no SLURM job is running on the node")
parser.add_argument("-uc", "--use_crontab", action="store_true", help="This script will be started by the system contab and the time interval between each data collection will be decided by the system crontab (if crontab is selected then the -tis argument will be ignored).")
parser.add_argument("-tis", "--time_interval_seconds", dest="time_interval_seconds", type=int, default=30, help="The time interval in seconds between each data collection (This option cannot be used with the -uc argument)")
args = parser.parse_args()

if args.use_crontab:
use_crontab = True
else:
use_crontab = False
time_interval_seconds = args.time_interval_seconds
dcgm_field_ids = args.dcgm_field_ids
force_gpu_monitoring = args.force_gpu_monitoring
name_log_event = args.name_log_event

return (use_crontab,time_interval_seconds,dcgm_field_ids,force_gpu_monitoring,name_log_event)


def main():
(use_crontab,time_interval_seconds,dcgm_field_ids,force_gpu_monitoring,name_log_event) = parse_args()
(customer_id,shared_key) = read_env_vars()

while True:
(have_jobid, slurm_jobid) = get_slurm_jobid()

if have_jobid or force_gpu_monitoring:
hostname = socket.gethostname()
dcgm_dmon_fields_cmd_l = ['dcgmi', 'dmon', '-e', dcgm_field_ids, '-c', '1']
dcgm_dmon_list_cmd_l = ['dcgmi', 'dmon', '-l']
dcgm_dmon_fields_out = execute_cmd(dcgm_dmon_fields_cmd_l)
dcgm_dmon_list_out = execute_cmd(dcgm_dmon_list_cmd_l)
physicalhostname_val = get_physicalhostname()
data_l = create_data_records(dcgm_dmon_fields_out,hostname,have_jobid,physicalhostname_val,dcgm_dmon_list_out)
print(data_l)
body = json.dumps(data_l)
post_data(customer_id, shared_key, body, name_log_event)

if use_crontab:
break
else:
time.sleep(time_interval_seconds)


if __name__ == "__main__":
main()
47 changes: 46 additions & 1 deletion experimental/gpu_monitoring/readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
GPU Monitoring is essential to get insights into how effectively your application in utilizing the GPU(s) and monitor the health of the GPU's.

Basic GPU Monitoring is demonstrated utilizing Azure Monitor log analytics. The following script are provided, collect Data Center GPU Manager dmon metrics and send it to your log analytics workspace, start/stop GPU Monitoring (using crontab) and generate a load to test the GPU monitoring.
SLURM job ids are collected, so you can monitor for specific jobids. (Assumes exclusive jobs on nodes)
SLURM job ids are collected, so you can monitor for specific jobids. (Assumes exclusive jobs on nodes). The physical hostnames of the hosts on which the VM's are running are also recorded. You can use the system crontab to control the time interval for collecting data, or you can run the python collection script directly and specify the collection time interval (see the -tis argument below).

## Prerequisites

Expand All @@ -14,6 +14,35 @@ SLURM job ids are collected, so you can monitor for specific jobids. (Assumes ex
- View/edit scripts to ensure all paths are correct and the log analytics workspace customer_id/shared_key are updated in the scripts, the dmon fields to monitor
are updated in the scripts and the crontab interval is selected (default every minute)

## GPU monitoring script options

```
./gpu_data_collector.py -h
usage: gpu_data_collector.py [-h] [-dfi DCGM_FIELD_IDS] [-nle NAME_LOG_EVENT]
[-fgm] [-uc] [-tis TIME_INTERVAL_SECONDS]
optional arguments:
-h, --help show this help message and exit
-dfi DCGM_FIELD_IDS, --dcgm_field_ids DCGM_FIELD_IDS
Select the DCGM field ids you would like to monitor
(if multiple field ids are desired then separate by commas)
[string] (default: 203,252,1004)
-nle NAME_LOG_EVENT, --name_log_event NAME_LOG_EVENT
Select a name for the log events you want to monitor
(default: MyGPUMonitor)
-fgm, --force_gpu_monitoring
Forces data to be sent to log analytics WS even if no
SLURM job is running on the node (default: False)
-uc, --use_crontab This script will be started by the system contab and
the time interval between each data collection will be
decided by the system crontab (if crontab is selected
then the -tis argument will be ignored). (default:
False)
-tis TIME_INTERVAL_SECONDS, --time_interval_seconds TIME_INTERVAL_SECONDS
The time interval in seconds between each data
collection (This option cannot be used with the -uc
argument) (default: 30 sec)
```

## Usage
>Note: Please edit all scripts as outlined in the prerequisites
Expand Down Expand Up @@ -69,6 +98,8 @@ memory_clock MMCLK 101
etc
```
To start the gpu monitor on a list of nodes. The default collection time interval is 30 sec (-tis argument) and the default DCGM GPU metrics collected are
GPU Utilization (203), GPU memory used (252) and Tensor activity (1004). You can change these options.

Start the GPU monitor
>Note: Remember to edit the hostfile
Expand All @@ -80,6 +111,20 @@ Stop the gpu_monitor
```
./stop_gpu_data_collector.sh
```
>Note: The log file for gpu_data_collector.py is located in /tmp/gpu_data_collector.log
Similarly, scripts are provided to use the system crontab to start the gpu data collector and decide the time interval based on the crontab parameters. In the case of crontab
the smallest timing interval is 60 sec. (start_gpu_data_collector_cron.sh and stop_gpu_data_collector_cron.sh

Go to your log analytics workspace to monitor your GPU's and generate dashboards.

A simple log analytics query to chart the average GPU utilization for a particular slurm job would be.

```
MYGPUMonitor_CL
| where gpu_id_d in (0,1,2,3,4,5,6,7) and slurm_jobid_d == 17
| summarize avg(gpu_utilization_d) by bin(TimeGenerated, 5m)
| render timechart
```

![Alt text1](/experimental/gpu_monitoring/images/gpu-dash.png?raw=true "gpu-dash")
7 changes: 4 additions & 3 deletions experimental/gpu_monitoring/start_gpu_data_collector.sh
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
#!/bin/bash

HOSTLIST=hostlist
INTERVAL_MINS=1
INTERVAL_SECS=30
DCGM_FIELD_IDS="203,252,1004"
SCRIPT_PATH=~
EXE_PATH="python3 ${SCRIPT_PATH}/gpu_data_collector.py \>\> /tmp/gpu_data_collector.log"
EXE_PATH="${SCRIPT_PATH}/gpu_data_collector.py -tis $INTERVAL_SECS -dfi $DCGM_FIELD_IDS \>\> /tmp/gpu_data_collector.log"
PDSH_RCMD_TYPE=ssh


WCOLL=$HOSTLIST pdsh "if ! [ -f /etc/crontab.orig ]; then sudo cp /etc/crontab /etc/crontab.orig; echo "\*/$INTERVAL_MINS \\* \\* \\* \\* root $EXE_PATH" 2>&1 | sudo tee -a /etc/crontab;fi"
WCOLL=$HOSTLIST pdsh sudo $EXE_PATH
10 changes: 10 additions & 0 deletions experimental/gpu_monitoring/start_gpu_data_collector_cron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash

HOSTLIST=hostlist
INTERVAL_MINS=1
SCRIPT_PATH=~
EXE_PATH="${SCRIPT_PATH}/gpu_data_collector.py -uc \>\> /tmp/gpu_data_collector.log"
PDSH_RCMD_TYPE=ssh


WCOLL=$HOSTLIST pdsh "if ! [ -f /etc/crontab.orig ]; then sudo cp /etc/crontab /etc/crontab.orig; echo "\*/$INTERVAL_MINS \\* \\* \\* \\* root $EXE_PATH" 2>&1 | sudo tee -a /etc/crontab;fi"
2 changes: 1 addition & 1 deletion experimental/gpu_monitoring/stop_gpu_data_collector.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@ HOSTLIST=hostlist
PDSH_RCMD_TYPE=ssh


WCOLL=$HOSTLIST pdsh sudo mv /etc/crontab.orig /etc/crontab
WCOLL=$HOSTLIST pdsh sudo pkill gpu_data_collector
7 changes: 7 additions & 0 deletions experimental/gpu_monitoring/stop_gpu_data_collector_cron.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/bin/bash

HOSTLIST=hostlist
PDSH_RCMD_TYPE=ssh


WCOLL=$HOSTLIST pdsh sudo mv /etc/crontab.orig /etc/crontab

0 comments on commit d47ebdf

Please sign in to comment.