Merge pull request Azure#558 from Azure/gpu_monitoring_update3

Update GPU Monitoring scripts
vinil-v · Jan 28, 2022 · d47ebdf · d47ebdf
2 parents 3f95eaa + 0f078ef
commit d47ebdf
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 37 deletions.
diff --git a/experimental/gpu_monitoring/gpu_data_collector.py b/experimental/gpu_monitoring/gpu_data_collector.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python3
+
 import json
 import requests
 import datetime
@@ -7,9 +9,13 @@
 import subprocess
 import socket
 import os
+import sys
 import glob
 import struct
+import time
+import argparse
 
+# Some useful DCGM field ID's
 #110: sm_app_clock (expect 1410 on A100, assume MHz)
 #110: mem_app_clock (expect 1215 on A100, assume MHz))
 #150: gpu_temp (in C)
@@ -20,16 +26,15 @@
 #1007: fp32_active
 #1008: fp16_active
 
-dcgm_field_ids = '203,252,1004,1006,1007,1008'
-
 # Update the customer ID to your Log Analytics workspace ID
-customer_id = 'XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX'
-
-# For the shared key, use either the primary or the secondary Connected Sources client authentication key   
-shared_key = "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
+# You can also use LOG_ANALYTICS_CUSTOMER_ID environmental variable, if you set this variable here the environmental variable 
+# will be ignored.
+#customer_id = 'XXXXXXXXXXX'
 
-# The log type is the name of the event that is being submitted
-log_type = 'MYGPUMonitor'
+# For the shared key, use either the primary or the secondary Connected Sources client authentication key.
+# You can also use LOG_ANALYTICS_SHARED_KEY environmental variable, if you set this variable here the environmental variable
+# will be ignored.
+#shared_key = 'XXXXXXXXXXXXX'
 
 
 # Build the API signature
@@ -44,7 +49,7 @@ def build_signature(customer_id, shared_key, date, content_length, method, conte
 
 
 # Build and send a request to the POST API
-def post_data(customer_id, shared_key, body, log_type):
+def post_data(customer_id, shared_key, body, name_log_event):
     method = 'POST'
     content_type = 'application/json'
     resource = '/api/logs'
@@ -56,7 +61,7 @@ def post_data(customer_id, shared_key, body, log_type):
     headers = {
         'content-type': content_type,
         'Authorization': signature,
-        'Log-Type': log_type,
+        'Log-Type': name_log_event,
         'x-ms-date': rfc1123date
     }
 
@@ -73,7 +78,7 @@ def execute_cmd(cmd_l):
     return cmd_out
 
 
-def find_long_field_name(field_name):
+def find_long_field_name(field_name,dcgm_dmon_list_out):
     for line in dcgm_dmon_list_out.splitlines():
         line_split = line.split()
         if field_name in line:
@@ -87,7 +92,7 @@ def num(s):
        return float(s)
 
 
-def create_data_records():
+def create_data_records(dcgm_dmon_fields_out,hostname,have_jobid,physicalhostname_val,dcgm_dmon_list_out):
     data_l = []
     field_name_l = []
     for line in dcgm_dmon_fields_out.splitlines():
@@ -98,10 +103,11 @@ def create_data_records():
             record_d = {}
             record_d['gpu_id'] = int(line_split[1])
             record_d['hostname'] = hostname
-            record_d['slurm_jobid'] = slurm_jobid
+            if have_jobid:
+               record_d['slurm_jobid'] = slurm_jobid
             record_d['physicalhostname'] = physicalhostname_val
             for field_name in field_name_l:
-                long_field_name = find_long_field_name(field_name)
+                long_field_name = find_long_field_name(field_name,dcgm_dmon_list_out)
                 indx = field_name_l.index(field_name) + 2
                 record_d[long_field_name] = num(line_split[indx])
             data_l.append(record_d)
@@ -111,8 +117,11 @@ def create_data_records():
 def get_slurm_jobid():
     if os.path.isdir('/sys/fs/cgroup/memory/slurm'):
       file_l = glob.glob('/sys/fs/cgroup/memory/slurm/uid_*/job_*')
-      jobid = int(file_l[0].split("_")[2])
-      return (True, jobid)
+      if file_l:
+         jobid = int(file_l[0].split("_")[2])
+         return (True, jobid)
+      else:
+         return (False, None)
     else:
       return (False, None)
 
@@ -128,19 +137,73 @@ def get_physicalhostname():
         key = key.split(b'\x00')
         value = value.split(b'\x00')
         if "PhysicalHostNameFullyQualified" in str(key[0]):
-           return str(value[0])[2:][:-1] 
-
-
-
-(have_jobid, slurm_jobid) = get_slurm_jobid()
-if have_jobid:
-   hostname = socket.gethostname()
-   dcgm_dmon_fields_cmd_l = ['dcgmi', 'dmon', '-e', dcgm_field_ids, '-c', '1']
-   dcgm_dmon_list_cmd_l = ['dcgmi', 'dmon', '-l']
-   dcgm_dmon_fields_out = execute_cmd(dcgm_dmon_fields_cmd_l)
-   dcgm_dmon_list_out = execute_cmd(dcgm_dmon_list_cmd_l)
-   physicalhostname_val = get_physicalhostname()
-   data_l = create_data_records()
-   print(data_l)
-   body = json.dumps(data_l)
-   post_data(customer_id, shared_key, body, log_type)
+           return str(value[0])[2:][:-1]
+
+
+def read_env_vars():
+    if 'customer_id' in globals():
+       customer_id = globals()['customer_id']
+    else:
+       if 'LOG_ANALYTICS_CUSTOMER_ID' in os.environ:
+          customer_id = os.environ['LOG_ANALYTICS_CUSTOMER_ID']
+       else:
+          sys.exit("Error: LOG_ANALYTICS_CUSTOMER_ID enviromental variable is not defined")
+    if 'shared_key' in globals():
+       shared_key = globals()['shared_key']
+    else:
+       if 'LOG_ANALYTICS_SHARED_KEY' in os.environ:
+          shared_key = os.environ['LOG_ANALYTICS_SHARED_KEY']
+       else:
+          sys.exit("Error: LOG_ANALYTICS_SHARED_KEY enviromental variable is not defined")
+
+    return (customer_id,shared_key)
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("-dfi", "--dcgm_field_ids", dest="dcgm_field_ids", type=str, default="203,252,1004", help="Select the DCGM field ids you would like to monitor (if multiple field ids are desired then separate by commas) [string]")
+    parser.add_argument("-nle", "--name_log_event", dest="name_log_event", type=str, default="MyGPUMonitor", help="Select a name for the log events you want to monitor")
+    parser.add_argument("-fgm", "--force_gpu_monitoring", action="store_true", help="Forces data to be sent to log analytics WS even if no SLURM job is running on the node")
+    parser.add_argument("-uc", "--use_crontab", action="store_true", help="This script will be started by the system contab and the time interval between each data collection will be decided by the system crontab (if crontab is selected then the  -tis argument will be ignored).")
+    parser.add_argument("-tis", "--time_interval_seconds", dest="time_interval_seconds", type=int, default=30, help="The time interval in seconds between each data collection (This option cannot be used with the -uc argument)")
+    args = parser.parse_args()
+
+    if args.use_crontab:
+       use_crontab = True
+    else:
+       use_crontab = False
+    time_interval_seconds = args.time_interval_seconds
+    dcgm_field_ids = args.dcgm_field_ids
+    force_gpu_monitoring = args.force_gpu_monitoring
+    name_log_event = args.name_log_event
+
+    return (use_crontab,time_interval_seconds,dcgm_field_ids,force_gpu_monitoring,name_log_event)
+
+
+def main():
+    (use_crontab,time_interval_seconds,dcgm_field_ids,force_gpu_monitoring,name_log_event) = parse_args()
+    (customer_id,shared_key) = read_env_vars()
+
+    while True:
+          (have_jobid, slurm_jobid) = get_slurm_jobid()
+
+          if have_jobid or force_gpu_monitoring:
+             hostname = socket.gethostname()
+             dcgm_dmon_fields_cmd_l = ['dcgmi', 'dmon', '-e', dcgm_field_ids, '-c', '1']
+             dcgm_dmon_list_cmd_l = ['dcgmi', 'dmon', '-l']
+             dcgm_dmon_fields_out = execute_cmd(dcgm_dmon_fields_cmd_l)
+             dcgm_dmon_list_out = execute_cmd(dcgm_dmon_list_cmd_l)
+             physicalhostname_val = get_physicalhostname()
+             data_l = create_data_records(dcgm_dmon_fields_out,hostname,have_jobid,physicalhostname_val,dcgm_dmon_list_out)
+             print(data_l)
+             body = json.dumps(data_l)
+             post_data(customer_id, shared_key, body, name_log_event)
+
+          if use_crontab:
+             break
+          else:
+             time.sleep(time_interval_seconds)
+
+
+if __name__ == "__main__":
+   main()
diff --git a/experimental/gpu_monitoring/readme.md b/experimental/gpu_monitoring/readme.md
@@ -3,7 +3,7 @@
 GPU Monitoring is essential to get insights into how effectively your application in utilizing the GPU(s) and monitor the health of the GPU's.
 
 Basic GPU Monitoring is demonstrated utilizing Azure Monitor log analytics. The following script are provided, collect Data Center GPU Manager dmon metrics and send it to your log  analytics workspace, start/stop GPU Monitoring (using crontab) and generate a load to test the GPU monitoring.
-SLURM job ids are collected, so you can monitor for specific jobids. (Assumes exclusive jobs on nodes)
+SLURM job ids are collected, so you can monitor for specific jobids. (Assumes exclusive jobs on nodes). The physical hostnames of the hosts on which the VM's are running are also recorded. You can use the system crontab to control the time interval for collecting data, or you can run the python collection script directly and specify the collection time interval (see the -tis argument below).
 
 ## Prerequisites
 
@@ -14,6 +14,35 @@ SLURM job ids are collected, so you can monitor for specific jobids. (Assumes ex
 - View/edit scripts to ensure all paths are correct and the log analytics workspace customer_id/shared_key are updated in the scripts, the dmon fields to monitor 
   are updated in the scripts and the crontab interval is selected (default every minute)
 
+## GPU monitoring script options
+
+```
+./gpu_data_collector.py -h
+usage: gpu_data_collector.py [-h] [-dfi DCGM_FIELD_IDS] [-nle NAME_LOG_EVENT]
+                             [-fgm] [-uc] [-tis TIME_INTERVAL_SECONDS]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -dfi DCGM_FIELD_IDS, --dcgm_field_ids DCGM_FIELD_IDS
+                        Select the DCGM field ids you would like to monitor
+                        (if multiple field ids are desired then separate by commas)
+                        [string] (default: 203,252,1004)
+  -nle NAME_LOG_EVENT, --name_log_event NAME_LOG_EVENT
+                        Select a name for the log events you want to monitor
+                        (default: MyGPUMonitor)
+  -fgm, --force_gpu_monitoring
+                        Forces data to be sent to log analytics WS even if no
+                        SLURM job is running on the node (default: False)
+  -uc, --use_crontab    This script will be started by the system contab and
+                        the time interval between each data collection will be
+                        decided by the system crontab (if crontab is selected
+                        then the -tis argument will be ignored). (default:
+                        False)
+  -tis TIME_INTERVAL_SECONDS, --time_interval_seconds TIME_INTERVAL_SECONDS
+                        The time interval in seconds between each data
+                        collection (This option cannot be used with the -uc
+                        argument) (default: 30 sec)
+```
 
 ## Usage
 >Note: Please edit all scripts as outlined in the prerequisites
@@ -69,6 +98,8 @@ memory_clock                                           MMCLK            101
 etc
 
 ```
+To start the gpu monitor on a list of  nodes. The default collection time interval is 30 sec (-tis argument) and the default DCGM GPU metrics collected are
+GPU Utilization (203), GPU memory used (252) and Tensor activity (1004). You can change these options.
 
 Start the GPU monitor
 >Note: Remember to edit the hostfile
@@ -80,6 +111,20 @@ Stop the gpu_monitor
 ```
 ./stop_gpu_data_collector.sh
 ```
+>Note: The log file for gpu_data_collector.py is located in /tmp/gpu_data_collector.log
+
+Similarly, scripts are provided to use the system crontab to start the gpu data collector and decide the time interval based on the crontab parameters. In the case of crontab 
+the smallest timing interval is 60 sec. (start_gpu_data_collector_cron.sh and stop_gpu_data_collector_cron.sh
+
 Go to your log analytics workspace to monitor your GPU's and generate dashboards.
 
+A simple log analytics query to chart the average GPU utilization for a particular slurm job would be.
+
+```
+MYGPUMonitor_CL
+| where gpu_id_d in (0,1,2,3,4,5,6,7) and slurm_jobid_d == 17
+| summarize avg(gpu_utilization_d) by bin(TimeGenerated, 5m)
+| render timechart
+```
+
 ![Alt text1](/experimental/gpu_monitoring/images/gpu-dash.png?raw=true "gpu-dash")
diff --git a/experimental/gpu_monitoring/start_gpu_data_collector.sh b/experimental/gpu_monitoring/start_gpu_data_collector.sh
@@ -1,10 +1,11 @@
 #!/bin/bash
 
 HOSTLIST=hostlist
-INTERVAL_MINS=1
+INTERVAL_SECS=30
+DCGM_FIELD_IDS="203,252,1004"
 SCRIPT_PATH=~
-EXE_PATH="python3 ${SCRIPT_PATH}/gpu_data_collector.py \>\> /tmp/gpu_data_collector.log"
+EXE_PATH="${SCRIPT_PATH}/gpu_data_collector.py -tis $INTERVAL_SECS -dfi $DCGM_FIELD_IDS \>\> /tmp/gpu_data_collector.log"
 PDSH_RCMD_TYPE=ssh
 
 
-WCOLL=$HOSTLIST pdsh "if ! [ -f /etc/crontab.orig ]; then sudo cp /etc/crontab /etc/crontab.orig; echo "\*/$INTERVAL_MINS \\* \\* \\* \\* root $EXE_PATH" 2>&1 | sudo tee -a /etc/crontab;fi"
+WCOLL=$HOSTLIST pdsh sudo $EXE_PATH
diff --git a/experimental/gpu_monitoring/start_gpu_data_collector_cron.sh b/experimental/gpu_monitoring/start_gpu_data_collector_cron.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+HOSTLIST=hostlist
+INTERVAL_MINS=1
+SCRIPT_PATH=~
+EXE_PATH="${SCRIPT_PATH}/gpu_data_collector.py -uc \>\> /tmp/gpu_data_collector.log"
+PDSH_RCMD_TYPE=ssh
+
+
+WCOLL=$HOSTLIST pdsh "if ! [ -f /etc/crontab.orig ]; then sudo cp /etc/crontab /etc/crontab.orig; echo "\*/$INTERVAL_MINS \\* \\* \\* \\* root $EXE_PATH" 2>&1 | sudo tee -a /etc/crontab;fi"
diff --git a/experimental/gpu_monitoring/stop_gpu_data_collector.sh b/experimental/gpu_monitoring/stop_gpu_data_collector.sh
@@ -4,4 +4,4 @@ HOSTLIST=hostlist
 PDSH_RCMD_TYPE=ssh
 
 
-WCOLL=$HOSTLIST pdsh sudo mv /etc/crontab.orig /etc/crontab
+WCOLL=$HOSTLIST pdsh sudo pkill gpu_data_collector
diff --git a/experimental/gpu_monitoring/stop_gpu_data_collector_cron.sh b/experimental/gpu_monitoring/stop_gpu_data_collector_cron.sh
@@ -0,0 +1,7 @@
+#!/bin/bash
+
+HOSTLIST=hostlist
+PDSH_RCMD_TYPE=ssh
+
+
+WCOLL=$HOSTLIST pdsh sudo mv /etc/crontab.orig /etc/crontab
Original file line number	Diff line number	Diff line change
Expand Up		@@ -4,4 +4,4 @@ HOSTLIST=hostlist
		PDSH_RCMD_TYPE=ssh


		WCOLL=$HOSTLIST pdsh sudo mv /etc/crontab.orig /etc/crontab
		WCOLL=$HOSTLIST pdsh sudo pkill gpu_data_collector