[Profiler] Add API for Dynamic Activity Toggling [2/n] (#133035)

Summary: During PT2 there are many GPU/CPU events that are unneccessary to profile in between a given step. To remedy this, we can add an API that takes in a list of activities and an arg to toggle said activies or not. For this diff we are adding the profiler API to propogate down to kineto (and in the future the collection.cpp logic). Subsequent diffs will be added for CPU toggling and e2e testing. Test Plan: Tested by toggling backward gpu traces off and got following trace: https://www.internalfb.com/intern/perfdoctor/trace_view?filepath=tree/traces/dynocli/devvm2185.cco0.facebook.com/rank-0.Jul_31_13_40_55.3251726.pt.trace.json.gz&bucket=gpu_traces Reviewed By: aaronenyeshi Differential Revision: D60541767 Pull Request resolved: pytorch/pytorch#133035 Approved by: https://github.com/aaronenyeshi
grukund · Aug 9, 2024 · d2ecdcb · d2ecdcb
1 parent b0b4723
commit d2ecdcb
Show file tree

Hide file tree

Showing 11 changed files with 72 additions and 2 deletions.
diff --git a/third_party/kineto b/third_party/kineto
diff --git a/torch/_C/_autograd.pyi b/torch/_C/_autograd.pyi
@@ -94,6 +94,10 @@ def _prepare_profiler(
     config: ProfilerConfig,
     activities: set[ProfilerActivity],
 ) -> None: ...
+def _toggle_collection_dynamic(
+    enable: bool,
+    activities: set[ProfilerActivity],
+) -> None: ...
 def _disable_profiler() -> _ProfilerResult: ...
 def _profiler_enabled() -> bool: ...
 def _add_metadata_json(key: str, value: str) -> None: ...

diff --git a/torch/autograd/__init__.py b/torch/autograd/__init__.py
@@ -571,6 +571,7 @@ def variable(*args, **kwargs):  # noqa: D103
     _record_function_with_args_exit,
     _set_empty_test_observer,
     _supported_activities,
+    _toggle_collection_dynamic,
     DeviceType,
     kineto_available,
     ProfilerEvent,

diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py
@@ -2,7 +2,7 @@
 from collections import defaultdict
 from dataclasses import dataclass
 from time import perf_counter_ns
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Iterable, List, Optional
 from warnings import warn
 
 import torch
@@ -16,6 +16,7 @@
     _prepare_profiler,
     _ProfilerResult,
     _supported_activities,
+    _toggle_collection_dynamic,
     DeviceType,
     kineto_available,
     ProfilerActivity,
@@ -440,6 +441,14 @@ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         assert self.with_stack, "export_stacks() requires with_stack=True"
         return self.function_events.export_stacks(path, metric)
 
+    def toggle_collection_dynamic(
+        self, enabled: bool, activities: Iterable[ProfilerActivity]
+    ):
+        """
+        Toggles the collection of activities for the current profiler instance.
+        """
+        return _toggle_collection_dynamic(enabled, set(activities))
+
     def key_averages(self, group_by_input_shape=False, group_by_stack_n=0):
         self._check_finish()
         assert self.function_events is not None, "Expected profiling results"

diff --git a/torch/csrc/autograd/init.cpp b/torch/csrc/autograd/init.cpp
@@ -311,6 +311,10 @@ PyObject* THPAutograd_initExtension(PyObject* _unused, PyObject* unused) {
       "_prepare_profiler",
       prepareProfiler,
       py::call_guard<py::gil_scoped_release>());
+  m.def(
+      "_toggle_collection_dynamic",
+      toggleCollectionDynamic,
+      py::call_guard<py::gil_scoped_release>());
   m.def("_add_metadata_json", addMetadataJson); // Only if `USE_KINETO` is set
   m.def("_kineto_step", profilerStep); // Only if `USE_KINETO` is set
   m.def("kineto_available", []() { return torch::profiler::kKinetoAvailable; });

diff --git a/torch/csrc/autograd/profiler_kineto.cpp b/torch/csrc/autograd/profiler_kineto.cpp
@@ -610,6 +610,22 @@ void prepareProfiler(
   }
 }
 
+void toggleCollectionDynamic(
+    const bool enable,
+    const std::set<torch::profiler::impl::ActivityType>& activities) {
+  // TODO: CPU toggling should be done in this file to interface with collection
+  // similar to enableProfiler call GPU toggling is called in impl::kineto as is
+  for (auto act : activities) {
+    if (act != torch::autograd::profiler::ActivityType::CUDA) {
+      LOG(WARNING)
+          << "Dynamic toggle is only supported for GPU activity, skipping toggling of "
+          << actToString(act);
+      continue;
+    }
+    torch::profiler::impl::kineto::toggleCollectionDynamic(enable);
+  }
+}
+
 void enableProfilerWithEventPostProcess(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities,

diff --git a/torch/csrc/autograd/profiler_kineto.h b/torch/csrc/autograd/profiler_kineto.h
@@ -180,6 +180,10 @@ TORCH_API void prepareProfiler(
     const torch::profiler::impl::ProfilerConfig& config,
     const std::set<torch::profiler::impl::ActivityType>& activities);
 
+TORCH_API void toggleCollectionDynamic(
+    const bool enable,
+    const std::set<torch::profiler::impl::ActivityType>& activities);
+
 /**
  * When a C++ thread really has no control over how the profiler was enabled,
  * for example, by some unreachable Python code, it can call these functions

diff --git a/torch/csrc/profiler/kineto_shim.cpp b/torch/csrc/profiler/kineto_shim.cpp
@@ -275,6 +275,16 @@ void prepareTrace(
 #endif // USE_KINETO
 }
 
+void toggleCollectionDynamic(const bool enable) {
+#ifdef USE_KINETO
+  // TODO: We may want to consider adding another input arg for this function
+  // if we want to support turning off certain devices and keeping others on.
+  // For now, we can keep it simple at have it turn off all tracing of "CUDA"
+  // devices
+  libkineto::api().activityProfiler().toggleCollectionDynamic(enable);
+#endif // USE_KINETO
+}
+
 void startTrace() {
 #ifdef USE_KINETO
   libkineto::api().activityProfiler().startTrace();

diff --git a/torch/csrc/profiler/kineto_shim.h b/torch/csrc/profiler/kineto_shim.h
@@ -117,6 +117,8 @@ void prepareTrace(
     const bool cpuOnly,
     const ActivitySet& activities,
     const torch::profiler::impl::ExperimentalConfig& config);
+
+void toggleCollectionDynamic(const bool enable);
 void startTrace();
 ActivityTraceWrapper stopTrace();
 void pushCorrelationId(uint64_t correlation_id);

diff --git a/torch/csrc/profiler/orchestration/observer.h b/torch/csrc/profiler/orchestration/observer.h
@@ -21,6 +21,12 @@ enum class C10_API_ENUM ActivityType {
   NUM_KINETO_ACTIVITIES, // must be the last one
 };
 
+inline std::string actToString(ActivityType t) {
+  const std::string ActivityTypeNames[] = {
+      "CPU", "XPU", "CUDA", "MTIA", "PrivateUse1"};
+  return ActivityTypeNames[static_cast<int>(t)];
+}
+
 enum class C10_API_ENUM ProfilerState {
   Disabled = 0,
   CPU, // CPU-only profiling

diff --git a/torch/profiler/profiler.py b/torch/profiler/profiler.py
@@ -239,6 +239,20 @@ def export_stacks(self, path: str, metric: str = "self_cpu_time_total"):
         assert self.profiler
         return self.profiler.export_stacks(path, metric)
 
+    def toggle_collection_dynamic(
+        self, enable: bool, activities: Iterable[ProfilerActivity]
+    ):
+        """Toggle collection of activities on/off
+
+        Args:
+            activities (iterable): list of activity groups (CPU, CUDA) to use in profiling, supported values:
+                ``torch.profiler.ProfilerActivity.CPU``, ``torch.profiler.ProfilerActivity.CUDA``,
+                ``torch.profiler.ProfilerActivity.XPU``.
+        """
+        if not self.profiler:
+            return
+        self.profiler.toggle_collection_dynamic(enable, activities)
+
     def key_averages(
         self, group_by_input_shape: bool = False, group_by_stack_n: int = 0
     ):
+4 −0		libkineto/include/ActivityProfilerInterface.h
+4 −0		libkineto/src/ActivityProfilerController.cpp
+1 −0		libkineto/src/ActivityProfilerController.h
+4 −0		libkineto/src/ActivityProfilerProxy.cpp
+3 −0		libkineto/src/ActivityProfilerProxy.h
+17 −0		libkineto/src/CuptiActivityProfiler.cpp
+3 −0		libkineto/src/CuptiActivityProfiler.h