OOM error with allocation information.

tensorflower-gardener · tensorflower-gardener · commit 43c428ada3cd · 2017-11-13T22:40:38.000-08:00
PiperOrigin-RevId: 175637128
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
@@ -548,7 +548,8 @@ Status DirectSession::Run(const RunOptions& run_options,
           ((measure_step_count + 1) % build_cost_model_every == 0);
     }
   }
-  if (do_trace || update_cost_model) {
+  if (do_trace || update_cost_model ||
+      run_options.report_tensor_allocations_upon_oom()) {
     run_state.collector.reset(
         new StepStatsCollector(run_metadata->mutable_step_stats()));
     args.stats_collector = run_state.collector.get();
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
@@ -1804,6 +1804,21 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
       LOG(WARNING) << this << " Compute status: " << s;
       DumpState();
     }
+    if (s.code() == error::RESOURCE_EXHAUSTED) {
+      if (stats_collector_) {
+        string err = stats_collector_->ReportAllocsOnResourceExhausted(
+            s.error_message());
+        s = Status(s.code(), strings::StrCat(s.error_message(), err));
+      } else {
+        s = Status(
+            s.code(),
+            strings::StrCat(
+                s.error_message(),
+                "\nHint: If you want to see a list of allocated tensors when "
+                "OOM happens, add report_tensor_allocations_upon_oom "
+                "to RunOptions for current allocation info.\n"));
+      }
+    }
     return s;
   }
 
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -20,10 +20,21 @@ limitations under the License.
 #include "tensorflow/core/framework/tracking_allocator.h"
 #include "tensorflow/core/graph/costmodel.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/scanner.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace {
+const int kMaxAllocReportNodes = 100;
+const float kMaxAllocReportFraction = 0.99;
+
+struct AllocStats {
+  std::map<int64, std::vector<string>> nodes_by_size;
+  int64 total_bytes = 0;
+  int64 total_nodes = 0;
+};
+}  // namespace
 
 NodeExecStatsWrapper::NodeExecStatsWrapper()
     : NodeExecStatsWrapper(new NodeExecStats) {}
@@ -267,6 +278,85 @@ void StepStatsCollector::Save(const string& device,
   }
 }
 
+string StepStatsCollector::ReportAllocsOnResourceExhausted(const string& err) {
+  mutex_lock l(mu_);
+  if (err.find("OOM") == err.npos) {
+    return "";
+  }
+  // <device, allocator> -> AllocStats
+  std::map<std::pair<string, string>, AllocStats> allocs_map;
+  string report = "\n";
+  for (const auto& dev_stat : dev_stats_) {
+    const string& device = dev_stat.first;
+    // Only print the device that has OOM.
+    // TODO(xpan): Extract device from err first to speed it up.
+    if (err.find(device) == err.npos) {
+      continue;
+    }
+    // NodeExecStatsWrapper*
+    for (const auto& stats : dev_stat.second) {
+      // std::pair<AllocatorMemoryUsed*, TrackingAllocator*>
+      for (const auto& alloc : stats->allocations_) {
+        // Only print the allocator that has OOM.
+        // TODO(xpan): Extract device from err first to speed it up.
+        if (err.find(alloc.first->allocator_name()) == err.npos) {
+          continue;
+        }
+        auto dev_allocator =
+            std::make_pair(dev_stat.first, alloc.first->allocator_name());
+        AllocStats& dev_allocs_stats = allocs_map[dev_allocator];
+        TrackingAllocator* tracking_alloc = alloc.second;
+        gtl::InlinedVector<AllocRecord, 4> cur_records =
+            tracking_alloc->GetCurrentRecords();
+        int64 cur_bytes = 0;
+        for (const auto& r : cur_records) {
+          cur_bytes += r.alloc_bytes;
+        }
+        if (cur_bytes > 0) {
+          dev_allocs_stats.total_bytes += cur_bytes;
+          dev_allocs_stats.total_nodes++;
+          dev_allocs_stats.nodes_by_size[cur_bytes].push_back(
+              stats->stats()->node_name());
+        }
+      }
+    }
+  }
+
+  for (const auto& dev_allocs_it : allocs_map) {
+    const auto& dev = dev_allocs_it.first;
+    const AllocStats& dev_allocs_stats = dev_allocs_it.second;
+    int64 reported_bytes = 0;
+    int64 reported_nodes = 0;
+    bool done = false;
+    strings::StrAppend(&report, "\nCurrent usage from device: ", dev.first,
+                       ", allocator: ", dev.second, "\n");
+    // Print allocations stats of the <device, allocator> pair.
+    for (auto it = dev_allocs_stats.nodes_by_size.rbegin();
+         it != dev_allocs_stats.nodes_by_size.rend(); ++it) {
+      for (const string& node_name : it->second) {
+        reported_bytes += it->first;
+        strings::StrAppend(&report, "  ",
+                           strings::HumanReadableNumBytes(it->first), " from ",
+                           node_name, "\n");
+        if (++reported_nodes > kMaxAllocReportNodes ||
+            reported_bytes >=
+                dev_allocs_stats.total_bytes * kMaxAllocReportFraction) {
+          done = true;
+          break;
+        }
+      }
+      if (done) break;
+    }
+    int64 remain_nodes = dev_allocs_stats.total_nodes - reported_nodes;
+    int64 remain_bytes = dev_allocs_stats.total_bytes - reported_bytes;
+    if (remain_nodes > 0) {
+      strings::StrAppend(&report, "  Remaining ", remain_nodes, " nodes with ",
+                         strings::HumanReadableNumBytes(remain_bytes), "\n");
+    }
+  }
+  return report;
+}
+
 void StepStatsCollector::Finalize() {
   mutex_lock l(mu_);
   FinalizeInternal();
diff --git a/tensorflow/core/common_runtime/step_stats_collector.h b/tensorflow/core/common_runtime/step_stats_collector.h
@@ -82,6 +82,13 @@ class StepStatsCollector {
   void Save(const string& device, NodeExecStats* nt);
   void Save(const string& device, NodeExecStatsWrapper* stats);
 
+  // Generates a string reporting the currently used memory based
+  // on ResourceExhausted OOM `err` message.
+  // `err` message needs to contain device name and allocator name, E.g.:
+  // "ResourceExhaustedError: OOM when allocating tensor ...
+  // on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
+  string ReportAllocsOnResourceExhausted(const string& err);
+
   // The following 2 Finalize methods populate the StepStats passed
   // from the constructor. Calling it more than once won't have any effect.
   // User shouldn't call Save() methods after Finalize.
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
@@ -498,6 +498,9 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
 
   // Collect execution cost stats on a smoothly decreasing frequency.
   ExecutorOpts exec_opts;
+  if (pss->report_tensor_allocations_upon_oom) {
+    exec_opts.set_report_tensor_allocations_upon_oom(true);
+  }
   if (pss->collect_costs) {
     exec_opts.set_record_costs(true);
   }
@@ -1368,6 +1371,8 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     const auto count = run_state->count;
     pss.collect_timeline =
         req.options().trace_level() == RunOptions::FULL_TRACE;
+    pss.report_tensor_allocations_upon_oom =
+        req.options().report_tensor_allocations_upon_oom();
 
     // Build the cost model every 'build_cost_model_every' steps after skipping
     // an
@@ -1528,7 +1533,8 @@ Status MasterSession::DoRunWithLocalExecution(
   TRACEPRINTF("stepid %llu", step_id);
 
   pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
-
+  pss.report_tensor_allocations_upon_oom =
+      req.options().report_tensor_allocations_upon_oom();
   // Build the cost model every 'build_cost_model_every' steps after skipping an
   // initial 'build_cost_model_after' steps.
   const int64 build_cost_model_after =
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
@@ -146,6 +146,7 @@ class MasterSession : public core::RefCounted {
     bool collect_timeline = false;
     bool collect_rpcs = false;
     bool collect_partition_graphs = false;
+    bool report_tensor_allocations_upon_oom = false;
     Microseconds start_micros = Microseconds(0);
     Microseconds end_micros = Microseconds(0);
     std::vector<StepStats> step_stats;  // per partition
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
@@ -132,7 +132,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     return;
   }
   StepStatsCollector* collector = nullptr;
-  if (request->exec_opts().record_timeline() ||
+  if (request->exec_opts().report_tensor_allocations_upon_oom() ||
+      request->exec_opts().record_timeline() ||
       request->exec_opts().record_costs()) {
     collector = new StepStatsCollector(response->mutable_step_stats());
     // TODO(mrry,pbar): GPU tracing for distributed steps.
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
@@ -622,8 +622,10 @@ Status OpKernelContext::allocate_tensor(
   Tensor new_tensor(a, type, shape, logged_attr);
 
   if (!new_tensor.IsInitialized()) {
-    return errors::ResourceExhausted("OOM when allocating tensor with shape",
-                                     shape.DebugString());
+    return errors::ResourceExhausted(
+        "OOM when allocating tensor with shape", shape.DebugString(),
+        " and type ", DataTypeString(type), " on ", params_->device->name(),
+        " by allocator ", a->Name());
   }
   if (params_->log_memory) {
     LogMemory::RecordTensorAllocation(params_->op_kernel->name(),
diff --git a/tensorflow/core/framework/tracking_allocator.cc b/tensorflow/core/framework/tracking_allocator.cc
@@ -183,6 +183,17 @@ gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetRecordsAndUnRef() {
   return allocations;
 }
 
+gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetCurrentRecords() {
+  gtl::InlinedVector<AllocRecord, 4> allocations;
+  {
+    mutex_lock lock(mu_);
+    for (const AllocRecord& alloc : allocations_) {
+      allocations.push_back(alloc);
+    }
+  }
+  return allocations;
+}
+
 bool TrackingAllocator::UnRef() {
   CHECK_GE(ref_, 1);
   --ref_;
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
@@ -85,6 +85,8 @@ class TrackingAllocator : public Allocator {
   // deallocated. After this call completes and all allocated pointers
   // have been deallocated the wrapper will delete itself.
   gtl::InlinedVector<AllocRecord, 4> GetRecordsAndUnRef();
+  // Returns a copy of allocation records collected so far.
+  gtl::InlinedVector<AllocRecord, 4> GetCurrentRecords();
 
  protected:
   ~TrackingAllocator() override {}
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
@@ -331,6 +331,13 @@ message RunOptions {
   // EXPERIMENTAL.  Options used to initialize DebuggerState, if enabled.
   DebugOptions debug_options = 6;
 
+  // When enabled, causes tensor alllocation information to be included in
+  // the error message when the Run() call fails because the allocator ran
+  // out of memory (OOM).
+  //
+  // Enabling this option can slow down the Run() call.
+  bool report_tensor_allocations_upon_oom = 7;
+
   reserved 4;
 }
 
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
@@ -169,6 +169,7 @@ message ExecutorOpts {
   bool record_costs = 1;
   bool record_timeline = 3;
   bool record_partition_graphs = 4;
+  bool report_tensor_allocations_upon_oom = 5;
 };
 
 message RunGraphRequest {
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
@@ -53,6 +53,7 @@ cuda_py_test(
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:distributed_framework_test_lib",
         "//tensorflow/python:platform",
         "//tensorflow/python:variables",
     ],
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
@@ -28,6 +28,8 @@
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -635,6 +637,63 @@ def testAutoProfiling(self):
       self._trainLoop(x, 10, time_dir, time_steps,
                       memory_dir, memory_steps, profile_dir, dump_steps)
 
+  def testOOM(self):
+    if not test.is_gpu_available():
+      return
+    ops.reset_default_graph()
+    with ops.device('/device:GPU:0'):
+      a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
+      b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
+      c = a * b
+
+    try:
+      with session.Session() as sess:
+        sess.run(c, options=config_pb2.RunOptions(
+            report_tensor_allocations_upon_oom=True))
+    except Exception as e:  # pylint: disable=broad-except
+      exception_str = '%s' % e
+      # This trace reports allocations for to random tensor.
+      self.assertTrue(
+          'OOM when allocating tensor with shape[30000,10000,20000]' in
+          exception_str)
+      mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
+                      exception_str)
+      self.assertGreater(float(mat.group(1)), 0.0)
+      mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
+                      exception_str)
+      self.assertGreater(float(mat.group(1)), 0.0)
+
+  def testDistributedOOM(self):
+    if not test.is_gpu_available():
+      return
+    ops.reset_default_graph()
+
+    workers, _ = test_util.create_local_cluster(2, 0)
+
+    with ops.device('/job:worker/replica:0/task:0/gpu:0'):
+      a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
+    with ops.device('/job:worker/replica:0/task:1/gpu:0'):
+      b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
+      c = a * b
+
+    try:
+      with session.Session(workers[1].target) as sess:
+        sess.run(c, options=config_pb2.RunOptions(
+            report_tensor_allocations_upon_oom=True))
+    except Exception as e:  # pylint: disable=broad-except
+      exception_str = '%s' % e
+      # test_random2 is reported because it's allocated in worker 1.
+      self.assertTrue('Current usage from device: '
+                      '/job:worker/replica:0/task:1/device:GPU:0, '
+                      'allocator: GPU_0_bfc' in exception_str)
+      mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
+                      exception_str)
+      self.assertGreater(float(mat.group(1)), 0.0)
+      # test_random1 is not reported because it's allocated in worker 0.
+      mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
+                      exception_str)
+      self.assertTrue(mat is None)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
@@ -34,6 +34,10 @@ tf_class {
     name: "OUTPUT_PARTITION_GRAPHS_FIELD_NUMBER"
     mtype: "<type \'int\'>"
   }
+  member {
+    name: "REPORT_TENSOR_ALLOCATIONS_UPON_OOM_FIELD_NUMBER"
+    mtype: "<type \'int\'>"
+  }
   member {
     name: "SOFTWARE_TRACE"
     mtype: "<type \'int\'>"

Original file line number	Diff line number	Diff line change
`@@ -548,7 +548,8 @@ Status DirectSession::Run(const RunOptions& run_options,`
`548`	`548`	`((measure_step_count + 1) % build_cost_model_every == 0);`
`549`	`549`	`}`
`550`	`550`	`}`
`551`		`- if (do_trace \|\| update_cost_model) {`
	`551`	`+ if (do_trace \|\| update_cost_model \|\|`
	`552`	`+ run_options.report_tensor_allocations_upon_oom()) {`
`552`	`553`	`run_state.collector.reset(`
`553`	`554`	`new StepStatsCollector(run_metadata->mutable_step_stats()));`
`554`	`555`	`args.stats_collector = run_state.collector.get();`