Skip to content

Commit 43c428a

Browse files
OOM error with allocation information.
PiperOrigin-RevId: 175637128
1 parent 2c26c98 commit 43c428a

15 files changed

+213
-5
lines changed

tensorflow/core/common_runtime/direct_session.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,8 @@ Status DirectSession::Run(const RunOptions& run_options,
548548
((measure_step_count + 1) % build_cost_model_every == 0);
549549
}
550550
}
551-
if (do_trace || update_cost_model) {
551+
if (do_trace || update_cost_model ||
552+
run_options.report_tensor_allocations_upon_oom()) {
552553
run_state.collector.reset(
553554
new StepStatsCollector(run_metadata->mutable_step_stats()));
554555
args.stats_collector = run_state.collector.get();

tensorflow/core/common_runtime/executor.cc

+15
Original file line numberDiff line numberDiff line change
@@ -1804,6 +1804,21 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
18041804
LOG(WARNING) << this << " Compute status: " << s;
18051805
DumpState();
18061806
}
1807+
if (s.code() == error::RESOURCE_EXHAUSTED) {
1808+
if (stats_collector_) {
1809+
string err = stats_collector_->ReportAllocsOnResourceExhausted(
1810+
s.error_message());
1811+
s = Status(s.code(), strings::StrCat(s.error_message(), err));
1812+
} else {
1813+
s = Status(
1814+
s.code(),
1815+
strings::StrCat(
1816+
s.error_message(),
1817+
"\nHint: If you want to see a list of allocated tensors when "
1818+
"OOM happens, add report_tensor_allocations_upon_oom "
1819+
"to RunOptions for current allocation info.\n"));
1820+
}
1821+
}
18071822
return s;
18081823
}
18091824

tensorflow/core/common_runtime/step_stats_collector.cc

+90
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,21 @@ limitations under the License.
2020
#include "tensorflow/core/framework/tracking_allocator.h"
2121
#include "tensorflow/core/graph/costmodel.h"
2222
#include "tensorflow/core/lib/core/stringpiece.h"
23+
#include "tensorflow/core/lib/strings/numbers.h"
2324
#include "tensorflow/core/lib/strings/scanner.h"
2425
#include "tensorflow/core/platform/logging.h"
2526

2627
namespace tensorflow {
28+
namespace {
29+
const int kMaxAllocReportNodes = 100;
30+
const float kMaxAllocReportFraction = 0.99;
31+
32+
struct AllocStats {
33+
std::map<int64, std::vector<string>> nodes_by_size;
34+
int64 total_bytes = 0;
35+
int64 total_nodes = 0;
36+
};
37+
} // namespace
2738

2839
NodeExecStatsWrapper::NodeExecStatsWrapper()
2940
: NodeExecStatsWrapper(new NodeExecStats) {}
@@ -267,6 +278,85 @@ void StepStatsCollector::Save(const string& device,
267278
}
268279
}
269280

281+
string StepStatsCollector::ReportAllocsOnResourceExhausted(const string& err) {
282+
mutex_lock l(mu_);
283+
if (err.find("OOM") == err.npos) {
284+
return "";
285+
}
286+
// <device, allocator> -> AllocStats
287+
std::map<std::pair<string, string>, AllocStats> allocs_map;
288+
string report = "\n";
289+
for (const auto& dev_stat : dev_stats_) {
290+
const string& device = dev_stat.first;
291+
// Only print the device that has OOM.
292+
// TODO(xpan): Extract device from err first to speed it up.
293+
if (err.find(device) == err.npos) {
294+
continue;
295+
}
296+
// NodeExecStatsWrapper*
297+
for (const auto& stats : dev_stat.second) {
298+
// std::pair<AllocatorMemoryUsed*, TrackingAllocator*>
299+
for (const auto& alloc : stats->allocations_) {
300+
// Only print the allocator that has OOM.
301+
// TODO(xpan): Extract device from err first to speed it up.
302+
if (err.find(alloc.first->allocator_name()) == err.npos) {
303+
continue;
304+
}
305+
auto dev_allocator =
306+
std::make_pair(dev_stat.first, alloc.first->allocator_name());
307+
AllocStats& dev_allocs_stats = allocs_map[dev_allocator];
308+
TrackingAllocator* tracking_alloc = alloc.second;
309+
gtl::InlinedVector<AllocRecord, 4> cur_records =
310+
tracking_alloc->GetCurrentRecords();
311+
int64 cur_bytes = 0;
312+
for (const auto& r : cur_records) {
313+
cur_bytes += r.alloc_bytes;
314+
}
315+
if (cur_bytes > 0) {
316+
dev_allocs_stats.total_bytes += cur_bytes;
317+
dev_allocs_stats.total_nodes++;
318+
dev_allocs_stats.nodes_by_size[cur_bytes].push_back(
319+
stats->stats()->node_name());
320+
}
321+
}
322+
}
323+
}
324+
325+
for (const auto& dev_allocs_it : allocs_map) {
326+
const auto& dev = dev_allocs_it.first;
327+
const AllocStats& dev_allocs_stats = dev_allocs_it.second;
328+
int64 reported_bytes = 0;
329+
int64 reported_nodes = 0;
330+
bool done = false;
331+
strings::StrAppend(&report, "\nCurrent usage from device: ", dev.first,
332+
", allocator: ", dev.second, "\n");
333+
// Print allocations stats of the <device, allocator> pair.
334+
for (auto it = dev_allocs_stats.nodes_by_size.rbegin();
335+
it != dev_allocs_stats.nodes_by_size.rend(); ++it) {
336+
for (const string& node_name : it->second) {
337+
reported_bytes += it->first;
338+
strings::StrAppend(&report, " ",
339+
strings::HumanReadableNumBytes(it->first), " from ",
340+
node_name, "\n");
341+
if (++reported_nodes > kMaxAllocReportNodes ||
342+
reported_bytes >=
343+
dev_allocs_stats.total_bytes * kMaxAllocReportFraction) {
344+
done = true;
345+
break;
346+
}
347+
}
348+
if (done) break;
349+
}
350+
int64 remain_nodes = dev_allocs_stats.total_nodes - reported_nodes;
351+
int64 remain_bytes = dev_allocs_stats.total_bytes - reported_bytes;
352+
if (remain_nodes > 0) {
353+
strings::StrAppend(&report, " Remaining ", remain_nodes, " nodes with ",
354+
strings::HumanReadableNumBytes(remain_bytes), "\n");
355+
}
356+
}
357+
return report;
358+
}
359+
270360
void StepStatsCollector::Finalize() {
271361
mutex_lock l(mu_);
272362
FinalizeInternal();

tensorflow/core/common_runtime/step_stats_collector.h

+7
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,13 @@ class StepStatsCollector {
8282
void Save(const string& device, NodeExecStats* nt);
8383
void Save(const string& device, NodeExecStatsWrapper* stats);
8484

85+
// Generates a string reporting the currently used memory based
86+
// on ResourceExhausted OOM `err` message.
87+
// `err` message needs to contain device name and allocator name, E.g.:
88+
// "ResourceExhaustedError: OOM when allocating tensor ...
89+
// on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc"
90+
string ReportAllocsOnResourceExhausted(const string& err);
91+
8592
// The following 2 Finalize methods populate the StepStats passed
8693
// from the constructor. Calling it more than once won't have any effect.
8794
// User shouldn't call Save() methods after Finalize.

tensorflow/core/distributed_runtime/master_session.cc

+7-1
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,9 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
498498

499499
// Collect execution cost stats on a smoothly decreasing frequency.
500500
ExecutorOpts exec_opts;
501+
if (pss->report_tensor_allocations_upon_oom) {
502+
exec_opts.set_report_tensor_allocations_upon_oom(true);
503+
}
501504
if (pss->collect_costs) {
502505
exec_opts.set_record_costs(true);
503506
}
@@ -1368,6 +1371,8 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
13681371
const auto count = run_state->count;
13691372
pss.collect_timeline =
13701373
req.options().trace_level() == RunOptions::FULL_TRACE;
1374+
pss.report_tensor_allocations_upon_oom =
1375+
req.options().report_tensor_allocations_upon_oom();
13711376

13721377
// Build the cost model every 'build_cost_model_every' steps after skipping
13731378
// an
@@ -1528,7 +1533,8 @@ Status MasterSession::DoRunWithLocalExecution(
15281533
TRACEPRINTF("stepid %llu", step_id);
15291534

15301535
pss.collect_timeline = req.options().trace_level() == RunOptions::FULL_TRACE;
1531-
1536+
pss.report_tensor_allocations_upon_oom =
1537+
req.options().report_tensor_allocations_upon_oom();
15321538
// Build the cost model every 'build_cost_model_every' steps after skipping an
15331539
// initial 'build_cost_model_after' steps.
15341540
const int64 build_cost_model_after =

tensorflow/core/distributed_runtime/master_session.h

+1
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ class MasterSession : public core::RefCounted {
146146
bool collect_timeline = false;
147147
bool collect_rpcs = false;
148148
bool collect_partition_graphs = false;
149+
bool report_tensor_allocations_upon_oom = false;
149150
Microseconds start_micros = Microseconds(0);
150151
Microseconds end_micros = Microseconds(0);
151152
std::vector<StepStats> step_stats; // per partition

tensorflow/core/distributed_runtime/worker.cc

+2-1
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,8 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
132132
return;
133133
}
134134
StepStatsCollector* collector = nullptr;
135-
if (request->exec_opts().record_timeline() ||
135+
if (request->exec_opts().report_tensor_allocations_upon_oom() ||
136+
request->exec_opts().record_timeline() ||
136137
request->exec_opts().record_costs()) {
137138
collector = new StepStatsCollector(response->mutable_step_stats());
138139
// TODO(mrry,pbar): GPU tracing for distributed steps.

tensorflow/core/framework/op_kernel.cc

+4-2
Original file line numberDiff line numberDiff line change
@@ -622,8 +622,10 @@ Status OpKernelContext::allocate_tensor(
622622
Tensor new_tensor(a, type, shape, logged_attr);
623623

624624
if (!new_tensor.IsInitialized()) {
625-
return errors::ResourceExhausted("OOM when allocating tensor with shape",
626-
shape.DebugString());
625+
return errors::ResourceExhausted(
626+
"OOM when allocating tensor with shape", shape.DebugString(),
627+
" and type ", DataTypeString(type), " on ", params_->device->name(),
628+
" by allocator ", a->Name());
627629
}
628630
if (params_->log_memory) {
629631
LogMemory::RecordTensorAllocation(params_->op_kernel->name(),

tensorflow/core/framework/tracking_allocator.cc

+11
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,17 @@ gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetRecordsAndUnRef() {
183183
return allocations;
184184
}
185185

186+
gtl::InlinedVector<AllocRecord, 4> TrackingAllocator::GetCurrentRecords() {
187+
gtl::InlinedVector<AllocRecord, 4> allocations;
188+
{
189+
mutex_lock lock(mu_);
190+
for (const AllocRecord& alloc : allocations_) {
191+
allocations.push_back(alloc);
192+
}
193+
}
194+
return allocations;
195+
}
196+
186197
bool TrackingAllocator::UnRef() {
187198
CHECK_GE(ref_, 1);
188199
--ref_;

tensorflow/core/framework/tracking_allocator.h

+2
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,8 @@ class TrackingAllocator : public Allocator {
8585
// deallocated. After this call completes and all allocated pointers
8686
// have been deallocated the wrapper will delete itself.
8787
gtl::InlinedVector<AllocRecord, 4> GetRecordsAndUnRef();
88+
// Returns a copy of allocation records collected so far.
89+
gtl::InlinedVector<AllocRecord, 4> GetCurrentRecords();
8890

8991
protected:
9092
~TrackingAllocator() override {}

tensorflow/core/protobuf/config.proto

+7
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,13 @@ message RunOptions {
331331
// EXPERIMENTAL. Options used to initialize DebuggerState, if enabled.
332332
DebugOptions debug_options = 6;
333333

334+
// When enabled, causes tensor alllocation information to be included in
335+
// the error message when the Run() call fails because the allocator ran
336+
// out of memory (OOM).
337+
//
338+
// Enabling this option can slow down the Run() call.
339+
bool report_tensor_allocations_upon_oom = 7;
340+
334341
reserved 4;
335342
}
336343

tensorflow/core/protobuf/worker.proto

+1
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,7 @@ message ExecutorOpts {
169169
bool record_costs = 1;
170170
bool record_timeline = 3;
171171
bool record_partition_graphs = 4;
172+
bool report_tensor_allocations_upon_oom = 5;
172173
};
173174

174175
message RunGraphRequest {

tensorflow/python/profiler/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ cuda_py_test(
5353
"//tensorflow/python:client",
5454
"//tensorflow/python:client_testlib",
5555
"//tensorflow/python:framework_for_generated_wrappers",
56+
"//tensorflow/python:distributed_framework_test_lib",
5657
"//tensorflow/python:platform",
5758
"//tensorflow/python:variables",
5859
],

tensorflow/python/profiler/model_analyzer_test.py

+59
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from tensorflow.core.protobuf import rewriter_config_pb2
2929
from tensorflow.python.client import session
3030
from tensorflow.python.framework import ops
31+
from tensorflow.python.framework import test_util
32+
from tensorflow.python.ops import random_ops
3133
from tensorflow.python.ops import variables
3234
from tensorflow.python.platform import gfile
3335
from tensorflow.python.platform import test
@@ -635,6 +637,63 @@ def testAutoProfiling(self):
635637
self._trainLoop(x, 10, time_dir, time_steps,
636638
memory_dir, memory_steps, profile_dir, dump_steps)
637639

640+
def testOOM(self):
641+
if not test.is_gpu_available():
642+
return
643+
ops.reset_default_graph()
644+
with ops.device('/device:GPU:0'):
645+
a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
646+
b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
647+
c = a * b
648+
649+
try:
650+
with session.Session() as sess:
651+
sess.run(c, options=config_pb2.RunOptions(
652+
report_tensor_allocations_upon_oom=True))
653+
except Exception as e: # pylint: disable=broad-except
654+
exception_str = '%s' % e
655+
# This trace reports allocations for to random tensor.
656+
self.assertTrue(
657+
'OOM when allocating tensor with shape[30000,10000,20000]' in
658+
exception_str)
659+
mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
660+
exception_str)
661+
self.assertGreater(float(mat.group(1)), 0.0)
662+
mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
663+
exception_str)
664+
self.assertGreater(float(mat.group(1)), 0.0)
665+
666+
def testDistributedOOM(self):
667+
if not test.is_gpu_available():
668+
return
669+
ops.reset_default_graph()
670+
671+
workers, _ = test_util.create_local_cluster(2, 0)
672+
673+
with ops.device('/job:worker/replica:0/task:0/gpu:0'):
674+
a = random_ops.random_normal([1, 10000, 20000], name='test_random1')
675+
with ops.device('/job:worker/replica:0/task:1/gpu:0'):
676+
b = random_ops.random_normal([30000, 10000, 1], name='test_random2')
677+
c = a * b
678+
679+
try:
680+
with session.Session(workers[1].target) as sess:
681+
sess.run(c, options=config_pb2.RunOptions(
682+
report_tensor_allocations_upon_oom=True))
683+
except Exception as e: # pylint: disable=broad-except
684+
exception_str = '%s' % e
685+
# test_random2 is reported because it's allocated in worker 1.
686+
self.assertTrue('Current usage from device: '
687+
'/job:worker/replica:0/task:1/device:GPU:0, '
688+
'allocator: GPU_0_bfc' in exception_str)
689+
mat = re.search('(.*)GiB from test_random2/RandomStandardNormal',
690+
exception_str)
691+
self.assertGreater(float(mat.group(1)), 0.0)
692+
# test_random1 is not reported because it's allocated in worker 0.
693+
mat = re.search('(.*)MiB from test_random1/RandomStandardNormal',
694+
exception_str)
695+
self.assertTrue(mat is None)
696+
638697

639698
if __name__ == '__main__':
640699
test.main()

tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt

+4
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ tf_class {
3434
name: "OUTPUT_PARTITION_GRAPHS_FIELD_NUMBER"
3535
mtype: "<type \'int\'>"
3636
}
37+
member {
38+
name: "REPORT_TENSOR_ALLOCATIONS_UPON_OOM_FIELD_NUMBER"
39+
mtype: "<type \'int\'>"
40+
}
3741
member {
3842
name: "SOFTWARE_TRACE"
3943
mtype: "<type \'int\'>"

0 commit comments

Comments
 (0)