forked from pytorch/pytorch
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCPUProfilingAllocator.cpp
466 lines (431 loc) · 16.4 KB
/
CPUProfilingAllocator.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
#include <c10/core/impl/alloc_cpu.h>
#include <c10/mobile/CPUProfilingAllocator.h>
#include <c10/util/irange.h>
#include <map>
#include <set>
namespace c10 {
namespace {
thread_local AllocationPlanner* allocation_planner{nullptr};
thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
struct MemBlock {
uint64_t start_offset, end_offset;
MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
bool operator<(const MemBlock& other) const {
return start_offset < other.start_offset;
}
};
enum class EventType { Allocate = 0, Free, Invalid };
struct MemEvent {
uint64_t time;
uint64_t allocation_id;
uint64_t size;
EventType type{EventType::Invalid};
MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
: time(t), allocation_id(id), size(s), type(e) {}
};
bool overlaps(const MemBlock& a, const MemBlock& b) {
// two blocks dont overlap if
// |---a--------|--------------b--------|
// strat_a end_a <= start_b end_b
return !(
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
}
bool validate_allocation_plan(
const std::vector<MemEvent>& alloc_events,
const std::vector<uint64_t>& allocation_offsets) {
std::set<MemBlock> allocations;
for (const auto& event : alloc_events) {
auto alloc_id = event.allocation_id;
// Skip allocations not managed by AllocationPlan
if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
continue;
}
auto start_offset = allocation_offsets[alloc_id];
auto end_offset = allocation_offsets[alloc_id] + event.size;
MemBlock mem_block(start_offset, end_offset);
if (event.type == EventType::Allocate) {
auto it = allocations.lower_bound(mem_block);
if (it != allocations.end()) {
auto next_block = *it;
if (overlaps(next_block, mem_block)) {
return false;
}
}
if (it != allocations.begin()) {
auto prev_block = *(--it);
if (overlaps(prev_block, mem_block)) {
return false;
}
}
allocations.emplace(mem_block);
} else if (event.type == EventType::Free) {
auto it = allocations.find(mem_block);
TORCH_CHECK(
(*it).end_offset == end_offset,
"Enf offset of allocation being freed must match the one recorded.");
TORCH_CHECK(
it != allocations.end(),
"ProfilingAllocator: Allocate event "
"must have preceded deallocate event.");
allocations.erase(it);
} else {
TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
}
}
return true;
}
std::vector<MemEvent> create_and_sort_mem_events(
const std::vector<uint64_t>& allocation_sizes,
const std::vector<uint64_t>& allocation_lifetimes) {
std::vector<MemEvent> events;
for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
// If observed allocation are freed outside the scope of
// observation, then allocations are not managed by the
// AllocationPlan.
if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
continue;
}
events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
events.emplace_back(
allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
}
std::sort(
events.begin(),
events.end(),
[](const MemEvent& a, const MemEvent& b) -> bool {
return a.time < b.time;
});
return events;
}
std::vector<uint64_t> formulate_greedy_allocation_plan(
const std::vector<uint64_t>& allocation_sizes,
const std::vector<uint64_t>& allocation_lifetimes) {
// Step 1. Construct all allocation/free events.
// Sort these events by timestamp.
// Step 2. Iterate through all events.
// 2.1 If allocate event:
// Find all candidate in free_size_to_offset map
// Greedily pick the first one.
// Remove the entry from free_size_to_offset map.
// new_offset = offset + request_size
// new_size = size - request_size
// Add new entry to both maps
// 2.2 If free event.
// Check if the returned offset merges with another chunk.
// If so merge until no more merging is possible.
// If returned offset does not merge, then
// just return it as a chunk.
// lower_bound on this map will get all candidates of
// the right size for allocation.
std::map<uint64_t, uint64_t> free_size_to_offset;
// This provides fast lookup when we want to insert freed block
// back, especially when we want to merge blocks.
ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
free_start_offset_to_size_iter;
ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
free_end_offset_to_size_iter;
// Upon free end_ptr = offset + size
// If end_ptr exists merge freed allocation
// Also find corresponding offset in size_to_offset
// Remove that entry and update with new size and offset
// If end_ptr does not exist then just insert offset,size
// in map and correspondingly size, offset in the other map.
// Merging should always be done recursively until no more chunks
// that can be found.
// After last free we should have only one entry left in these maps.
std::vector<uint64_t> allocation_offsets(
allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
auto mem_events =
create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
uint64_t max_offset{0};
for (const auto& mem_event : mem_events) {
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
uint64_t alloc_offset;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
uint64_t new_offset, new_size;
if (mem_event.type == EventType::Allocate) {
auto it = free_size_to_offset.lower_bound(mem_event.size);
if (it == free_size_to_offset.end()) {
// If there is no contiguous block of the size requested
// allocate a new one.
alloc_offset = max_offset;
max_offset += mem_event.size;
} else {
// If we have found a block of the size we want
// 1. change the block by allocating out of it.
// 1.1 Erase the entire block
// 1.2 Erase the reverse map entries
// 2. If block still has space left insert the remainder back in map.
// Including reverse map entries.
alloc_offset = it->second;
new_offset = alloc_offset + mem_event.size;
new_size = it->first - mem_event.size;
free_size_to_offset.erase(it);
free_start_offset_to_size_iter.erase(alloc_offset);
free_end_offset_to_size_iter.erase(alloc_offset + it->first);
if (new_size > 0) {
auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
free_start_offset_to_size_iter.emplace(new_offset, ref_it);
free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
}
}
allocation_offsets[mem_event.allocation_id] = alloc_offset;
} else {
// 1. Check if freed block is adjacent to an existing free block
// at its end boundary. This is done by checking
// free_end_offset_to_size_iter.
// If we find such a block, remove it and adjust size of
// the block being freed.
// 2. Similarly check if freed block is adjacent to an existing
// free block at start boundary. This is done by checking
// free_start_offset_to_size_iter.
// If we find such a block, remove it and adjust size of
// the block being freed.
// 3. Insert the freed block in map.
auto freed_offset = allocation_offsets[mem_event.allocation_id];
auto freed_size = mem_event.size;
auto end_offset = freed_offset + freed_size;
// Merge when another free block exist at the end of this block
auto end_it = free_start_offset_to_size_iter.find(end_offset);
if (end_it != free_start_offset_to_size_iter.end()) {
auto merge_block_iter = end_it->second;
auto merge_block_size = merge_block_iter->first;
freed_size += merge_block_size;
free_size_to_offset.erase(merge_block_iter);
free_start_offset_to_size_iter.erase(end_it);
// If the block is being merged then also remove it from
// free_end_offset_to_size_iter
free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
}
// Merge when freed block exist at the end of another free block
auto start_it = free_end_offset_to_size_iter.find(freed_offset);
if (start_it != free_end_offset_to_size_iter.end()) {
auto merge_block_iter = start_it->second;
auto merge_block_size = merge_block_iter->first;
freed_size += merge_block_size;
freed_offset -= merge_block_size;
free_size_to_offset.erase(merge_block_iter);
free_end_offset_to_size_iter.erase(start_it);
// If the block is being merged then also remove it from
// free_start_offset_to_size_iter
free_start_offset_to_size_iter.erase(freed_offset);
}
auto freed_block_it =
free_size_to_offset.emplace(freed_size, freed_offset).first;
free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
free_end_offset_to_size_iter.emplace(
freed_offset + freed_size, freed_block_it);
}
}
TORCH_CHECK(
validate_allocation_plan(mem_events, allocation_offsets),
"ProfilingAllocator: Allocation plan invalid.");
return allocation_offsets;
}
} // namespace
void AllocationPlan::clear() {
allocation_sizes.clear();
allocation_lifetimes.clear();
allocation_offsets.clear();
}
void AllocationPlanner::record_allocation(
const uint64_t size,
const void* ptr) {
if (validation_mode_) {
validation_success = validation_success && validate_allocation(size, ptr);
return;
}
allocation_plan_->allocation_sizes.push_back(size);
allocation_plan_->allocation_lifetimes.push_back(
std::numeric_limits<uint64_t>::max());
allocation_ptr_to_id_[ptr] = allocation_id_;
allocation_id_++;
}
void AllocationPlanner::record_free(const void* ptr) {
if (validation_mode_) {
validation_success = validation_success && validate_free(ptr);
return;
}
auto it = allocation_ptr_to_id_.find(ptr);
if (it == allocation_ptr_to_id_.end()) {
// Free being recorded was allocated outside of WithProfileAllocationGuard
return;
}
auto id = it->second;
TORCH_CHECK(
id < allocation_plan_->allocation_lifetimes.size(),
"Allocation must have been recorded during record_allocation.");
allocation_plan_->allocation_lifetimes[id] = allocation_id_;
}
bool AllocationPlanner::validate_allocation(
const uint64_t size,
const void* ptr) {
if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
allocation_plan_->allocation_sizes[allocation_id_] != size) {
TORCH_WARN(
"Allocation request does not match plan:",
"Allocation id:",
allocation_id_,
", Number of recorded allocations:",
allocation_plan_->allocation_sizes.size(),
", Recorded size of the requested allocation:",
allocation_plan_->allocation_sizes[allocation_id_],
", but got:",
size);
return false;
}
allocation_ptr_to_id_[ptr] = allocation_id_;
allocation_id_++;
return true;
}
bool AllocationPlanner::validate_free(const void* ptr) {
auto it = allocation_ptr_to_id_.find(ptr);
if (it == allocation_ptr_to_id_.end()) {
// Allocation that was made outside the validation scope is being freed here
return true;
}
auto id = (*it).second;
TORCH_CHECK(
id < allocation_plan_->allocation_lifetimes.size(),
"Allocation must have been recorded during validate_allocation.");
auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
return (lifetime_id == allocation_id_);
}
void AllocationPlanner::formulate_plan() {
allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
allocation_plan_->allocation_sizes,
allocation_plan_->allocation_lifetimes);
allocation_plan_->total_size = 0;
for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
if (allocation_plan_->allocation_lifetimes[i] ==
std::numeric_limits<uint64_t>::max()) {
continue;
}
auto limit = allocation_plan_->allocation_offsets[i] +
allocation_plan_->allocation_sizes[i];
allocation_plan_->total_size =
std::max(allocation_plan_->total_size, limit);
}
}
void AllocationPlanner::clear() {
allocation_plan_->clear();
allocation_ptr_to_id_.clear();
}
void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
plan_ = plan;
allocation_id_ = 0;
allocation_ptr_to_id_.clear();
if (current_size_ < plan->total_size) {
// Free existing memory and reallocate for larger size.
c10::free_cpu(blob_);
blob_ = c10::alloc_cpu(plan->total_size);
current_size_ = plan->total_size;
}
}
void CPUProfilingAllocator::unset_plan() {
allocation_id_ = 0;
allocation_ptr_to_id_.clear();
plan_ = nullptr;
}
void* CPUProfilingAllocator::allocate(const size_t bytes) {
TORCH_CHECK(
bytes == plan_->allocation_sizes[allocation_id_],
"Got allocation request that does not match with the plan.");
if (plan_->allocation_lifetimes[allocation_id_] ==
std::numeric_limits<uint64_t>::max()) {
// This allocation is not managed by ProfilingAllocator.
allocation_id_++;
return c10::alloc_cpu(bytes);
}
void* ptr = reinterpret_cast<uint8_t*>(blob_) +
plan_->allocation_offsets[allocation_id_];
allocation_ptr_to_id_[ptr] = allocation_id_;
allocation_id_++;
return ptr;
}
void CPUProfilingAllocator::free(void* const ptr) {
auto it = allocation_ptr_to_id_.find(ptr);
if (it == allocation_ptr_to_id_.end()) {
// Either
// 1. Allocation that was made outside the validation scope is being freed
// here or
// 2. Allocation that is not managed by profiling allocator is being freed.
// Example of the second type
// Tensor out;
// for (....) {
// {
// CPUProfilingAllocator
// out = ...some op (This also frees previous memory held by out)
// }
// out is used..
// }
c10::free_cpu(ptr);
return;
}
auto id = it->second;
TORCH_CHECK(
id < plan_->allocation_lifetimes.size(),
"Freeing allocation that is not accordingly to the plan.");
auto lifetime_id = plan_->allocation_lifetimes[id];
TORCH_CHECK(
lifetime_id == allocation_id_,
"Lifetime of allocations do not match: allocation_id ",
id,
", expected:",
lifetime_id,
", got:",
allocation_id_);
}
CPUProfilingAllocator::~CPUProfilingAllocator() {
c10::free_cpu(blob_);
}
WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
// Nesting of allocation profiling does not seem meaningful.
TORCH_CHECK(
allocation_planner == nullptr,
"Nesting profiling allocations is not supported.");
planner_ = std::make_unique<AllocationPlanner>(plan);
planner_->clear();
allocation_planner = planner_.get();
}
WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
planner_->formulate_plan();
allocation_planner = nullptr;
}
WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
AllocationPlan* plan,
bool* success) {
// Nesting of allocation profiling does not seem meaningful.
TORCH_CHECK(
allocation_planner == nullptr,
"Nesting profiling allocations is not supported.");
planner_ = std::make_unique<AllocationPlanner>(plan, true);
success_ = success;
allocation_planner = planner_.get();
}
WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
*success_ = planner_->validation_success;
allocation_planner = nullptr;
}
AllocationPlanner* GetThreadLocalAllocationPlanner() {
return allocation_planner;
}
WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
CPUProfilingAllocator* allocator,
const AllocationPlan* plan) {
// Nesting of profiling allocator is not supported.
TORCH_CHECK(
profiling_allocator == nullptr,
"Nesting profiling allocators is not supported.");
profiling_allocator = allocator;
profiling_allocator->set_plan(plan);
}
WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
profiling_allocator->unset_plan();
profiling_allocator = nullptr;
}
CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
return profiling_allocator;
}
} // namespace c10