forked from Rajeevveera24/pytorch-copy
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Reland] Refactor caching device allocator utils (#130923)
# Motivation Following [[RFC] Intel GPU Runtime Upstreaming for Allocator ](pytorch/pytorch#116322), this PR aims to refactor caching device allocator utils to improve code reuse usage. This is the first PR, we could prepare some follow-up PRs continuing to refactor the device caching allocator. Pull Request resolved: pytorch/pytorch#130923 Approved by: https://github.com/EikanWang, https://github.com/gujinghui, https://github.com/albanD, https://github.com/eqy
- Loading branch information
1 parent
d7c97e7
commit 6c1da66
Showing
7 changed files
with
202 additions
and
190 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,131 @@ | ||
#pragma once | ||
|
||
#include <c10/core/Allocator.h> | ||
#include <c10/util/irange.h> | ||
|
||
#include <array> | ||
|
||
namespace c10::CachingDeviceAllocator { | ||
|
||
struct Stat { | ||
void increase(size_t amount) { | ||
current += static_cast<int64_t>(amount); | ||
peak = std::max(current, peak); | ||
allocated += static_cast<int64_t>(amount); | ||
} | ||
|
||
void decrease(size_t amount) { | ||
current -= static_cast<int64_t>(amount); | ||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY( | ||
current >= 0, | ||
"Negative tracked stat in device allocator (likely logic error)."); | ||
freed += static_cast<int64_t>(amount); | ||
} | ||
|
||
void reset_accumulated() { | ||
allocated = 0; | ||
freed = 0; | ||
} | ||
|
||
void reset_peak() { | ||
peak = current; | ||
} | ||
|
||
int64_t current = 0; | ||
int64_t peak = 0; | ||
int64_t allocated = 0; | ||
int64_t freed = 0; | ||
}; | ||
|
||
enum struct StatType : uint64_t { | ||
AGGREGATE = 0, | ||
SMALL_POOL = 1, | ||
LARGE_POOL = 2, | ||
NUM_TYPES = 3 // remember to update this whenever a new stat type is added | ||
}; | ||
|
||
using StatArray = std::array<Stat, static_cast<size_t>(StatType::NUM_TYPES)>; | ||
using StatTypes = std::array<bool, static_cast<size_t>(StatType::NUM_TYPES)>; | ||
|
||
template <typename Func> | ||
void for_each_selected_stat_type(const StatTypes& stat_types, Func f) { | ||
for (const auto stat_type : c10::irange(stat_types.size())) { | ||
if (stat_types[stat_type]) { | ||
f(stat_type); | ||
} | ||
} | ||
} | ||
|
||
// Struct containing memory allocator summary statistics for a device. | ||
struct DeviceStats { | ||
// COUNT: allocations requested by client code | ||
StatArray allocation; | ||
// COUNT: number of allocated segments from device memory allocation. | ||
StatArray segment; | ||
// COUNT: number of active memory blocks (allocated or used by stream) | ||
StatArray active; | ||
// COUNT: number of inactive, split memory blocks (unallocated but can't be | ||
// released via device memory deallocation) | ||
StatArray inactive_split; | ||
|
||
// SUM: bytes allocated by this memory alocator | ||
StatArray allocated_bytes; | ||
// SUM: bytes reserved by this memory allocator (both free and used) | ||
StatArray reserved_bytes; | ||
// SUM: bytes within active memory blocks | ||
StatArray active_bytes; | ||
// SUM: bytes within inactive, split memory blocks | ||
StatArray inactive_split_bytes; | ||
// SUM: bytes requested by client code | ||
StatArray requested_bytes; | ||
|
||
// COUNT: total number of failed calls to device malloc necessitating cache | ||
// flushes. | ||
int64_t num_alloc_retries = 0; | ||
|
||
// COUNT: total number of OOMs (i.e. failed calls to device memory allocation | ||
// after cache flush) | ||
int64_t num_ooms = 0; | ||
|
||
// COUNT: total number of oversize blocks allocated from pool | ||
Stat oversize_allocations; | ||
|
||
// COUNT: total number of oversize blocks requiring malloc | ||
Stat oversize_segments; | ||
|
||
// COUNT: total number of synchronize_and_free_events() calls | ||
int64_t num_sync_all_streams = 0; | ||
|
||
// COUNT: total number of device memory allocation calls. This includes both | ||
// mapped and malloced memory. | ||
int64_t num_device_alloc = 0; | ||
|
||
// COUNT: total number of device memory deallocation calls. This includes both | ||
// un-mapped and free memory. | ||
int64_t num_device_free = 0; | ||
|
||
// SIZE: maximum block size that is allowed to be split. | ||
int64_t max_split_size = 0; | ||
}; | ||
|
||
// Size pretty-printer | ||
inline std::string format_size(uint64_t size) { | ||
std::ostringstream os; | ||
os.precision(2); | ||
os << std::fixed; | ||
if (size <= 1024) { | ||
os << size << " bytes"; | ||
} else if (size <= 1048576) { | ||
os << (static_cast<double>(size) / 1024.0); | ||
os << " KiB"; | ||
} else if (size <= 1073741824ULL) { | ||
os << static_cast<double>(size) / 1048576.0; | ||
os << " MiB"; | ||
} else { | ||
os << static_cast<double>(size) / 1073741824.0; | ||
os << " GiB"; | ||
} | ||
return os.str(); | ||
} | ||
|
||
} // namespace c10::CachingDeviceAllocator |
Oops, something went wrong.