Skip to content

Commit

Permalink
use cuda caching allocator from pytorch (NVIDIA#1180)
Browse files Browse the repository at this point in the history
  • Loading branch information
xwang233 authored Sep 30, 2021
1 parent 2a559c5 commit bdac244
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 17 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ build
docs/build
*~
__pycache__
*.so
.vscode
14 changes: 6 additions & 8 deletions apex/contrib/csrc/groupbn/batch_norm.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include <c10/cuda/CUDACachingAllocator.h>

#include "THC/THC.h"

Expand All @@ -26,23 +27,20 @@ static size_t round_up_to_multiple(size_t x, int multiple) {
return ((x + multiple - 1) / multiple) * multiple;
}

// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct Workspace {
Workspace(size_t size) : size(size), data(NULL) {
data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
dataPtr = allocator.allocate(size);
data = dataPtr.get();
}
Workspace(const Workspace&) = delete;
Workspace(Workspace&&) = default;
Workspace& operator=(Workspace&&) = default;
~Workspace() {
if (data) {
THCudaFree(at::globalContext().lazyInitCUDA(), data);
}
}
~Workspace() = default;

size_t size;
void* data;
c10::DataPtr dataPtr;
};

// Return {y}
Expand Down
14 changes: 6 additions & 8 deletions apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include <c10/cuda/CUDACachingAllocator.h>

#include "THC/THC.h"

Expand All @@ -27,23 +28,20 @@ static size_t round_up_to_multiple(size_t x, int multiple) {
return ((x + multiple - 1) / multiple) * multiple;
}

// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct Workspace {
Workspace(size_t size) : size(size), data(NULL) {
data = THCudaMalloc(at::globalContext().lazyInitCUDA(), size);
auto& allocator = *::c10::cuda::CUDACachingAllocator::get();
dataPtr = allocator.allocate(size);
data = dataPtr.get();
}
Workspace(const Workspace&) = delete;
Workspace(Workspace&&) = default;
Workspace& operator=(Workspace&&) = default;
~Workspace() {
if (data) {
THCudaFree(at::globalContext().lazyInitCUDA(), data);
}
}
~Workspace() = default;

size_t size;
void* data;
c10::DataPtr dataPtr;
};

// Return {y}
Expand Down
1 change: 0 additions & 1 deletion apex/contrib/csrc/xentropy/xentropy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@

#include <THC/THC.h>
#include <THC/THCGeneral.h>
#include <THC/THCThrustAllocator.cuh>

#include "type_shim.h"
#include "compat.h"
Expand Down

0 comments on commit bdac244

Please sign in to comment.