diff --git a/apex/mlp/__init__.py b/apex/mlp/__init__.py new file mode 100644 index 000000000..f2f30f7d1 --- /dev/null +++ b/apex/mlp/__init__.py @@ -0,0 +1 @@ +from .mlp import * diff --git a/apex/mlp/mlp.py b/apex/mlp/mlp.py new file mode 100644 index 000000000..50d2dc1df --- /dev/null +++ b/apex/mlp/mlp.py @@ -0,0 +1,70 @@ +from copy import copy +import math +import torch +from torch import nn +import mlp_cuda +from .. import amp + +class MlpFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, *args): + output = mlp_cuda.forward(args) + ctx.save_for_backward(*args) + ctx.outputs = output + return output[0] + + @staticmethod + def backward(ctx, grad_o): + grads = mlp_cuda.backward(grad_o, ctx.outputs, ctx.saved_tensors) + del ctx.outputs + return tuple(grads) + +mlp_function = amp.half_function(MlpFunction.apply) + +class MLP(torch.nn.Module): + """Launch MLP in C++ + + Args: + mlp_sizes (list of int): MLP sizes. Example: [1024,1024,1024] will create 2 MLP layers with shape 1024x1024 + bias (bool): Default True: + relu (bool): Default True + """ + def __init__(self, mlp_sizes, bias=True, relu=True): + if not (bias and relu): + raise TypeError("bias and relu must be both true.") + super(MLP, self).__init__() + self.num_layers = len(mlp_sizes) - 1 + self.mlp_sizes = copy(mlp_sizes) + self.bias = bias + self.relu= relu + + # ignoring bias = False now + self.weights = [] + self.biases = [] + for i in range(self.num_layers): + w = torch.nn.Parameter(torch.empty(mlp_sizes[i+1], mlp_sizes[i])) + self.weights.append(w) + name = 'weight_{}'.format(i) + setattr(self, name, w) + b = torch.nn.Parameter(torch.empty(mlp_sizes[i+1])) + self.biases.append(b) + name = 'bias_{}'.format(i) + setattr(self, name, b) + + self.reset_parameters() + + def reset_parameters(self): + for weight in self.weights: + dimsum = weight.size(0) + weight.size(1) + std = math.sqrt(2. / float(dimsum)) + nn.init.normal_(weight, 0., std) + for bias in self.biases: + std = math.sqrt(1. / float(bias.size(0))) + nn.init.normal_(bias, 0., std) + + def forward(self, input): + return mlp_function(input, *self.weights, *self.biases) + + def extra_repr(self): + s = F"MLP sizes: {self.mlp_sizes}, Bias={self.bias}, ReLU={self.relu}" + return s diff --git a/csrc/mlp.cpp b/csrc/mlp.cpp new file mode 100644 index 000000000..cda1a707a --- /dev/null +++ b/csrc/mlp.cpp @@ -0,0 +1,139 @@ +#include +#include +#include + +#include + +size_t get_mlp_reserved_space(int batch_size, int num_layers, const int* output_features); + +template +size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features); + +template +int mlp_fp( + T* X, + int input_features, + int batch_size, + T** WPtr, + int num_layers, + int* output_features, + T** BPtr, + T* Y, + T* reserved_space); + +template +int mlp_bp( + T* X, + T* Y, + int input_features, + int batch_size, + T** WPtr, + int num_layers, + int* output_features, + T* dY, + T* reserved_space, + T* work_space, + T* dX, + T** dwPtr, + T** dbPtr); + +std::vector mlp_forward(std::vector inputs) { + // inputs contains (input, weights, biases) + auto num_layers = (inputs.size() - 1) / 2; + auto batch_size = inputs[0].size(0); + auto input_features = inputs[0].size(1); + + std::vector output_features; + for (int i = 0; i < num_layers; i++) { + output_features.push_back(inputs[i + 1].size(0)); + } + + auto reserved_size = get_mlp_reserved_space(batch_size, num_layers, output_features.data()); + + // create output/workspace tensor + // TODO(deyuf): just get buffer? + auto out = at::empty({batch_size, output_features.back()}, inputs[0].type()); + auto reserved_space = at::empty({reserved_size}, inputs[0].type()); + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_forward", [&] { + std::vector w_ptr; + std::vector b_ptr; + for (int i = 0; i < num_layers; i++) { + w_ptr.push_back(inputs[i + 1].data_ptr()); + b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr()); + } + auto result = mlp_fp( + inputs[0].data_ptr(), + input_features, + batch_size, + w_ptr.data(), + num_layers, + output_features.data(), + b_ptr.data(), + out.data_ptr(), + reserved_space.data_ptr()); + }); + + return {out, reserved_space}; +} + +std::vector mlp_backward( + at::Tensor grad_o, + std::vector fprop_outputs, + std::vector inputs) { + // same code to get sizes and W pointers + auto num_layers = (inputs.size() - 1) / 2; + auto batch_size = inputs[0].size(0); + auto input_features = inputs[0].size(1); + + std::vector output_features; + for (int i = 0; i < num_layers; i++) { + output_features.push_back(inputs[i + 1].size(0)); + } + // create outputs, length of inputs + std::vector outputs; + for (int i = 0; i < inputs.size(); i++) { + outputs.push_back(at::empty(inputs[i].sizes(), inputs[i].type())); // clone for testing now + } + + AT_DISPATCH_FLOATING_TYPES_AND_HALF(inputs[0].type(), "mlp_forward", [&] { + std::vector w_ptr; + std::vector b_ptr; + for (int i = 0; i < num_layers; i++) { + w_ptr.push_back(inputs[i + 1].data_ptr()); + b_ptr.push_back(inputs[i + 1 + num_layers].data_ptr()); + } + std::vector outputs_ptr; + for (int i = 0; i < inputs.size(); i++) { + outputs_ptr.push_back(outputs[i].data_ptr()); + } + + auto work_size = + get_mlp_bp_workspace_in_bytes(batch_size, num_layers, output_features.data()); + + // auto work_space = at::empty({work_size*4}, at::kByte); + auto work_space = at::empty({work_size / sizeof(scalar_t)}, inputs[0].type()); + + auto result = mlp_bp( + inputs[0].data_ptr(), + fprop_outputs[0].data_ptr(), + input_features, + batch_size, + w_ptr.data(), + num_layers, + output_features.data(), + grad_o.contiguous().data_ptr(), + fprop_outputs[1].data_ptr(), + work_space.data_ptr(), + outputs_ptr[0], + outputs_ptr.data() + 1, + outputs_ptr.data() + 1 + num_layers); + }); + + return outputs; +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("forward", &mlp_forward, "MLP forward"); + m.def("backward", &mlp_backward, "MLP backward"); +} diff --git a/csrc/mlp_cuda.cu b/csrc/mlp_cuda.cu new file mode 100644 index 000000000..e14d62c0e --- /dev/null +++ b/csrc/mlp_cuda.cu @@ -0,0 +1,884 @@ +#include +#include +#include +#include +#include +#include +#include + +/* Includes, cuda */ +#include +#include + +#define BIASADDRELU_FPROP_NUM_THREADS 128 +#define BIASADDRELU_BPROP_NUM_THREADS 128 + +// move to a header later on +#define ILP 4 +template +__host__ __device__ __forceinline__ bool is_aligned(T* p){ + return ((uint64_t)p) % (ILP*sizeof(T)) == 0; +} + +template +__device__ __forceinline__ void load_store(T* dst, T* src, int dst_offset, int src_offset){ + typedef typename std::aligned_storage::type LT; + ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset]; +} +template +__device__ __forceinline__ void load_store(T* dst, volatile T* src, int dst_offset, int src_offset){ + typedef typename std::aligned_storage::type LT; + ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset]; +} +template +__device__ __forceinline__ void load_store(volatile T* dst, T* src, int dst_offset, int src_offset){ + typedef typename std::aligned_storage::type LT; + ((LT*)dst)[dst_offset] = ((LT*)src)[src_offset]; +} + +// Keep ReLU in float only. When using half, cast to float before calling. +__device__ __inline__ float relu(float a) { + float retf = max(a, 0.f); + return (retf); +} + +// FP64 Wrapper around cublas GEMMEx +cublasStatus_t mlp_gemm( + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + float* alpha, + const double* A, + int lda, + const double* B, + int ldb, + const float* beta, + double* C, + int ldc) { + return cublasGemmEx( + handle, + transa, + transb, + m, + n, + k, + alpha, + A, + CUDA_R_64F, + lda, + B, + CUDA_R_64F, + ldb, + beta, + C, + CUDA_R_64F, + ldc, + CUDA_R_64F, + CUBLAS_GEMM_DEFAULT); +} + +// FP32 Wrapper around cublas GEMMEx +cublasStatus_t mlp_gemm( + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + float* alpha, + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, + float* C, + int ldc) { + return cublasGemmEx( + handle, + transa, + transb, + m, + n, + k, + alpha, + A, + CUDA_R_32F, + lda, + B, + CUDA_R_32F, + ldb, + beta, + C, + CUDA_R_32F, + ldc, + CUDA_R_32F, + CUBLAS_GEMM_DEFAULT); +} + +// FP16 Tensor core wrapper around cublas GEMMEx +cublasStatus_t mlp_gemm( + cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + float* alpha, + const at::Half* A, + int lda, + const at::Half* B, + int ldb, + float* beta, + at::Half* C, + int ldc) { + return cublasGemmEx( + handle, + transa, + transb, + m, + n, + k, + alpha, + A, + CUDA_R_16F, + lda, + B, + CUDA_R_16F, + ldb, + beta, + C, + CUDA_R_16F, + ldc, + CUDA_R_32F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP); +} + +// Bias ADD + ReLU. Assume input X is [features x batch size], assume column major. +// Bias is one 'features' long vector, with implicit broadcast. +// Currently, activation support fuesed ReLU. Safe to call in-place. +template +__global__ void biasAddRelu_fprop(T *X, T *b, uint batch_size, uint features) { + T r_x[ILP]; + T r_b[ILP]; + if(is_aligned(X) && is_aligned(b) && features % ILP ==0) { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid*ILP < features * batch_size; tid += blockDim.x * gridDim.x) { + int row = tid % (features / ILP); + load_store(r_x, X, 0 , tid); + load_store(r_b, b, 0 , row); +#pragma unroll + for(int ii = 0; ii < ILP; ii++) { + float bias_sum = static_cast(r_x[ii]) + static_cast(r_b[ii]); + r_x[ii] = relu(bias_sum); + } + load_store(X, r_x, tid , 0); + } + } else { + int tid = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid < features * batch_size; tid += ILP * blockDim.x * gridDim.x) { +#pragma unroll + for(int ii = 0; ii < ILP; ii++) { + int idx = tid + ii * blockDim.x * gridDim.x; + if(idx < features * batch_size) { + int row = tid % features; + r_x[ii] = X[idx]; + r_b[ii] = b[row]; + } + } +#pragma unroll + for(int ii = 0; ii < ILP; ii++) { + float bias_sum = static_cast(r_x[ii]) + static_cast(r_b[ii]); + r_x[ii] = relu(bias_sum); + } +#pragma unroll + for(int ii = 0; ii < ILP; ii++) { + int idx = tid + ii * blockDim.x * gridDim.x; + if(idx < features * batch_size) { + X[idx] = r_x[ii]; + } + } + } + } +} + +// Compute grid size for pointwise backward kernel. +// Some intelligence needed to determine number of splits for reduction. +void get_biasAddRelu_bprop_grid_size( + int yfeat, + int threadsPerBlock, + int batch_size, + int* grid_x, + int* grid_y) { + // Get number of SMs for efficient reduction. + int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + // First preference, whole reduction in 1 CTA + int nBlocks = (yfeat + threadsPerBlock - 1) / threadsPerBlock; + + // Figure out how many splits to divide reduction into. At least 32 elements per CTA. + // we want grid_y as close to sqrt(batchsize)? + int nRedSplits = std::sqrt(batch_size); + // for batchsize <=64, just use 1 block + if(batch_size < 64) nRedSplits = 1; + // no need to go over occupancy + nRedSplits = min((8*num_SMs)/nBlocks, nRedSplits); + + *grid_x = nBlocks; + *grid_y = nRedSplits; + return; +} + +// Addition done deterministically via a 2-pass approach. Each CTA writes out partial +// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result. +template +__global__ void biasAddRelu_bprop( + T* Y, + T* dY, + int features, + int batch_size, + T* dX, + volatile float* intermediate, + int* semaphores, + T* db) { + // The feature that this thread is responsible for + int f = blockIdx.x * blockDim.x + threadIdx.x; + + // Compute the batch span this thread is responsible for + int chunkSize = (batch_size + gridDim.y - 1) / gridDim.y; + int nStart = blockIdx.y * chunkSize; + int nSpan = min(batch_size, nStart + chunkSize) - nStart; + volatile float* out = intermediate + blockIdx.y * features; + + // Flag to trigger last reduction. + __shared__ bool isLastBlock; + + // Accumulate db in FP32 always + float db_local = 0; + if (f < features) { + int nidx = 0; + // Handle non-multiple of UNROLL_FACTOR residue + for (; nidx < nSpan % UNROLL_FACTOR; nidx++) { + int row, col, flat_idx; + row = f; + col = nStart + nidx; + flat_idx = col * features + row; + T y_val = Y[flat_idx]; + T dy_val = dY[flat_idx]; + T dx_val; + if ((float)y_val > 0.f) + dx_val = dy_val; + else + dx_val = 0; + dX[flat_idx] = dx_val; + db_local += (float)dx_val; + } + + // Handle meat of work + for (; (nidx + UNROLL_FACTOR - 1) < nSpan; nidx += UNROLL_FACTOR) { + int row, col, flat_idx; + row = f; + col = nStart + nidx; + flat_idx = col * features + row; +#pragma unroll 4 + for (int u = 0; u < UNROLL_FACTOR; u++) { + T y_val = Y[flat_idx]; + T dy_val = dY[flat_idx]; + T dx_val; + if ((float)y_val > 0.f) + dx_val = dy_val; + else + dx_val = 0; + dX[flat_idx] = dx_val; + db_local += (float)dx_val; + flat_idx += features; + } + } + + // Write out partial result + out[f] = db_local; + } + __threadfence(); + __syncthreads(); + + // Increment semaphore and check if this is the last CTA in + // the grid_y dimension. + if (threadIdx.x == 0 && f < features) { + unsigned int sum_idx; + sum_idx = atomicAdd(&(semaphores[blockIdx.x]), 1); + isLastBlock = (sum_idx == (gridDim.y - 1)); + } + __syncthreads(); + + db_local = 0; + if (isLastBlock && f < features) { + for (int n = 0; n < gridDim.y; n++) { + int row, col; + row = f; + col = n; + db_local += (float)(intermediate[col * features + row]); + } + db[f] = (T)db_local; + } +} + +// Addition done deterministically via a 2-pass approach. Each CTA writes out partial +// sum, and the last CTA in grid Y dimension accumulates partials serially and writes to result. +template +__global__ void biasAddRelu_bprop_aligned( + T* Y, + T* dY, + int features, + int batch_size, + T* dX, + volatile float* intermediate, + int* semaphores, + T* db) { + // The feature that this thread is responsible for + int f = blockIdx.x * blockDim.x + threadIdx.x; + + // Compute the batch span this thread is responsible for + int chunkSize = (batch_size + gridDim.y - 1) / gridDim.y; + int nStart = blockIdx.y * chunkSize; + int nSpan = min(batch_size, nStart + chunkSize) - nStart; + volatile float* out = intermediate + blockIdx.y * features; + + // Flag to trigger last reduction. + __shared__ bool isLastBlock; + + // Accumulate db in FP32 always + float db_local[ILP]; + T r_y[ILP]; + T r_dy[ILP]; +#pragma unroll + for(int ii=0;ii +size_t get_mlp_bp_workspace_in_bytes(int batch_size, int num_layers, const int* output_features) { + size_t work_space = 0; + + // Store each intermediate dY explicitly. Need 2 dYs per MLP layer (one for o/p + // of biasReLU_bp and one for o/p of dgrad GEMM). + work_space += 2 * get_all_activations_size(batch_size, num_layers, output_features) * sizeof(T); + work_space += + get_reduction_scratch_space(batch_size, num_layers, output_features) * sizeof(float); + work_space += get_semaphores_size(num_layers, output_features) * sizeof(int); + + return work_space; +} + +// Returns pointers to each segment of the workspace +template +void partition_mlp_bp_workspace( + int batch_size, + int num_layers, + const int* output_features, + void* work_space, + T** dy_gemms, + T** dx_gemms, + float** db_scratch, + int** semaphores) { + /* + Workspace is partitioned as + DY_GEMMs : DX_GEMMs : DB_SCRATCH : SEMAPHORES + */ + // Start address where dy_gemm tensors are stored + *dy_gemms = reinterpret_cast(work_space); + // Start address where dx_gemm tensors are stored + *dx_gemms = *dy_gemms + get_all_activations_size(batch_size, num_layers, output_features); + // Start address where db intermediate tensors are stored + *db_scratch = reinterpret_cast( + *dx_gemms + get_all_activations_size(batch_size, num_layers, output_features)); + // Start address of semaphores + *semaphores = reinterpret_cast( + *db_scratch + get_reduction_scratch_space(batch_size, num_layers, output_features)); + + return; +} + +// Does a simple MLP fprop (GEMM+bias+ReLU). +// Can handle num_layers number of layers, each with its own shape. Output of layer i is assumed +// to be input of layer i+1. output_features, WPtr and BPtr are arrays of length num_layers, and +// must be in the same order i.e. WPtr[i] and BPtr[i] are respectively the weight and bias of layer +// 'i'. +template +int mlp_fp( + T* X, + int input_features, + int batch_size, + T** WPtr, + int num_layers, + int* output_features, + T** BPtr, + T* Y, + T* reserved_space) { + T *weight, *input, *output, *bias; + T *reserved_space_x, *reserved_space_y; + reserved_space_x = NULL; + reserved_space_y = reserved_space; + + // Get cublas handle from Pytorch + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + // Get the stream from cublas handle to reuse for biasReLU kernel. + cudaStream_t stream; + cublasGetStream(handle, &stream); + + for (int layer = 0; layer < num_layers; layer++) { + weight = WPtr[layer]; + input = (layer == 0) ? X : reserved_space_x; + output = (layer == num_layers - 1) ? Y : reserved_space_y; + bias = BPtr[layer]; + int ifeat = (layer == 0) ? input_features : output_features[layer - 1]; + int ofeat = output_features[layer]; + + float one = 1.f; + float zero = 0.f; + + cublasStatus_t cublas_status; + // Call GEMM: fprop is Y = W'X + cublas_status = mlp_gemm( + handle, + CUBLAS_OP_T, + CUBLAS_OP_N, + ofeat, + batch_size, + ifeat, + &one, + weight, + ifeat, + input, + ifeat, + &zero, + output, + ofeat); + + if (cublas_status != CUBLAS_STATUS_SUCCESS) { + printf("GEMM fprop failed with %d\n", cublas_status); + return 1; + } + + // Call biasReLU + const uint &input_size = ofeat; + int num_blocks = 0; + int num_SMs = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks, biasAddRelu_fprop, BIASADDRELU_FPROP_NUM_THREADS, 0); + biasAddRelu_fprop<<>>(output, bias, batch_size, input_size); + + // Set current output as next layer input + reserved_space_x = reserved_space_y; + // Set next layer output + reserved_space_y += ofeat * batch_size; + } + + return 0; +} + +// Does a simple MLP bprop (GEMM+bias+ReLU). +// Needs reserved space to come back exactly as it was populated in fprop. +// Does dgrad and wgrad sequentially. +template +int mlp_bp( + T* X, + T* Y, + int input_features, + int batch_size, + T** WPtr, + int num_layers, + int* output_features, + T* dY, + T* reserved_space, + T* work_space, + T* dX, + T** dwPtr, + T** dbPtr) { + T* weight; + T *dweight, *dx, *dy, *dbias; + T *x, *y; + + // Where the dx of the biasReLU (== dy of gemm) is stored. Can be thrown away + // after bp call. + T* dy_gemm_base; + // Where the dx after GEMM is stored. + T* dx_gemm_base; + // Where partial reduction results are stored. + float* db_scratch; + // Semaphores for reduction. + int* semaphores; + + partition_mlp_bp_workspace( + batch_size, + num_layers, + output_features, + work_space, + &dy_gemm_base, + &dx_gemm_base, + &db_scratch, + &semaphores); + + size_t semaphore_size = get_semaphores_size(num_layers, output_features) * sizeof(int); + + // Get cublas handle from Pytorch + cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); + // Get the stream from cublas handle to reuse for biasReLU kernel. + cudaStream_t stream; + cublasGetStream(handle, &stream); + + int* y_offsets = (int*)malloc(num_layers * sizeof(int)); + get_y_offsets(batch_size, num_layers, output_features, y_offsets); + + for (int layer = num_layers - 1; layer >= 0; layer--) { + weight = WPtr[layer]; + dweight = dwPtr[layer]; + + // x is read from reserved space + x = (layer == 0) ? X : reserved_space + y_offsets[layer - 1]; + // dx is written in workspace for all but layer==0 + dx = (layer == 0) ? dX : dx_gemm_base + y_offsets[layer - 1]; + + // y is read from reserved space + y = (layer == num_layers - 1) ? Y : reserved_space + y_offsets[layer]; + // dx from layer+1 + dy = (layer == num_layers - 1) ? dY : dx_gemm_base + y_offsets[layer]; + // dy_gemm is written to and read immediately + T* dy_gemm = dy_gemm_base + y_offsets[layer]; + + dbias = dbPtr[layer]; + int xfeat = (layer == 0) ? input_features : output_features[layer - 1]; + int yfeat = output_features[layer]; + + float one = 1.f; + float zero = 0.f; + + // Call bias ReLU backprop - first implementation, 1 thread per bias element + int threadsPerBlock = BIASADDRELU_BPROP_NUM_THREADS; + int grid_x, grid_y; + get_biasAddRelu_bprop_grid_size(yfeat, threadsPerBlock, batch_size, &grid_x, &grid_y); + + dim3 block(threadsPerBlock); + + cudaMemsetAsync(semaphores, 0, semaphore_size, stream); + + if(yfeat % (ILP * threadsPerBlock) == 0 && + is_aligned(y) && + is_aligned(dy) && + is_aligned(dy_gemm) && + is_aligned(dbias)){ + dim3 grid(grid_x/ILP, grid_y); + biasAddRelu_bprop_aligned<<>>( + y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias); + } else { + dim3 grid(grid_x, grid_y); + biasAddRelu_bprop<<>>( + y, dy, yfeat, batch_size, dy_gemm, db_scratch, semaphores, dbias); + } + + cublasStatus_t cublas_status; + // Call GEMM dgrad + cublas_status = mlp_gemm( + handle, + CUBLAS_OP_N, + CUBLAS_OP_N, + xfeat, + batch_size, + yfeat, + &one, + weight, + xfeat, + dy_gemm, + yfeat, + &zero, + dx, + xfeat); + + if (cublas_status != CUBLAS_STATUS_SUCCESS) { + printf("GEMM dgrad failed with %d\n", cublas_status); + return 1; + } + + // Call GEMM wgrad + cublas_status = mlp_gemm( + handle, + CUBLAS_OP_N, + CUBLAS_OP_T, + xfeat, + yfeat, + batch_size, + &one, + x, + xfeat, + dy_gemm, + yfeat, + &zero, + dweight, + xfeat); + + if (cublas_status != CUBLAS_STATUS_SUCCESS) { + printf("GEMM wgrad failed with %d\n", cublas_status); + return 1; + } + } + + return 0; +} + +// Instantiate for floating point types +template int mlp_fp( + float* X, + int input_features, + int batch_size, + float** WPtr, + int num_layers, + int* output_features, + float** BPtr, + float* Y, + float* reserved_space); + +template int mlp_bp( + float* X, + float* Y, + int input_features, + int batch_size, + float** WPtr, + int num_layers, + int* output_features, + float* dY, + float* reserved_space, + float* work_space, + float* dX, + float** dwPtr, + float** dbPtr); + +template int mlp_fp( + at::Half* X, + int input_features, + int batch_size, + at::Half** WPtr, + int num_layers, + int* output_features, + at::Half** BPtr, + at::Half* Y, + at::Half* reserved_space); + +template int mlp_bp( + at::Half* X, + at::Half* Y, + int input_features, + int batch_size, + at::Half** WPtr, + int num_layers, + int* output_features, + at::Half* dY, + at::Half* reserved_space, + at::Half* work_space, + at::Half* dX, + at::Half** dwPtr, + at::Half** dbPtr); + +template int mlp_fp( + double* X, + int input_features, + int batch_size, + double** WPtr, + int num_layers, + int* output_features, + double** BPtr, + double* Y, + double* reserved_space); + +template int mlp_bp( + double* X, + double* Y, + int input_features, + int batch_size, + double** WPtr, + int num_layers, + int* output_features, + double* dY, + double* reserved_space, + double* work_space, + double* dX, + double** dwPtr, + double** dbPtr); + +template size_t get_mlp_bp_workspace_in_bytes( + int batch_size, + int num_layers, + const int* output_features); +template size_t get_mlp_bp_workspace_in_bytes( + int batch_size, + int num_layers, + const int* output_features); +template size_t get_mlp_bp_workspace_in_bytes( + int batch_size, + int num_layers, + const int* output_features); diff --git a/setup.py b/setup.py index b64f999f2..7cc2a5d80 100644 --- a/setup.py +++ b/setup.py @@ -138,6 +138,13 @@ def check_cuda_torch_binary_vs_bare_metal(cuda_dir): '-O3', '--use_fast_math'] + version_dependent_macros})) + ext_modules.append( + CUDAExtension(name='mlp_cuda', + sources=['csrc/mlp.cpp', + 'csrc/mlp_cuda.cu'], + extra_compile_args={'cxx': ['-O3'] + version_dependent_macros, + 'nvcc':['-O3'] + version_dependent_macros})) + if "--bnp" in sys.argv: from torch.utils.cpp_extension import CUDAExtension sys.argv.remove("--bnp") diff --git a/tests/L0/run_mlp/test_mlp.py b/tests/L0/run_mlp/test_mlp.py new file mode 100644 index 000000000..4cf41beca --- /dev/null +++ b/tests/L0/run_mlp/test_mlp.py @@ -0,0 +1,108 @@ +"""Tests for c++ MLP""" +import unittest +from time import time +import numpy as np + +import torch +from torch import nn + +from apex.mlp import MLP + +batch_size = 1024 +mlp_sizes = [480, 1024, 1024, 512, 256, 1] +num_iters = 10 + +class TestMLP(unittest.TestCase): + + def test_creation(self): + MLP(mlp_sizes) + + def test_numeric(self): + mlp = MLP(mlp_sizes).cuda() + + mlp_layers = [] + for i in range(mlp.num_layers): + linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1]) + mlp.weights[i].data.copy_(linear.weight) + mlp.biases[i].data.copy_(linear.bias) + mlp_layers.append(linear) + mlp_layers.append(nn.ReLU(inplace=True)) + + ref_mlp = nn.Sequential(*mlp_layers).cuda() + + test_input = torch.empty(batch_size, mlp_sizes[0], device="cuda").uniform_(-1., 1.).requires_grad_() + ref_input = test_input.clone().detach().requires_grad_() + mlp_out = mlp(test_input) + ref_out = ref_mlp(ref_input) + np.testing.assert_allclose( + mlp_out.detach().cpu().numpy(), + ref_out.detach().cpu().numpy(), + atol=1e-7, rtol=1e-5) + + # Use mean value as scalar loss. Multiply 10 to make it big enough not zero out + mlp_out.mean().mul(10.).backward() + ref_out.mean().mul(10.).backward() + np.testing.assert_allclose( + test_input.grad.detach().cpu().numpy(), + ref_input.grad.detach().cpu().numpy(), + atol=0, rtol=1e-5) + np.testing.assert_allclose( + mlp.biases[0].grad.detach().cpu().numpy(), + ref_mlp[0].bias.grad.detach().cpu().numpy(), + atol=1e-7, rtol=1e-5) + + def test_performance_half(self): + mlp = MLP(mlp_sizes).cuda().half() + + mlp_layers = [] + for i in range(mlp.num_layers): + linear = nn.Linear(mlp_sizes[i], mlp_sizes[i + 1]) + mlp.weights[i].data.copy_(linear.weight) + mlp.biases[i].data.copy_(linear.bias) + mlp_layers.append(linear) + mlp_layers.append(nn.ReLU(inplace=True)) + + ref_mlp = nn.Sequential(*mlp_layers).cuda().half() + + test_input = torch.empty( + batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_() + ref_input = torch.empty( + batch_size, mlp_sizes[0], device="cuda", dtype=torch.half).fill_(10.).requires_grad_() + + # Warm up GPU + for _ in range(100): + ref_out = ref_mlp(ref_input) + ref_loss = ref_out.mean() + ref_mlp.zero_grad() + ref_loss.backward() + mlp_out = mlp(test_input) + test_loss = mlp_out.mean() + mlp.zero_grad() + test_loss.backward() + + torch.cuda.profiler.start() + torch.cuda.synchronize() + start_time = time() + for _ in range(num_iters): + ref_out = ref_mlp(ref_input) + ref_loss = ref_out.mean() + ref_mlp.zero_grad() + ref_loss.backward() + torch.cuda.synchronize() + stop_time = time() + print(F"\nPytorch MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms") + + torch.cuda.synchronize() + start_time = time() + for _ in range(num_iters): + mlp_out = mlp(test_input) + test_loss = mlp_out.mean() + mlp.zero_grad() + test_loss.backward() + torch.cuda.synchronize() + stop_time = time() + print(F"C++ MLP time {(stop_time - start_time) * 1000. / num_iters:.4f} ms") + torch.cuda.profiler.stop() + +if __name__ == '__main__': + unittest.main() diff --git a/tests/L0/run_test.py b/tests/L0/run_test.py index 6d3b96187..8a4135d5f 100644 --- a/tests/L0/run_test.py +++ b/tests/L0/run_test.py @@ -1,7 +1,7 @@ import unittest import sys -test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data"] +test_dirs = ["run_amp", "run_fp16util", "run_optimizers", "run_fused_layer_norm", "run_pyprof_nvtx", "run_pyprof_data", "run_mlp"] runner = unittest.TextTestRunner(verbosity=2)