Skip to content

Commit

Permalink
fix max grid limitation for scan kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
syurkevi authored and pavanky committed Aug 29, 2017
1 parent 7315c2b commit df95ec6
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 8 deletions.
16 changes: 12 additions & 4 deletions src/backend/cuda/kernel/scan_dim.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ namespace kernel
const int tid = tidy * THREADS_X + tidx;

const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x + tidx;
const int yid = blockIdx_y; // yid of output. updated for input later.

Expand Down Expand Up @@ -141,9 +141,9 @@ namespace kernel
const int tidy = threadIdx.y;

const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x + tidx;
const int yid = blockIdx_y; // yid of output. updated for input later.

Expand Down Expand Up @@ -198,6 +198,10 @@ namespace kernel
dim3 blocks(blocks_all[0] * blocks_all[2],
blocks_all[1] * blocks_all[3]);

const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
blocks.z = divup(blocks.y, maxBlocksY);
blocks.y = divup(blocks.y, blocks.z);

uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

switch (threads_y) {
Expand Down Expand Up @@ -232,6 +236,10 @@ namespace kernel
dim3 blocks(blocks_all[0] * blocks_all[2],
blocks_all[1] * blocks_all[3]);

const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
blocks.z = divup(blocks.y, maxBlocksY);
blocks.y = divup(blocks.y, blocks.z);

uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));

CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads,
Expand Down
16 changes: 12 additions & 4 deletions src/backend/cuda/kernel/scan_first.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ namespace kernel
const int tidy = threadIdx.y;

const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x * lim + tidx;
const int yid = blockIdx_y * blockDim.y + tidy;

Expand Down Expand Up @@ -125,9 +125,9 @@ namespace kernel
const int tidy = threadIdx.y;

const int zid = blockIdx.x / blocks_x;
const int wid = blockIdx.y / blocks_y;
const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
const int xid = blockIdx_x * blockDim.x * lim + tidx;
const int yid = blockIdx_y * blockDim.y + tidy;

Expand Down Expand Up @@ -167,6 +167,10 @@ namespace kernel
dim3 blocks(blocks_x * out.dims[2],
blocks_y * out.dims[3]);

const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
blocks.z = divup(blocks.y, maxBlocksY);
blocks.y = divup(blocks.y, blocks.z);

uint lim = divup(out.dims[0], (threads_x * blocks_x));

switch (threads_x) {
Expand Down Expand Up @@ -201,6 +205,10 @@ namespace kernel
dim3 blocks(blocks_x * out.dims[2],
blocks_y * out.dims[3]);

const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
blocks.z = divup(blocks.y, maxBlocksY);
blocks.y = divup(blocks.y, blocks.z);

uint lim = divup(out.dims[0], (threads_x * blocks_x));

CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim);
Expand Down
47 changes: 47 additions & 0 deletions test/scan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,3 +158,50 @@ TEST(Accum, CPP)
delete[] outData;
}
}

TEST(Accum, MaxDim)
{
const size_t largeDim = 65535 * 32 + 1;

//first dimension kernel tests
af::array input = af::constant(0, 2, largeDim, 2, 2);
input(af::span, af::seq(0, 9999), af::span, af::span) = 1;

af::array gold_first = af::constant(0, 2, largeDim, 2, 2);
gold_first(af::span, af::seq(0, 9999), af::span, af::span) = af::range(2, 10000, 2, 2) + 1;

af::array output_first = af::accum(input, 0);
ASSERT_TRUE(af::allTrue<bool>(output_first == gold_first));


input = af::constant(0, 2, 2, 2, largeDim);
input(af::span, af::span, af::span, af::seq(0, 9999)) = 1;

gold_first = af::constant(0, 2, 2, 2, largeDim);
gold_first(af::span, af::span, af::span, af::seq(0, 9999)) = af::range(2, 2, 2, 10000) + 1;

output_first = af::accum(input, 0);
ASSERT_TRUE(af::allTrue<bool>(output_first == gold_first));


//other dimension kernel tests
input = af::constant(0, 2, largeDim, 2, 2);
input(af::span, af::seq(0, 9999), af::span, af::span) = 1;

af::array gold_dim = af::constant(10000, 2, largeDim, 2, 2);
gold_dim(af::span, af::seq(0, 9999), af::span, af::span) = af::range(af::dim4(2, 10000, 2, 2), 1) + 1;

af::array output_dim = af::accum(input, 1);
ASSERT_TRUE(af::allTrue<bool>(output_dim == gold_dim));


input = af::constant(0, 2, 2, 2, largeDim);
input(af::span, af::span, af::span, af::seq(0, 9999)) = 1;

gold_dim = af::constant(0, 2, 2, 2, largeDim);
gold_dim(af::span, af::span, af::span, af::seq(0, 9999)) = af::range(af::dim4(2, 2, 2, 10000), 1) + 1;

output_dim = af::accum(input, 1);
ASSERT_TRUE(af::allTrue<bool>(output_dim == gold_dim));

}

0 comments on commit df95ec6

Please sign in to comment.