fix max grid limitation for scan kernel

jgqwhucs · Aug 29, 2017 · df95ec6 · df95ec6
1 parent 7315c2b
commit df95ec6
Show file tree

Hide file tree

Showing 3 changed files with 71 additions and 8 deletions.
diff --git a/src/backend/cuda/kernel/scan_dim.hpp b/src/backend/cuda/kernel/scan_dim.hpp
@@ -37,9 +37,9 @@ namespace kernel
         const int tid  = tidy * THREADS_X + tidx;
 
         const int zid = blockIdx.x / blocks_x;
-        const int wid = blockIdx.y / blocks_y;
+        const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
         const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
         const int xid = blockIdx_x * blockDim.x + tidx;
         const int yid = blockIdx_y; // yid  of output. updated for input later.
 
@@ -141,9 +141,9 @@ namespace kernel
         const int tidy = threadIdx.y;
 
         const int zid = blockIdx.x / blocks_x;
-        const int wid = blockIdx.y / blocks_y;
+        const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
         const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
         const int xid = blockIdx_x * blockDim.x + tidx;
         const int yid = blockIdx_y; // yid  of output. updated for input later.
 
@@ -198,6 +198,10 @@ namespace kernel
         dim3 blocks(blocks_all[0] * blocks_all[2],
                     blocks_all[1] * blocks_all[3]);
 
+        const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
+        blocks.z = divup(blocks.y, maxBlocksY);
+        blocks.y = divup(blocks.y, blocks.z);
+
         uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));
 
         switch (threads_y) {
@@ -232,6 +236,10 @@ namespace kernel
         dim3 blocks(blocks_all[0] * blocks_all[2],
                     blocks_all[1] * blocks_all[3]);
 
+        const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
+        blocks.z = divup(blocks.y, maxBlocksY);
+        blocks.y = divup(blocks.y, blocks.z);
+
         uint lim = divup(out.dims[dim], (threads_y * blocks_all[dim]));
 
         CUDA_LAUNCH((bcast_dim_kernel<To, op, dim>), blocks, threads,

diff --git a/src/backend/cuda/kernel/scan_first.hpp b/src/backend/cuda/kernel/scan_first.hpp
@@ -34,9 +34,9 @@ namespace kernel
         const int tidy = threadIdx.y;
 
         const int zid = blockIdx.x / blocks_x;
-        const int wid = blockIdx.y / blocks_y;
+        const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
         const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
         const int xid = blockIdx_x * blockDim.x * lim + tidx;
         const int yid = blockIdx_y * blockDim.y + tidy;
 
@@ -125,9 +125,9 @@ namespace kernel
         const int tidy = threadIdx.y;
 
         const int zid = blockIdx.x / blocks_x;
-        const int wid = blockIdx.y / blocks_y;
+        const int wid = (blockIdx.y + blockIdx.z * gridDim.y) / blocks_y;
         const int blockIdx_x = blockIdx.x - (blocks_x) * zid;
-        const int blockIdx_y = blockIdx.y - (blocks_y) * wid;
+        const int blockIdx_y = (blockIdx.y + blockIdx.z * gridDim.y) - (blocks_y) * wid;
         const int xid = blockIdx_x * blockDim.x * lim + tidx;
         const int yid = blockIdx_y * blockDim.y + tidy;
 
@@ -167,6 +167,10 @@ namespace kernel
         dim3 blocks(blocks_x * out.dims[2],
                     blocks_y * out.dims[3]);
 
+        const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
+        blocks.z = divup(blocks.y, maxBlocksY);
+        blocks.y = divup(blocks.y, blocks.z);
+
         uint lim = divup(out.dims[0], (threads_x * blocks_x));
 
         switch (threads_x) {
@@ -201,6 +205,10 @@ namespace kernel
         dim3 blocks(blocks_x * out.dims[2],
                     blocks_y * out.dims[3]);
 
+        const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
+        blocks.z = divup(blocks.y, maxBlocksY);
+        blocks.y = divup(blocks.y, blocks.z);
+
         uint lim = divup(out.dims[0], (threads_x * blocks_x));
 
         CUDA_LAUNCH((bcast_first_kernel<To, op>), blocks, threads, out, tmp, blocks_x, blocks_y, lim);

diff --git a/test/scan.cpp b/test/scan.cpp
@@ -158,3 +158,50 @@ TEST(Accum, CPP)
         delete[] outData;
     }
 }
+
+TEST(Accum, MaxDim)
+{
+    const size_t largeDim = 65535 * 32 + 1;
+
+    //first dimension kernel tests
+    af::array input = af::constant(0, 2, largeDim, 2, 2);
+    input(af::span, af::seq(0, 9999), af::span, af::span) = 1;
+
+    af::array gold_first = af::constant(0, 2, largeDim, 2, 2);
+    gold_first(af::span, af::seq(0, 9999), af::span, af::span) = af::range(2, 10000, 2, 2) + 1;
+
+    af::array output_first = af::accum(input, 0);
+    ASSERT_TRUE(af::allTrue<bool>(output_first == gold_first));
+
+
+    input = af::constant(0, 2, 2, 2, largeDim);
+    input(af::span, af::span, af::span, af::seq(0, 9999)) = 1;
+
+    gold_first = af::constant(0, 2, 2, 2, largeDim);
+    gold_first(af::span, af::span, af::span, af::seq(0, 9999)) = af::range(2, 2, 2, 10000) + 1;
+
+    output_first = af::accum(input, 0);
+    ASSERT_TRUE(af::allTrue<bool>(output_first == gold_first));
+
+
+    //other dimension kernel tests
+    input = af::constant(0, 2, largeDim, 2, 2);
+    input(af::span, af::seq(0, 9999), af::span, af::span) = 1;
+
+    af::array gold_dim = af::constant(10000, 2, largeDim, 2, 2);
+    gold_dim(af::span, af::seq(0, 9999), af::span, af::span) = af::range(af::dim4(2, 10000, 2, 2), 1) + 1;
+
+    af::array output_dim = af::accum(input, 1);
+    ASSERT_TRUE(af::allTrue<bool>(output_dim == gold_dim));
+
+
+    input = af::constant(0, 2, 2, 2, largeDim);
+    input(af::span, af::span, af::span, af::seq(0, 9999)) = 1;
+
+    gold_dim = af::constant(0, 2, 2, 2, largeDim);
+    gold_dim(af::span, af::span, af::span, af::seq(0, 9999)) = af::range(af::dim4(2, 2, 2, 10000), 1) + 1;
+
+    output_dim = af::accum(input, 1);
+    ASSERT_TRUE(af::allTrue<bool>(output_dim == gold_dim));
+
+}