fix max grid dimension for join kernel

jgqwhucs · Aug 29, 2017 · 4fa5f48 · 4fa5f48
1 parent 4933315
commit 4fa5f48
Show file tree

Hide file tree

Showing 2 changed files with 42 additions and 11 deletions.
diff --git a/src/backend/cuda/kernel/join.hpp b/src/backend/cuda/kernel/join.hpp
@@ -12,6 +12,7 @@
 #include <Param.hpp>
 #include <err_cuda.hpp>
 #include <debug_cuda.hpp>
+#include <iostream>
 
 namespace cuda
 {
@@ -29,21 +30,20 @@ namespace cuda
                          const int o0, const int o1, const int o2, const int o3,
                          const int blocksPerMatX, const int blocksPerMatY)
         {
-            const int iz = blockIdx.x / blocksPerMatX;
-            const int iw = blockIdx.y / blocksPerMatY;
+            const int incy = blocksPerMatY * blockDim.y;
+            const int incx = blocksPerMatX * blockDim.x;
 
+            const int iz = blockIdx.x / blocksPerMatX;
             const int blockIdx_x = blockIdx.x - iz * blocksPerMatX;
-            const int blockIdx_y = blockIdx.y - iw * blocksPerMatY;
-
             const int xx = threadIdx.x + blockIdx_x * blockDim.x;
-            const int yy = threadIdx.y + blockIdx_y * blockDim.y;
 
-            const int incy = blocksPerMatY * blockDim.y;
-            const int incx = blocksPerMatX * blockDim.x;
-
-            To *d_out = out.ptr;
+            To *d_out      = out.ptr;
             Ti const *d_in = in.ptr;
 
+            const int iw = (blockIdx.y + (blockIdx.z * gridDim.y)) / blocksPerMatY;
+            const int blockIdx_y = (blockIdx.y + (blockIdx.z * gridDim.y)) - iw * blocksPerMatY;
+            const int yy = threadIdx.y + blockIdx_y * blockDim.y;
+
             if(iz < in.dims[2] && iw < in.dims[3]) {
                 d_out = d_out + (iz + o2) * out.strides[2] + (iw + o3) * out.strides[3];
                 d_in  = d_in  + iz * in.strides[2] + iw * in.strides[3];
@@ -69,10 +69,17 @@ namespace cuda
 
             int blocksPerMatX = divup(X.dims[0], TILEX);
             int blocksPerMatY = divup(X.dims[1], TILEY);
+
             dim3 blocks(blocksPerMatX * X.dims[2],
                         blocksPerMatY * X.dims[3],
                         1);
 
+            const int maxBlocksY   = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
+            const int blocksPerMatZ = divup(blocks.y, maxBlocksY);
+            if(blocksPerMatZ > 1) {
+                blocks.y = maxBlocksY;
+                blocks.z = blocksPerMatZ;
+            }
             CUDA_LAUNCH((join_kernel<To, Tx, dim>), blocks, threads,
                        out, X, offset[0], offset[1], offset[2], offset[3],
                        blocksPerMatX, blocksPerMatY);

diff --git a/test/join.cpp b/test/join.cpp
@@ -114,6 +114,32 @@ void joinTest(string pTestFile, const unsigned dim, const unsigned in0, const un
     JOIN_INIT(JoinSmall1, join_small, 1, 0, 2, 1);
     JOIN_INIT(JoinSmall2, join_small, 2, 0, 3, 2);
 
+TEST(Join, JoinLargeDim)
+{
+    //const int nx = 32;
+    const int nx = 1;
+    const int ny = 4 * 1024 * 1024;
+    const int nw = 4 * 1024 * 1024;
+
+    af::deviceGC();
+    {
+        af::array in = af::randu(nx, ny, u8);
+        af::array joined = af::join(0, in, in);
+        af::dim4 in_dims = in.dims();
+        af::dim4 joined_dims = joined.dims();
+
+        ASSERT_EQ(2*in_dims[0], joined_dims[0]);
+        //todo: uncomment as assert
+        //printf("%f\n", af::sum<float>((joined(0, af::span) - joined(1, af::span)).as(f32)));
+
+        af::array in2 = af::constant(1, (dim_t)nx, (dim_t)ny, (dim_t)2, (dim_t)nw, u8);
+        joined = af::join(3, in, in);
+        in_dims = in.dims();
+        joined_dims = joined.dims();
+        ASSERT_EQ(2*in_dims[3], joined_dims[3]);
+    }
+}
+
 ///////////////////////////////// CPP ////////////////////////////////////
 //
 TEST(Join, CPP)
@@ -161,7 +187,6 @@ TEST(JoinMany0, CPP)
     af::array output = af::join(0, a0, a1, a2);
     af::array gold = af::join(0, a0, af::join(0, a1, a2));
 
-
     ASSERT_EQ(af::sum<float>(output - gold), 0);
 }
 
@@ -177,6 +202,5 @@ TEST(JoinMany1, CPP)
     int dim = 1;
     af::array output = af::join(dim, a0, a1, a2, a3);
     af::array gold = af::join(dim, a0, af::join(dim, a1, af::join(dim, a2, a3)));
-
     ASSERT_EQ(af::sum<float>(output - gold), 0);
 }