Skip to content

Commit

Permalink
fix max grid dimension for join kernel
Browse files Browse the repository at this point in the history
  • Loading branch information
syurkevi authored and pavanky committed Aug 29, 2017
1 parent 4933315 commit 4fa5f48
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 11 deletions.
25 changes: 16 additions & 9 deletions src/backend/cuda/kernel/join.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <Param.hpp>
#include <err_cuda.hpp>
#include <debug_cuda.hpp>
#include <iostream>

namespace cuda
{
Expand All @@ -29,21 +30,20 @@ namespace cuda
const int o0, const int o1, const int o2, const int o3,
const int blocksPerMatX, const int blocksPerMatY)
{
const int iz = blockIdx.x / blocksPerMatX;
const int iw = blockIdx.y / blocksPerMatY;
const int incy = blocksPerMatY * blockDim.y;
const int incx = blocksPerMatX * blockDim.x;

const int iz = blockIdx.x / blocksPerMatX;
const int blockIdx_x = blockIdx.x - iz * blocksPerMatX;
const int blockIdx_y = blockIdx.y - iw * blocksPerMatY;

const int xx = threadIdx.x + blockIdx_x * blockDim.x;
const int yy = threadIdx.y + blockIdx_y * blockDim.y;

const int incy = blocksPerMatY * blockDim.y;
const int incx = blocksPerMatX * blockDim.x;

To *d_out = out.ptr;
To *d_out = out.ptr;
Ti const *d_in = in.ptr;

const int iw = (blockIdx.y + (blockIdx.z * gridDim.y)) / blocksPerMatY;
const int blockIdx_y = (blockIdx.y + (blockIdx.z * gridDim.y)) - iw * blocksPerMatY;
const int yy = threadIdx.y + blockIdx_y * blockDim.y;

if(iz < in.dims[2] && iw < in.dims[3]) {
d_out = d_out + (iz + o2) * out.strides[2] + (iw + o3) * out.strides[3];
d_in = d_in + iz * in.strides[2] + iw * in.strides[3];
Expand All @@ -69,10 +69,17 @@ namespace cuda

int blocksPerMatX = divup(X.dims[0], TILEX);
int blocksPerMatY = divup(X.dims[1], TILEY);

dim3 blocks(blocksPerMatX * X.dims[2],
blocksPerMatY * X.dims[3],
1);

const int maxBlocksY = cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
const int blocksPerMatZ = divup(blocks.y, maxBlocksY);
if(blocksPerMatZ > 1) {
blocks.y = maxBlocksY;
blocks.z = blocksPerMatZ;
}
CUDA_LAUNCH((join_kernel<To, Tx, dim>), blocks, threads,
out, X, offset[0], offset[1], offset[2], offset[3],
blocksPerMatX, blocksPerMatY);
Expand Down
28 changes: 26 additions & 2 deletions test/join.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,32 @@ void joinTest(string pTestFile, const unsigned dim, const unsigned in0, const un
JOIN_INIT(JoinSmall1, join_small, 1, 0, 2, 1);
JOIN_INIT(JoinSmall2, join_small, 2, 0, 3, 2);

TEST(Join, JoinLargeDim)
{
//const int nx = 32;
const int nx = 1;
const int ny = 4 * 1024 * 1024;
const int nw = 4 * 1024 * 1024;

af::deviceGC();
{
af::array in = af::randu(nx, ny, u8);
af::array joined = af::join(0, in, in);
af::dim4 in_dims = in.dims();
af::dim4 joined_dims = joined.dims();

ASSERT_EQ(2*in_dims[0], joined_dims[0]);
//todo: uncomment as assert
//printf("%f\n", af::sum<float>((joined(0, af::span) - joined(1, af::span)).as(f32)));

af::array in2 = af::constant(1, (dim_t)nx, (dim_t)ny, (dim_t)2, (dim_t)nw, u8);
joined = af::join(3, in, in);
in_dims = in.dims();
joined_dims = joined.dims();
ASSERT_EQ(2*in_dims[3], joined_dims[3]);
}
}

///////////////////////////////// CPP ////////////////////////////////////
//
TEST(Join, CPP)
Expand Down Expand Up @@ -161,7 +187,6 @@ TEST(JoinMany0, CPP)
af::array output = af::join(0, a0, a1, a2);
af::array gold = af::join(0, a0, af::join(0, a1, a2));


ASSERT_EQ(af::sum<float>(output - gold), 0);
}

Expand All @@ -177,6 +202,5 @@ TEST(JoinMany1, CPP)
int dim = 1;
af::array output = af::join(dim, a0, a1, a2, a3);
af::array gold = af::join(dim, a0, af::join(dim, a1, af::join(dim, a2, a3)));

ASSERT_EQ(af::sum<float>(output - gold), 0);
}

0 comments on commit 4fa5f48

Please sign in to comment.