Skip to content

Commit

Permalink
Merge pull request opencv#3002 from vbystricky:oclopt_pyrdown
Browse files Browse the repository at this point in the history
  • Loading branch information
alalek committed Jul 31, 2014
2 parents 17a6b8c + e49d148 commit f8aecb2
Show file tree
Hide file tree
Showing 2 changed files with 108 additions and 119 deletions.
225 changes: 107 additions & 118 deletions modules/imgproc/src/opencl/pyr_down.cl
Original file line number Diff line number Diff line change
Expand Up @@ -89,154 +89,125 @@
#define MAD(x,y,z) mad((x),(y),(z))
#endif

#define LOAD_LOCAL(col_gl, col_lcl) \
sum0 = co3* SRC(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \
sum0 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum0); \
temp = SRC(col_gl, EXTRAPOLATE_(src_y, src_rows)); \
sum0 = MAD(co1, temp, sum0); \
sum1 = co3 * temp; \
temp = SRC(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \
sum0 = MAD(co2, temp, sum0); \
sum1 = MAD(co2, temp, sum1); \
temp = SRC(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \
sum0 = MAD(co3, temp, sum0); \
sum1 = MAD(co1, temp, sum1); \
smem[0][col_lcl] = sum0; \
sum1 = MAD(co2, SRC(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum1); \
sum1 = MAD(co3, SRC(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum1); \
smem[1][col_lcl] = sum1;


#if kercn == 4
#define LOAD_LOCAL4(col_gl, col_lcl) \
sum40 = co3* SRC4(col_gl, EXTRAPOLATE_(src_y - 2, src_rows)); \
sum40 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y - 1, src_rows)), sum40); \
temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y, src_rows)); \
sum40 = MAD(co1, temp4, sum40); \
sum41 = co3 * temp4; \
temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 1, src_rows)); \
sum40 = MAD(co2, temp4, sum40); \
sum41 = MAD(co2, temp4, sum41); \
temp4 = SRC4(col_gl, EXTRAPOLATE_(src_y + 2, src_rows)); \
sum40 = MAD(co3, temp4, sum40); \
sum41 = MAD(co1, temp4, sum41); \
vstore4(sum40, col_lcl, (__local float*) &smem[0][2]); \
sum41 = MAD(co2, SRC4(col_gl, EXTRAPOLATE_(src_y + 3, src_rows)), sum41); \
sum41 = MAD(co3, SRC4(col_gl, EXTRAPOLATE_(src_y + 4, src_rows)), sum41); \
vstore4(sum41, col_lcl, (__local float*) &smem[1][2]);
#endif

#define noconvert

__kernel void pyrDown(__global const uchar * src, int src_step, int src_offset, int src_rows, int src_cols,
__global uchar * dst, int dst_step, int dst_offset, int dst_rows, int dst_cols)
{
const int x = get_global_id(0)*kercn;
const int y = get_group_id(1);
const int y = 2*get_global_id(1);

__local FT smem[LOCAL_SIZE + 4];
__local FT smem[2][LOCAL_SIZE + 4];
__global uchar * dstData = dst + dst_offset;
__global const uchar * srcData = src + src_offset;

FT sum;
FT sum0, sum1, temp;
FT co1 = 0.375f;
FT co2 = 0.25f;
FT co3 = 0.0625f;

const int src_y = 2*y;
int col;

if (src_y >= 2 && src_y < src_rows - 2)
if (src_y >= 2 && src_y < src_rows - 4)
{
#define EXTRAPOLATE_(val, maxVal) val
#if kercn == 1
col = EXTRAPOLATE(x, src_cols);

sum = co3* SRC(col, src_y - 2);
sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = MAD(co1, SRC(col, src_y ), sum);
sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = MAD(co3, SRC(col, src_y + 2), sum);

smem[2 + get_local_id(0)] = sum;
LOAD_LOCAL(col, 2 + get_local_id(0))
#else
if (x < src_cols-4)
{
float4 sum4;
sum4 = co3* SRC4(x, src_y - 2);
sum4 = MAD(co2, SRC4(x, src_y - 1), sum4);
sum4 = MAD(co1, SRC4(x, src_y ), sum4);
sum4 = MAD(co2, SRC4(x, src_y + 1), sum4);
sum4 = MAD(co3, SRC4(x, src_y + 2), sum4);

vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
float4 sum40, sum41, temp4;
LOAD_LOCAL4(x, get_local_id(0))
}
else
{
for (int i=0; i<4; i++)
{
col = EXTRAPOLATE(x+i, src_cols);
sum = co3* SRC(col, src_y - 2);
sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = MAD(co1, SRC(col, src_y ), sum);
sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = MAD(co3, SRC(col, src_y + 2), sum);

smem[2 + 4*get_local_id(0)+i] = sum;
LOAD_LOCAL(col, 2 + 4 * get_local_id(0) + i)
}
}
#endif
if (get_local_id(0) < 2)
{
col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);

sum = co3* SRC(col, src_y - 2);
sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = MAD(co1, SRC(col, src_y ), sum);
sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = MAD(co3, SRC(col, src_y + 2), sum);

smem[get_local_id(0)] = sum;
LOAD_LOCAL(col, get_local_id(0))
}

if (get_local_id(0) > 1 && get_local_id(0) < 4)
else if (get_local_id(0) < 4)
{
col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);

sum = co3* SRC(col, src_y - 2);
sum = MAD(co2, SRC(col, src_y - 1), sum);
sum = MAD(co1, SRC(col, src_y ), sum);
sum = MAD(co2, SRC(col, src_y + 1), sum);
sum = MAD(co3, SRC(col, src_y + 2), sum);

smem[LOCAL_SIZE + get_local_id(0)] = sum;
LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
}
}
else // need extrapolate y
{
#define EXTRAPOLATE_(val, maxVal) EXTRAPOLATE(val, maxVal)
#if kercn == 1
col = EXTRAPOLATE(x, src_cols);

sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);

smem[2 + get_local_id(0)] = sum;
LOAD_LOCAL(col, 2 + get_local_id(0))
#else
if (x < src_cols-4)
{
float4 sum4;
sum4 = co3* SRC4(x, EXTRAPOLATE(src_y - 2, src_rows));
sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y - 1, src_rows)), sum4);
sum4 = MAD(co1, SRC4(x, EXTRAPOLATE(src_y , src_rows)), sum4);
sum4 = MAD(co2, SRC4(x, EXTRAPOLATE(src_y + 1, src_rows)), sum4);
sum4 = MAD(co3, SRC4(x, EXTRAPOLATE(src_y + 2, src_rows)), sum4);

vstore4(sum4, get_local_id(0), (__local float*) &smem[2]);
float4 sum40, sum41, temp4;
LOAD_LOCAL4(x, get_local_id(0))
}
else
{
for (int i=0; i<4; i++)
{
col = EXTRAPOLATE(x+i, src_cols);
sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);

smem[2 + 4*get_local_id(0)+i] = sum;
LOAD_LOCAL(col, 2 + 4*get_local_id(0) + i)
}
}
#endif
if (get_local_id(0) < 2)
{
col = EXTRAPOLATE((int)(get_group_id(0)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);

sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);

smem[get_local_id(0)] = sum;
LOAD_LOCAL(col, get_local_id(0))
}

if (get_local_id(0) > 1 && get_local_id(0) < 4)
else if (get_local_id(0) < 4)
{
col = EXTRAPOLATE((int)((get_group_id(0)+1)*LOCAL_SIZE + get_local_id(0) - 2), src_cols);

sum = co3* SRC(col, EXTRAPOLATE(src_y - 2, src_rows));
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y - 1, src_rows)), sum);
sum = MAD(co1, SRC(col, EXTRAPOLATE(src_y , src_rows)), sum);
sum = MAD(co2, SRC(col, EXTRAPOLATE(src_y + 1, src_rows)), sum);
sum = MAD(co3, SRC(col, EXTRAPOLATE(src_y + 2, src_rows)), sum);

smem[LOCAL_SIZE + get_local_id(0)] = sum;
LOAD_LOCAL(col, LOCAL_SIZE + get_local_id(0))
}
}

Expand All @@ -247,50 +218,68 @@ __kernel void pyrDown(__global const uchar * src, int src_step, int src_offset,
{
const int tid2 = get_local_id(0) * 2;

sum = 0.f;
const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;

if (dst_x < dst_cols)
{
for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
{
#if cn == 1
#if fdepth <= 5
sum = sum + dot(vload4(0, (__local float*) (&smem)+tid2), (float4)(co3, co2, co1, co2));
FT sum = dot(vload4(0, (__local float*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (float4)(co3, co2, co1, co2));
#else
sum = sum + dot(vload4(0, (__local double*) (&smem)+tid2), (double4)(co3, co2, co1, co2));
FT sum = dot(vload4(0, (__local double*) (&smem) + tid2 + (yin - y) * (LOCAL_SIZE + 4)), (double4)(co3, co2, co1, co2));
#endif
#else
sum = MAD(co3, smem[2 + tid2 - 2], sum);
sum = MAD(co2, smem[2 + tid2 - 1], sum);
sum = MAD(co1, smem[2 + tid2 ], sum);
sum = MAD(co2, smem[2 + tid2 + 1], sum);
FT sum = co3 * smem[yin - y][2 + tid2 - 2];
sum = MAD(co2, smem[yin - y][2 + tid2 - 1], sum);
sum = MAD(co1, smem[yin - y][2 + tid2 ], sum);
sum = MAD(co2, smem[yin - y][2 + tid2 + 1], sum);
#endif
sum = MAD(co3, smem[2 + tid2 + 2], sum);

const int dst_x = (get_group_id(0) * get_local_size(0) + tid2) / 2;

if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + y * dst_step + dst_x * PIXSIZE);
sum = MAD(co3, smem[yin - y][2 + tid2 + 2], sum);
storepix(convertToT(sum), dstData + yin * dst_step + dst_x * PIXSIZE);
}
}
}
#else
int tid4 = get_local_id(0) * 4;

sum = co3* smem[2 + tid4 + 2];
sum = MAD(co3, smem[2 + tid4 - 2], sum);
sum = MAD(co2, smem[2 + tid4 - 1], sum);
sum = MAD(co1, smem[2 + tid4 ], sum);
sum = MAD(co2, smem[2 + tid4 + 1], sum);

int dst_x = (get_group_id(0) * LOCAL_SIZE + tid4) / 2;
if (dst_x < dst_cols - 1)
{
for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
{

if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));

tid4 += 2;
dst_x += 1;
FT sum = co3* smem[yin - y][2 + tid4 + 2];
sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
sum = MAD(co1, smem[yin - y][2 + tid4 ], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));

dst_x ++;
sum = co3* smem[yin - y][2 + tid4 + 4];
sum = MAD(co3, smem[yin - y][2 + tid4 ], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);
sum = MAD(co1, smem[yin - y][2 + tid4 + 2], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 3], sum);
storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
dst_x --;
}

sum = co3* smem[2 + tid4 + 2];
sum = MAD(co3, smem[2 + tid4 - 2], sum);
sum = MAD(co2, smem[2 + tid4 - 1], sum);
sum = MAD(co1, smem[2 + tid4 ], sum);
sum = MAD(co2, smem[2 + tid4 + 1], sum);
}
else if (dst_x < dst_cols)
{
for (int yin = y, y1 = min(dst_rows, y + 2); yin < y1; yin++)
{
FT sum = co3* smem[yin - y][2 + tid4 + 2];
sum = MAD(co3, smem[yin - y][2 + tid4 - 2], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 - 1], sum);
sum = MAD(co1, smem[yin - y][2 + tid4 ], sum);
sum = MAD(co2, smem[yin - y][2 + tid4 + 1], sum);

if (dst_x < dst_cols)
storepix(convertToT(sum), dstData + mad24(y, dst_step, dst_x * PIXSIZE));
storepix(convertToT(sum), dstData + mad24(yin, dst_step, dst_x * PIXSIZE));
}
}
#endif

}
2 changes: 1 addition & 1 deletion modules/imgproc/src/pyramids.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ static bool ocl_pyrDown( InputArray _src, OutputArray _dst, const Size& _dsz, in
k.args(ocl::KernelArg::ReadOnly(src), ocl::KernelArg::WriteOnly(dst));

size_t localThreads[2] = { local_size/kercn, 1 };
size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, dst.rows };
size_t globalThreads[2] = { (src.cols + (kercn-1))/kercn, (dst.rows + 1) / 2 };
return k.run(2, globalThreads, localThreads, false);
}

Expand Down

0 comments on commit f8aecb2

Please sign in to comment.