Skip to content

Commit

Permalink
Merge pull request opencv#13329 from elatkin:el/gapi_perf_medblur
Browse files Browse the repository at this point in the history
GAPI (fluid): Median blur optimization (opencv#13329)

* GAPI (fluid): Median blur optimization: reference 3x3

* GAPI (fluid): Median blur optimization: CPU dispatcher

* GAPI (fluid): Median blur optimization: manual CV_SIMD
  • Loading branch information
Evgeny Latkin authored and alalek committed Nov 29, 2018
1 parent 6374b99 commit ab430b8
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 14 deletions.
39 changes: 25 additions & 14 deletions modules/gapi/src/backends/fluid/gfluidimgproc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1442,7 +1442,9 @@ static void run_medianblur( Buffer& dst,
const View & src,
int ksize)
{
static const int kmax = 9;
static_assert(std::is_same<DST, SRC>::value, "unsupported combination of types");

constexpr int kmax = 9;
GAPI_Assert(ksize <= kmax);

const SRC *in[ kmax ];
Expand All @@ -1460,24 +1462,33 @@ static void run_medianblur( Buffer& dst,
int width = dst.length();
int chan = dst.meta().chan;

for (int w=0; w < width; w++)
// optimized: if 3x3

if (3 == ksize)
{
// TODO: make this cycle innermost
for (int c=0; c < chan; c++)
{
SRC neighbours[kmax * kmax];
run_medblur3x3_impl(out, in, width, chan);
return;
}

for (int i=0; i < ksize; i++)
for (int j=0; j < ksize; j++)
{
neighbours[i*ksize + j] = in[i][(w + j - border)*chan + c];
}
// reference: any ksize

int length = width * chan;
int klength = ksize * ksize;
int klenhalf = klength / 2;

int length = ksize * ksize;
std::nth_element(neighbours, neighbours + length/2, neighbours + length);
for (int l=0; l < length; l++)
{
SRC neighbours[kmax * kmax];

out[w*chan + c] = saturate<DST>(neighbours[length/2], rintf);
for (int i=0; i < ksize; i++)
for (int j=0; j < ksize; j++)
{
neighbours[i*ksize + j] = in[i][l + (j - border)*chan];
}

std::nth_element(neighbours, neighbours + klenhalf, neighbours + klength);

out[l] = saturate<DST>(neighbours[klenhalf], rintf);
}
}

Expand Down
20 changes: 20 additions & 0 deletions modules/gapi/src/backends/fluid/gfluidimgproc_func.dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,26 @@ RUN_MORPHOLOGY3X3_IMPL( float)

#undef RUN_MORPHOLOGY3X3_IMPL

//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------

#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
{ \
CV_CPU_DISPATCH(run_medblur3x3_impl, (out, in, width, chan), \
CV_CPU_DISPATCH_MODES_ALL); \
}

RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)

#undef RUN_MEDBLUR3X3_IMPL

} // namespace fliud
} // namespace gapi
} // namespace cv
Expand Down
16 changes: 16 additions & 0 deletions modules/gapi/src/backends/fluid/gfluidimgproc_func.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)

#undef RUN_MORPHOLOGY3X3_IMPL

//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------

#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);

RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)

#undef RUN_MEDBLUR3X3_IMPL

} // namespace fluid
} // namespace gapi
} // namespace cv
Expand Down
190 changes: 190 additions & 0 deletions modules/gapi/src/backends/fluid/gfluidimgproc_func.simd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,22 @@ RUN_MORPHOLOGY3X3_IMPL( float)

#undef RUN_MORPHOLOGY3X3_IMPL

//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------

#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan);

RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)

#undef RUN_MEDBLUR3X3_IMPL

//----------------------------------------------------------------------

#ifndef CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
Expand Down Expand Up @@ -1580,6 +1596,180 @@ RUN_MORPHOLOGY3X3_IMPL( float)

#undef RUN_MORPHOLOGY3X3_IMPL

//---------------------------
//
// Fluid kernels: Median blur
//
//---------------------------

template<typename T>
static void run_medblur3x3_reference(T out[], const T *in[], int width, int chan)
{
constexpr int ksize = 3;
constexpr int border = (ksize - 1) / 2;

const int length = width * chan;
const int shift = border * chan;

for (int l=0; l < length; l++)
{
T t[3][3];

// neighbourhood 3x3
t[0][0] = in[0][l - shift]; t[0][1] = in[0][l]; t[0][2] = in[0][l + shift];
t[1][0] = in[1][l - shift]; t[1][1] = in[1][l]; t[1][2] = in[1][l + shift];
t[2][0] = in[2][l - shift]; t[2][1] = in[2][l]; t[2][2] = in[2][l + shift];

// sort 2 values
auto sort = [](T& a, T& b)
{
T u=a, v=b;
a = (std::min)(u, v);
b = (std::max)(u, v);
};

// horizontal: 3-elements bubble-sort per each row
sort(t[0][0], t[0][1]); sort(t[0][1], t[0][2]); sort(t[0][0], t[0][1]);
sort(t[1][0], t[1][1]); sort(t[1][1], t[1][2]); sort(t[1][0], t[1][1]);
sort(t[2][0], t[2][1]); sort(t[2][1], t[2][2]); sort(t[2][0], t[2][1]);

// vertical: columns bubble-sort (although partial)
sort(t[0][0], t[1][0]); sort(t[0][1], t[1][1]); /*sort(t[0][2], t[1][2]);*/
sort(t[1][0], t[2][0]); sort(t[1][1], t[2][1]); sort(t[1][2], t[2][2]);
/*sort(t[0][0], t[1][0]);*/ sort(t[0][1], t[1][1]); sort(t[0][2], t[1][2]);

// diagonal: bubble-sort (in opposite order!)
sort(t[1][1], t[0][2]); sort(t[2][0], t[1][1]); sort(t[1][1], t[0][2]);

out[l] = t[1][1];
}
}

#if CV_SIMD
template<typename VT, typename T>
static void run_medblur3x3_simd(T out[], const T *in[], int width, int chan)
{
constexpr int ksize = 3;
constexpr int border = (ksize - 1) / 2;

const int length = width * chan;
const int shift = border * chan;

for (int l=0; l < length;)
{
constexpr int nlanes = VT::nlanes;

// main part of output row
for (; l <= length - nlanes; l += nlanes)
{
VT t00, t01, t02, t10, t11, t12, t20, t21, t22;

// neighbourhood 3x3

t00 = vx_load(&in[0][l - shift]);
t01 = vx_load(&in[0][l ]);
t02 = vx_load(&in[0][l + shift]);

t10 = vx_load(&in[1][l - shift]);
t11 = vx_load(&in[1][l ]);
t12 = vx_load(&in[1][l + shift]);

t20 = vx_load(&in[2][l - shift]);
t21 = vx_load(&in[2][l ]);
t22 = vx_load(&in[2][l + shift]);

// sort 2 values
auto sort = [](VT& a, VT& b)
{
VT u=a, v=b;
a = v_min(u, v);
b = v_max(u, v);
};

// horizontal: 3-elements bubble-sort per each row
sort(t00, t01); sort(t01, t02); sort(t00, t01);
sort(t10, t11); sort(t11, t12); sort(t10, t11);
sort(t20, t21); sort(t21, t22); sort(t20, t21);

// vertical: columns bubble-sort (although partial)
sort(t00, t10); sort(t01, t11); /*sort(t02, t12);*/
sort(t10, t20); sort(t11, t21); sort(t12, t22);
/*sort(t00, t10);*/ sort(t01, t11); sort(t02, t12);

// diagonal: bubble-sort (in opposite order!)
sort(t11, t02); sort(t20, t11); sort(t11, t02);

v_store(&out[l], t11);
}

// tail (if any)
if (l < length)
{
GAPI_DbgAssert(length >= nlanes);
l = length - nlanes;
}
}
}
#endif

template<typename T>
static void run_medblur3x3_code(T out[], const T *in[], int width, int chan)
{
#if CV_SIMD
int length = width * chan;

// length variable may be unused if types do not match at 'if' statements below
(void) length;

if (std::is_same<T, float>::value && length >= v_float32::nlanes)
{
run_medblur3x3_simd<v_float32>(reinterpret_cast<float*>(out),
reinterpret_cast<const float**>(in),
width, chan);
return;
}

if (std::is_same<T, short>::value && length >= v_int16::nlanes)
{
run_medblur3x3_simd<v_int16>(reinterpret_cast<short*>(out),
reinterpret_cast<const short**>(in),
width, chan);
return;
}

if (std::is_same<T, ushort>::value && length >= v_uint16::nlanes)
{
run_medblur3x3_simd<v_uint16>(reinterpret_cast<ushort*>(out),
reinterpret_cast<const ushort**>(in),
width, chan);
return;
}

if (std::is_same<T, uchar>::value && length >= v_uint8::nlanes)
{
run_medblur3x3_simd<v_uint8>(reinterpret_cast<uchar*>(out),
reinterpret_cast<const uchar**>(in),
width, chan);
return;
}
#endif

run_medblur3x3_reference(out, in, width, chan);
}

#define RUN_MEDBLUR3X3_IMPL(T) \
void run_medblur3x3_impl(T out[], const T *in[], int width, int chan) \
{ \
run_medblur3x3_code(out, in, width, chan); \
}

RUN_MEDBLUR3X3_IMPL(uchar )
RUN_MEDBLUR3X3_IMPL(ushort)
RUN_MEDBLUR3X3_IMPL( short)
RUN_MEDBLUR3X3_IMPL( float)

#undef RUN_MEDBLUR3X3_IMPL

//------------------------------------------------------------------------------

#endif // CV_CPU_OPTIMIZATION_DECLARATIONS_ONLY
Expand Down

0 comments on commit ab430b8

Please sign in to comment.