Skip to content

Commit

Permalink
* added depth-wise convolution; gives ~20-30% performance improvement…
Browse files Browse the repository at this point in the history
… in MobileSSD networks

* hopefully, eliminated compile warnings, errors, as well as failure in one test

* * fixed a few typos
* decreased buffer size in some cases
* added more optimal im2row branch in the case of 1x1 convolutions
* tuned fastConv to reduce the number of passes over arrays

backport of commit 77b01de
  • Loading branch information
vpisarev authored and alalek committed Aug 4, 2020
1 parent 5b5c42d commit 1537ecd
Show file tree
Hide file tree
Showing 2 changed files with 499 additions and 36 deletions.
261 changes: 234 additions & 27 deletions modules/dnn/src/layers/convolution_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -648,6 +648,7 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
bool useAVX;
bool useAVX2;
bool useAVX512;
int blk_size_cn;

ParallelConv()
: input_(0), weights_(0), output_(0), ngroups_(0), nstripes_(0),
Expand Down Expand Up @@ -704,12 +705,17 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
p.useAVX2 = checkHardwareSupport(CPU_AVX2) && isConv2D;
p.useAVX512 = CV_CPU_HAS_SUPPORT_AVX512_SKX && isConv2D;

int ncn = std::min(inpCn, (int)BLK_SIZE_CN);

int kernel_d = !isConv2D? kernel_size[0] : 1;
int kernel_h = kernel_size[kernel_size.size() - 2];
int kernel_w = kernel_size.back();

int blk_size_cn0 = cvCeil(800./(kernel_w*kernel_h));
int ncn = 16;
while (ncn*2 < blk_size_cn0 && ncn < inpCn)
ncn *= 2;
ncn = std::min(ncn, inpCn);
p.blk_size_cn = ncn;

int dil_d = !isConv2D? dilations[0] : 1;
int dil_h = dilations[dilations.size() - 2];
int dil_w = dilations.back();
Expand Down Expand Up @@ -777,18 +783,26 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
int dilation_w = dilations.back();

int i, j, k, d;
size_t inpPlaneSize = input_->total(2);
size_t outPlaneSize = output_->total(2);
int inpPlaneSize = (int)input_->total(2);
int outPlaneSize = (int)output_->total(2);
bool is1x1 = is1x1_;

int stripesPerSample;
size_t stripeSize;
int stripeSize;
Range r = r0;

if( nstripes >= batchSize*2 )
bool depthWiseConvolution = !is1x1 && isConv2D && ngroups > 1 && inpCn == 1 &&
outCn == 1 && kernel_d == 1 && dilation_d == 1 && stride_d == 0 && pad_d == 0 &&
width >= 16 + dilation_w*(kernel_w - 1);
// for now only 3x3 depth-wise convolutions are supported
depthWiseConvolution = depthWiseConvolution && kernel_w == 3 && kernel_h == 3 &&
// computing at most 1 pixel from each side can involve padding
max(stride_w, dilation_w) >= pad_l && max(stride_h, dilation_h) >= pad_t &&
pad_l <= 1 && pad_t <= 1;

if( !depthWiseConvolution && nstripes >= batchSize*2 )
{
stripesPerSample = nstripes/batchSize;
stripeSize = alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
stripeSize = (int)alignSize((outPlaneSize + stripesPerSample - 1)/stripesPerSample, valign);
stripeSize = std::min(stripeSize, outPlaneSize);
}
else
Expand All @@ -807,20 +821,29 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
const float* biasptr_ = &biasvec_->at(0);
const float* reluptr_ = reluslope_->empty() ? 0 : &reluslope_->at(0);
float* data_out0_ = output_->ptr<float>();
size_t rowbufsz = (size_t)karea*BLK_SIZE_CN*BLK_SIZE;
AutoBuffer<float> rowbuf0_(rowbufsz + valign);
float* rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));

// we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop.
// the main idea is to make sure that the tail (a.k.a. padding) of each row
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
// does not contain NaNs or Infs. Because the padding in the weights
// matrix is explicitly initialized with 0's, we handle all other
// cases nicely, i.e. we can skip expliciting re-initialization
// of the padding - we just retain elements from the previous iteration
// of the loop over channels (cn0).
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
AutoBuffer<float> rowbuf0_;
float* rowbuf0 = 0;
bool use_rowbuf = !depthWiseConvolution;
int blk_size = depthWiseConvolution ? outPlaneSize : min((int)BLK_SIZE, stripeSize);

// im2row buffer is not used for depth-wise convolution
if(use_rowbuf)
{
size_t rowbufsz = alignSize(karea*blk_size_cn, valign)*min((int)BLK_SIZE, blk_size);
//printf("karea=%d, blk_size_cn=%d, rowbufsz=%d, stripeSize=%d\n", karea, blk_size_cn, (int)rowbufsz, stripeSize);
rowbuf0_.allocate(rowbufsz + valign);
rowbuf0 = alignPtr(rowbuf0_.data(), (int)(valign*sizeof(float)));
// we clear the buffer once; ultimately, it lets us to avoid
// tail processing after running the unrolled/vectorized loop.
// the main idea is to make sure that the tail (a.k.a. padding) of each row
// (i.e. the elements with indices between vsz=karea*ncn and vsz_a)
// does not contain NaNs or Infs. Because the padding in the weights
// matrix is explicitly initialized with 0's, we handle all other
// cases nicely, i.e. we can skip expliciting re-initialization
// of the padding - we just retain elements from the previous iteration
// of the loop over channels (cn0).
memset(rowbuf0, 0, rowbufsz*sizeof(rowbuf0[0]) );
}

for( int stripe = r.start; stripe < r.end; stripe++ )
{
Expand All @@ -835,28 +858,213 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl
const float* wptr_orig = wptr_orig_ + wstep*startOutCn;
const float* biasptr = biasptr_ + startOutCn;

for( int cn0 = 0; cn0 < inpCn; cn0 += BLK_SIZE_CN )
for( int cn0 = 0; cn0 < inpCn; cn0 += blk_size_cn )
{
int cn1 = std::min(cn0 + BLK_SIZE_CN, inpCn);
int cn1 = std::min(cn0 + blk_size_cn, inpCn);
int ncn = cn1 - cn0, vsz = karea*ncn;
int vsz_a = (int)alignSize(vsz, valign);
const float* wptr = wptr_orig + cn0*karea;
// we apply [Channels][P]ReLU (if any) during the final pass only.
const float* relu = cn1 == inpCn && reluptr_ ? reluptr_ + startOutCn : 0;

for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += BLK_SIZE )
for( int ofs0 = stripeStart; ofs0 < stripeEnd; ofs0 += blk_size )
{
int ofs, ofs1 = std::min(ofs0 + BLK_SIZE, stripeEnd);
int ofs, ofs1 = std::min(ofs0 + blk_size, stripeEnd);
int bsz = ofs1 - ofs0;

int out_d = ofs0 / (outH * outW);
int out_i = (ofs0 - out_d * outH * outW) / outW;
int out_j = ofs0 % outW;

if (depthWiseConvolution)
{
CV_Assert(out_i == 0 && out_j == 0);
int in_d = out_d * stride_d - pad_d;
const float* inptr_ = data_inp0 + (cn0*depth*height + in_d*height)*width;
float* outptr_ = data_out0 + ofs0;

#if CV_TRY_AVX2
if(useAVX2)
opt_AVX2::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
else
#endif
#if CV_TRY_AVX
if(useAVX)
opt_AVX::fastDepthwiseConv(wptr, kernel_h, kernel_w,
stride_h, stride_w, dilation_h, dilation_w, pad_t, pad_l,
biasptr, relu, inptr_, height, width, outptr_, out_d, outH, outW);
else
#endif
{
const float w00_ = wptr[0], w01_ = wptr[1], w02_ = wptr[2],
w10 = wptr[3], w11 = wptr[4], w12 = wptr[5],
w20_ = wptr[6], w21_ = wptr[7], w22_ = wptr[8];
int outW1 = min(outW, (width - dilation_w*(kernel_w - 1) + pad_l)/stride_w);
float relu_coeff = relu ? relu[out_d] : 1.f, bias = biasptr[out_d];

for (int out_i = 0; out_i < outH; out_i++)
{
int in_i = out_i * stride_h - pad_t, out_j = 0;
const float* imgptr0 = inptr_ + in_i*width;
const float* imgptr1 = imgptr0 + dilation_h*width;
const float* imgptr2 = imgptr0 + (dilation_h*2)*width;
float out, w00 = w00_, w01 = w01_, w02 = w02_;
float w20 = w20_, w21 = w21_, w22 = w22_;
if (in_i < 0)
{
w00 = w01 = w02 = 0.f;
imgptr0 = imgptr1;
}
else if (in_i + dilation_h*(kernel_h-1) >= height)
{
w20 = w21 = w22 = 0.f;
imgptr2 = imgptr1;
}
float* outptr = outptr_ + out_i*outW;
if (pad_l > 0)
{
out = imgptr0[0]*w01 + imgptr0[dilation_w]*w02 +
imgptr1[0]*w11 + imgptr1[dilation_w]*w12 +
imgptr2[0]*w21 + imgptr2[dilation_w]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[0] = out;
out_j = 1;
}

#if CV_SIMD
// maybe with AVX or AVX512 strided depthwise convolution
// can be accelerated with vector code, but with 4xfloat vectors
// it's hardly the case
if( stride_w == 1 )
{
const int VECSZ = v_float32::nlanes;
const int out_delta = VECSZ/stride_w;
v_float32 vw00 = vx_setall_f32(w00), vw01 = vx_setall_f32(w01), vw02 = vx_setall_f32(w02),
vw10 = vx_setall_f32(w10), vw11 = vx_setall_f32(w11), vw12 = vx_setall_f32(w12),
vw20 = vx_setall_f32(w20), vw21 = vx_setall_f32(w21), vw22 = vx_setall_f32(w22);
v_float32 z = vx_setzero_f32(), vbias = vx_setall_f32(bias), vrc = vx_setall_f32(relu_coeff);
for( ; out_j < outW1; out_j += out_delta )
{
if (out_j + out_delta > outW1)
{
if (out_j <= pad_l)
break;
out_j = outW1 - out_delta;
}
int in_j = out_j * stride_w - pad_l;
v_float32 v00 = vx_load(imgptr0 + in_j),
v01 = vx_load(imgptr0 + in_j + dilation_w),
v02 = vx_load(imgptr0 + in_j + dilation_w*2),
v10 = vx_load(imgptr1 + in_j),
v11 = vx_load(imgptr1 + in_j + dilation_w),
v12 = vx_load(imgptr1 + in_j + dilation_w*2),
v20 = vx_load(imgptr2 + in_j),
v21 = vx_load(imgptr2 + in_j + dilation_w),
v22 = vx_load(imgptr2 + in_j + dilation_w*2);

v_float32 vout = v00*vw00 + v01*vw01 + v02*vw02 +
v10*vw10 + v11*vw11 + v12*vw12 +
v20*vw20 + v21*vw21 + v22*vw22 + vbias;
if (relu)
vout = v_select(vout > z, vout, vout*vrc);
vx_store(outptr + out_j, vout);
}
}
#endif
for (; out_j < outW1; out_j++)
{
int in_j = out_j * stride_w - pad_l;
out = imgptr0[in_j]*w00 + imgptr0[in_j + dilation_w]*w01 + imgptr0[in_j + dilation_w*2]*w02 +
imgptr1[in_j]*w10 + imgptr1[in_j + dilation_w]*w11 + imgptr1[in_j + dilation_w*2]*w12 +
imgptr2[in_j]*w20 + imgptr2[in_j + dilation_w]*w21 + imgptr2[in_j + dilation_w*2]*w22 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}

for (; out_j < outW; out_j++ )
{
int in_j0 = out_j * stride_w - pad_l, in_j1 = in_j0 + dilation_w, in_j2 = in_j0 + dilation_w*2;
float s0 = 1.f, s1 = 1.f, s2 = 1.f;
if (in_j0 >= width)
{
in_j0 = 0;
s0 = 0.f;
}
if (in_j1 >= width)
{
in_j1 = 0;
s1 = 0.f;
}
if (in_j2 >= width)
{
in_j2 = 0;
s2 = 0.f;
}
out = imgptr0[in_j0]*w00*s0 + imgptr0[in_j1]*w01*s1 + imgptr0[in_j2]*w02*s2 +
imgptr1[in_j0]*w10*s0 + imgptr1[in_j1]*w11*s1 + imgptr1[in_j2]*w12*s2 +
imgptr2[in_j0]*w20*s0 + imgptr2[in_j1]*w21*s1 + imgptr2[in_j2]*w22*s2 + bias;
if (relu)
out = out > 0.f ? out : out*relu_coeff;
outptr[out_j] = out;
}
}
}
continue;
}

// do im2row for a part of input tensor
float* rowbuf = rowbuf0;

if (isConv2D)
{
if( is1x1 && stride_w == 1 && stride_h == 1 )
{
const float* imgptr = data_inp0 + (cn0*height + out_i)*width + out_j;
for( int j = 0; j < bsz; j++, rowbuf += vsz_a )
{
if( j + 4 <= bsz )
{
k = 0;
#if CV_SIMD128
for( ; k <= vsz - 4; k += 4 )
{
const float* inp = imgptr + j + k*inpPlaneSize;
v_float32x4 p0 = v_load(inp), p1 = v_load(inp + inpPlaneSize);
v_float32x4 p2 = v_load(inp + inpPlaneSize*2), p3 = v_load(inp + inpPlaneSize*3);
v_float32x4 r0, r1, r2, r3;
v_transpose4x4(p0, p1, p2, p3, r0, r1, r2, r3);
v_store(rowbuf + k, r0);
v_store(rowbuf + k + vsz_a, r1);
v_store(rowbuf + k + vsz_a*2, r2);
v_store(rowbuf + k + vsz_a*3, r3);
}
#endif
for( ; k < vsz; k++ )
{
const float* inp = imgptr + j + k*inpPlaneSize;
float v0 = inp[0], v1 = inp[1], v2 = inp[2], v3 = inp[3];
rowbuf[k] = v0;
rowbuf[k + vsz_a] = v1;
rowbuf[k + vsz_a*2] = v2;
rowbuf[k + vsz_a*3] = v3;
}
j += 3;
rowbuf += vsz_a*3;
}
else
{
for( k = 0; k < vsz; k++ )
{
rowbuf[k] = imgptr[j + k*inpPlaneSize];
}
}
}
}
else
for( ofs = ofs0; ofs < ofs1; out_j = 0, ++out_i )
{
int delta = std::min(ofs1 - ofs, outW - out_j);
Expand Down Expand Up @@ -976,7 +1184,6 @@ class ConvolutionLayerImpl CV_FINAL : public BaseConvolutionLayerImpl

// now compute dot product of the weights
// and im2row-transformed part of the tensor
int bsz = ofs1 - ofs0;
#if CV_TRY_AVX512_SKX
/* AVX512 convolution requires an alignment of 16, and ROI is only there for larger vector sizes */
if(useAVX512)
Expand Down
Loading

0 comments on commit 1537ecd

Please sign in to comment.