Skip to content

Commit

Permalink
Merge pull request opencv#17863 from nglee:dev_cudaDetachOutput
Browse files Browse the repository at this point in the history
  • Loading branch information
mshabunin committed Jul 22, 2020
2 parents 5444a6b + 9411cd6 commit 0fa06b1
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 1 deletion.
44 changes: 43 additions & 1 deletion modules/cudaarithm/src/core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,34 @@ namespace
cudaSafeCall( cudaDeviceSynchronize() );
}
};

template <int DEPTH> struct NppMirrorIFunc
{
typedef typename NppTypeTraits<DEPTH>::npp_t npp_t;

typedef NppStatus (*func_t)(npp_t* pSrcDst, int nSrcDstStep, NppiSize oROI, NppiAxis flip);
};

template <int DEPTH, typename NppMirrorIFunc<DEPTH>::func_t func> struct NppMirrorI
{
typedef typename NppMirrorIFunc<DEPTH>::npp_t npp_t;

static void call(GpuMat& srcDst, int flipCode, cudaStream_t stream)
{
NppStreamHandler h(stream);

NppiSize sz;
sz.width = srcDst.cols;
sz.height = srcDst.rows;

nppSafeCall( func(srcDst.ptr<npp_t>(), static_cast<int>(srcDst.step),
sz,
(flipCode == 0 ? NPP_HORIZONTAL_AXIS : (flipCode > 0 ? NPP_VERTICAL_AXIS : NPP_BOTH_AXIS))) );

if (stream == 0)
cudaSafeCall( cudaDeviceSynchronize() );
}
};
}

void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& stream)
Expand All @@ -117,6 +145,17 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
{NppMirror<CV_32F, nppiMirror_32f_C1R>::call, 0, NppMirror<CV_32F, nppiMirror_32f_C3R>::call, NppMirror<CV_32F, nppiMirror_32f_C4R>::call}
};

typedef void (*ifunc_t)(GpuMat& srcDst, int flipCode, cudaStream_t stream);
static const ifunc_t ifuncs[6][4] =
{
{NppMirrorI<CV_8U, nppiMirror_8u_C1IR>::call, 0, NppMirrorI<CV_8U, nppiMirror_8u_C3IR>::call, NppMirrorI<CV_8U, nppiMirror_8u_C4IR>::call},
{0,0,0,0},
{NppMirrorI<CV_16U, nppiMirror_16u_C1IR>::call, 0, NppMirrorI<CV_16U, nppiMirror_16u_C3IR>::call, NppMirrorI<CV_16U, nppiMirror_16u_C4IR>::call},
{0,0,0,0},
{NppMirrorI<CV_32S, nppiMirror_32s_C1IR>::call, 0, NppMirrorI<CV_32S, nppiMirror_32s_C3IR>::call, NppMirrorI<CV_32S, nppiMirror_32s_C4IR>::call},
{NppMirrorI<CV_32F, nppiMirror_32f_C1IR>::call, 0, NppMirrorI<CV_32F, nppiMirror_32f_C3IR>::call, NppMirrorI<CV_32F, nppiMirror_32f_C4IR>::call}
};

GpuMat src = getInputMat(_src, stream);

CV_Assert(src.depth() == CV_8U || src.depth() == CV_16U || src.depth() == CV_32S || src.depth() == CV_32F);
Expand All @@ -125,7 +164,10 @@ void cv::cuda::flip(InputArray _src, OutputArray _dst, int flipCode, Stream& str
_dst.create(src.size(), src.type());
GpuMat dst = getOutputMat(_dst, src.size(), src.type(), stream);

funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
if (src.refcount != dst.refcount)
funcs[src.depth()][src.channels() - 1](src, dst, flipCode, StreamAccessor::getStream(stream));
else // in-place
ifuncs[src.depth()][src.channels() - 1](src, flipCode, StreamAccessor::getStream(stream));

syncOutput(dst, _dst, stream);
}
Expand Down
13 changes: 13 additions & 0 deletions modules/cudaarithm/test/test_core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,19 @@ CUDA_TEST_P(Flip, Accuracy)
EXPECT_MAT_NEAR(dst_gold, dst, 0.0);
}

CUDA_TEST_P(Flip, AccuracyInplace)
{
cv::Mat src = randomMat(size, type);

cv::cuda::GpuMat srcDst = loadMat(src, useRoi);
cv::cuda::flip(srcDst, srcDst, flip_code);

cv::Mat dst_gold;
cv::flip(src, dst_gold, flip_code);

EXPECT_MAT_NEAR(dst_gold, srcDst, 0.0);
}

INSTANTIATE_TEST_CASE_P(CUDA_Arithm, Flip, testing::Combine(
ALL_DEVICES,
DIFFERENT_SIZES,
Expand Down

0 comments on commit 0fa06b1

Please sign in to comment.