diff --git a/3rdparty/tbb/CMakeLists.txt b/3rdparty/tbb/CMakeLists.txt index a05d0d3c1ae4..a3c0a812ce0e 100644 --- a/3rdparty/tbb/CMakeLists.txt +++ b/3rdparty/tbb/CMakeLists.txt @@ -5,19 +5,14 @@ if (WIN32 AND NOT ARM) message(FATAL_ERROR "BUILD_TBB option supports Windows on ARM only!\nUse regular official TBB build instead of the BUILD_TBB option!") endif() -set(tbb_filename "2018_U1.tar.gz") -set(tbb_subdir "tbb-2018_U1") -set(tbb_md5 "b2f2fa09adf44a22f4024049907f774b") - -ocv_warnings_disable(CMAKE_CXX_FLAGS /wd4702 - -Wshadow - -Wunused-parameter - -Wmissing-prototypes # MacOSX -) +ocv_update(OPENCV_TBB_RELEASE "2019_U8") +ocv_update(OPENCV_TBB_RELEASE_MD5 "7c371d0f62726154d2c568a85697a0ad") +ocv_update(OPENCV_TBB_FILENAME "${OPENCV_TBB_RELEASE}.tar.gz") +ocv_update(OPENCV_TBB_SUBDIR "tbb-${OPENCV_TBB_RELEASE}") set(tbb_src_dir "${OpenCV_BINARY_DIR}/3rdparty/tbb") -ocv_download(FILENAME ${tbb_filename} - HASH ${tbb_md5} +ocv_download(FILENAME ${OPENCV_TBB_FILENAME} + HASH ${OPENCV_TBB_RELEASE_MD5} URL "${OPENCV_TBB_URL}" "$ENV{OPENCV_TBB_URL}" @@ -29,7 +24,7 @@ ocv_download(FILENAME ${tbb_filename} if(NOT res) return() endif() -set(tbb_src_dir "${tbb_src_dir}/${tbb_subdir}") +set(tbb_src_dir "${tbb_src_dir}/${OPENCV_TBB_SUBDIR}") ocv_include_directories("${tbb_src_dir}/include" "${tbb_src_dir}/src/" @@ -82,18 +77,19 @@ endif() if(ANDROID_COMPILER_IS_CLANG) add_definitions(-D__TBB_GCC_BUILTIN_ATOMICS_PRESENT=1) - ocv_warnings_disable(CMAKE_CXX_FLAGS -Wmissing-prototypes) endif() -set(TBB_SOURCE_FILES ${lib_srcs} ${lib_hdrs}) +ocv_warnings_disable(CMAKE_CXX_FLAGS + /wd4702 + -Wshadow + -Wunused-parameter + -Wclass-memaccess # TBB 2018 under GCC 8+ + -Wimplicit-fallthrough # TBB 2018 under GCC 7+ + -Wmissing-prototypes # MacOSX, Android/Clang + -Wundef -Wmissing-declarations # TBB 2019 +) -if (ARM AND NOT WIN32) - if (NOT ANDROID) - set(TBB_SOURCE_FILES ${TBB_SOURCE_FILES} "${CMAKE_CURRENT_SOURCE_DIR}/arm_linux_stub.cpp") - endif() - set(TBB_SOURCE_FILES ${TBB_SOURCE_FILES} "${CMAKE_CURRENT_SOURCE_DIR}/android_additional.h") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -include \"${CMAKE_CURRENT_SOURCE_DIR}/android_additional.h\"") -endif() +set(TBB_SOURCE_FILES ${lib_srcs} ${lib_hdrs}) set(tbb_version_file "version_string.ver") configure_file("${CMAKE_CURRENT_SOURCE_DIR}/${tbb_version_file}.cmakein" "${CMAKE_CURRENT_BINARY_DIR}/${tbb_version_file}" @ONLY) @@ -122,8 +118,6 @@ else() target_link_libraries(tbb c m dl) endif() -ocv_warnings_disable(CMAKE_CXX_FLAGS -Wundef -Wmissing-declarations) - # filter out flags that are not handled well by the TBB code foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) string(REPLACE "-Werror=non-virtual-dtor" "" ${var} "${${var}}") diff --git a/3rdparty/tbb/android_additional.h b/3rdparty/tbb/android_additional.h deleted file mode 100644 index 2faa49503234..000000000000 --- a/3rdparty/tbb/android_additional.h +++ /dev/null @@ -1,41 +0,0 @@ -#include - -static inline int getPossibleCPUs() -{ - FILE* cpuPossible = fopen("/sys/devices/system/cpu/possible", "r"); - if(!cpuPossible) - return 1; - - char buf[2000]; //big enough for 1000 CPUs in worst possible configuration - char* pbuf = fgets(buf, sizeof(buf), cpuPossible); - fclose(cpuPossible); - if(!pbuf) - return 1; - - //parse string of form "0-1,3,5-7,10,13-15" - int cpusAvailable = 0; - - while(*pbuf) - { - const char* pos = pbuf; - bool range = false; - while(*pbuf && *pbuf != ',') - { - if(*pbuf == '-') range = true; - ++pbuf; - } - if(*pbuf) *pbuf++ = 0; - if(!range) - ++cpusAvailable; - else - { - int rstart = 0, rend = 0; - sscanf(pos, "%d-%d", &rstart, &rend); - cpusAvailable += rend - rstart + 1; - } - - } - return cpusAvailable ? cpusAvailable : 1; -} - -#define __TBB_HardwareConcurrency() getPossibleCPUs() diff --git a/3rdparty/tbb/arm_linux_stub.cpp b/3rdparty/tbb/arm_linux_stub.cpp deleted file mode 100644 index 93cc336be407..000000000000 --- a/3rdparty/tbb/arm_linux_stub.cpp +++ /dev/null @@ -1,11 +0,0 @@ -#include "tbb/tbb_misc.h" - -namespace tbb { -namespace internal { - -void affinity_helper::protect_affinity_mask(bool) {} -affinity_helper::~affinity_helper() {} -void destroy_process_mask() {} - -} -} diff --git a/cmake/OpenCVCompilerOptimizations.cmake b/cmake/OpenCVCompilerOptimizations.cmake index 11c13b288672..9e4691760c0e 100644 --- a/cmake/OpenCVCompilerOptimizations.cmake +++ b/cmake/OpenCVCompilerOptimizations.cmake @@ -329,7 +329,7 @@ elseif(ARM OR AARCH64) ocv_update(CPU_VFPV3_FLAGS_ON "-mfpu=vfpv3") ocv_update(CPU_NEON_FLAGS_ON "-mfpu=neon") ocv_update(CPU_NEON_FLAGS_CONFLICT "-mfpu=[^ ]*") - ocv_update(CPU_FP16_FLAGS_ON "-mfpu=neon-fp16") + ocv_update(CPU_FP16_FLAGS_ON "-mfpu=neon-fp16 -mfp16-format=ieee") ocv_update(CPU_FP16_FLAGS_CONFLICT "-mfpu=[^ ]*") endif() ocv_update(CPU_FP16_IMPLIES "NEON") @@ -617,9 +617,6 @@ macro(ocv_compiler_optimization_options) if(ENABLE_POWERPC) add_extra_compiler_option("-mcpu=G3 -mtune=G5") endif() - if(ARM) - add_extra_compiler_option("-mfp16-format=ieee") - endif(ARM) endmacro() macro(ocv_compiler_optimization_options_finalize) diff --git a/cmake/OpenCVCompilerOptions.cmake b/cmake/OpenCVCompilerOptions.cmake index a5e6f5fd412b..e3ae62fae70d 100644 --- a/cmake/OpenCVCompilerOptions.cmake +++ b/cmake/OpenCVCompilerOptions.cmake @@ -398,3 +398,23 @@ macro(ocv_add_modules_compiler_options) add_definitions(-DOPENCV_ENABLE_MEMORY_SANITIZER=1) endif() endmacro() + +# adjust -Wl,-rpath-link +if(CMAKE_SKIP_RPATH) + if((NOT CMAKE_CROSSCOMPILING OR OPENCV_ENABLE_LINKER_RPATH_LINK_ORIGIN) AND NOT OPENCV_SKIP_LINKER_RPATH_LINK_ORIGIN) + if(DEFINED CMAKE_SHARED_LIBRARY_RPATH_ORIGIN_TOKEN) + list(APPEND CMAKE_PLATFORM_RUNTIME_PATH "${CMAKE_SHARED_LIBRARY_RPATH_ORIGIN_TOKEN}") + else() + list(APPEND CMAKE_PLATFORM_RUNTIME_PATH "\$ORIGIN") + endif() + elseif(NOT OPENCV_SKIP_LINKER_RPATH_LINK_BINARY_LIB) + list(APPEND CMAKE_PLATFORM_RUNTIME_PATH "${LIBRARY_OUTPUT_PATH}") + endif() +endif() +if(OPENCV_EXTRA_RPATH_LINK_PATH) + string(REPLACE ":" ";" OPENCV_EXTRA_RPATH_LINK_PATH_ "${OPENCV_EXTRA_RPATH_LINK_PATH}") + list(APPEND CMAKE_PLATFORM_RUNTIME_PATH ${OPENCV_EXTRA_RPATH_LINK_PATH_}) + if(NOT CMAKE_EXECUTABLE_RPATH_LINK_CXX_FLAG) + message(WARNING "OPENCV_EXTRA_RPATH_LINK_PATH may not work properly because CMAKE_EXECUTABLE_RPATH_LINK_CXX_FLAG is not defined (not supported)") + endif() +endif() diff --git a/cmake/OpenCVGenConfig.cmake b/cmake/OpenCVGenConfig.cmake index f452678adb89..cef9d0bbcafe 100644 --- a/cmake/OpenCVGenConfig.cmake +++ b/cmake/OpenCVGenConfig.cmake @@ -68,7 +68,11 @@ configure_file("${OpenCV_SOURCE_DIR}/cmake/templates/OpenCVConfig-version.cmake. # Part 2/3: ${BIN_DIR}/unix-install/OpenCVConfig.cmake -> For use *with* "make install" # ------------------------------------------------------------------------------------------- file(RELATIVE_PATH OpenCV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}/${OPENCV_CONFIG_INSTALL_PATH}/" ${CMAKE_INSTALL_PREFIX}) -set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\"") +if (IS_ABSOLUTE ${OPENCV_INCLUDE_INSTALL_PATH}) + set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"${OPENCV_INCLUDE_INSTALL_PATH}\"") +else() + set(OpenCV_INCLUDE_DIRS_CONFIGCMAKE "\"\${OpenCV_INSTALL_PATH}/${OPENCV_INCLUDE_INSTALL_PATH}\"") +endif() if(USE_IPPICV) file(RELATIVE_PATH IPPICV_INSTALL_PATH_RELATIVE_CONFIGCMAKE "${CMAKE_INSTALL_PREFIX}" "${IPPICV_INSTALL_PATH}") diff --git a/cmake/templates/OpenCVConfig.cmake.in b/cmake/templates/OpenCVConfig.cmake.in index c255fc267728..ef299a451534 100644 --- a/cmake/templates/OpenCVConfig.cmake.in +++ b/cmake/templates/OpenCVConfig.cmake.in @@ -106,7 +106,21 @@ set(OpenCV_SHARED @BUILD_SHARED_LIBS@) set(OpenCV_USE_MANGLED_PATHS @OpenCV_USE_MANGLED_PATHS_CONFIGCMAKE@) set(OpenCV_LIB_COMPONENTS @OPENCV_MODULES_CONFIGCMAKE@) -set(OpenCV_INCLUDE_DIRS @OpenCV_INCLUDE_DIRS_CONFIGCMAKE@) +set(__OpenCV_INCLUDE_DIRS @OpenCV_INCLUDE_DIRS_CONFIGCMAKE@) + +set(OpenCV_INCLUDE_DIRS "") +foreach(d ${__OpenCV_INCLUDE_DIRS}) + get_filename_component(__d "${d}" REALPATH) + if(NOT EXISTS "${__d}") + if(NOT OpenCV_FIND_QUIETLY) + message(WARNING "OpenCV: Include directory doesn't exist: '${d}'. OpenCV installation may be broken. Skip...") + endif() + else() + list(APPEND OpenCV_INCLUDE_DIRS "${__d}") + endif() +endforeach() +unset(__d) + if(NOT TARGET opencv_core) include(${CMAKE_CURRENT_LIST_DIR}/OpenCVModules${OpenCV_MODULES_SUFFIX}.cmake) diff --git a/modules/calib3d/src/ptsetreg.cpp b/modules/calib3d/src/ptsetreg.cpp index 275112b7e98a..7ccf86f75f93 100644 --- a/modules/calib3d/src/ptsetreg.cpp +++ b/modules/calib3d/src/ptsetreg.cpp @@ -99,55 +99,60 @@ class RANSACPointSetRegistrator : public PointSetRegistrator return nz; } - bool getSubset( const Mat& m1, const Mat& m2, - Mat& ms1, Mat& ms2, RNG& rng, - int maxAttempts=1000 ) const + bool getSubset( const Mat& m1, const Mat& m2, Mat& ms1, Mat& ms2, RNG& rng, int maxAttempts=1000 ) const { cv::AutoBuffer _idx(modelPoints); int* idx = _idx.data(); - int i = 0, j, k, iters = 0; - int d1 = m1.channels() > 1 ? m1.channels() : m1.cols; - int d2 = m2.channels() > 1 ? m2.channels() : m2.cols; - int esz1 = (int)m1.elemSize1()*d1, esz2 = (int)m2.elemSize1()*d2; - int count = m1.checkVector(d1), count2 = m2.checkVector(d2); - const int *m1ptr = m1.ptr(), *m2ptr = m2.ptr(); - - ms1.create(modelPoints, 1, CV_MAKETYPE(m1.depth(), d1)); - ms2.create(modelPoints, 1, CV_MAKETYPE(m2.depth(), d2)); - int *ms1ptr = ms1.ptr(), *ms2ptr = ms2.ptr(); + const int d1 = m1.channels() > 1 ? m1.channels() : m1.cols; + const int d2 = m2.channels() > 1 ? m2.channels() : m2.cols; - CV_Assert( count >= modelPoints && count == count2 ); - CV_Assert( (esz1 % sizeof(int)) == 0 && (esz2 % sizeof(int)) == 0 ); + int esz1 = (int)m1.elemSize1() * d1; + int esz2 = (int)m2.elemSize1() * d2; + CV_Assert((esz1 % sizeof(int)) == 0 && (esz2 % sizeof(int)) == 0); esz1 /= sizeof(int); esz2 /= sizeof(int); - for(; iters < maxAttempts; iters++) + const int count = m1.checkVector(d1); + const int count2 = m2.checkVector(d2); + CV_Assert(count >= modelPoints && count == count2); + + const int *m1ptr = m1.ptr(); + const int *m2ptr = m2.ptr(); + + ms1.create(modelPoints, 1, CV_MAKETYPE(m1.depth(), d1)); + ms2.create(modelPoints, 1, CV_MAKETYPE(m2.depth(), d2)); + + int *ms1ptr = ms1.ptr(); + int *ms2ptr = ms2.ptr(); + + for( int iters = 0; iters < maxAttempts; ++iters ) { - for( i = 0; i < modelPoints && iters < maxAttempts; ) + int i; + + for( i = 0; i < modelPoints; ++i ) { - int idx_i = 0; - for(;;) - { - idx_i = idx[i] = rng.uniform(0, count); - for( j = 0; j < i; j++ ) - if( idx_i == idx[j] ) - break; - if( j == i ) - break; - } - for( k = 0; k < esz1; k++ ) + int idx_i; + + for ( idx_i = rng.uniform(0, count); + std::find(idx, idx + i, idx_i) != idx + i; + idx_i = rng.uniform(0, count) ) + {} + + idx[i] = idx_i; + + for( int k = 0; k < esz1; ++k ) ms1ptr[i*esz1 + k] = m1ptr[idx_i*esz1 + k]; - for( k = 0; k < esz2; k++ ) + + for( int k = 0; k < esz2; ++k ) ms2ptr[i*esz2 + k] = m2ptr[idx_i*esz2 + k]; - i++; } - if( i == modelPoints && !cb->checkSubset(ms1, ms2, i) ) - continue; - break; + + if( cb->checkSubset(ms1, ms2, i) ) + return true; } - return i == modelPoints && iters < maxAttempts; + return false; } bool run(InputArray _m1, InputArray _m2, OutputArray _model, OutputArray _mask) const CV_OVERRIDE diff --git a/modules/calib3d/src/stereobm.cpp b/modules/calib3d/src/stereobm.cpp index 0257fd572bc7..64a7071ca2fd 100644 --- a/modules/calib3d/src/stereobm.cpp +++ b/modules/calib3d/src/stereobm.cpp @@ -534,12 +534,12 @@ static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right, v_expand(sad8, sad4_l, sad4_h); mask4 = thresh4 > sad4_l; mask4 = mask4 & ((d1 > d4) | (d4 > d2)); - if( v_signmask(mask4) ) + if( v_check_any(mask4) ) break; d4 += dd_4; mask4 = thresh4 > sad4_h; mask4 = mask4 & ((d1 > d4) | (d4 > d2)); - if( v_signmask(mask4) ) + if( v_check_any(mask4) ) break; d4 += dd_4; } diff --git a/modules/calib3d/src/stereosgbm.cpp b/modules/calib3d/src/stereosgbm.cpp index afc57c4cb229..88b28ff5982a 100644 --- a/modules/calib3d/src/stereosgbm.cpp +++ b/modules/calib3d/src/stereosgbm.cpp @@ -2013,14 +2013,14 @@ void SGBM3WayMainLoop::operator () (const Range& range) const mask = cost1 < thresh_reg; mask = mask & ( (cur_dd2) ); - if( v_signmask(mask) ) + if( v_check_any(mask) ) break; cur_d = cur_d+eight_reg; mask = cost2 < thresh_reg; mask = mask & ( (cur_dd2) ); - if( v_signmask(mask) ) + if( v_check_any(mask) ) break; cur_d = cur_d+eight_reg; diff --git a/modules/core/include/opencv2/core/base.hpp b/modules/core/include/opencv2/core/base.hpp index 5d76f524949e..a3a3e51e0475 100644 --- a/modules/core/include/opencv2/core/base.hpp +++ b/modules/core/include/opencv2/core/base.hpp @@ -301,8 +301,8 @@ CV_EXPORTS CV_NORETURN void error(int _code, const String& _err, const char* _fu // In practice, some macro are not processed correctly (noreturn is not detected). // We need to use simplified definition for them. -#define CV_Error(...) do { abort(); } while (0) -#define CV_Error_( code, args ) do { cv::format args; abort(); } while (0) +#define CV_Error(code, msg) do { (void)(code); (void)(msg); abort(); } while (0) +#define CV_Error_(code, args) do { (void)(code); (void)(cv::format args); abort(); } while (0) #define CV_Assert( expr ) do { if (!(expr)) abort(); } while (0) #else // CV_STATIC_ANALYSIS diff --git a/modules/core/include/opencv2/core/hal/intrin.hpp b/modules/core/include/opencv2/core/hal/intrin.hpp index adce1b3fb17b..a96cfbdfb6fd 100644 --- a/modules/core/include/opencv2/core/hal/intrin.hpp +++ b/modules/core/include/opencv2/core/hal/intrin.hpp @@ -55,6 +55,34 @@ #define OPENCV_HAL_NOP(a) (a) #define OPENCV_HAL_1ST(a, b) (a) +namespace { +inline unsigned int trailingZeros32(unsigned int value) { +#if defined(_MSC_VER) +#if (_MSC_VER < 1700) || defined(_M_ARM) + unsigned long index = 0; + _BitScanForward(&index, value); + return (unsigned int)index; +#elif defined(__clang__) + // clang-cl doesn't export _tzcnt_u32 for non BMI systems + return value ? __builtin_ctz(value) : 32; +#else + return _tzcnt_u32(value); +#endif +#elif defined(__GNUC__) || defined(__GNUG__) + return __builtin_ctz(value); +#elif defined(__ICC) || defined(__INTEL_COMPILER) + return _bit_scan_forward(value); +#elif defined(__clang__) + return llvm.cttz.i32(value, true); +#else + static const int MultiplyDeBruijnBitPosition[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; + return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27]; +#endif +} +} + // unlike HAL API, which is in cv::hal, // we put intrinsics into cv namespace to make its // access from within opencv code more accessible @@ -419,32 +447,6 @@ namespace CV__SIMD_NAMESPACE { using namespace CV__SIMD_NAMESPACE; #endif -inline unsigned int trailingZeros32(unsigned int value) { -#if defined(_MSC_VER) -#if (_MSC_VER < 1700) || defined(_M_ARM) - unsigned long index = 0; - _BitScanForward(&index, value); - return (unsigned int)index; -#elif defined(__clang__) - // clang-cl doesn't export _tzcnt_u32 for non BMI systems - return value ? __builtin_ctz(value) : 32; -#else - return _tzcnt_u32(value); -#endif -#elif defined(__GNUC__) || defined(__GNUG__) - return __builtin_ctz(value); -#elif defined(__ICC) || defined(__INTEL_COMPILER) - return _bit_scan_forward(value); -#elif defined(__clang__) - return llvm.cttz.i32(value, true); -#else - static const int MultiplyDeBruijnBitPosition[32] = { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; - return MultiplyDeBruijnBitPosition[((uint32_t)((value & -value) * 0x077CB531U)) >> 27]; -#endif -} - #ifndef CV_DOXYGEN CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END #endif diff --git a/modules/core/include/opencv2/core/hal/intrin_avx.hpp b/modules/core/include/opencv2/core/hal/intrin_avx.hpp index 15ec47f7ef82..24e2a5289338 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx.hpp @@ -1244,6 +1244,17 @@ inline int v_signmask(const v_float32x8& a) inline int v_signmask(const v_float64x4& a) { return _mm256_movemask_pd(a.val); } +inline int v_scan_forward(const v_int8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_uint8x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_int16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_uint16x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_int32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_uint32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_float32x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_int64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_uint64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_float64x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } + /** Checks **/ #define OPENCV_HAL_IMPL_AVX_CHECK(_Tpvec, and_op, allmask) \ inline bool v_check_all(const _Tpvec& a) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp index 190d435001dc..d4edf0cdd10a 100644 --- a/modules/core/include/opencv2/core/hal/intrin_avx512.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_avx512.hpp @@ -2719,7 +2719,7 @@ OPENCV_HAL_IMPL_AVX512_LOADSTORE_INTERLEAVE(v_float64x8, double, f64, v_uint64x8 ////////// Mask and checks ///////// /** Mask **/ -inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } +inline int64 v_signmask(const v_int8x64& a) { return (int64)_mm512_movepi8_mask(a.val); } inline int v_signmask(const v_int16x32& a) { return (int)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline int v_signmask(const v_int32x16& a) { return (int)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline int v_signmask(const v_int64x8& a) { return (int)_mm512_cmp_epi64_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } @@ -2733,7 +2733,7 @@ inline int v_signmask(const v_float64x8& a) { return v_signmask(v_reinterpret_as /** Checks **/ inline bool v_check_all(const v_int8x64& a) { return !(bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } -inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_cmp_epi8_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } +inline bool v_check_any(const v_int8x64& a) { return (bool)_mm512_movepi8_mask(a.val); } inline bool v_check_all(const v_int16x32& a) { return !(bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } inline bool v_check_any(const v_int16x32& a) { return (bool)_mm512_cmp_epi16_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_LT); } inline bool v_check_all(const v_int32x16& a) { return !(bool)_mm512_cmp_epi32_mask(a.val, _mm512_setzero_si512(), _MM_CMPINT_NLT); } @@ -2754,6 +2754,22 @@ inline bool v_check_any(const v_uint16x32& a) { return v_check_any(v_reinterpret inline bool v_check_any(const v_uint32x16& a) { return v_check_any(v_reinterpret_as_s32(a)); } inline bool v_check_any(const v_uint64x8& a) { return v_check_any(v_reinterpret_as_s64(a)); } +inline int v_scan_forward(const v_int8x64& a) +{ + int64 mask = _mm512_movepi8_mask(a.val); + int mask32 = (int)mask; + return mask != 0 ? mask32 != 0 ? trailingZeros32(mask32) : 32 + trailingZeros32((int)(mask >> 32)) : 0; +} +inline int v_scan_forward(const v_uint8x64& a) { return v_scan_forward(v_reinterpret_as_s8(a)); } +inline int v_scan_forward(const v_int16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); } +inline int v_scan_forward(const v_uint16x32& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))); } +inline int v_scan_forward(const v_int32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; } +inline int v_scan_forward(const v_uint32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; } +inline int v_scan_forward(const v_float32x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 2; } +inline int v_scan_forward(const v_int64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; } +inline int v_scan_forward(const v_uint64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; } +inline int v_scan_forward(const v_float64x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s16(a))) / 4; } + inline void v512_cleanup() { _mm256_zeroall(); } CV_CPU_OPTIMIZATION_HAL_NAMESPACE_END diff --git a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp index 81fb63378665..884de8029409 100644 --- a/modules/core/include/opencv2/core/hal/intrin_cpp.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_cpp.hpp @@ -1072,6 +1072,7 @@ template inline typename V_TypeTraits< typename V_TypeTrait } /** @brief Get negative values mask +@deprecated v_signmask depends on a lane count heavily and therefore isn't universal enough Returned value is a bit mask with bits set to 1 on places corresponding to negative packed values indexes. Example: @@ -1088,6 +1089,23 @@ template inline int v_signmask(const v_reg<_Tp, n>& a) return mask; } +/** @brief Get first negative lane index + +Returned value is an index of first negative lane (undefined for input of all positive values) +Example: +@code{.cpp} +v_int32x4 r; // set to {0, 0, -1, -1} +int idx = v_heading_zeros(r); // idx = 2 +@endcode +*/ +template inline int v_scan_forward(const v_reg<_Tp, n>& a) +{ + for (int i = 0; i < n; i++) + if(V_TypeTraits<_Tp>::reinterpret_int(a.s[i]) < 0) + return i; + return 0; +} + /** @brief Check if all packed values are less than zero Unsigned values will be casted to signed: `uchar 254 => char -2`. diff --git a/modules/core/include/opencv2/core/hal/intrin_neon.hpp b/modules/core/include/opencv2/core/hal/intrin_neon.hpp index 46d347d23456..5617bc24e64d 100644 --- a/modules/core/include/opencv2/core/hal/intrin_neon.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_neon.hpp @@ -1096,17 +1096,32 @@ inline int v_signmask(const v_int32x4& a) { return v_signmask(v_reinterpret_as_u32(a)); } inline int v_signmask(const v_float32x4& a) { return v_signmask(v_reinterpret_as_u32(a)); } -#if CV_SIMD128_64F inline int v_signmask(const v_uint64x2& a) { int64x1_t m0 = vdup_n_s64(0); uint64x2_t v0 = vshlq_u64(vshrq_n_u64(a.val, 63), vcombine_s64(m0, m0)); return (int)vgetq_lane_u64(v0, 0) + ((int)vgetq_lane_u64(v0, 1) << 1); } +inline int v_signmask(const v_int64x2& a) +{ return v_signmask(v_reinterpret_as_u64(a)); } +#if CV_SIMD128_64F inline int v_signmask(const v_float64x2& a) { return v_signmask(v_reinterpret_as_u64(a)); } #endif +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); } +#if CV_SIMD128_64F +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); } +#endif + #define OPENCV_HAL_IMPL_NEON_CHECK_ALLANY(_Tpvec, suffix, shift) \ inline bool v_check_all(const v_##_Tpvec& a) \ { \ diff --git a/modules/core/include/opencv2/core/hal/intrin_sse.hpp b/modules/core/include/opencv2/core/hal/intrin_sse.hpp index a01c99fa99b9..e172d45a9fd5 100644 --- a/modules/core/include/opencv2/core/hal/intrin_sse.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_sse.hpp @@ -1617,6 +1617,17 @@ OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_int32x4, epi8, v_packq_epi32, OPENCV_HAL_AND, OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float32x4, ps, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 15, 15) OPENCV_HAL_IMPL_SSE_CHECK_SIGNS(v_float64x2, pd, OPENCV_HAL_NOP, OPENCV_HAL_1ST, 3, 3) +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 2; } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 4; } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(v_reinterpret_as_s8(a))) / 8; } + #if CV_SSE4_1 #define OPENCV_HAL_IMPL_SSE_SELECT(_Tpvec, cast_ret, cast, suffix) \ inline _Tpvec v_select(const _Tpvec& mask, const _Tpvec& a, const _Tpvec& b) \ diff --git a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp index 4d98809a3493..a4d2c29d348e 100644 --- a/modules/core/include/opencv2/core/hal/intrin_vsx.hpp +++ b/modules/core/include/opencv2/core/hal/intrin_vsx.hpp @@ -891,6 +891,17 @@ inline int v_signmask(const v_uint64x2& a) inline int v_signmask(const v_float64x2& a) { return v_signmask(v_reinterpret_as_s64(a)); } +inline int v_scan_forward(const v_int8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint8x16& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint16x8& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float32x4& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_int64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_uint64x2& a) { return trailingZeros32(v_signmask(a)); } +inline int v_scan_forward(const v_float64x2& a) { return trailingZeros32(v_signmask(a)); } + template inline bool v_check_all(const _Tpvec& a) { return vec_all_lt(a.val, _Tpvec().val); } diff --git a/modules/core/include/opencv2/core/matx.hpp b/modules/core/include/opencv2/core/matx.hpp index 320a6fef7ee0..de34da45a499 100644 --- a/modules/core/include/opencv2/core/matx.hpp +++ b/modules/core/include/opencv2/core/matx.hpp @@ -385,6 +385,10 @@ template class Vec : public Matx<_Tp, cn, 1> const _Tp& operator ()(int i) const; _Tp& operator ()(int i); +#ifdef CV_CXX11 + Vec<_Tp, cn>& operator=(const Vec<_Tp, cn>& rhs) = default; +#endif + Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_AddOp); Vec(const Matx<_Tp, cn, 1>& a, const Matx<_Tp, cn, 1>& b, Matx_SubOp); template Vec(const Matx<_Tp, cn, 1>& a, _T2 alpha, Matx_ScaleOp); diff --git a/modules/dnn/test/test_backends.cpp b/modules/dnn/test/test_backends.cpp index da5a47b787bf..41d302f86e3a 100644 --- a/modules/dnn/test/test_backends.cpp +++ b/modules/dnn/test/test_backends.cpp @@ -383,7 +383,8 @@ TEST_P(DNNTestNetwork, DenseNet_121) l1 = 0.1; lInf = 0.6; } processNet("dnn/DenseNet_121.caffemodel", "dnn/DenseNet_121.prototxt", Size(224, 224), "", "", l1, lInf); - expectNoFallbacksFromIE(net); + if (target != DNN_TARGET_MYRIAD || getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + expectNoFallbacksFromIE(net); } TEST_P(DNNTestNetwork, FastNeuralStyle_eccv16) diff --git a/modules/dnn/test/test_caffe_importer.cpp b/modules/dnn/test/test_caffe_importer.cpp index c2d467387601..a5cae50621fa 100644 --- a/modules/dnn/test/test_caffe_importer.cpp +++ b/modules/dnn/test/test_caffe_importer.cpp @@ -286,19 +286,22 @@ TEST_P(Reproducibility_MobileNet_SSD, Accuracy) zerosOut = zerosOut.reshape(1, zerosOut.total() / 7); const int numDetections = zerosOut.rows; - ASSERT_NE(numDetections, 0); - for (int i = 0; i < numDetections; ++i) + // TODO: fix it + if (targetId != DNN_TARGET_MYRIAD || + getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) { - float confidence = zerosOut.ptr(i)[2]; - ASSERT_EQ(confidence, 0); + ASSERT_NE(numDetections, 0); + for (int i = 0; i < numDetections; ++i) + { + float confidence = zerosOut.ptr(i)[2]; + ASSERT_EQ(confidence, 0); + } } - // There is something wrong with Reshape layer in Myriad plugin and - // regression with DLIE/OCL_FP16 target. + // There is something wrong with Reshape layer in Myriad plugin. if (backendId == DNN_BACKEND_INFERENCE_ENGINE) { - if ((targetId == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_2) || - targetId == DNN_TARGET_OPENCL_FP16) + if (targetId == DNN_TARGET_MYRIAD || targetId == DNN_TARGET_OPENCL_FP16) return; } @@ -465,7 +468,7 @@ TEST_P(Test_Caffe_nets, Colorization) double lInf = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 5.3 : 3e-3; if (target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) { - l1 = 0.6; lInf = 15; + l1 = 0.5; lInf = 11; } normAssert(out, ref, "", l1, lInf); expectNoFallbacksFromIE(net); @@ -500,7 +503,8 @@ TEST_P(Test_Caffe_nets, DenseNet_121) l1 = 0.11; lInf = 0.5; } normAssert(out, ref, "", l1, lInf); - expectNoFallbacksFromIE(net); + if (target != DNN_TARGET_MYRIAD || getInferenceEngineVPUType() != CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) + expectNoFallbacksFromIE(net); } TEST(Test_Caffe, multiple_inputs) diff --git a/modules/dnn/test/test_layers.cpp b/modules/dnn/test/test_layers.cpp index 18f47c0a2e2b..02d33b4c36cf 100644 --- a/modules/dnn/test/test_layers.cpp +++ b/modules/dnn/test/test_layers.cpp @@ -141,8 +141,6 @@ TEST_P(Test_Caffe_layers, Convolution) TEST_P(Test_Caffe_layers, DeConvolution) { - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_CPU) - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE); // TODO IE_CPU testLayerUsingCaffeModels("layer_deconvolution", true, false); } @@ -246,15 +244,8 @@ TEST_P(Test_Caffe_layers, Concat) TEST_P(Test_Caffe_layers, Fused_Concat) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_GE(2019010000) - if (backend == DNN_BACKEND_INFERENCE_ENGINE) // Test is disabled for DLIE due negative_slope parameter - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE, CV_TEST_TAG_DNN_SKIP_IE_2019R1, CV_TEST_TAG_DNN_SKIP_IE_2019R1_1); -#endif - -#if defined(INF_ENGINE_RELEASE) if (backend == DNN_BACKEND_INFERENCE_ENGINE && (target == DNN_TARGET_OPENCL || target == DNN_TARGET_OPENCL_FP16)) applyTestTag(target == DNN_TARGET_OPENCL ? CV_TEST_TAG_DNN_SKIP_IE_OPENCL : CV_TEST_TAG_DNN_SKIP_IE_OPENCL_FP16); -#endif checkBackend(); @@ -319,26 +310,6 @@ TEST_P(Test_Caffe_layers, layer_prelu_fc) testLayerUsingCaffeModels("layer_prelu_fc", true, false, l1, lInf); } -//template -//static void test_Layer_Concat() -//{ -// Matx21f a(1.f, 1.f), b(2.f, 2.f), c(3.f, 3.f); -// std::vector res(1), src = { Blob(XMat(a)), Blob(XMat(b)), Blob(XMat(c)) }; -// Blob ref(XMat(Matx23f(1.f, 2.f, 3.f, 1.f, 2.f, 3.f))); -// -// runLayer(ConcatLayer::create(1), src, res); -// normAssert(ref, res[0]); -//} -//TEST(Layer_Concat, Accuracy) -//{ -// test_Layer_Concat()); -//} -//OCL_TEST(Layer_Concat, Accuracy) -//{ -// OCL_ON(test_Layer_Concat()); -// ); -//} - TEST_P(Test_Caffe_layers, Reshape_Split_Slice) { if (backend == DNN_BACKEND_INFERENCE_ENGINE) @@ -774,9 +745,8 @@ TEST_P(Test_Caffe_layers, Average_pooling_kernel_area) // Test PriorBoxLayer in case of no aspect ratios (just squared proposals). TEST_P(Test_Caffe_layers, PriorBox_squares) { - if (backend == DNN_BACKEND_INFERENCE_ENGINE) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD); - LayerParams lp; lp.name = "testPriorBox"; lp.type = "PriorBox"; diff --git a/modules/dnn/test/test_onnx_importer.cpp b/modules/dnn/test/test_onnx_importer.cpp index 05fa79dcf114..c99b8cf43110 100644 --- a/modules/dnn/test/test_onnx_importer.cpp +++ b/modules/dnn/test/test_onnx_importer.cpp @@ -225,7 +225,7 @@ TEST_P(Test_ONNX_layers, Multiplication) TEST_P(Test_ONNX_layers, Constant) { -#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_LE(2018050000) +#if defined(INF_ENGINE_RELEASE) && INF_ENGINE_VER_MAJOR_EQ(2018050000) if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X) applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X, CV_TEST_TAG_DNN_SKIP_IE_2018R5); diff --git a/modules/dnn/test/test_tf_importer.cpp b/modules/dnn/test/test_tf_importer.cpp index 7b311fa294af..dd5d871d71f5 100644 --- a/modules/dnn/test/test_tf_importer.cpp +++ b/modules/dnn/test/test_tf_importer.cpp @@ -151,12 +151,6 @@ TEST_P(Test_TensorFlow_layers, padding) TEST_P(Test_TensorFlow_layers, padding_same) { -#if defined(INF_ENGINE_RELEASE) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD - && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X - ) - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); -#endif // Reference output values are in range [0.0006, 2.798] runTensorFlowNet("padding_same"); } @@ -432,14 +426,6 @@ TEST_P(Test_TensorFlow_nets, Inception_v2_SSD) TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD) { checkBackend(); - -#if defined(INF_ENGINE_RELEASE) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD - && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X - ) - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); -#endif - std::string proto = findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pbtxt"); std::string model = findDataFile("dnn/ssd_mobilenet_v1_coco_2017_11_17.pb", false); @@ -456,7 +442,17 @@ TEST_P(Test_TensorFlow_nets, MobileNet_v1_SSD) Mat ref = blobFromNPY(findDataFile("dnn/tensorflow/ssd_mobilenet_v1_coco_2017_11_17.detection_out.npy")); float scoreDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 7e-3 : 1.5e-5; float iouDiff = (target == DNN_TARGET_OPENCL_FP16 || target == DNN_TARGET_MYRIAD) ? 0.012 : 1e-3; - normAssertDetections(ref, out, "", 0.3, scoreDiff, iouDiff); + float detectionConfThresh = (target == DNN_TARGET_MYRIAD) ? 0.35 : 0.3; + +#if defined(INF_ENGINE_RELEASE) + if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD + && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X + ) + scoreDiff = 0.061; + iouDiff = 0.12; + detectionConfThresh = 0.36; +#endif + normAssertDetections(ref, out, "", detectionConfThresh, scoreDiff, iouDiff); expectNoFallbacksFromIE(net); } @@ -648,15 +644,8 @@ TEST_P(Test_TensorFlow_layers, fp16_weights) TEST_P(Test_TensorFlow_layers, fp16_padding_same) { -#if defined(INF_ENGINE_RELEASE) - if (backend == DNN_BACKEND_INFERENCE_ENGINE && target == DNN_TARGET_MYRIAD - && getInferenceEngineVPUType() == CV_DNN_INFERENCE_ENGINE_VPU_TYPE_MYRIAD_X - ) - applyTestTag(CV_TEST_TAG_DNN_SKIP_IE_MYRIAD_X); -#endif - // Reference output values are in range [-3.504, -0.002] - runTensorFlowNet("fp16_padding_same", false, 6e-4, 4e-3); + runTensorFlowNet("fp16_padding_same", false, 7e-4, 4e-3); } TEST_P(Test_TensorFlow_layers, defun) diff --git a/modules/features2d/src/blobdetector.cpp b/modules/features2d/src/blobdetector.cpp index 403a8974cc9a..c973b09764b4 100644 --- a/modules/features2d/src/blobdetector.cpp +++ b/modules/features2d/src/blobdetector.cpp @@ -338,7 +338,7 @@ void SimpleBlobDetectorImpl::detect(InputArray image, std::vector& centers[j].push_back(curCenters[i]); size_t k = centers[j].size() - 1; - while( k > 0 && centers[j][k].radius < centers[j][k-1].radius ) + while( k > 0 && curCenters[i].radius < centers[j][k-1].radius ) { centers[j][k] = centers[j][k-1]; k--; diff --git a/modules/features2d/src/fast.cpp b/modules/features2d/src/fast.cpp index 4380b1ce53e9..81c0c8b9204b 100644 --- a/modules/features2d/src/fast.cpp +++ b/modules/features2d/src/fast.cpp @@ -132,10 +132,9 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo m1 = m1 | ((x3 < v1) & (x0 < v1)); m0 = m0 | m1; - int mask = v_signmask(m0); - if( mask == 0 ) + if( !v_check_any(m0) ) continue; - if( (mask & 255) == 0 ) + if( !v_check_any(v_combine_low(m0, m0)) ) { j -= 8; ptr -= 8; @@ -159,16 +158,36 @@ void FAST_t(InputArray _img, std::vector& keypoints, int threshold, bo max1 = v_max(max1, v_reinterpret_as_u8(c1)); } - max0 = v_max(max0, max1); - int m = v_signmask(K16 < max0); + max0 = K16 < v_max(max0, max1); + int m = -v_reduce_sum(v_reinterpret_as_s8(max0)); + uchar mflag[16]; + v_store(mflag, max0); - for( k = 0; m > 0 && k < 16; k++, m >>= 1 ) + for( k = 0; m > 0 && k < 16; k++ ) { - if(m & 1) + if(mflag[k]) { + --m; cornerpos[ncorners++] = j+k; if(nonmax_suppression) - curr[j+k] = (uchar)cornerScore(ptr+k, pixel, threshold); + { + short d[25]; + for (int _k = 0; _k < 25; _k++) + d[_k] = (short)(ptr[k] - ptr[k + pixel[_k]]); + + v_int16x8 a0, b0, a1, b1; + a0 = b0 = a1 = b1 = v_load(d + 8); + for(int shift = 0; shift < 8; ++shift) + { + v_int16x8 v_nms = v_load(d + shift); + a0 = v_min(a0, v_nms); + b0 = v_max(b0, v_nms); + v_nms = v_load(d + 9 + shift); + a1 = v_min(a1, v_nms); + b1 = v_max(b1, v_nms); + } + curr[j + k] = (uchar)(v_reduce_max(v_max(v_max(a0, a1), v_setzero_s16() - v_min(b0, b1))) - 1); + } } } } diff --git a/modules/imgproc/src/canny.cpp b/modules/imgproc/src/canny.cpp index e2baf25edd70..50aafdbec864 100644 --- a/modules/imgproc/src/canny.cpp +++ b/modules/imgproc/src/canny.cpp @@ -47,10 +47,6 @@ #include "opencv2/core/openvx/ovx_defs.hpp" -#if CV_SIMD128 -#define CV_MALLOC_SIMD128 16 -#endif - namespace cv { @@ -296,18 +292,11 @@ static bool ocl_Canny(InputArray _src, const UMat& dx_, const UMat& dy_, OutputA #define CANNY_PUSH(map, stack) *map = 2, stack.push_back(map) -#define CANNY_CHECK_SIMD(m, high, map, stack) \ - if (m > high) \ - CANNY_PUSH(map, stack); \ - else \ - *map = 0 - #define CANNY_CHECK(m, high, map, stack) \ if (m > high) \ CANNY_PUSH(map, stack); \ else \ - *map = 0; \ - continue + *map = 0 class parallelCanny : public ParallelLoopBody { @@ -317,9 +306,14 @@ class parallelCanny : public ParallelLoopBody src(_src), src2(_src), map(_map), _borderPeaksParallel(borderPeaksParallel), low(_low), high(_high), aperture_size(_aperture_size), L2gradient(_L2gradient) { -#if CV_SIMD128 +#if CV_SIMD + for(int i = 0; i < v_int8::nlanes; ++i) + { + smask[i] = 0; + smask[i + v_int8::nlanes] = (schar)-1; + } if (true) - _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1); + _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1); else #endif _map.create(src.rows + 2, src.cols + 2, CV_8UC1); @@ -336,9 +330,14 @@ class parallelCanny : public ParallelLoopBody src(_dx), src2(_dy), map(_map), _borderPeaksParallel(borderPeaksParallel), low(_low), high(_high), aperture_size(0), L2gradient(_L2gradient) { -#if CV_SIMD128 +#if CV_SIMD + for(int i = 0; i < v_int8::nlanes; ++i) + { + smask[i] = 0; + smask[i + v_int8::nlanes] = (schar)-1; + } if (true) - _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_MALLOC_SIMD128 + 1), CV_MALLOC_SIMD128), CV_8UC1); + _map.create(src.rows + 2, (int)alignSize((size_t)(src.cols + CV_SIMD_WIDTH + 1), CV_SIMD_WIDTH), CV_8UC1); else #endif _map.create(src.rows + 2, src.cols + 2, CV_8UC1); @@ -397,11 +396,11 @@ class parallelCanny : public ParallelLoopBody } // _mag_p: previous row, _mag_a: actual row, _mag_n: next row -#if CV_SIMD128 - AutoBuffer buffer(3 * (mapstep * cn + CV_MALLOC_SIMD128)); - _mag_p = alignPtr(buffer.data() + 1, CV_MALLOC_SIMD128); - _mag_a = alignPtr(_mag_p + mapstep * cn, CV_MALLOC_SIMD128); - _mag_n = alignPtr(_mag_a + mapstep * cn, CV_MALLOC_SIMD128); +#if CV_SIMD + AutoBuffer buffer(3 * (mapstep * cn + CV_SIMD_WIDTH)); + _mag_p = alignPtr(buffer.data() + 1, CV_SIMD_WIDTH); + _mag_a = alignPtr(_mag_p + mapstep * cn, CV_SIMD_WIDTH); + _mag_n = alignPtr(_mag_a + mapstep * cn, CV_SIMD_WIDTH); #else AutoBuffer buffer(3 * (mapstep * cn)); _mag_p = buffer.data() + 1; @@ -437,21 +436,19 @@ class parallelCanny : public ParallelLoopBody if (L2gradient) { int j = 0, width = src.cols * cn; -#if CV_SIMD128 +#if CV_SIMD + for ( ; j <= width - v_int16::nlanes; j += v_int16::nlanes) { - for ( ; j <= width - 8; j += 8) - { - v_int16x8 v_dx = v_load((const short*)(_dx + j)); - v_int16x8 v_dy = v_load((const short*)(_dy + j)); + v_int16 v_dx = vx_load((const short*)(_dx + j)); + v_int16 v_dy = vx_load((const short*)(_dy + j)); - v_int32x4 v_dxp_low, v_dxp_high; - v_int32x4 v_dyp_low, v_dyp_high; - v_expand(v_dx, v_dxp_low, v_dxp_high); - v_expand(v_dy, v_dyp_low, v_dyp_high); + v_int32 v_dxp_low, v_dxp_high; + v_int32 v_dyp_low, v_dyp_high; + v_expand(v_dx, v_dxp_low, v_dxp_high); + v_expand(v_dy, v_dyp_low, v_dyp_high); - v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); - v_store_aligned((int *)(_mag_n + j + 4), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); - } + v_store_aligned((int *)(_mag_n + j), v_dxp_low*v_dxp_low+v_dyp_low*v_dyp_low); + v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dxp_high*v_dxp_high+v_dyp_high*v_dyp_high); } #endif for ( ; j < width; ++j) @@ -460,23 +457,21 @@ class parallelCanny : public ParallelLoopBody else { int j = 0, width = src.cols * cn; -#if CV_SIMD128 +#if CV_SIMD + for(; j <= width - v_int16::nlanes; j += v_int16::nlanes) { - for(; j <= width - 8; j += 8) - { - v_int16x8 v_dx = v_load((const short *)(_dx + j)); - v_int16x8 v_dy = v_load((const short *)(_dy + j)); + v_int16 v_dx = vx_load((const short *)(_dx + j)); + v_int16 v_dy = vx_load((const short *)(_dy + j)); - v_dx = v_reinterpret_as_s16(v_abs(v_dx)); - v_dy = v_reinterpret_as_s16(v_abs(v_dy)); + v_dx = v_reinterpret_as_s16(v_abs(v_dx)); + v_dy = v_reinterpret_as_s16(v_abs(v_dy)); - v_int32x4 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; - v_expand(v_dx, v_dx_ml, v_dx_mh); - v_expand(v_dy, v_dy_ml, v_dy_mh); + v_int32 v_dx_ml, v_dy_ml, v_dx_mh, v_dy_mh; + v_expand(v_dx, v_dx_ml, v_dx_mh); + v_expand(v_dy, v_dy_ml, v_dy_mh); - v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); - v_store_aligned((int *)(_mag_n + j + 4), v_dx_mh + v_dy_mh); - } + v_store_aligned((int *)(_mag_n + j), v_dx_ml + v_dy_ml); + v_store_aligned((int *)(_mag_n + j + v_int32::nlanes), v_dx_mh + v_dy_mh); } #endif for ( ; j < width; ++j) @@ -520,9 +515,9 @@ class parallelCanny : public ParallelLoopBody // From here actual src row is (i - 1) // Set left and right border to 1 -#if CV_SIMD128 +#if CV_SIMD if (true) - _pmap = map.ptr(i) + CV_MALLOC_SIMD128; + _pmap = map.ptr(i) + CV_SIMD_WIDTH; else #endif _pmap = map.ptr(i) + 1; @@ -542,166 +537,59 @@ class parallelCanny : public ParallelLoopBody const int TG22 = 13573; int j = 0; -#if CV_SIMD128 +#if CV_SIMD { - const v_int32x4 v_low = v_setall_s32(low); - const v_int8x16 v_one = v_setall_s8(1); + const v_int32 v_low = vx_setall_s32(low); + const v_int8 v_one = vx_setall_s8(1); - for (; j <= src.cols - 32; j += 32) + for (; j <= src.cols - v_int8::nlanes; j += v_int8::nlanes) { - v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); - v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); - v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); - v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); - - v_int32x4 v_cmp1 = v_m1 > v_low; - v_int32x4 v_cmp2 = v_m2 > v_low; - v_int32x4 v_cmp3 = v_m3 > v_low; - v_int32x4 v_cmp4 = v_m4 > v_low; - - v_m1 = v_load_aligned((const int*)(_mag_a + j + 16)); - v_m2 = v_load_aligned((const int*)(_mag_a + j + 20)); - v_m3 = v_load_aligned((const int*)(_mag_a + j + 24)); - v_m4 = v_load_aligned((const int*)(_mag_a + j + 28)); - v_store_aligned((signed char*)(_pmap + j), v_one); - v_store_aligned((signed char*)(_pmap + j + 16), v_one); - - v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); - v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); - - v_cmp1 = v_m1 > v_low; - v_cmp2 = v_m2 > v_low; - v_cmp3 = v_m3 > v_low; - v_cmp4 = v_m4 > v_low; - - v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); - - v_cmp80 = v_pack(v_cmp1, v_cmp2); - v_cmp81 = v_pack(v_cmp3, v_cmp4); - - unsigned int mask = v_signmask(v_cmp); - - v_cmp = v_pack(v_cmp80, v_cmp81); - mask |= v_signmask(v_cmp) << 16; - - if (mask) + v_int8 v_cmp = v_pack(v_pack(vx_load_aligned((const int*)(_mag_a + j )) > v_low, + vx_load_aligned((const int*)(_mag_a + j + v_int32::nlanes)) > v_low), + v_pack(vx_load_aligned((const int*)(_mag_a + j + 2*v_int32::nlanes)) > v_low, + vx_load_aligned((const int*)(_mag_a + j + 3*v_int32::nlanes)) > v_low)); + while (v_check_any(v_cmp)) { - int k = j; + int l = v_scan_forward(v_cmp); + v_cmp &= vx_load(smask + v_int8::nlanes - 1 - l); + int k = j + l; - do - { - int l = trailingZeros32(mask); - k += l; - mask >>= l; + int m = _mag_a[k]; + short xs = _dx[k]; + short ys = _dy[k]; + int x = (int)std::abs(xs); + int y = (int)std::abs(ys) << 15; - int m = _mag_a[k]; - short xs = _dx[k]; - short ys = _dy[k]; - int x = (int)std::abs(xs); - int y = (int)std::abs(ys) << 15; + int tg22x = x * TG22; - int tg22x = x * TG22; - - if (y < tg22x) - { - if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) - { - CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); - } - } - else + if (y < tg22x) + { + if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) { - int tg67x = tg22x + (x << 16); - if (y > tg67x) - { - if (m > _mag_p[k] && m >= _mag_n[k]) - { - CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); - } - } - else - { - int s = (xs ^ ys) < 0 ? -1 : 1; - if(m > _mag_p[k - s] && m > _mag_n[k + s]) - { - CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); - } - } + CANNY_CHECK(m, high, (_pmap+k), stack); } - ++k; - } while((mask >>= 1)); - } - } - - if (j <= src.cols - 16) - { - v_int32x4 v_m1 = v_load_aligned((const int*)(_mag_a + j)); - v_int32x4 v_m2 = v_load_aligned((const int*)(_mag_a + j + 4)); - v_int32x4 v_m3 = v_load_aligned((const int*)(_mag_a + j + 8)); - v_int32x4 v_m4 = v_load_aligned((const int*)(_mag_a + j + 12)); - - v_store_aligned((signed char*)(_pmap + j), v_one); - - v_int32x4 v_cmp1 = v_m1 > v_low; - v_int32x4 v_cmp2 = v_m2 > v_low; - v_int32x4 v_cmp3 = v_m3 > v_low; - v_int32x4 v_cmp4 = v_m4 > v_low; - - v_int16x8 v_cmp80 = v_pack(v_cmp1, v_cmp2); - v_int16x8 v_cmp81 = v_pack(v_cmp3, v_cmp4); - - v_int8x16 v_cmp = v_pack(v_cmp80, v_cmp81); - unsigned int mask = v_signmask(v_cmp); - - if (mask) - { - int k = j; - - do + } + else { - int l = trailingZeros32(mask); - k += l; - mask >>= l; - - int m = _mag_a[k]; - short xs = _dx[k]; - short ys = _dy[k]; - int x = (int)std::abs(xs); - int y = (int)std::abs(ys) << 15; - - int tg22x = x * TG22; - - if (y < tg22x) + int tg67x = tg22x + (x << 16); + if (y > tg67x) { - if (m > _mag_a[k - 1] && m >= _mag_a[k + 1]) + if (m > _mag_p[k] && m >= _mag_n[k]) { - CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); + CANNY_CHECK(m, high, (_pmap+k), stack); } } else { - int tg67x = tg22x + (x << 16); - if (y > tg67x) + int s = (xs ^ ys) < 0 ? -1 : 1; + if(m > _mag_p[k - s] && m > _mag_n[k + s]) { - if (m > _mag_p[k] && m >= _mag_n[k]) - { - CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); - } - } - else - { - int s = (xs ^ ys) < 0 ? -1 : 1; - if(m > _mag_p[k - s] && m > _mag_n[k + s]) - { - CANNY_CHECK_SIMD(m, high, (_pmap+k), stack); - } + CANNY_CHECK(m, high, (_pmap+k), stack); } } - ++k; - } while((mask >>= 1)); + } } - j += 16; } } #endif @@ -723,6 +611,7 @@ class parallelCanny : public ParallelLoopBody if (m > _mag_a[j - 1] && m >= _mag_a[j + 1]) { CANNY_CHECK(m, high, (_pmap+j), stack); + continue; } } else @@ -733,6 +622,7 @@ class parallelCanny : public ParallelLoopBody if (m > _mag_p[j] && m >= _mag_n[j]) { CANNY_CHECK(m, high, (_pmap+j), stack); + continue; } } else @@ -741,6 +631,7 @@ class parallelCanny : public ParallelLoopBody if(m > _mag_p[j - s] && m > _mag_n[j + s]) { CANNY_CHECK(m, high, (_pmap+j), stack); + continue; } } } @@ -802,6 +693,9 @@ class parallelCanny : public ParallelLoopBody ptrdiff_t mapstep; int cn; mutable Mutex mutex; +#if CV_SIMD + schar smask[2*v_int8::nlanes]; +#endif }; class finalPass : public ParallelLoopBody @@ -824,31 +718,31 @@ class finalPass : public ParallelLoopBody int j = 0; uchar *pdst = dst.ptr(i); const uchar *pmap = map.ptr(i + 1); -#if CV_SIMD128 +#if CV_SIMD if (true) - pmap += CV_MALLOC_SIMD128; + pmap += CV_SIMD_WIDTH; else #endif pmap += 1; -#if CV_SIMD128 +#if CV_SIMD { - const v_uint8x16 v_zero = v_setzero_u8(); - const v_uint8x16 v_ff = ~v_zero; - const v_uint8x16 v_two(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2); + const v_uint8 v_zero = vx_setzero_u8(); + const v_uint8 v_ff = ~v_zero; + const v_uint8 v_two = vx_setall_u8(2); - for (; j <= dst.cols - 16; j += 16) + for (; j <= dst.cols - v_uint8::nlanes; j += v_uint8::nlanes) { - v_uint8x16 v_pmap = v_load_aligned((const unsigned char*)(pmap + j)); + v_uint8 v_pmap = vx_load_aligned((const unsigned char*)(pmap + j)); v_pmap = v_select(v_pmap == v_two, v_ff, v_zero); v_store((pdst + j), v_pmap); } - if (j <= dst.cols - 8) + if (j <= dst.cols - v_uint8::nlanes/2) { - v_uint8x16 v_pmap = v_load_low((const unsigned char*)(pmap + j)); + v_uint8 v_pmap = vx_load_low((const unsigned char*)(pmap + j)); v_pmap = v_select(v_pmap == v_two, v_ff, v_zero); v_store_low((pdst + j), v_pmap); - j += 8; + j += v_uint8::nlanes/2; } } #endif diff --git a/modules/imgproc/src/connectedcomponents.cpp b/modules/imgproc/src/connectedcomponents.cpp index 10095842b2e6..9241c6c09ee2 100644 --- a/modules/imgproc/src/connectedcomponents.cpp +++ b/modules/imgproc/src/connectedcomponents.cpp @@ -2542,7 +2542,8 @@ namespace cv{ //Array used to store info and labeled pixel by each thread. //Different threads affect different memory location of chunksSizeAndLabels - int *chunksSizeAndLabels = (int *)cv::fastMalloc(h * sizeof(int)); + const int chunksSizeAndLabelsSize = h + 1; + int *chunksSizeAndLabels = (int *)cv::fastMalloc(chunksSizeAndLabelsSize * sizeof(int)); //Tree of labels LabelT *P = (LabelT *)cv::fastMalloc(Plength * sizeof(LabelT)); @@ -2561,6 +2562,7 @@ namespace cv{ LabelT nLabels = 1; for (int i = 0; i < h; i = chunksSizeAndLabels[i]){ + CV_Assert(i + 1 < chunksSizeAndLabelsSize); flattenL(P, LabelT((i + 1) / 2) * LabelT((w + 1) / 2) + 1, chunksSizeAndLabels[i + 1], nLabels); } diff --git a/modules/imgproc/src/contours.cpp b/modules/imgproc/src/contours.cpp index 8325b2414bbd..e06caf576434 100644 --- a/modules/imgproc/src/contours.cpp +++ b/modules/imgproc/src/contours.cpp @@ -1061,19 +1061,13 @@ cvFindNextContour( CvContourScanner scanner ) } else { -#if CV_SIMD_WIDTH > 16 - v_uint8 vx_prev = vx_setall_u8((uchar)prev); - while (x <= width - v_uint8::nlanes && - v_check_all(vx_load((uchar*)(img + x)) == vx_prev)) - x += v_uint8::nlanes; -#endif - v_uint8x16 v_prev = v_setall_u8((uchar)prev); - for (; x <= width - v_uint8x16::nlanes; x += v_uint8x16::nlanes) + v_uint8 v_prev = vx_setall_u8((uchar)prev); + for (; x <= width - v_uint8::nlanes; x += v_uint8::nlanes) { - unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(img + x)) != v_prev); - if (mask) + v_uint8 vmask = (vx_load((uchar*)(img + x)) != v_prev); + if (v_check_any(vmask)) { - p = img[(x += cv::trailingZeros32(mask))]; + p = img[(x += v_scan_forward(vmask))]; goto _next_contour; } } @@ -1334,19 +1328,13 @@ CvLinkedRunPoint; inline int findStartContourPoint(uchar *src_data, CvSize img_size, int j) { #if CV_SIMD -#if CV_SIMD_WIDTH > 16 - v_uint8 vx_zero = vx_setzero_u8(); - while (j <= img_size.width - v_uint8::nlanes && - v_check_all(vx_load((uchar*)(src_data + j)) == vx_zero)) - j += v_uint8::nlanes; -#endif - v_uint8x16 v_zero = v_setzero_u8(); - for (; j <= img_size.width - v_uint8x16::nlanes; j += v_uint8x16::nlanes) + v_uint8 v_zero = vx_setzero_u8(); + for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes) { - unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) != v_zero); - if (mask) + v_uint8 vmask = (vx_load((uchar*)(src_data + j)) != v_zero); + if (v_check_any(vmask)) { - j += cv::trailingZeros32(mask); + j += v_scan_forward(vmask); return j; } } @@ -1365,19 +1353,13 @@ inline int findEndContourPoint(uchar *src_data, CvSize img_size, int j) } else { -#if CV_SIMD_WIDTH > 16 - v_uint8 vx_zero = vx_setzero_u8(); - while (j <= img_size.width - v_uint8::nlanes && - v_check_all(vx_load((uchar*)(src_data + j)) != vx_zero)) - j += v_uint8::nlanes; -#endif - v_uint8x16 v_zero = v_setzero_u8(); + v_uint8 v_zero = vx_setzero_u8(); for (; j <= img_size.width - v_uint8::nlanes; j += v_uint8::nlanes) { - unsigned int mask = (unsigned int)v_signmask(v_load((uchar*)(src_data + j)) == v_zero); - if (mask) + v_uint8 vmask = (vx_load((uchar*)(src_data + j)) == v_zero); + if (v_check_any(vmask)) { - j += cv::trailingZeros32(mask); + j += v_scan_forward(vmask); return j; } } diff --git a/modules/imgproc/src/filter.dispatch.cpp b/modules/imgproc/src/filter.dispatch.cpp index 24e1a74e8854..c21efe181cda 100644 --- a/modules/imgproc/src/filter.dispatch.cpp +++ b/modules/imgproc/src/filter.dispatch.cpp @@ -1160,9 +1160,7 @@ static bool dftFilter2D(int stype, int dtype, int kernel_type, corrDepth = ddepth == CV_64F ? CV_64F : CV_32F; temp.create(Size(width, height), CV_MAKETYPE(corrDepth, dst_channels)); } - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(corrDepth, src_channels), - anchor, 0, borderType); + crossCorr(src, kernel, temp, anchor, 0, borderType); add(temp, delta, temp); if (temp.data != dst_data) { temp.convertTo(dst, dst.type()); @@ -1172,9 +1170,7 @@ static bool dftFilter2D(int stype, int dtype, int kernel_type, temp = Mat(Size(width, height), dtype, dst_data, dst_step); else temp.create(Size(width, height), dtype); - crossCorr(src, kernel, temp, src.size(), - CV_MAKETYPE(ddepth, src_channels), - anchor, delta, borderType); + crossCorr(src, kernel, temp, anchor, delta, borderType); if (temp.data != dst_data) temp.copyTo(dst); } diff --git a/modules/imgproc/src/filterengine.hpp b/modules/imgproc/src/filterengine.hpp index 019c1d5d2dd2..9ec0b6e8b156 100644 --- a/modules/imgproc/src/filterengine.hpp +++ b/modules/imgproc/src/filterengine.hpp @@ -366,7 +366,6 @@ static inline Point normalizeAnchor( Point anchor, Size ksize ) void preprocess2DKernel( const Mat& kernel, std::vector& coords, std::vector& coeffs ); void crossCorr( const Mat& src, const Mat& templ, Mat& dst, - Size corrsize, int ctype, Point anchor=Point(0,0), double delta=0, int borderType=BORDER_REFLECT_101 ); diff --git a/modules/imgproc/src/hough.cpp b/modules/imgproc/src/hough.cpp index 5862319738b2..6b18b17b56dc 100644 --- a/modules/imgproc/src/hough.cpp +++ b/modules/imgproc/src/hough.cpp @@ -1139,32 +1139,23 @@ class HoughCirclesAccumInvoker : public ParallelLoopBody for(; x < numCols; ++x ) { -#if CV_SIMD128 +#if CV_SIMD { - v_uint8x16 v_zero = v_setzero_u8(); + v_uint8 v_zero = vx_setzero_u8(); - for(; x <= numCols - 32; x += 32) { - v_uint8x16 v_edge1 = v_load(edgeData + x); - v_uint8x16 v_edge2 = v_load(edgeData + x + 16); + for(; x <= numCols - 2*v_uint8::nlanes; x += 2*v_uint8::nlanes) { + v_uint8 v_edge1 = (vx_load(edgeData + x ) != v_zero); + v_uint8 v_edge2 = (vx_load(edgeData + x + v_uint8::nlanes) != v_zero); - v_uint8x16 v_cmp1 = (v_edge1 == v_zero); - v_uint8x16 v_cmp2 = (v_edge2 == v_zero); - - unsigned int mask1 = v_signmask(v_cmp1); - unsigned int mask2 = v_signmask(v_cmp2); - - mask1 ^= 0x0000ffff; - mask2 ^= 0x0000ffff; - - if(mask1) + if(v_check_any(v_edge1)) { - x += trailingZeros32(mask1); + x += v_scan_forward(v_edge1); goto _next_step; } - if(mask2) + if(v_check_any(v_edge2)) { - x += trailingZeros32(mask2 << 16); + x += v_uint8::nlanes + v_scan_forward(v_edge2); goto _next_step; } } @@ -1175,7 +1166,7 @@ class HoughCirclesAccumInvoker : public ParallelLoopBody if(x == numCols) continue; -#if CV_SIMD128 +#if CV_SIMD _next_step: #endif float vx, vy; @@ -1506,36 +1497,35 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Po int nzCount = 0; const Point* nz_ = &nz[0]; int j = 0; -#if CV_SIMD128 +#if CV_SIMD { - const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2); - const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2); + const v_float32 v_minRadius2 = vx_setall_f32(minRadius2); + const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2); - v_float32x4 v_curCenterX = v_setall_f32(curCenter.x); - v_float32x4 v_curCenterY = v_setall_f32(curCenter.y); + v_float32 v_curCenterX = vx_setall_f32(curCenter.x); + v_float32 v_curCenterY = vx_setall_f32(curCenter.y); - float CV_DECL_ALIGNED(16) rbuf[4]; - for(; j <= nzSz - 4; j += 4) + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes]; + for(; j <= nzSz - v_float32::nlanes; j += v_float32::nlanes) { - v_float32x4 v_nzX, v_nzY; + v_float32 v_nzX, v_nzY; v_load_deinterleave((const float*)&nz_[j], v_nzX, v_nzY); // FIXIT use proper datatype - v_float32x4 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX)); - v_float32x4 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY)); + v_float32 v_x = v_cvt_f32(v_reinterpret_as_s32(v_nzX)); + v_float32 v_y = v_cvt_f32(v_reinterpret_as_s32(v_nzY)); - v_float32x4 v_dx = v_x - v_curCenterX; - v_float32x4 v_dy = v_y - v_curCenterY; + v_float32 v_dx = v_x - v_curCenterX; + v_float32 v_dy = v_y - v_curCenterY; - v_float32x4 v_r2 = (v_dx * v_dx) + (v_dy * v_dy); - v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2); - unsigned int mask = v_signmask(vmask); - if (mask) + v_float32 v_r2 = (v_dx * v_dx) + (v_dy * v_dy); + v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2); + if (v_check_any(vmask)) { + v_store_aligned(rmask, v_reinterpret_as_s32(vmask)); v_store_aligned(rbuf, v_r2); - if (mask & 1) ddata[nzCount++] = rbuf[0]; - if (mask & 2) ddata[nzCount++] = rbuf[1]; - if (mask & 4) ddata[nzCount++] = rbuf[2]; - if (mask & 8) ddata[nzCount++] = rbuf[3]; + for (int i = 0; i < v_int32::nlanes; ++i) + if (rmask[i]) ddata[nzCount++] = rbuf[i]; } } } @@ -1566,12 +1556,13 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Poi const Range xOuter = Range(std::max(int(curCenter.x - rOuter), 0), std::min(int(curCenter.x + rOuter), positions.cols)); const Range yOuter = Range(std::max(int(curCenter.y - rOuter), 0), std::min(int(curCenter.y + rOuter), positions.rows)); -#if CV_SIMD128 - const int numSIMDPoints = 4; - - const v_float32x4 v_minRadius2 = v_setall_f32(minRadius2); - const v_float32x4 v_maxRadius2 = v_setall_f32(maxRadius2); - const v_float32x4 v_curCenterX_0123 = v_setall_f32(curCenter.x) - v_float32x4(0.0f, 1.0f, 2.0f, 3.0f); +#if CV_SIMD + float v_seq[v_float32::nlanes]; + for (int i = 0; i < v_float32::nlanes; ++i) + v_seq[i] = (float)i; + const v_float32 v_minRadius2 = vx_setall_f32(minRadius2); + const v_float32 v_maxRadius2 = vx_setall_f32(maxRadius2); + const v_float32 v_curCenterX_0123 = vx_setall_f32(curCenter.x) - vx_load(v_seq); #endif for (int y = yOuter.start; y < yOuter.end; y++) @@ -1581,29 +1572,28 @@ inline int HoughCircleEstimateRadiusInvoker::filterCircles(const Poi float dy2 = dy * dy; int x = xOuter.start; -#if CV_SIMD128 +#if CV_SIMD { - const v_float32x4 v_dy2 = v_setall_f32(dy2); - const v_uint32x4 v_zero_u32 = v_setall_u32(0); - float CV_DECL_ALIGNED(16) rbuf[4]; - for (; x <= xOuter.end - 4; x += numSIMDPoints) + const v_float32 v_dy2 = vx_setall_f32(dy2); + const v_uint32 v_zero_u32 = vx_setall_u32(0); + float CV_DECL_ALIGNED(CV_SIMD_WIDTH) rbuf[v_float32::nlanes]; + int CV_DECL_ALIGNED(CV_SIMD_WIDTH) rmask[v_int32::nlanes]; + for (; x <= xOuter.end - v_float32::nlanes; x += v_float32::nlanes) { - v_uint32x4 v_mask = v_load_expand_q(ptr + x); + v_uint32 v_mask = vx_load_expand_q(ptr + x); v_mask = v_mask != v_zero_u32; - v_float32x4 v_x = v_cvt_f32(v_setall_s32(x)); - v_float32x4 v_dx = v_x - v_curCenterX_0123; + v_float32 v_x = v_cvt_f32(vx_setall_s32(x)); + v_float32 v_dx = v_x - v_curCenterX_0123; - v_float32x4 v_r2 = (v_dx * v_dx) + v_dy2; - v_float32x4 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask); - unsigned int mask = v_signmask(vmask); - if (mask) + v_float32 v_r2 = (v_dx * v_dx) + v_dy2; + v_float32 vmask = (v_minRadius2 <= v_r2) & (v_r2 <= v_maxRadius2) & v_reinterpret_as_f32(v_mask); + if (v_check_any(vmask)) { + v_store_aligned(rmask, v_reinterpret_as_s32(vmask)); v_store_aligned(rbuf, v_r2); - if (mask & 1) ddata[nzCount++] = rbuf[0]; - if (mask & 2) ddata[nzCount++] = rbuf[1]; - if (mask & 4) ddata[nzCount++] = rbuf[2]; - if (mask & 8) ddata[nzCount++] = rbuf[3]; + for (int i = 0; i < v_int32::nlanes; ++i) + if (rmask[i]) ddata[nzCount++] = rbuf[i]; } } } diff --git a/modules/imgproc/src/templmatch.cpp b/modules/imgproc/src/templmatch.cpp index 27f38a833b77..e84d9c961174 100644 --- a/modules/imgproc/src/templmatch.cpp +++ b/modules/imgproc/src/templmatch.cpp @@ -564,7 +564,6 @@ static bool ocl_matchTemplate( InputArray _img, InputArray _templ, OutputArray _ #include "opencv2/core/hal/hal.hpp" void crossCorr( const Mat& img, const Mat& _templ, Mat& corr, - Size corrsize, int ctype, Point anchor, double delta, int borderType ) { const double blockScale = 4.5; @@ -574,7 +573,7 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr, Mat templ = _templ; int depth = img.depth(), cn = img.channels(); int tdepth = templ.depth(), tcn = templ.channels(); - int cdepth = CV_MAT_DEPTH(ctype), ccn = CV_MAT_CN(ctype); + int cdepth = corr.depth(), ccn = corr.channels(); CV_Assert( img.dims <= 2 && templ.dims <= 2 && corr.dims <= 2 ); @@ -585,13 +584,11 @@ void crossCorr( const Mat& img, const Mat& _templ, Mat& corr, } CV_Assert( depth == tdepth || tdepth == CV_32F); - CV_Assert( corrsize.height <= img.rows + templ.rows - 1 && - corrsize.width <= img.cols + templ.cols - 1 ); + CV_Assert( corr.rows <= img.rows + templ.rows - 1 && + corr.cols <= img.cols + templ.cols - 1 ); CV_Assert( ccn == 1 || delta == 0 ); - corr.create(corrsize, ctype); - int maxDepth = depth > CV_8S ? CV_64F : std::max(std::max(CV_32F, tdepth), cdepth); Size blocksize, dftsize; @@ -815,8 +812,8 @@ static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _ Mat mask2_templ = templ.mul(mask2); Mat corr(corrSize, CV_32F); - crossCorr( img, mask2_templ, corr, corr.size(), corr.type(), Point(0,0), 0, 0 ); - crossCorr( img2, mask, result, result.size(), result.type(), Point(0,0), 0, 0 ); + crossCorr( img, mask2_templ, corr, Point(0,0), 0, 0 ); + crossCorr( img2, mask, result, Point(0,0), 0, 0 ); result -= corr * 2; result += templSum2; @@ -830,8 +827,8 @@ static void matchTemplateMask( InputArray _img, InputArray _templ, OutputArray _ } Mat corr(corrSize, CV_32F); - crossCorr( img2, mask2, corr, corr.size(), corr.type(), Point(0,0), 0, 0 ); - crossCorr( img, mask_templ, result, result.size(), result.type(), Point(0,0), 0, 0 ); + crossCorr( img2, mask2, corr, Point(0,0), 0, 0 ); + crossCorr( img, mask_templ, result, Point(0,0), 0, 0 ); sqrt(corr, corr); result = result.mul(1/corr); @@ -1125,7 +1122,7 @@ void cv::matchTemplate( InputArray _img, InputArray _templ, OutputArray _result, CV_IPP_RUN_FAST(ipp_matchTemplate(img, templ, result, method)) - crossCorr( img, templ, result, result.size(), result.type(), Point(0,0), 0, 0); + crossCorr( img, templ, result, Point(0,0), 0, 0); common_matchTemplate(img, templ, result, method, cn); } diff --git a/modules/imgproc/test/test_connectedcomponents.cpp b/modules/imgproc/test/test_connectedcomponents.cpp index abd6fd43b461..3817f6d172a4 100644 --- a/modules/imgproc/test/test_connectedcomponents.cpp +++ b/modules/imgproc/test/test_connectedcomponents.cpp @@ -136,4 +136,18 @@ void CV_ConnectedComponentsTest::run( int /* start_from */) TEST(Imgproc_ConnectedComponents, regression) { CV_ConnectedComponentsTest test; test.safe_run(); } +TEST(Imgproc_ConnectedComponents, grana_buffer_overflow) +{ + cv::Mat darkMask; + darkMask.create(31, 87, CV_8U); + darkMask = 0; + + cv::Mat labels; + cv::Mat stats; + cv::Mat centroids; + + int nbComponents = cv::connectedComponentsWithStats(darkMask, labels, stats, centroids, 8, CV_32S, cv::CCL_GRANA); + EXPECT_EQ(1, nbComponents); +} + }} // namespace diff --git a/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py b/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py index a90af4da1db9..e3b13ca2e6ca 100644 --- a/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py +++ b/samples/python/tutorial_code/ImgTrans/LaPlace/laplace_demo.py @@ -40,7 +40,7 @@ def main(argv): # [laplacian] # Apply Laplace function - dst = cv.Laplacian(src_gray, ddepth, kernel_size) + dst = cv.Laplacian(src_gray, ddepth, ksize=kernel_size) # [laplacian] # [convert]