From d4595c8cfd9d34bbb26326c4fb9f1f139442c9e1 Mon Sep 17 00:00:00 2001 From: songkuangshi Date: Fri, 22 Mar 2019 14:24:05 +0800 Subject: [PATCH] fix avx align error --- LightCTR/common/avx.h | 20 ++++++++++---------- LightCTR/common/time.h | 2 +- build_ring.sh | 2 +- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/LightCTR/common/avx.h b/LightCTR/common/avx.h index 9aef337..6dc467c 100644 --- a/LightCTR/common/avx.h +++ b/LightCTR/common/avx.h @@ -22,7 +22,7 @@ inline void avx_vecAdd(const float* x, const float* y, float* res, size_t len) { if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_add_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; y += 8; res += 8; @@ -42,7 +42,7 @@ inline void avx_vecAdd(const float* x, const float const_delta, float* res, size if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_add_ps(_mm256_loadu_ps(x), delta); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; res += 8; } @@ -62,7 +62,7 @@ inline void avx_vecScalerAdd(const float* x, const float* y, float* res, for (; len > 7; len -= 8) { __m256 t = _mm256_add_ps(_mm256_loadu_ps(x), _mm256_mul_ps(_mm256_loadu_ps(y), _scalar)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; y += 8; res += 8; @@ -83,7 +83,7 @@ inline void avx_vecScalerAdd(const float* x, const float* y, float* res, __m256 t = _mm256_add_ps(_mm256_loadu_ps(x), _mm256_mul_ps(_mm256_loadu_ps(y), _mm256_loadu_ps(y_scalar))); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; y += 8; y_scalar += 8; @@ -162,7 +162,7 @@ inline void avx_vecSqrt(const float* x, float *res, size_t len) { if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_sqrt_ps(_mm256_loadu_ps(x)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; res += 8; } @@ -178,7 +178,7 @@ inline void avx_vecRsqrt(const float* x, float *res, size_t len) { if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_rsqrt_ps(_mm256_loadu_ps(x)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; res += 8; } @@ -194,7 +194,7 @@ inline void avx_vecRcp(const float* x, float *res, size_t len) { if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_rcp_ps(_mm256_loadu_ps(x)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; res += 8; } @@ -211,7 +211,7 @@ inline void avx_vecScale(const float* x, float *res, size_t len, const float sca if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_mul_ps(_mm256_loadu_ps(x), _scalar); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; res += 8; } @@ -227,7 +227,7 @@ inline void avx_vecScale(const float* x, float *res, size_t len, const float* sc if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_mul_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(scalar)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; scalar += 8; res += 8; @@ -245,7 +245,7 @@ inline void avx_vecDiv(const float* x, const float* y, float* res, size_t len) { if (len > 7) { for (; len > 7; len -= 8) { __m256 t = _mm256_div_ps(_mm256_loadu_ps(x), _mm256_loadu_ps(y)); - _mm256_store_ps(res, t); + _mm256_storeu_ps(res, t); x += 8; y += 8; res += 8; diff --git a/LightCTR/common/time.h b/LightCTR/common/time.h index 1888339..dd15528 100644 --- a/LightCTR/common/time.h +++ b/LightCTR/common/time.h @@ -42,7 +42,7 @@ inline time_t __must_inline__ get_now_s(void) { inline time_t __must_inline__ gettickspan(uint64_t old_tick = get_now_ms()) { update_tv(); - int64_t cur_tick = get_now_ms(); + uint64_t cur_tick = get_now_ms(); if (old_tick > cur_tick) { return 0; } diff --git a/build_ring.sh b/build_ring.sh index 2f66b0b..f12146b 100755 --- a/build_ring.sh +++ b/build_ring.sh @@ -20,7 +20,7 @@ wait echo echo echo "[Build Success]" -echo "Please copy different BIN file to corresponding machine, DON'T forget expert LightCTR_PS_NUM, LightCTR_WORKER_NUM and LightCTR_MASTER_ADDR, run Master first" +echo "Please copy different BIN file to corresponding machine, DON'T forget expert LightCTR_WORKER_NUM and LightCTR_MASTER_ADDR, run Master first" echo echo "[or] Press any key to run clunster on standalone mode" read -n 1