Skip to content

Commit

Permalink
Configure QD8_F32 8 bit AVX GEMMs to match 4 bit GEMM sizes
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 657768155
fbarchard authored and xnnpack-bot committed Jul 30, 2024
1 parent ed5850c commit d3806ed
Showing 4 changed files with 365 additions and 29 deletions.
93 changes: 86 additions & 7 deletions src/amalgam/gen/avx2.c
Original file line number Diff line number Diff line change
@@ -5789,7 +5789,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2(
} while (nc != 0);
}

void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__avx2(
size_t mr,
size_t nc,
size_t kc,
@@ -5803,7 +5803,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 3);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(kc % sizeof(int8_t) == 0);
@@ -5826,6 +5826,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
a2 = a1;
c2 = c1;
}
const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
a3 = a2;
c3 = c2;
}

do {
const __m128i vinit0 = _mm_cvtsi32_si128(((const int*) w)[0]);
@@ -5855,6 +5861,11 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
__m256i vacc2x23 = _mm256_mullo_epi32(vinit23, vinput_zero_point2);
__m256i vacc2x45 = _mm256_mullo_epi32(vinit45, vinput_zero_point2);
__m256i vacc2x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point2);
const __m256i vinput_zero_point3 = _mm256_set1_epi32((int) quantization_params[3].zero_point);
__m256i vacc3x01 = _mm256_mullo_epi32(vinit01, vinput_zero_point3);
__m256i vacc3x23 = _mm256_mullo_epi32(vinit23, vinput_zero_point3);
__m256i vacc3x45 = _mm256_mullo_epi32(vinit45, vinput_zero_point3);
__m256i vacc3x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point3);
w = (const int32_t*) w + 8;

size_t k = kc;
@@ -5869,27 +5880,34 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
a2 += 8;
const __m128i va3 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a3));
const __m256i vxa3 = _mm256_cvtepi8_epi16(va3);
a3 += 8;

const __m256i vxb01 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) w));

vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
vacc3x01 = _mm256_add_epi32(vacc3x01, _mm256_madd_epi16(vxa3, vxb01));
const __m256i vxb23 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) ((const int8_t*) w + 16)));

vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
vacc3x23 = _mm256_add_epi32(vacc3x23, _mm256_madd_epi16(vxa3, vxb23));
const __m256i vxb45 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) ((const int8_t*) w + 32)));

vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
vacc3x45 = _mm256_add_epi32(vacc3x45, _mm256_madd_epi16(vxa3, vxb45));
const __m256i vxb67 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) ((const int8_t*) w + 48)));

vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
vacc3x67 = _mm256_add_epi32(vacc3x67, _mm256_madd_epi16(vxa3, vxb67));

w = (const int8_t*) w + 64;
k -= 8 * sizeof(int8_t);
@@ -5901,43 +5919,53 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
const __m256i vacc3x0213 = _mm256_hadd_epi32(vacc3x01, vacc3x23);
const __m256i vacc3x4657 = _mm256_hadd_epi32(vacc3x45, vacc3x67);

const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
const __m256i vacc3x02461357 = _mm256_hadd_epi32(vacc3x0213, vacc3x4657);

const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
__m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
__m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
__m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
__m256i vacc3x01234567 = _mm256_permutevar8x32_epi32(vacc3x02461357, vpermute_mask);

__m256 vout0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
const __m256 vinput_scale0 = _mm256_broadcast_ss(&quantization_params[0].inv_scale);
__m256 vout1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
const __m256 vinput_scale1 = _mm256_broadcast_ss(&quantization_params[1].inv_scale);
__m256 vout2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
const __m256 vinput_scale2 = _mm256_broadcast_ss(&quantization_params[2].inv_scale);
__m256 vout3x01234567 = _mm256_cvtepi32_ps(vacc3x01234567);
const __m256 vinput_scale3 = _mm256_broadcast_ss(&quantization_params[3].inv_scale);

vout0x01234567 = _mm256_mul_ps(vout0x01234567, vinput_scale0);
vout1x01234567 = _mm256_mul_ps(vout1x01234567, vinput_scale1);
vout2x01234567 = _mm256_mul_ps(vout2x01234567, vinput_scale2);
vout3x01234567 = _mm256_mul_ps(vout3x01234567, vinput_scale3);

const __m256 vfilter_output_scale01234567 = _mm256_load_ps((const float*) w);
const __m256 vbias01234567 = _mm256_load_ps((const float*) w + 8);
w = (const float*) w + 16;
vout0x01234567 = _mm256_fmadd_ps(vout0x01234567, vfilter_output_scale01234567, vbias01234567);
vout1x01234567 = _mm256_fmadd_ps(vout1x01234567, vfilter_output_scale01234567, vbias01234567);
vout2x01234567 = _mm256_fmadd_ps(vout2x01234567, vfilter_output_scale01234567, vbias01234567);
vout3x01234567 = _mm256_fmadd_ps(vout3x01234567, vfilter_output_scale01234567, vbias01234567);

const __m256 vmin = _mm256_load_ps(params->avx.min);
vout0x01234567 = _mm256_max_ps(vout0x01234567, vmin);
vout1x01234567 = _mm256_max_ps(vout1x01234567, vmin);
vout2x01234567 = _mm256_max_ps(vout2x01234567, vmin);
vout3x01234567 = _mm256_max_ps(vout3x01234567, vmin);

const __m256 vmax = _mm256_load_ps(params->avx.max);
vout0x01234567 = _mm256_min_ps(vout0x01234567, vmax);
vout1x01234567 = _mm256_min_ps(vout1x01234567, vmax);
vout2x01234567 = _mm256_min_ps(vout2x01234567, vmax);
vout3x01234567 = _mm256_min_ps(vout3x01234567, vmax);

if XNN_LIKELY(nc >= 8) {
_mm256_storeu_ps(c0, vout0x01234567);
@@ -5946,46 +5974,57 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
c1 = (float*) ((uintptr_t) c1 + cn_stride);
_mm256_storeu_ps(c2, vout2x01234567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm256_storeu_ps(c3, vout3x01234567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);

a0 = (const int8_t*) ((uintptr_t) a0 - kc);
a1 = (const int8_t*) ((uintptr_t) a1 - kc);
a2 = (const int8_t*) ((uintptr_t) a2 - kc);
a3 = (const int8_t*) ((uintptr_t) a3 - kc);

nc -= 8;
} else {
__m128 vout0x0123 = _mm256_castps256_ps128(vout0x01234567);
__m128 vout1x0123 = _mm256_castps256_ps128(vout1x01234567);
__m128 vout2x0123 = _mm256_castps256_ps128(vout2x01234567);
__m128 vout3x0123 = _mm256_castps256_ps128(vout3x01234567);
if (nc & 4) {
_mm_storeu_ps(c0, vout0x0123);
_mm_storeu_ps(c1, vout1x0123);
_mm_storeu_ps(c2, vout2x0123);
_mm_storeu_ps(c3, vout3x0123);

vout0x0123 = _mm256_extractf128_ps(vout0x01234567, 1);
vout1x0123 = _mm256_extractf128_ps(vout1x01234567, 1);
vout2x0123 = _mm256_extractf128_ps(vout2x01234567, 1);
vout3x0123 = _mm256_extractf128_ps(vout3x01234567, 1);

c0 += 4;
c1 += 4;
c2 += 4;
c3 += 4;
}
if (nc & 2) {
_mm_storel_pi((__m64*) c0, vout0x0123);
_mm_storel_pi((__m64*) c1, vout1x0123);
_mm_storel_pi((__m64*) c2, vout2x0123);
_mm_storel_pi((__m64*) c3, vout3x0123);

vout0x0123 = _mm_movehl_ps(vout0x0123, vout0x0123);
vout1x0123 = _mm_movehl_ps(vout1x0123, vout1x0123);
vout2x0123 = _mm_movehl_ps(vout2x0123, vout2x0123);
vout3x0123 = _mm_movehl_ps(vout3x0123, vout3x0123);

c0 += 2;
c1 += 2;
c2 += 2;
c3 += 2;
}
if (nc & 1) {
_mm_store_ss(c0, vout0x0123);
_mm_store_ss(c1, vout1x0123);
_mm_store_ss(c2, vout2x0123);
_mm_store_ss(c3, vout3x0123);
}
nc = 0;
}
@@ -6135,7 +6174,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx2(
} while (nc != 0);
}

void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__avx2(
size_t mr,
size_t nc,
size_t kc,
@@ -6152,11 +6191,11 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
{
assert(mr != 0);
assert(mr <= 3);
assert(mr <= 4);
assert(nc != 0);
assert(kc != 0);
assert(ks != 0);
assert(ks % (3 * sizeof(void*)) == 0);
assert(ks % (4 * sizeof(void*)) == 0);
assert(a_offset % sizeof(int8_t) == 0);
assert(a != NULL);
assert(w != NULL);
@@ -6172,6 +6211,10 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
if XNN_UNPREDICTABLE(mr <= 2) {
c2 = c1;
}
float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
if XNN_UNPREDICTABLE(mr != 4) {
c3 = c2;
}

const __m256i vinput_zero_point = _mm256_set1_epi32((int) quantization_params->zero_point);
const __m256 vinput_scale = _mm256_broadcast_ss(&quantization_params->inv_scale);
@@ -6200,6 +6243,10 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
__m256i vacc2x23 = vacc0x23;
__m256i vacc2x45 = vacc0x45;
__m256i vacc2x67 = vacc0x67;
__m256i vacc3x01 = vacc0x01;
__m256i vacc3x23 = vacc0x23;
__m256i vacc3x45 = vacc0x45;
__m256i vacc3x67 = vacc0x67;
w = (const int32_t*) w + 8;

size_t p = ks;
@@ -6222,7 +6269,13 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
} else {
a2 = zero_data;
}
a += 3;
const int8_t* restrict a3 = a[3];
if XNN_UNPREDICTABLE(a3 != zero) {
a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
} else {
a3 = zero_data;
}
a += 4;

size_t k = 0;
while (k < kc) {
@@ -6235,36 +6288,43 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
a2 += 8;
const __m128i va3 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a3));
const __m256i vxa3 = _mm256_cvtepi8_epi16(va3);
a3 += 8;

const __m128i vb01 = _mm_load_si128((const __m128i*) w);
const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);

vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
vacc3x01 = _mm256_add_epi32(vacc3x01, _mm256_madd_epi16(vxa3, vxb01));
const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);

vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
vacc3x23 = _mm256_add_epi32(vacc3x23, _mm256_madd_epi16(vxa3, vxb23));
const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);

vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
vacc3x45 = _mm256_add_epi32(vacc3x45, _mm256_madd_epi16(vxa3, vxb45));
const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);

vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
vacc3x67 = _mm256_add_epi32(vacc3x67, _mm256_madd_epi16(vxa3, vxb67));

w = (const void*) ((const int8_t*) w + 64);
k += 8 * sizeof(int8_t);
}
p -= 3 * sizeof(void*);
p -= 4 * sizeof(void*);
} while (p != 0);

const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
@@ -6273,42 +6333,53 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
const __m256i vacc3x0213 = _mm256_hadd_epi32(vacc3x01, vacc3x23);
const __m256i vacc3x4657 = _mm256_hadd_epi32(vacc3x45, vacc3x67);

const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
const __m256i vacc3x02461357 = _mm256_hadd_epi32(vacc3x0213, vacc3x4657);

const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
__m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
__m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
__m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
__m256i vacc3x01234567 = _mm256_permutevar8x32_epi32(vacc3x02461357, vpermute_mask);

__m256 vout0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
__m256 vout1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
__m256 vout2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
__m256 vout3x01234567 = _mm256_cvtepi32_ps(vacc3x01234567);

vout0x01234567 = _mm256_mul_ps(vout0x01234567, vinput_scale);
vout1x01234567 = _mm256_mul_ps(vout1x01234567, vinput_scale);
vout2x01234567 = _mm256_mul_ps(vout2x01234567, vinput_scale);
vout3x01234567 = _mm256_mul_ps(vout3x01234567, vinput_scale);

const __m256 vfilter_output_scale01234567 = _mm256_load_ps((const float*) w);
const __m256 vbias01234567 = _mm256_load_ps((const float*) w + 8);
w = (const float*) w + 16;
vout0x01234567 = _mm256_fmadd_ps(vout0x01234567, vfilter_output_scale01234567, vbias01234567);
vout1x01234567 = _mm256_fmadd_ps(vout1x01234567, vfilter_output_scale01234567, vbias01234567);
vout2x01234567 = _mm256_fmadd_ps(vout2x01234567, vfilter_output_scale01234567, vbias01234567);
vout3x01234567 = _mm256_fmadd_ps(vout3x01234567, vfilter_output_scale01234567, vbias01234567);

const __m256 vmin = _mm256_load_ps(params->avx.min);
vout0x01234567 = _mm256_max_ps(vout0x01234567, vmin);
vout1x01234567 = _mm256_max_ps(vout1x01234567, vmin);
vout2x01234567 = _mm256_max_ps(vout2x01234567, vmin);
vout3x01234567 = _mm256_max_ps(vout3x01234567, vmin);

const __m256 vmax = _mm256_load_ps(params->avx.max);
vout0x01234567 = _mm256_min_ps(vout0x01234567, vmax);
vout1x01234567 = _mm256_min_ps(vout1x01234567, vmax);
vout2x01234567 = _mm256_min_ps(vout2x01234567, vmax);
vout3x01234567 = _mm256_min_ps(vout3x01234567, vmax);

if XNN_LIKELY(nc >= 8) {
_mm256_storeu_ps(c3, vout3x01234567);
c3 = (float*) ((uintptr_t) c3 + cn_stride);
_mm256_storeu_ps(c2, vout2x01234567);
c2 = (float*) ((uintptr_t) c2 + cn_stride);
_mm256_storeu_ps(c1, vout1x01234567);
@@ -6319,36 +6390,44 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
a = (const int8_t**restrict) ((uintptr_t) a - ks);
nc -= 8;
} else {
__m128 vout3x0123 = _mm256_castps256_ps128(vout3x01234567);
__m128 vout2x0123 = _mm256_castps256_ps128(vout2x01234567);
__m128 vout1x0123 = _mm256_castps256_ps128(vout1x01234567);
__m128 vout0x0123 = _mm256_castps256_ps128(vout0x01234567);
if (nc & 4) {
_mm_storeu_ps(c3, vout3x0123);
_mm_storeu_ps(c2, vout2x0123);
_mm_storeu_ps(c1, vout1x0123);
_mm_storeu_ps(c0, vout0x0123);

vout3x0123 = _mm256_extractf128_ps(vout3x01234567, 1);
vout2x0123 = _mm256_extractf128_ps(vout2x01234567, 1);
vout1x0123 = _mm256_extractf128_ps(vout1x01234567, 1);
vout0x0123 = _mm256_extractf128_ps(vout0x01234567, 1);

c3 += 4;
c2 += 4;
c1 += 4;
c0 += 4;
}
if (nc & 2) {
_mm_storel_pi((__m64*) c3, vout3x0123);
_mm_storel_pi((__m64*) c2, vout2x0123);
_mm_storel_pi((__m64*) c1, vout1x0123);
_mm_storel_pi((__m64*) c0, vout0x0123);

vout3x0123 = _mm_movehl_ps(vout3x0123, vout3x0123);
vout2x0123 = _mm_movehl_ps(vout2x0123, vout2x0123);
vout1x0123 = _mm_movehl_ps(vout1x0123, vout1x0123);
vout0x0123 = _mm_movehl_ps(vout0x0123, vout0x0123);

c3 += 2;
c2 += 2;
c1 += 2;
c0 += 2;
}
if (nc & 1) {
_mm_store_ss(c3, vout3x0123);
_mm_store_ss(c2, vout2x0123);
_mm_store_ss(c1, vout1x0123);
_mm_store_ss(c0, vout0x0123);
Loading

0 comments on commit d3806ed

Please sign in to comment.