Configure QD8_F32 8 bit AVX GEMMs to match 4 bit GEMM sizes

PiperOrigin-RevId: 657768155
MinhuiWan · Jul 30, 2024 · d3806ed · d3806ed
1 parent ed5850c
commit d3806ed
Showing 4 changed files with 365 additions and 29 deletions.
diff --git a/src/amalgam/gen/avx2.c b/src/amalgam/gen/avx2.c
@@ -5789,7 +5789,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_1x8c8__avx2(
   } while (nc != 0);
 }
 
-void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
+void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_4x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -5803,7 +5803,7 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
     const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(mr != 0);
-  assert(mr <= 3);
+  assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
   assert(kc % sizeof(int8_t) == 0);
@@ -5826,6 +5826,12 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
     a2 = a1;
     c2 = c1;
   }
+  const int8_t* a3 = (const int8_t*) ((uintptr_t) a2 + a_stride);
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    a3 = a2;
+    c3 = c2;
+  }
 
   do {
     const __m128i vinit0 = _mm_cvtsi32_si128(((const int*) w)[0]);
@@ -5855,6 +5861,11 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
     __m256i vacc2x23 = _mm256_mullo_epi32(vinit23, vinput_zero_point2);
     __m256i vacc2x45 = _mm256_mullo_epi32(vinit45, vinput_zero_point2);
     __m256i vacc2x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point2);
+    const __m256i vinput_zero_point3 = _mm256_set1_epi32((int) quantization_params[3].zero_point);
+    __m256i vacc3x01 = _mm256_mullo_epi32(vinit01, vinput_zero_point3);
+    __m256i vacc3x23 = _mm256_mullo_epi32(vinit23, vinput_zero_point3);
+    __m256i vacc3x45 = _mm256_mullo_epi32(vinit45, vinput_zero_point3);
+    __m256i vacc3x67 = _mm256_mullo_epi32(vinit67, vinput_zero_point3);
     w = (const int32_t*) w + 8;
 
     size_t k = kc;
@@ -5869,27 +5880,34 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
       const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
       const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
       a2 += 8;
+      const __m128i va3 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a3));
+      const __m256i vxa3 = _mm256_cvtepi8_epi16(va3);
+      a3 += 8;
 
       const __m256i vxb01 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) w));
 
       vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
       vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
       vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
+      vacc3x01 = _mm256_add_epi32(vacc3x01, _mm256_madd_epi16(vxa3, vxb01));
       const __m256i vxb23 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) ((const int8_t*) w + 16)));
 
       vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
       vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
       vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
+      vacc3x23 = _mm256_add_epi32(vacc3x23, _mm256_madd_epi16(vxa3, vxb23));
       const __m256i vxb45 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) ((const int8_t*) w + 32)));
 
       vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
       vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
       vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
+      vacc3x45 = _mm256_add_epi32(vacc3x45, _mm256_madd_epi16(vxa3, vxb45));
       const __m256i vxb67 = _mm256_cvtepi8_epi16(_mm_load_si128((const __m128i*) ((const int8_t*) w + 48)));
 
       vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
       vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
       vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
+      vacc3x67 = _mm256_add_epi32(vacc3x67, _mm256_madd_epi16(vxa3, vxb67));
 
       w = (const int8_t*) w + 64;
       k -= 8 * sizeof(int8_t);
@@ -5901,43 +5919,53 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
+    const __m256i vacc3x0213 = _mm256_hadd_epi32(vacc3x01, vacc3x23);
+    const __m256i vacc3x4657 = _mm256_hadd_epi32(vacc3x45, vacc3x67);
 
     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
+    const __m256i vacc3x02461357 = _mm256_hadd_epi32(vacc3x0213, vacc3x4657);
 
     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
+    __m256i vacc3x01234567 = _mm256_permutevar8x32_epi32(vacc3x02461357, vpermute_mask);
 
     __m256 vout0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
     const __m256 vinput_scale0 = _mm256_broadcast_ss(&quantization_params[0].inv_scale);
     __m256 vout1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
     const __m256 vinput_scale1 = _mm256_broadcast_ss(&quantization_params[1].inv_scale);
     __m256 vout2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
     const __m256 vinput_scale2 = _mm256_broadcast_ss(&quantization_params[2].inv_scale);
+    __m256 vout3x01234567 = _mm256_cvtepi32_ps(vacc3x01234567);
+    const __m256 vinput_scale3 = _mm256_broadcast_ss(&quantization_params[3].inv_scale);
 
     vout0x01234567 = _mm256_mul_ps(vout0x01234567, vinput_scale0);
     vout1x01234567 = _mm256_mul_ps(vout1x01234567, vinput_scale1);
     vout2x01234567 = _mm256_mul_ps(vout2x01234567, vinput_scale2);
+    vout3x01234567 = _mm256_mul_ps(vout3x01234567, vinput_scale3);
 
     const __m256 vfilter_output_scale01234567 = _mm256_load_ps((const float*) w);
     const __m256 vbias01234567 = _mm256_load_ps((const float*) w + 8);
     w = (const float*) w + 16;
     vout0x01234567 = _mm256_fmadd_ps(vout0x01234567, vfilter_output_scale01234567, vbias01234567);
     vout1x01234567 = _mm256_fmadd_ps(vout1x01234567, vfilter_output_scale01234567, vbias01234567);
     vout2x01234567 = _mm256_fmadd_ps(vout2x01234567, vfilter_output_scale01234567, vbias01234567);
+    vout3x01234567 = _mm256_fmadd_ps(vout3x01234567, vfilter_output_scale01234567, vbias01234567);
 
     const __m256 vmin = _mm256_load_ps(params->avx.min);
     vout0x01234567 = _mm256_max_ps(vout0x01234567, vmin);
     vout1x01234567 = _mm256_max_ps(vout1x01234567, vmin);
     vout2x01234567 = _mm256_max_ps(vout2x01234567, vmin);
+    vout3x01234567 = _mm256_max_ps(vout3x01234567, vmin);
 
     const __m256 vmax = _mm256_load_ps(params->avx.max);
     vout0x01234567 = _mm256_min_ps(vout0x01234567, vmax);
     vout1x01234567 = _mm256_min_ps(vout1x01234567, vmax);
     vout2x01234567 = _mm256_min_ps(vout2x01234567, vmax);
+    vout3x01234567 = _mm256_min_ps(vout3x01234567, vmax);
 
     if XNN_LIKELY(nc >= 8) {
       _mm256_storeu_ps(c0, vout0x01234567);
@@ -5946,46 +5974,57 @@ void xnn_qd8_f32_qc8w_gemm_minmax_ukernel_3x8c8__avx2(
       c1 = (float*) ((uintptr_t) c1 + cn_stride);
       _mm256_storeu_ps(c2, vout2x01234567);
       c2 = (float*) ((uintptr_t) c2 + cn_stride);
+      _mm256_storeu_ps(c3, vout3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
 
       a0 = (const int8_t*) ((uintptr_t) a0 - kc);
       a1 = (const int8_t*) ((uintptr_t) a1 - kc);
       a2 = (const int8_t*) ((uintptr_t) a2 - kc);
+      a3 = (const int8_t*) ((uintptr_t) a3 - kc);
 
       nc -= 8;
     } else {
       __m128 vout0x0123 = _mm256_castps256_ps128(vout0x01234567);
       __m128 vout1x0123 = _mm256_castps256_ps128(vout1x01234567);
       __m128 vout2x0123 = _mm256_castps256_ps128(vout2x01234567);
+      __m128 vout3x0123 = _mm256_castps256_ps128(vout3x01234567);
       if (nc & 4) {
         _mm_storeu_ps(c0, vout0x0123);
         _mm_storeu_ps(c1, vout1x0123);
         _mm_storeu_ps(c2, vout2x0123);
+        _mm_storeu_ps(c3, vout3x0123);
 
         vout0x0123 = _mm256_extractf128_ps(vout0x01234567, 1);
         vout1x0123 = _mm256_extractf128_ps(vout1x01234567, 1);
         vout2x0123 = _mm256_extractf128_ps(vout2x01234567, 1);
+        vout3x0123 = _mm256_extractf128_ps(vout3x01234567, 1);
 
         c0 += 4;
         c1 += 4;
         c2 += 4;
+        c3 += 4;
       }
       if (nc & 2) {
         _mm_storel_pi((__m64*) c0, vout0x0123);
         _mm_storel_pi((__m64*) c1, vout1x0123);
         _mm_storel_pi((__m64*) c2, vout2x0123);
+        _mm_storel_pi((__m64*) c3, vout3x0123);
 
         vout0x0123 = _mm_movehl_ps(vout0x0123, vout0x0123);
         vout1x0123 = _mm_movehl_ps(vout1x0123, vout1x0123);
         vout2x0123 = _mm_movehl_ps(vout2x0123, vout2x0123);
+        vout3x0123 = _mm_movehl_ps(vout3x0123, vout3x0123);
 
         c0 += 2;
         c1 += 2;
         c2 += 2;
+        c3 += 2;
       }
       if (nc & 1) {
         _mm_store_ss(c0, vout0x0123);
         _mm_store_ss(c1, vout1x0123);
         _mm_store_ss(c2, vout2x0123);
+        _mm_store_ss(c3, vout3x0123);
       }
       nc = 0;
     }
@@ -6135,7 +6174,7 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_1x8c8__avx2(
   } while (nc != 0);
 }
 
-void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
+void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_4x8c8__avx2(
     size_t mr,
     size_t nc,
     size_t kc,
@@ -6152,11 +6191,11 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
     const struct xnn_qd8_quantization_params quantization_params[restrict XNN_MIN_ELEMENTS(1)]) XNN_OOB_READS
 {
   assert(mr != 0);
-  assert(mr <= 3);
+  assert(mr <= 4);
   assert(nc != 0);
   assert(kc != 0);
   assert(ks != 0);
-  assert(ks % (3 * sizeof(void*)) == 0);
+  assert(ks % (4 * sizeof(void*)) == 0);
   assert(a_offset % sizeof(int8_t) == 0);
   assert(a != NULL);
   assert(w != NULL);
@@ -6172,6 +6211,10 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
   if XNN_UNPREDICTABLE(mr <= 2) {
     c2 = c1;
   }
+  float* c3 = (float*) ((uintptr_t) c2 + cm_stride);
+  if XNN_UNPREDICTABLE(mr != 4) {
+    c3 = c2;
+  }
 
   const __m256i vinput_zero_point = _mm256_set1_epi32((int) quantization_params->zero_point);
   const __m256 vinput_scale = _mm256_broadcast_ss(&quantization_params->inv_scale);
@@ -6200,6 +6243,10 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
     __m256i vacc2x23 = vacc0x23;
     __m256i vacc2x45 = vacc0x45;
     __m256i vacc2x67 = vacc0x67;
+    __m256i vacc3x01 = vacc0x01;
+    __m256i vacc3x23 = vacc0x23;
+    __m256i vacc3x45 = vacc0x45;
+    __m256i vacc3x67 = vacc0x67;
     w = (const int32_t*) w + 8;
 
     size_t p = ks;
@@ -6222,7 +6269,13 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
       } else {
         a2 = zero_data;
       }
-      a += 3;
+      const int8_t* restrict a3 = a[3];
+      if XNN_UNPREDICTABLE(a3 != zero) {
+        a3 = (const int8_t*) ((uintptr_t) a3 + a_offset);
+      } else {
+        a3 = zero_data;
+      }
+      a += 4;
 
       size_t k = 0;
       while (k < kc) {
@@ -6235,36 +6288,43 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
         const __m128i va2 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a2));
         const __m256i vxa2 = _mm256_cvtepi8_epi16(va2);
         a2 += 8;
+        const __m128i va3 = _mm_broadcastq_epi64(_mm_loadl_epi64((const __m128i*) a3));
+        const __m256i vxa3 = _mm256_cvtepi8_epi16(va3);
+        a3 += 8;
 
         const __m128i vb01 = _mm_load_si128((const __m128i*) w);
         const __m256i vxb01 = _mm256_cvtepi8_epi16(vb01);
 
         vacc0x01 = _mm256_add_epi32(vacc0x01, _mm256_madd_epi16(vxa0, vxb01));
         vacc1x01 = _mm256_add_epi32(vacc1x01, _mm256_madd_epi16(vxa1, vxb01));
         vacc2x01 = _mm256_add_epi32(vacc2x01, _mm256_madd_epi16(vxa2, vxb01));
+        vacc3x01 = _mm256_add_epi32(vacc3x01, _mm256_madd_epi16(vxa3, vxb01));
         const __m128i vb23 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 16));
         const __m256i vxb23 = _mm256_cvtepi8_epi16(vb23);
 
         vacc0x23 = _mm256_add_epi32(vacc0x23, _mm256_madd_epi16(vxa0, vxb23));
         vacc1x23 = _mm256_add_epi32(vacc1x23, _mm256_madd_epi16(vxa1, vxb23));
         vacc2x23 = _mm256_add_epi32(vacc2x23, _mm256_madd_epi16(vxa2, vxb23));
+        vacc3x23 = _mm256_add_epi32(vacc3x23, _mm256_madd_epi16(vxa3, vxb23));
         const __m128i vb45 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 32));
         const __m256i vxb45 = _mm256_cvtepi8_epi16(vb45);
 
         vacc0x45 = _mm256_add_epi32(vacc0x45, _mm256_madd_epi16(vxa0, vxb45));
         vacc1x45 = _mm256_add_epi32(vacc1x45, _mm256_madd_epi16(vxa1, vxb45));
         vacc2x45 = _mm256_add_epi32(vacc2x45, _mm256_madd_epi16(vxa2, vxb45));
+        vacc3x45 = _mm256_add_epi32(vacc3x45, _mm256_madd_epi16(vxa3, vxb45));
         const __m128i vb67 = _mm_load_si128((const __m128i*) ((const int8_t*) w + 48));
         const __m256i vxb67 = _mm256_cvtepi8_epi16(vb67);
 
         vacc0x67 = _mm256_add_epi32(vacc0x67, _mm256_madd_epi16(vxa0, vxb67));
         vacc1x67 = _mm256_add_epi32(vacc1x67, _mm256_madd_epi16(vxa1, vxb67));
         vacc2x67 = _mm256_add_epi32(vacc2x67, _mm256_madd_epi16(vxa2, vxb67));
+        vacc3x67 = _mm256_add_epi32(vacc3x67, _mm256_madd_epi16(vxa3, vxb67));
 
         w = (const void*) ((const int8_t*) w + 64);
         k += 8 * sizeof(int8_t);
       }
-      p -= 3 * sizeof(void*);
+      p -= 4 * sizeof(void*);
     } while (p != 0);
 
     const __m256i vacc0x0213 = _mm256_hadd_epi32(vacc0x01, vacc0x23);
@@ -6273,42 +6333,53 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
     const __m256i vacc1x4657 = _mm256_hadd_epi32(vacc1x45, vacc1x67);
     const __m256i vacc2x0213 = _mm256_hadd_epi32(vacc2x01, vacc2x23);
     const __m256i vacc2x4657 = _mm256_hadd_epi32(vacc2x45, vacc2x67);
+    const __m256i vacc3x0213 = _mm256_hadd_epi32(vacc3x01, vacc3x23);
+    const __m256i vacc3x4657 = _mm256_hadd_epi32(vacc3x45, vacc3x67);
 
     const __m256i vacc0x02461357 = _mm256_hadd_epi32(vacc0x0213, vacc0x4657);
     const __m256i vacc1x02461357 = _mm256_hadd_epi32(vacc1x0213, vacc1x4657);
     const __m256i vacc2x02461357 = _mm256_hadd_epi32(vacc2x0213, vacc2x4657);
+    const __m256i vacc3x02461357 = _mm256_hadd_epi32(vacc3x0213, vacc3x4657);
 
     const __m256i vpermute_mask = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
     __m256i vacc0x01234567 = _mm256_permutevar8x32_epi32(vacc0x02461357, vpermute_mask);
     __m256i vacc1x01234567 = _mm256_permutevar8x32_epi32(vacc1x02461357, vpermute_mask);
     __m256i vacc2x01234567 = _mm256_permutevar8x32_epi32(vacc2x02461357, vpermute_mask);
+    __m256i vacc3x01234567 = _mm256_permutevar8x32_epi32(vacc3x02461357, vpermute_mask);
 
     __m256 vout0x01234567 = _mm256_cvtepi32_ps(vacc0x01234567);
     __m256 vout1x01234567 = _mm256_cvtepi32_ps(vacc1x01234567);
     __m256 vout2x01234567 = _mm256_cvtepi32_ps(vacc2x01234567);
+    __m256 vout3x01234567 = _mm256_cvtepi32_ps(vacc3x01234567);
 
     vout0x01234567 = _mm256_mul_ps(vout0x01234567, vinput_scale);
     vout1x01234567 = _mm256_mul_ps(vout1x01234567, vinput_scale);
     vout2x01234567 = _mm256_mul_ps(vout2x01234567, vinput_scale);
+    vout3x01234567 = _mm256_mul_ps(vout3x01234567, vinput_scale);
 
     const __m256 vfilter_output_scale01234567 = _mm256_load_ps((const float*) w);
     const __m256 vbias01234567 = _mm256_load_ps((const float*) w + 8);
     w = (const float*) w + 16;
     vout0x01234567 = _mm256_fmadd_ps(vout0x01234567, vfilter_output_scale01234567, vbias01234567);
     vout1x01234567 = _mm256_fmadd_ps(vout1x01234567, vfilter_output_scale01234567, vbias01234567);
     vout2x01234567 = _mm256_fmadd_ps(vout2x01234567, vfilter_output_scale01234567, vbias01234567);
+    vout3x01234567 = _mm256_fmadd_ps(vout3x01234567, vfilter_output_scale01234567, vbias01234567);
 
     const __m256 vmin = _mm256_load_ps(params->avx.min);
     vout0x01234567 = _mm256_max_ps(vout0x01234567, vmin);
     vout1x01234567 = _mm256_max_ps(vout1x01234567, vmin);
     vout2x01234567 = _mm256_max_ps(vout2x01234567, vmin);
+    vout3x01234567 = _mm256_max_ps(vout3x01234567, vmin);
 
     const __m256 vmax = _mm256_load_ps(params->avx.max);
     vout0x01234567 = _mm256_min_ps(vout0x01234567, vmax);
     vout1x01234567 = _mm256_min_ps(vout1x01234567, vmax);
     vout2x01234567 = _mm256_min_ps(vout2x01234567, vmax);
+    vout3x01234567 = _mm256_min_ps(vout3x01234567, vmax);
 
     if XNN_LIKELY(nc >= 8) {
+      _mm256_storeu_ps(c3, vout3x01234567);
+      c3 = (float*) ((uintptr_t) c3 + cn_stride);
       _mm256_storeu_ps(c2, vout2x01234567);
       c2 = (float*) ((uintptr_t) c2 + cn_stride);
       _mm256_storeu_ps(c1, vout1x01234567);
@@ -6319,36 +6390,44 @@ void xnn_qd8_f32_qc8w_igemm_minmax_ukernel_3x8c8__avx2(
       a = (const int8_t**restrict) ((uintptr_t) a - ks);
       nc -= 8;
     } else {
+      __m128 vout3x0123 = _mm256_castps256_ps128(vout3x01234567);
       __m128 vout2x0123 = _mm256_castps256_ps128(vout2x01234567);
       __m128 vout1x0123 = _mm256_castps256_ps128(vout1x01234567);
       __m128 vout0x0123 = _mm256_castps256_ps128(vout0x01234567);
       if (nc & 4) {
+        _mm_storeu_ps(c3, vout3x0123);
         _mm_storeu_ps(c2, vout2x0123);
         _mm_storeu_ps(c1, vout1x0123);
         _mm_storeu_ps(c0, vout0x0123);
 
+        vout3x0123 = _mm256_extractf128_ps(vout3x01234567, 1);
         vout2x0123 = _mm256_extractf128_ps(vout2x01234567, 1);
         vout1x0123 = _mm256_extractf128_ps(vout1x01234567, 1);
         vout0x0123 = _mm256_extractf128_ps(vout0x01234567, 1);
 
+        c3 += 4;
         c2 += 4;
         c1 += 4;
         c0 += 4;
       }
       if (nc & 2) {
+        _mm_storel_pi((__m64*) c3, vout3x0123);
         _mm_storel_pi((__m64*) c2, vout2x0123);
         _mm_storel_pi((__m64*) c1, vout1x0123);
         _mm_storel_pi((__m64*) c0, vout0x0123);
 
+        vout3x0123 = _mm_movehl_ps(vout3x0123, vout3x0123);
         vout2x0123 = _mm_movehl_ps(vout2x0123, vout2x0123);
         vout1x0123 = _mm_movehl_ps(vout1x0123, vout1x0123);
         vout0x0123 = _mm_movehl_ps(vout0x0123, vout0x0123);
 
+        c3 += 2;
         c2 += 2;
         c1 += 2;
         c0 += 2;
       }
       if (nc & 1) {
+        _mm_store_ss(c3, vout3x0123);
         _mm_store_ss(c2, vout2x0123);
         _mm_store_ss(c1, vout1x0123);
         _mm_store_ss(c0, vout0x0123);