1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  * All rights reserved.
4  *
5  * This source code is licensed under the BSD-style license found in the
6  * LICENSE file in the root directory of this source tree.
7  */
8 
9 #include <immintrin.h>
10 
11 #include <qnnpack/q8gemm_sparse.h>
12 #include <requantization/runtime-sse2.h>
13 
14 #include "8x4c1x4-packed-sse2.h"
15 
16 #define CONVERT_TO_FP_AND_TRANSPOSE(a, b, c, d, t_a, t_b, t_c, t_d)  \
17   a_ps = _mm_cvtepi32_ps(a);                                         \
18   b_ps = _mm_cvtepi32_ps(b);                                         \
19   c_ps = _mm_cvtepi32_ps(c);                                         \
20   d_ps = _mm_cvtepi32_ps(d);                                         \
21   tmp0 = _mm_shuffle_ps(a_ps, b_ps, _MM_SHUFFLE(1, 0, 1, 0));        \
22   tmp1 = _mm_shuffle_ps(a_ps, b_ps, _MM_SHUFFLE(3, 2, 3, 2));        \
23   tmp2 = _mm_shuffle_ps(c_ps, d_ps, _MM_SHUFFLE(1, 0, 1, 0));        \
24   tmp3 = _mm_shuffle_ps(c_ps, d_ps, _MM_SHUFFLE(3, 2, 3, 2));        \
25   t_a = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2, 0, 2, 0));         \
26   t_b = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(3, 1, 3, 1));         \
27   t_c = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(2, 0, 2, 0));         \
28   t_d = _mm_shuffle_ps(tmp1, tmp3, _MM_SHUFFLE(3, 1, 3, 1));
29 
30 // KERNEL_NAME and W_INDEX_DTYPE macros are defined in
31 // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/quantized/cpu/qnnpack/src/q8gemm_sparse/8x4c1x4-dq-packedA-sse2.c
KERNEL_NAME(size_t mr,size_t nr,const uint8_t * a_packed,const uint8_t * packed_w,const W_INDEX_DTYPE * w_row_ptr,const W_INDEX_DTYPE * w_block_ids_ptr,const float * b,float * c,size_t c_stride,size_t output_channel_index,const struct pytorch_qnnp_conv_dynamic_quantization_params quantization_params[RESTRICT_STATIC1])32 void KERNEL_NAME(
33     size_t mr,
34     size_t nr,
35     const uint8_t* a_packed,
36     const uint8_t* packed_w,
37     const W_INDEX_DTYPE* w_row_ptr,
38     const W_INDEX_DTYPE* w_block_ids_ptr,
39     const float* b,
40     float* c,
41     size_t c_stride,
42     size_t output_channel_index,
43     const struct pytorch_qnnp_conv_dynamic_quantization_params
44         quantization_params[RESTRICT_STATIC 1]) {
45   const __m128i va_zero_point = _mm_set1_epi16(quantization_params->input_zero_point);
46   const __m128 vbias = _mm_load_ps(b);
47   const __m128i vzero = _mm_setzero_si128();
48 
49   // Packed A format.
50   // 8kx4m blocks for alls blocks given 4 rows (4m) are placed in contiguous memory.
51   // Original A
52   // --------- K -----------          -- (K + 4 - 1) / 4 --
53   // |                     |          |                   |
54   // |                     |        (M + 8 - 1)/8         |
55   // |                     | Packed   |                   |
56   // M                     |  =>      |-------------------|
57   // |                     |        Thus Packed A has (K + 4 - 1)/4 * (M + 8 -1)/8 blocks
58   // |                     |
59   // |---------------------|
60   //
61   // Each 8 x 4 blocks is transposed and stored.
62   // Each of the (K + 4 - 1)/4 blocks for a given group of 8 m blocks
63   // are stored adjacent in memory
64   // Thus, each block:
65   // |----8m-----|----8m-----|
66   // 4k          |           | .....
67   // |-----------|-----------|
68   // This locality helps in loading 8kx8m blocks of activations
69   // Note when M is not multiple of 8, the rest can contain arbitrary
70   // data in packed A as we will not be writing those out.
71   // This wil be taken care by just copying the appropriate valid data
72 
73   __m128i vacc_low[4];
74   __m128i vacc_high[4];
75   const __m128 vmultiplier =
76       _mm_loadu_ps(&quantization_params->multipliers[output_channel_index]);
77   for (int32_t n = 0; n < nr; n++) {
78     vacc_low[n] = _mm_setzero_si128();
79     vacc_high[n] = _mm_setzero_si128();
80     const int16_t b_zero_point =
81       (int16_t)(uint16_t)quantization_params->kernel_zero_points[
82       output_channel_index + n];
83 
84     int32_t num_blocks = w_row_ptr[n+1] - w_row_ptr[n];
85     // Offset into compressed values.
86     // w_row_ptr[0] is the block offset in the compressed values.
87     // Where the corresponding row of the weight matrix starts.
88     const uint8_t* temp_packed_w = packed_w + w_row_ptr[n] * COL_BLOCK_SIZE;
89     // Similarly w_row_ptr[0] is also the block offset where
90     // corresponding row's block column ids start.
91     // Per row # of block column ids = # of block values
92     const W_INDEX_DTYPE* temp_w_block_ids_ptr = w_block_ids_ptr + w_row_ptr[n];
93     while (num_blocks > 1) {
94       // Load two 1x4 uint8 blocks 2 ints
95       const uint8_t* b_ptr = temp_packed_w;
96       // This is not perf optimal since this will result in
97       // register spills. We probably should work with output block
98       // of 1x4 instead of 1x8
99       // But doing is this way because mostly this how we will
100       // do it for ARM and this reference code helps establish
101       // the baseline for functional correctness.
102       const int16_t b_0 = (int16_t)((uint16_t)(b_ptr[0]));
103       const int16_t b_1 = (int16_t)((uint16_t)(b_ptr[1]));
104       const int16_t b_2 = (int16_t)((uint16_t)(b_ptr[2]));
105       const int16_t b_3 = (int16_t)((uint16_t)(b_ptr[3]));
106       const int16_t b_4 = (int16_t)((uint16_t)(b_ptr[4]));
107       const int16_t b_5 = (int16_t)((uint16_t)(b_ptr[5]));
108       const int16_t b_6 = (int16_t)((uint16_t)(b_ptr[6]));
109       const int16_t b_7 = (int16_t)((uint16_t)(b_ptr[7]));
110       // Now we will load 8kx1(broadcast 8) weight values
111       const __m128i vxb0 = _mm_set1_epi16((b_0 - b_zero_point));
112       const __m128i vxb1 = _mm_set1_epi16((b_1 - b_zero_point));
113       const __m128i vxb2 = _mm_set1_epi16((b_2 - b_zero_point));
114       const __m128i vxb3 = _mm_set1_epi16((b_3 - b_zero_point));
115       const __m128i vxb4 = _mm_set1_epi16((b_4 - b_zero_point));
116       const __m128i vxb5 = _mm_set1_epi16((b_5 - b_zero_point));
117       const __m128i vxb6 = _mm_set1_epi16((b_6 - b_zero_point));
118       const __m128i vxb7 = _mm_set1_epi16((b_7 - b_zero_point));
119 
120       // Load activation blocks. In this kernel we assume
121       // a mat is already transposed. K x M
122       // 1. Load 8 1x8 registers = 8k x 8m
123 
124       // Load column id of the first 1x4 block
125       int32_t col_block_id_0 = temp_w_block_ids_ptr[0];
126       // Load column id of the second 1x4 block
127       int32_t col_block_id_1 = temp_w_block_ids_ptr[1];
128       const __m128i va0 =
129         _mm_loadl_epi64((const __m128i*) (a_packed +
130             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 0));
131       const __m128i va1 =
132         _mm_loadl_epi64((const __m128i*) (a_packed +
133             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 1));
134       const __m128i va2 =
135         _mm_loadl_epi64((const __m128i*) (a_packed +
136             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 2));
137       const __m128i va3 =
138         _mm_loadl_epi64((const __m128i*) (a_packed +
139             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 3));
140       const __m128i va4 =
141         _mm_loadl_epi64((const __m128i*) (a_packed +
142             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 0));
143       const __m128i va5 =
144         _mm_loadl_epi64((const __m128i*) (a_packed +
145             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 1));
146       const __m128i va6 =
147         _mm_loadl_epi64((const __m128i*) (a_packed +
148             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 2));
149       const __m128i va7 =
150         _mm_loadl_epi64((const __m128i*) (a_packed +
151             col_block_id_1 * PACKED_A_BLOCK_SIZE + MR * 3));
152 
153       const __m128i vxa0 =
154           sub_zero_point(_mm_unpacklo_epi8(va0, vzero), va_zero_point);
155       const __m128i vxa1 =
156           sub_zero_point(_mm_unpacklo_epi8(va1, vzero), va_zero_point);
157       const __m128i vxa2 =
158           sub_zero_point(_mm_unpacklo_epi8(va2, vzero), va_zero_point);
159       const __m128i vxa3 =
160           sub_zero_point(_mm_unpacklo_epi8(va3, vzero), va_zero_point);
161       const __m128i vxa4 =
162           sub_zero_point(_mm_unpacklo_epi8(va4, vzero), va_zero_point);
163       const __m128i vxa5 =
164           sub_zero_point(_mm_unpacklo_epi8(va5, vzero), va_zero_point);
165       const __m128i vxa6 =
166           sub_zero_point(_mm_unpacklo_epi8(va6, vzero), va_zero_point);
167       const __m128i vxa7 =
168           sub_zero_point(_mm_unpacklo_epi8(va7, vzero), va_zero_point);
169 
170       // acc += a0 * b0;
171       __m128i vacc_low_16bits = _mm_mullo_epi16(vxa0, vxb0);
172       __m128i vacc_high_16bits = _mm_mulhi_epi16(vxa0, vxb0);
173       vacc_low[n] = _mm_add_epi32(vacc_low[n],
174         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
175       vacc_high[n] = _mm_add_epi32(vacc_high[n],
176         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
177       // acc += a1 * b1;
178       vacc_low_16bits = _mm_mullo_epi16(vxa1, vxb1);
179       vacc_high_16bits = _mm_mulhi_epi16(vxa1, vxb1);
180       vacc_low[n] = _mm_add_epi32(vacc_low[n],
181         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
182       vacc_high[n] = _mm_add_epi32(vacc_high[n],
183         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
184       // acc += a2 * b2;
185       vacc_low_16bits = _mm_mullo_epi16(vxa2, vxb2);
186       vacc_high_16bits = _mm_mulhi_epi16(vxa2, vxb2);
187       vacc_low[n] = _mm_add_epi32(vacc_low[n],
188         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
189       vacc_high[n] = _mm_add_epi32(vacc_high[n],
190         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
191       // acc += a3 * b3;
192       vacc_low_16bits = _mm_mullo_epi16(vxa3, vxb3);
193       vacc_high_16bits = _mm_mulhi_epi16(vxa3, vxb3);
194       vacc_low[n] = _mm_add_epi32(vacc_low[n],
195         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
196       vacc_high[n] = _mm_add_epi32(vacc_high[n],
197         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
198       // acc += a4 * b4;
199       vacc_low_16bits = _mm_mullo_epi16(vxa4, vxb4);
200       vacc_high_16bits = _mm_mulhi_epi16(vxa4, vxb4);
201       vacc_low[n] = _mm_add_epi32(vacc_low[n],
202         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
203       vacc_high[n] = _mm_add_epi32(vacc_high[n],
204         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
205       // acc += a5 * b5;
206       vacc_low_16bits = _mm_mullo_epi16(vxa5, vxb5);
207       vacc_high_16bits = _mm_mulhi_epi16(vxa5, vxb5);
208       vacc_low[n] = _mm_add_epi32(vacc_low[n],
209         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
210       vacc_high[n] = _mm_add_epi32(vacc_high[n],
211         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
212       // acc += a6 * b6;
213       vacc_low_16bits = _mm_mullo_epi16(vxa6, vxb6);
214       vacc_high_16bits = _mm_mulhi_epi16(vxa6, vxb6);
215       vacc_low[n] = _mm_add_epi32(vacc_low[n],
216         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
217       vacc_high[n] = _mm_add_epi32(vacc_high[n],
218         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
219       // acc += a7 * b7;
220       vacc_low_16bits = _mm_mullo_epi16(vxa7, vxb7);
221       vacc_high_16bits = _mm_mulhi_epi16(vxa7, vxb7);
222       vacc_low[n] = _mm_add_epi32(vacc_low[n],
223         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
224       vacc_high[n] = _mm_add_epi32(vacc_high[n],
225         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
226 
227       // Now we have 1x8 m acculated 32 bit values in vacc_low[n](4) and vacc_high[n](4)
228 
229       temp_packed_w = temp_packed_w + COL_BLOCK_SIZE * 2;
230       temp_w_block_ids_ptr += 2;
231       num_blocks -= 2;
232     }
233     if (num_blocks > 0) {
234       // Load two 1x4 uint8 blocks 2 ints
235       const uint8_t* b_ptr = temp_packed_w;
236       const int16_t b_0 = (int16_t)((uint16_t)(b_ptr[0]));
237       const int16_t b_1 = (int16_t)((uint16_t)(b_ptr[1]));
238       const int16_t b_2 = (int16_t)((uint16_t)(b_ptr[2]));
239       const int16_t b_3 = (int16_t)((uint16_t)(b_ptr[3]));
240       // Now we will load 8kx1(broadcast 8) weight values
241       const __m128i vxb0 = _mm_set1_epi16((b_0 - b_zero_point));
242       const __m128i vxb1 = _mm_set1_epi16((b_1 - b_zero_point));
243       const __m128i vxb2 = _mm_set1_epi16((b_2 - b_zero_point));
244       const __m128i vxb3 = _mm_set1_epi16((b_3 - b_zero_point));
245 
246       // Then load transformed weight blocks
247       // 1. Load 4 1x8 registers = 4k x 8m
248       // Thus have 4x8 (4k x 8m) activations a0, a1, a2, a3
249       // Each a containing 8 m values.
250 
251       // Load column id of the first 1x4 block
252       int32_t col_block_id_0 = temp_w_block_ids_ptr[0];
253       const __m128i va0 =
254         _mm_loadl_epi64((const __m128i*) (a_packed +
255             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 0));
256       const __m128i va1 =
257         _mm_loadl_epi64((const __m128i*) (a_packed +
258             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 1));
259       const __m128i va2 =
260         _mm_loadl_epi64((const __m128i*) (a_packed +
261             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 2));
262       const __m128i va3 =
263         _mm_loadl_epi64((const __m128i*) (a_packed +
264             col_block_id_0 * PACKED_A_BLOCK_SIZE + MR * 3));
265       const __m128i vxa0 =
266           sub_zero_point(_mm_unpacklo_epi8(va0, vzero), va_zero_point);
267       const __m128i vxa1 =
268           sub_zero_point(_mm_unpacklo_epi8(va1, vzero), va_zero_point);
269       const __m128i vxa2 =
270           sub_zero_point(_mm_unpacklo_epi8(va2, vzero), va_zero_point);
271       const __m128i vxa3 =
272           sub_zero_point(_mm_unpacklo_epi8(va3, vzero), va_zero_point);
273 
274       // acc += a0 * b0;
275       __m128i vacc_low_16bits = _mm_mullo_epi16(vxa0, vxb0);
276       __m128i vacc_high_16bits = _mm_mulhi_epi16(vxa0, vxb0);
277       vacc_low[n] = _mm_add_epi32(vacc_low[n],
278         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
279       vacc_high[n] = _mm_add_epi32(vacc_high[n],
280         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
281       // acc += a1 * b1;
282       vacc_low_16bits = _mm_mullo_epi16(vxa1, vxb1);
283       vacc_high_16bits = _mm_mulhi_epi16(vxa1, vxb1);
284       vacc_low[n] = _mm_add_epi32(vacc_low[n],
285         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
286       vacc_high[n] = _mm_add_epi32(vacc_high[n],
287         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
288       // acc += a2 * b2;
289       vacc_low_16bits = _mm_mullo_epi16(vxa2, vxb2);
290       vacc_high_16bits = _mm_mulhi_epi16(vxa2, vxb2);
291       vacc_low[n] = _mm_add_epi32(vacc_low[n],
292         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
293       vacc_high[n] = _mm_add_epi32(vacc_high[n],
294         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
295       // acc += a3 * b3;
296       vacc_low_16bits = _mm_mullo_epi16(vxa3, vxb3);
297       vacc_high_16bits = _mm_mulhi_epi16(vxa3, vxb3);
298       vacc_low[n] = _mm_add_epi32(vacc_low[n],
299         _mm_unpacklo_epi16(vacc_low_16bits, vacc_high_16bits));
300       vacc_high[n] = _mm_add_epi32(vacc_high[n],
301         _mm_unpackhi_epi16(vacc_low_16bits, vacc_high_16bits));
302 
303       // Now we have 1x8 m acculated 32 bit values in vacc_low[n](4) and vacc_high[n](4)
304     }
305   }
306 
307   __m128 vout[8];
308   __m128 a_ps, b_ps, c_ps, d_ps, tmp0, tmp1, tmp2, tmp3;
309 
310   // Transform low half of 4x8 result
311   // That is 4x4 block (4n x 4m)
312   // Convert to FP and transpose: 4m x 4n
313   CONVERT_TO_FP_AND_TRANSPOSE(vacc_low[0],
314                               vacc_low[1],
315                               vacc_low[2],
316                               vacc_low[3],
317                               vout[0],
318                               vout[1],
319                               vout[2],
320                               vout[3])
321   CONVERT_TO_FP_AND_TRANSPOSE(vacc_high[0],
322                               vacc_high[1],
323                               vacc_high[2],
324                               vacc_high[3],
325                               vout[4],
326                               vout[5],
327                               vout[6],
328                               vout[7])
329 
330   vout[0] = _mm_mul_ps(vmultiplier, vout[0]);
331   vout[1] = _mm_mul_ps(vmultiplier, vout[1]);
332   vout[2] = _mm_mul_ps(vmultiplier, vout[2]);
333   vout[3] = _mm_mul_ps(vmultiplier, vout[3]);
334   vout[4] = _mm_mul_ps(vmultiplier, vout[4]);
335   vout[5] = _mm_mul_ps(vmultiplier, vout[5]);
336   vout[6] = _mm_mul_ps(vmultiplier, vout[6]);
337   vout[7] = _mm_mul_ps(vmultiplier, vout[7]);
338 
339   vout[0] = _mm_add_ps(vout[0], vbias);
340   vout[1] = _mm_add_ps(vout[1], vbias);
341   vout[2] = _mm_add_ps(vout[2], vbias);
342   vout[3] = _mm_add_ps(vout[3], vbias);
343   vout[4] = _mm_add_ps(vout[4], vbias);
344   vout[5] = _mm_add_ps(vout[5], vbias);
345   vout[6] = _mm_add_ps(vout[6], vbias);
346   vout[7] = _mm_add_ps(vout[7], vbias);
347 
348   float* c0 = c;
349   float* c1 = c0 + c_stride;
350   if (mr < 2) {
351     c1 = c0;
352     vout[1] = vout[0];
353   }
354   float* c2 = c1 + c_stride;
355   if (mr < 3) {
356     c2 = c0;
357     vout[2] = vout[0];
358   }
359   float* c3 = c2 + c_stride;
360   if (mr < 4) {
361     c3 = c0;
362     vout[3] = vout[0];
363   }
364   float* c4 = c3 + c_stride;
365   if (mr < 5) {
366     c4 = c0;
367     vout[4] = vout[0];
368   }
369   float* c5 = c4 + c_stride;
370   if (mr < 6) {
371     c5 = c0;
372     vout[5] = vout[0];
373   }
374   float* c6 = c5 + c_stride;
375   if (mr < 7) {
376     c6 = c0;
377     vout[6] = vout[0];
378   }
379   float* c7 = c6 + c_stride;
380   if (mr < 8) {
381     c7 = c0;
382     vout[7] = vout[0];
383   }
384 
385   if (nr == 4) {
386     _mm_storeu_ps(c0, vout[0]);
387     _mm_storeu_ps(c1, vout[1]);
388     _mm_storeu_ps(c2, vout[2]);
389     _mm_storeu_ps(c3, vout[3]);
390     _mm_storeu_ps(c4, vout[4]);
391     _mm_storeu_ps(c5, vout[5]);
392     _mm_storeu_ps(c6, vout[6]);
393     _mm_storeu_ps(c7, vout[7]);
394   } else {
395     if (nr >= 2) {
396       _mm_storel_pi((__m64*)c0, vout[0]);
397       _mm_storel_pi((__m64*)c1, vout[1]);
398       _mm_storel_pi((__m64*)c2, vout[2]);
399       _mm_storel_pi((__m64*)c3, vout[3]);
400       _mm_storel_pi((__m64*)c4, vout[4]);
401       _mm_storel_pi((__m64*)c5, vout[5]);
402       _mm_storel_pi((__m64*)c6, vout[6]);
403       _mm_storel_pi((__m64*)c7, vout[7]);
404 
405       nr -= 2;
406 
407       c0 += 2;
408       c1 += 2;
409       c2 += 2;
410       c3 += 2;
411       c4 += 2;
412       c5 += 2;
413       c6 += 2;
414       c7 += 2;
415       vout[0] = _mm_shuffle_ps(vout[0], vout[0], _MM_SHUFFLE(2, 2, 2, 2));
416       vout[1] = _mm_shuffle_ps(vout[1], vout[1], _MM_SHUFFLE(2, 2, 2, 2));
417       vout[2] = _mm_shuffle_ps(vout[2], vout[2], _MM_SHUFFLE(2, 2, 2, 2));
418       vout[3] = _mm_shuffle_ps(vout[3], vout[3], _MM_SHUFFLE(2, 2, 2, 2));
419       vout[4] = _mm_shuffle_ps(vout[4], vout[4], _MM_SHUFFLE(2, 2, 2, 2));
420       vout[5] = _mm_shuffle_ps(vout[5], vout[5], _MM_SHUFFLE(2, 2, 2, 2));
421       vout[6] = _mm_shuffle_ps(vout[6], vout[6], _MM_SHUFFLE(2, 2, 2, 2));
422       vout[7] = _mm_shuffle_ps(vout[7], vout[7], _MM_SHUFFLE(2, 2, 2, 2));
423     }
424     if (nr != 0) {
425       *c0 = _mm_cvtss_f32(vout[0]);
426       *c1 = _mm_cvtss_f32(vout[1]);
427       *c2 = _mm_cvtss_f32(vout[2]);
428       *c3 = _mm_cvtss_f32(vout[3]);
429       *c4 = _mm_cvtss_f32(vout[4]);
430       *c5 = _mm_cvtss_f32(vout[5]);
431       *c6 = _mm_cvtss_f32(vout[6]);
432       *c7 = _mm_cvtss_f32(vout[7]);
433     }
434   }
435 }
436