xref: /aosp_15_r20/external/executorch/extension/llm/custom_ops/spinquant/third-party/FFHT/fht_avx.c (revision 523fa7a60841cd1ecfb9cc4201f1ca8b03ed023a)
1 #include "fht.h"
2 static inline void helper_float_1(float *buf);
helper_float_1(float * buf)3 static inline void helper_float_1(float *buf) {
4   for (int j = 0; j < 2; j += 2) {
5     for (int k = 0; k < 1; ++k) {
6       float u = buf[j + k];
7       float v = buf[j + k + 1];
8       buf[j + k] = u + v;
9       buf[j + k + 1] = u - v;
10     }
11   }
12 }
13 static inline void helper_float_2(float *buf);
helper_float_2(float * buf)14 static inline void helper_float_2(float *buf) {
15   for (int j = 0; j < 4; j += 2) {
16     for (int k = 0; k < 1; ++k) {
17       float u = buf[j + k];
18       float v = buf[j + k + 1];
19       buf[j + k] = u + v;
20       buf[j + k + 1] = u - v;
21     }
22   }
23   for (int j = 0; j < 4; j += 4) {
24     for (int k = 0; k < 2; ++k) {
25       float u = buf[j + k];
26       float v = buf[j + k + 2];
27       buf[j + k] = u + v;
28       buf[j + k + 2] = u - v;
29     }
30   }
31 }
32 static inline void helper_float_3(float *buf);
helper_float_3(float * buf)33 static inline void helper_float_3(float *buf) {
34   for (int j = 0; j < 8; j += 8) {
35     __asm__ volatile (
36       "vmovups (%0), %%ymm0\n"
37       "vpermilps $160, %%ymm0, %%ymm8\n"
38       "vpermilps $245, %%ymm0, %%ymm9\n"
39       "vxorps %%ymm10, %%ymm10, %%ymm10\n"
40       "vsubps %%ymm9, %%ymm10, %%ymm11\n"
41       "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
42       "vpermilps $68, %%ymm0, %%ymm8\n"
43       "vpermilps $238, %%ymm0, %%ymm9\n"
44       "vxorps %%ymm10, %%ymm10, %%ymm10\n"
45       "vsubps %%ymm9, %%ymm10, %%ymm11\n"
46       "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
47       "vaddps %%ymm8, %%ymm12, %%ymm0\n"
48       "vxorps %%ymm8, %%ymm8, %%ymm8\n"
49       "vsubps %%ymm0, %%ymm8, %%ymm9\n"
50       "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
51       "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
52       "vaddps %%ymm10, %%ymm11, %%ymm0\n"
53       "vmovups %%ymm0, (%0)\n"
54       :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
55     );
56   }
57 }
58 static inline void helper_float_4(float *buf);
helper_float_4(float * buf)59 static inline void helper_float_4(float *buf) {
60   for (int j = 0; j < 16; j += 16) {
61     for (int k = 0; k < 8; k += 8) {
62       __asm__ volatile (
63         "vmovups (%0), %%ymm0\n"
64         "vmovups (%1), %%ymm1\n"
65         "vpermilps $160, %%ymm0, %%ymm8\n"
66         "vpermilps $245, %%ymm0, %%ymm9\n"
67         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
68         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
69         "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
70         "vpermilps $160, %%ymm1, %%ymm8\n"
71         "vpermilps $245, %%ymm1, %%ymm9\n"
72         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
73         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
74         "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
75         "vpermilps $68, %%ymm0, %%ymm8\n"
76         "vpermilps $238, %%ymm0, %%ymm9\n"
77         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
78         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
79         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
80         "vaddps %%ymm8, %%ymm12, %%ymm0\n"
81         "vpermilps $68, %%ymm1, %%ymm8\n"
82         "vpermilps $238, %%ymm1, %%ymm9\n"
83         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
84         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
85         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
86         "vaddps %%ymm8, %%ymm12, %%ymm1\n"
87         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
88         "vsubps %%ymm0, %%ymm8, %%ymm9\n"
89         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
90         "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
91         "vaddps %%ymm10, %%ymm11, %%ymm0\n"
92         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
93         "vsubps %%ymm1, %%ymm8, %%ymm9\n"
94         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
95         "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
96         "vaddps %%ymm10, %%ymm11, %%ymm1\n"
97         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
98         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
99         "vmovups %%ymm8, (%0)\n"
100         "vmovups %%ymm9, (%1)\n"
101         :: "r"(buf + j + k + 0), "r"(buf + j + k + 8) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
102       );
103     }
104   }
105 }
106 static inline void helper_float_5(float *buf);
helper_float_5(float * buf)107 static inline void helper_float_5(float *buf) {
108   for (int j = 0; j < 32; j += 32) {
109     for (int k = 0; k < 8; k += 8) {
110       __asm__ volatile (
111         "vmovups (%0), %%ymm0\n"
112         "vmovups (%1), %%ymm1\n"
113         "vmovups (%2), %%ymm2\n"
114         "vmovups (%3), %%ymm3\n"
115         "vpermilps $160, %%ymm0, %%ymm8\n"
116         "vpermilps $245, %%ymm0, %%ymm9\n"
117         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
118         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
119         "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
120         "vpermilps $160, %%ymm1, %%ymm8\n"
121         "vpermilps $245, %%ymm1, %%ymm9\n"
122         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
123         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
124         "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
125         "vpermilps $160, %%ymm2, %%ymm8\n"
126         "vpermilps $245, %%ymm2, %%ymm9\n"
127         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
128         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
129         "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
130         "vpermilps $160, %%ymm3, %%ymm8\n"
131         "vpermilps $245, %%ymm3, %%ymm9\n"
132         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
133         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
134         "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
135         "vpermilps $68, %%ymm0, %%ymm8\n"
136         "vpermilps $238, %%ymm0, %%ymm9\n"
137         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
138         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
139         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
140         "vaddps %%ymm8, %%ymm12, %%ymm0\n"
141         "vpermilps $68, %%ymm1, %%ymm8\n"
142         "vpermilps $238, %%ymm1, %%ymm9\n"
143         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
144         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
145         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
146         "vaddps %%ymm8, %%ymm12, %%ymm1\n"
147         "vpermilps $68, %%ymm2, %%ymm8\n"
148         "vpermilps $238, %%ymm2, %%ymm9\n"
149         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
150         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
151         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
152         "vaddps %%ymm8, %%ymm12, %%ymm2\n"
153         "vpermilps $68, %%ymm3, %%ymm8\n"
154         "vpermilps $238, %%ymm3, %%ymm9\n"
155         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
156         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
157         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
158         "vaddps %%ymm8, %%ymm12, %%ymm3\n"
159         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
160         "vsubps %%ymm0, %%ymm8, %%ymm9\n"
161         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
162         "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
163         "vaddps %%ymm10, %%ymm11, %%ymm0\n"
164         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
165         "vsubps %%ymm1, %%ymm8, %%ymm9\n"
166         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
167         "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
168         "vaddps %%ymm10, %%ymm11, %%ymm1\n"
169         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
170         "vsubps %%ymm2, %%ymm8, %%ymm9\n"
171         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
172         "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
173         "vaddps %%ymm10, %%ymm11, %%ymm2\n"
174         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
175         "vsubps %%ymm3, %%ymm8, %%ymm9\n"
176         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
177         "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
178         "vaddps %%ymm10, %%ymm11, %%ymm3\n"
179         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
180         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
181         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
182         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
183         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
184         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
185         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
186         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
187         "vmovups %%ymm0, (%0)\n"
188         "vmovups %%ymm1, (%1)\n"
189         "vmovups %%ymm2, (%2)\n"
190         "vmovups %%ymm3, (%3)\n"
191         :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
192       );
193     }
194   }
195 }
196 static inline void helper_float_6(float *buf);
helper_float_6(float * buf)197 static inline void helper_float_6(float *buf) {
198   for (int j = 0; j < 64; j += 64) {
199     for (int k = 0; k < 8; k += 8) {
200       __asm__ volatile (
201         "vmovups (%0), %%ymm0\n"
202         "vmovups (%1), %%ymm1\n"
203         "vmovups (%2), %%ymm2\n"
204         "vmovups (%3), %%ymm3\n"
205         "vmovups (%4), %%ymm4\n"
206         "vmovups (%5), %%ymm5\n"
207         "vmovups (%6), %%ymm6\n"
208         "vmovups (%7), %%ymm7\n"
209         "vpermilps $160, %%ymm0, %%ymm8\n"
210         "vpermilps $245, %%ymm0, %%ymm9\n"
211         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
212         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
213         "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
214         "vpermilps $160, %%ymm1, %%ymm8\n"
215         "vpermilps $245, %%ymm1, %%ymm9\n"
216         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
217         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
218         "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
219         "vpermilps $160, %%ymm2, %%ymm8\n"
220         "vpermilps $245, %%ymm2, %%ymm9\n"
221         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
222         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
223         "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
224         "vpermilps $160, %%ymm3, %%ymm8\n"
225         "vpermilps $245, %%ymm3, %%ymm9\n"
226         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
227         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
228         "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
229         "vpermilps $160, %%ymm4, %%ymm8\n"
230         "vpermilps $245, %%ymm4, %%ymm9\n"
231         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
232         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
233         "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
234         "vpermilps $160, %%ymm5, %%ymm8\n"
235         "vpermilps $245, %%ymm5, %%ymm9\n"
236         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
237         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
238         "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
239         "vpermilps $160, %%ymm6, %%ymm8\n"
240         "vpermilps $245, %%ymm6, %%ymm9\n"
241         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
242         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
243         "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
244         "vpermilps $160, %%ymm7, %%ymm8\n"
245         "vpermilps $245, %%ymm7, %%ymm9\n"
246         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
247         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
248         "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
249         "vpermilps $68, %%ymm0, %%ymm8\n"
250         "vpermilps $238, %%ymm0, %%ymm9\n"
251         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
252         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
253         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
254         "vaddps %%ymm8, %%ymm12, %%ymm0\n"
255         "vpermilps $68, %%ymm1, %%ymm8\n"
256         "vpermilps $238, %%ymm1, %%ymm9\n"
257         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
258         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
259         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
260         "vaddps %%ymm8, %%ymm12, %%ymm1\n"
261         "vpermilps $68, %%ymm2, %%ymm8\n"
262         "vpermilps $238, %%ymm2, %%ymm9\n"
263         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
264         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
265         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
266         "vaddps %%ymm8, %%ymm12, %%ymm2\n"
267         "vpermilps $68, %%ymm3, %%ymm8\n"
268         "vpermilps $238, %%ymm3, %%ymm9\n"
269         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
270         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
271         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
272         "vaddps %%ymm8, %%ymm12, %%ymm3\n"
273         "vpermilps $68, %%ymm4, %%ymm8\n"
274         "vpermilps $238, %%ymm4, %%ymm9\n"
275         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
276         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
277         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
278         "vaddps %%ymm8, %%ymm12, %%ymm4\n"
279         "vpermilps $68, %%ymm5, %%ymm8\n"
280         "vpermilps $238, %%ymm5, %%ymm9\n"
281         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
282         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
283         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
284         "vaddps %%ymm8, %%ymm12, %%ymm5\n"
285         "vpermilps $68, %%ymm6, %%ymm8\n"
286         "vpermilps $238, %%ymm6, %%ymm9\n"
287         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
288         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
289         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
290         "vaddps %%ymm8, %%ymm12, %%ymm6\n"
291         "vpermilps $68, %%ymm7, %%ymm8\n"
292         "vpermilps $238, %%ymm7, %%ymm9\n"
293         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
294         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
295         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
296         "vaddps %%ymm8, %%ymm12, %%ymm7\n"
297         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
298         "vsubps %%ymm0, %%ymm8, %%ymm9\n"
299         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
300         "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
301         "vaddps %%ymm10, %%ymm11, %%ymm0\n"
302         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
303         "vsubps %%ymm1, %%ymm8, %%ymm9\n"
304         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
305         "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
306         "vaddps %%ymm10, %%ymm11, %%ymm1\n"
307         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
308         "vsubps %%ymm2, %%ymm8, %%ymm9\n"
309         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
310         "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
311         "vaddps %%ymm10, %%ymm11, %%ymm2\n"
312         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
313         "vsubps %%ymm3, %%ymm8, %%ymm9\n"
314         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
315         "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
316         "vaddps %%ymm10, %%ymm11, %%ymm3\n"
317         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
318         "vsubps %%ymm4, %%ymm8, %%ymm9\n"
319         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
320         "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
321         "vaddps %%ymm10, %%ymm11, %%ymm4\n"
322         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
323         "vsubps %%ymm5, %%ymm8, %%ymm9\n"
324         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
325         "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
326         "vaddps %%ymm10, %%ymm11, %%ymm5\n"
327         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
328         "vsubps %%ymm6, %%ymm8, %%ymm9\n"
329         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
330         "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
331         "vaddps %%ymm10, %%ymm11, %%ymm6\n"
332         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
333         "vsubps %%ymm7, %%ymm8, %%ymm9\n"
334         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
335         "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
336         "vaddps %%ymm10, %%ymm11, %%ymm7\n"
337         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
338         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
339         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
340         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
341         "vaddps %%ymm5, %%ymm4, %%ymm12\n"
342         "vsubps %%ymm5, %%ymm4, %%ymm13\n"
343         "vaddps %%ymm7, %%ymm6, %%ymm14\n"
344         "vsubps %%ymm7, %%ymm6, %%ymm15\n"
345         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
346         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
347         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
348         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
349         "vaddps %%ymm14, %%ymm12, %%ymm4\n"
350         "vsubps %%ymm14, %%ymm12, %%ymm6\n"
351         "vaddps %%ymm15, %%ymm13, %%ymm5\n"
352         "vsubps %%ymm15, %%ymm13, %%ymm7\n"
353         "vaddps %%ymm4, %%ymm0, %%ymm8\n"
354         "vsubps %%ymm4, %%ymm0, %%ymm12\n"
355         "vaddps %%ymm5, %%ymm1, %%ymm9\n"
356         "vsubps %%ymm5, %%ymm1, %%ymm13\n"
357         "vaddps %%ymm6, %%ymm2, %%ymm10\n"
358         "vsubps %%ymm6, %%ymm2, %%ymm14\n"
359         "vaddps %%ymm7, %%ymm3, %%ymm11\n"
360         "vsubps %%ymm7, %%ymm3, %%ymm15\n"
361         "vmovups %%ymm8, (%0)\n"
362         "vmovups %%ymm9, (%1)\n"
363         "vmovups %%ymm10, (%2)\n"
364         "vmovups %%ymm11, (%3)\n"
365         "vmovups %%ymm12, (%4)\n"
366         "vmovups %%ymm13, (%5)\n"
367         "vmovups %%ymm14, (%6)\n"
368         "vmovups %%ymm15, (%7)\n"
369         :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
370       );
371     }
372   }
373 }
374 void helper_float_7_recursive(float *buf, int depth);
helper_float_7_recursive(float * buf,int depth)375 void helper_float_7_recursive(float *buf, int depth) {
376   if (depth == 7) {
377     for (int j = 0; j < 128; j += 64) {
378       for (int k = 0; k < 8; k += 8) {
379         __asm__ volatile (
380           "vmovups (%0), %%ymm0\n"
381           "vmovups (%1), %%ymm1\n"
382           "vmovups (%2), %%ymm2\n"
383           "vmovups (%3), %%ymm3\n"
384           "vmovups (%4), %%ymm4\n"
385           "vmovups (%5), %%ymm5\n"
386           "vmovups (%6), %%ymm6\n"
387           "vmovups (%7), %%ymm7\n"
388           "vpermilps $160, %%ymm0, %%ymm8\n"
389           "vpermilps $245, %%ymm0, %%ymm9\n"
390           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
391           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
392           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
393           "vpermilps $160, %%ymm1, %%ymm8\n"
394           "vpermilps $245, %%ymm1, %%ymm9\n"
395           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
396           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
397           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
398           "vpermilps $160, %%ymm2, %%ymm8\n"
399           "vpermilps $245, %%ymm2, %%ymm9\n"
400           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
401           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
402           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
403           "vpermilps $160, %%ymm3, %%ymm8\n"
404           "vpermilps $245, %%ymm3, %%ymm9\n"
405           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
406           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
407           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
408           "vpermilps $160, %%ymm4, %%ymm8\n"
409           "vpermilps $245, %%ymm4, %%ymm9\n"
410           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
411           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
412           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
413           "vpermilps $160, %%ymm5, %%ymm8\n"
414           "vpermilps $245, %%ymm5, %%ymm9\n"
415           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
416           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
417           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
418           "vpermilps $160, %%ymm6, %%ymm8\n"
419           "vpermilps $245, %%ymm6, %%ymm9\n"
420           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
421           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
422           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
423           "vpermilps $160, %%ymm7, %%ymm8\n"
424           "vpermilps $245, %%ymm7, %%ymm9\n"
425           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
426           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
427           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
428           "vpermilps $68, %%ymm0, %%ymm8\n"
429           "vpermilps $238, %%ymm0, %%ymm9\n"
430           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
431           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
432           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
433           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
434           "vpermilps $68, %%ymm1, %%ymm8\n"
435           "vpermilps $238, %%ymm1, %%ymm9\n"
436           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
437           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
438           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
439           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
440           "vpermilps $68, %%ymm2, %%ymm8\n"
441           "vpermilps $238, %%ymm2, %%ymm9\n"
442           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
443           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
444           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
445           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
446           "vpermilps $68, %%ymm3, %%ymm8\n"
447           "vpermilps $238, %%ymm3, %%ymm9\n"
448           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
449           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
450           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
451           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
452           "vpermilps $68, %%ymm4, %%ymm8\n"
453           "vpermilps $238, %%ymm4, %%ymm9\n"
454           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
455           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
456           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
457           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
458           "vpermilps $68, %%ymm5, %%ymm8\n"
459           "vpermilps $238, %%ymm5, %%ymm9\n"
460           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
461           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
462           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
463           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
464           "vpermilps $68, %%ymm6, %%ymm8\n"
465           "vpermilps $238, %%ymm6, %%ymm9\n"
466           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
467           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
468           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
469           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
470           "vpermilps $68, %%ymm7, %%ymm8\n"
471           "vpermilps $238, %%ymm7, %%ymm9\n"
472           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
473           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
474           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
475           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
476           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
477           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
478           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
479           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
480           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
481           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
482           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
483           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
484           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
485           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
486           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
487           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
488           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
489           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
490           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
491           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
492           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
493           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
494           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
495           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
496           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
497           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
498           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
499           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
500           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
501           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
502           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
503           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
504           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
505           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
506           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
507           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
508           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
509           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
510           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
511           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
512           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
513           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
514           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
515           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
516           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
517           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
518           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
519           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
520           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
521           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
522           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
523           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
524           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
525           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
526           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
527           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
528           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
529           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
530           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
531           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
532           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
533           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
534           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
535           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
536           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
537           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
538           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
539           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
540           "vmovups %%ymm8, (%0)\n"
541           "vmovups %%ymm9, (%1)\n"
542           "vmovups %%ymm10, (%2)\n"
543           "vmovups %%ymm11, (%3)\n"
544           "vmovups %%ymm12, (%4)\n"
545           "vmovups %%ymm13, (%5)\n"
546           "vmovups %%ymm14, (%6)\n"
547           "vmovups %%ymm15, (%7)\n"
548           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
549         );
550       }
551     }
552     for (int j = 0; j < 128; j += 128) {
553       for (int k = 0; k < 64; k += 8) {
554         __asm__ volatile (
555           "vmovups (%0), %%ymm0\n"
556           "vmovups (%1), %%ymm1\n"
557           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
558           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
559           "vmovups %%ymm8, (%0)\n"
560           "vmovups %%ymm9, (%1)\n"
561           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
562         );
563       }
564     }
565     return;
566   }
567 }
568 void helper_float_7(float *buf);
helper_float_7(float * buf)569 void helper_float_7(float *buf) {
570   helper_float_7_recursive(buf, 7);
571 }
572 void helper_float_8_recursive(float *buf, int depth);
helper_float_8_recursive(float * buf,int depth)573 void helper_float_8_recursive(float *buf, int depth) {
574   if (depth == 6) {
575     for (int j = 0; j < 64; j += 64) {
576       for (int k = 0; k < 8; k += 8) {
577         __asm__ volatile (
578           "vmovups (%0), %%ymm0\n"
579           "vmovups (%1), %%ymm1\n"
580           "vmovups (%2), %%ymm2\n"
581           "vmovups (%3), %%ymm3\n"
582           "vmovups (%4), %%ymm4\n"
583           "vmovups (%5), %%ymm5\n"
584           "vmovups (%6), %%ymm6\n"
585           "vmovups (%7), %%ymm7\n"
586           "vpermilps $160, %%ymm0, %%ymm8\n"
587           "vpermilps $245, %%ymm0, %%ymm9\n"
588           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
589           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
590           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
591           "vpermilps $160, %%ymm1, %%ymm8\n"
592           "vpermilps $245, %%ymm1, %%ymm9\n"
593           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
594           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
595           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
596           "vpermilps $160, %%ymm2, %%ymm8\n"
597           "vpermilps $245, %%ymm2, %%ymm9\n"
598           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
599           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
600           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
601           "vpermilps $160, %%ymm3, %%ymm8\n"
602           "vpermilps $245, %%ymm3, %%ymm9\n"
603           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
604           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
605           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
606           "vpermilps $160, %%ymm4, %%ymm8\n"
607           "vpermilps $245, %%ymm4, %%ymm9\n"
608           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
609           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
610           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
611           "vpermilps $160, %%ymm5, %%ymm8\n"
612           "vpermilps $245, %%ymm5, %%ymm9\n"
613           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
614           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
615           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
616           "vpermilps $160, %%ymm6, %%ymm8\n"
617           "vpermilps $245, %%ymm6, %%ymm9\n"
618           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
619           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
620           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
621           "vpermilps $160, %%ymm7, %%ymm8\n"
622           "vpermilps $245, %%ymm7, %%ymm9\n"
623           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
624           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
625           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
626           "vpermilps $68, %%ymm0, %%ymm8\n"
627           "vpermilps $238, %%ymm0, %%ymm9\n"
628           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
629           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
630           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
631           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
632           "vpermilps $68, %%ymm1, %%ymm8\n"
633           "vpermilps $238, %%ymm1, %%ymm9\n"
634           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
635           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
636           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
637           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
638           "vpermilps $68, %%ymm2, %%ymm8\n"
639           "vpermilps $238, %%ymm2, %%ymm9\n"
640           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
641           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
642           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
643           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
644           "vpermilps $68, %%ymm3, %%ymm8\n"
645           "vpermilps $238, %%ymm3, %%ymm9\n"
646           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
647           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
648           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
649           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
650           "vpermilps $68, %%ymm4, %%ymm8\n"
651           "vpermilps $238, %%ymm4, %%ymm9\n"
652           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
653           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
654           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
655           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
656           "vpermilps $68, %%ymm5, %%ymm8\n"
657           "vpermilps $238, %%ymm5, %%ymm9\n"
658           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
659           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
660           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
661           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
662           "vpermilps $68, %%ymm6, %%ymm8\n"
663           "vpermilps $238, %%ymm6, %%ymm9\n"
664           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
665           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
666           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
667           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
668           "vpermilps $68, %%ymm7, %%ymm8\n"
669           "vpermilps $238, %%ymm7, %%ymm9\n"
670           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
671           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
672           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
673           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
674           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
675           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
676           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
677           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
678           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
679           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
680           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
681           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
682           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
683           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
684           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
685           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
686           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
687           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
688           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
689           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
690           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
691           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
692           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
693           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
694           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
695           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
696           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
697           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
698           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
699           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
700           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
701           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
702           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
703           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
704           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
705           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
706           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
707           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
708           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
709           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
710           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
711           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
712           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
713           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
714           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
715           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
716           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
717           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
718           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
719           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
720           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
721           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
722           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
723           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
724           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
725           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
726           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
727           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
728           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
729           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
730           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
731           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
732           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
733           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
734           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
735           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
736           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
737           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
738           "vmovups %%ymm8, (%0)\n"
739           "vmovups %%ymm9, (%1)\n"
740           "vmovups %%ymm10, (%2)\n"
741           "vmovups %%ymm11, (%3)\n"
742           "vmovups %%ymm12, (%4)\n"
743           "vmovups %%ymm13, (%5)\n"
744           "vmovups %%ymm14, (%6)\n"
745           "vmovups %%ymm15, (%7)\n"
746           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
747         );
748       }
749     }
750     return;
751   }
752   if (depth == 8) {
753     helper_float_8_recursive(buf + 0, 6);
754     helper_float_8_recursive(buf + 64, 6);
755     helper_float_8_recursive(buf + 128, 6);
756     helper_float_8_recursive(buf + 192, 6);
757     for (int j = 0; j < 256; j += 256) {
758       for (int k = 0; k < 64; k += 8) {
759         __asm__ volatile (
760           "vmovups (%0), %%ymm0\n"
761           "vmovups (%1), %%ymm1\n"
762           "vmovups (%2), %%ymm2\n"
763           "vmovups (%3), %%ymm3\n"
764           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
765           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
766           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
767           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
768           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
769           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
770           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
771           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
772           "vmovups %%ymm0, (%0)\n"
773           "vmovups %%ymm1, (%1)\n"
774           "vmovups %%ymm2, (%2)\n"
775           "vmovups %%ymm3, (%3)\n"
776           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
777         );
778       }
779     }
780     return;
781   }
782 }
783 void helper_float_8(float *buf);
helper_float_8(float * buf)784 void helper_float_8(float *buf) {
785   helper_float_8_recursive(buf, 8);
786 }
787 static inline void helper_float_9(float *buf);
helper_float_9(float * buf)788 static inline void helper_float_9(float *buf) {
789   for (int j = 0; j < 512; j += 64) {
790     for (int k = 0; k < 8; k += 8) {
791       __asm__ volatile (
792         "vmovups (%0), %%ymm0\n"
793         "vmovups (%1), %%ymm1\n"
794         "vmovups (%2), %%ymm2\n"
795         "vmovups (%3), %%ymm3\n"
796         "vmovups (%4), %%ymm4\n"
797         "vmovups (%5), %%ymm5\n"
798         "vmovups (%6), %%ymm6\n"
799         "vmovups (%7), %%ymm7\n"
800         "vpermilps $160, %%ymm0, %%ymm8\n"
801         "vpermilps $245, %%ymm0, %%ymm9\n"
802         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
803         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
804         "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
805         "vpermilps $160, %%ymm1, %%ymm8\n"
806         "vpermilps $245, %%ymm1, %%ymm9\n"
807         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
808         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
809         "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
810         "vpermilps $160, %%ymm2, %%ymm8\n"
811         "vpermilps $245, %%ymm2, %%ymm9\n"
812         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
813         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
814         "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
815         "vpermilps $160, %%ymm3, %%ymm8\n"
816         "vpermilps $245, %%ymm3, %%ymm9\n"
817         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
818         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
819         "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
820         "vpermilps $160, %%ymm4, %%ymm8\n"
821         "vpermilps $245, %%ymm4, %%ymm9\n"
822         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
823         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
824         "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
825         "vpermilps $160, %%ymm5, %%ymm8\n"
826         "vpermilps $245, %%ymm5, %%ymm9\n"
827         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
828         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
829         "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
830         "vpermilps $160, %%ymm6, %%ymm8\n"
831         "vpermilps $245, %%ymm6, %%ymm9\n"
832         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
833         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
834         "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
835         "vpermilps $160, %%ymm7, %%ymm8\n"
836         "vpermilps $245, %%ymm7, %%ymm9\n"
837         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
838         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
839         "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
840         "vpermilps $68, %%ymm0, %%ymm8\n"
841         "vpermilps $238, %%ymm0, %%ymm9\n"
842         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
843         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
844         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
845         "vaddps %%ymm8, %%ymm12, %%ymm0\n"
846         "vpermilps $68, %%ymm1, %%ymm8\n"
847         "vpermilps $238, %%ymm1, %%ymm9\n"
848         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
849         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
850         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
851         "vaddps %%ymm8, %%ymm12, %%ymm1\n"
852         "vpermilps $68, %%ymm2, %%ymm8\n"
853         "vpermilps $238, %%ymm2, %%ymm9\n"
854         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
855         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
856         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
857         "vaddps %%ymm8, %%ymm12, %%ymm2\n"
858         "vpermilps $68, %%ymm3, %%ymm8\n"
859         "vpermilps $238, %%ymm3, %%ymm9\n"
860         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
861         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
862         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
863         "vaddps %%ymm8, %%ymm12, %%ymm3\n"
864         "vpermilps $68, %%ymm4, %%ymm8\n"
865         "vpermilps $238, %%ymm4, %%ymm9\n"
866         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
867         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
868         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
869         "vaddps %%ymm8, %%ymm12, %%ymm4\n"
870         "vpermilps $68, %%ymm5, %%ymm8\n"
871         "vpermilps $238, %%ymm5, %%ymm9\n"
872         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
873         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
874         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
875         "vaddps %%ymm8, %%ymm12, %%ymm5\n"
876         "vpermilps $68, %%ymm6, %%ymm8\n"
877         "vpermilps $238, %%ymm6, %%ymm9\n"
878         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
879         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
880         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
881         "vaddps %%ymm8, %%ymm12, %%ymm6\n"
882         "vpermilps $68, %%ymm7, %%ymm8\n"
883         "vpermilps $238, %%ymm7, %%ymm9\n"
884         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
885         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
886         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
887         "vaddps %%ymm8, %%ymm12, %%ymm7\n"
888         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
889         "vsubps %%ymm0, %%ymm8, %%ymm9\n"
890         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
891         "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
892         "vaddps %%ymm10, %%ymm11, %%ymm0\n"
893         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
894         "vsubps %%ymm1, %%ymm8, %%ymm9\n"
895         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
896         "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
897         "vaddps %%ymm10, %%ymm11, %%ymm1\n"
898         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
899         "vsubps %%ymm2, %%ymm8, %%ymm9\n"
900         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
901         "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
902         "vaddps %%ymm10, %%ymm11, %%ymm2\n"
903         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
904         "vsubps %%ymm3, %%ymm8, %%ymm9\n"
905         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
906         "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
907         "vaddps %%ymm10, %%ymm11, %%ymm3\n"
908         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
909         "vsubps %%ymm4, %%ymm8, %%ymm9\n"
910         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
911         "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
912         "vaddps %%ymm10, %%ymm11, %%ymm4\n"
913         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
914         "vsubps %%ymm5, %%ymm8, %%ymm9\n"
915         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
916         "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
917         "vaddps %%ymm10, %%ymm11, %%ymm5\n"
918         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
919         "vsubps %%ymm6, %%ymm8, %%ymm9\n"
920         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
921         "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
922         "vaddps %%ymm10, %%ymm11, %%ymm6\n"
923         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
924         "vsubps %%ymm7, %%ymm8, %%ymm9\n"
925         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
926         "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
927         "vaddps %%ymm10, %%ymm11, %%ymm7\n"
928         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
929         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
930         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
931         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
932         "vaddps %%ymm5, %%ymm4, %%ymm12\n"
933         "vsubps %%ymm5, %%ymm4, %%ymm13\n"
934         "vaddps %%ymm7, %%ymm6, %%ymm14\n"
935         "vsubps %%ymm7, %%ymm6, %%ymm15\n"
936         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
937         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
938         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
939         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
940         "vaddps %%ymm14, %%ymm12, %%ymm4\n"
941         "vsubps %%ymm14, %%ymm12, %%ymm6\n"
942         "vaddps %%ymm15, %%ymm13, %%ymm5\n"
943         "vsubps %%ymm15, %%ymm13, %%ymm7\n"
944         "vaddps %%ymm4, %%ymm0, %%ymm8\n"
945         "vsubps %%ymm4, %%ymm0, %%ymm12\n"
946         "vaddps %%ymm5, %%ymm1, %%ymm9\n"
947         "vsubps %%ymm5, %%ymm1, %%ymm13\n"
948         "vaddps %%ymm6, %%ymm2, %%ymm10\n"
949         "vsubps %%ymm6, %%ymm2, %%ymm14\n"
950         "vaddps %%ymm7, %%ymm3, %%ymm11\n"
951         "vsubps %%ymm7, %%ymm3, %%ymm15\n"
952         "vmovups %%ymm8, (%0)\n"
953         "vmovups %%ymm9, (%1)\n"
954         "vmovups %%ymm10, (%2)\n"
955         "vmovups %%ymm11, (%3)\n"
956         "vmovups %%ymm12, (%4)\n"
957         "vmovups %%ymm13, (%5)\n"
958         "vmovups %%ymm14, (%6)\n"
959         "vmovups %%ymm15, (%7)\n"
960         :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
961       );
962     }
963   }
964   for (int j = 0; j < 512; j += 512) {
965     for (int k = 0; k < 64; k += 8) {
966       __asm__ volatile (
967         "vmovups (%0), %%ymm0\n"
968         "vmovups (%1), %%ymm1\n"
969         "vmovups (%2), %%ymm2\n"
970         "vmovups (%3), %%ymm3\n"
971         "vmovups (%4), %%ymm4\n"
972         "vmovups (%5), %%ymm5\n"
973         "vmovups (%6), %%ymm6\n"
974         "vmovups (%7), %%ymm7\n"
975         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
976         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
977         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
978         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
979         "vaddps %%ymm5, %%ymm4, %%ymm12\n"
980         "vsubps %%ymm5, %%ymm4, %%ymm13\n"
981         "vaddps %%ymm7, %%ymm6, %%ymm14\n"
982         "vsubps %%ymm7, %%ymm6, %%ymm15\n"
983         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
984         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
985         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
986         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
987         "vaddps %%ymm14, %%ymm12, %%ymm4\n"
988         "vsubps %%ymm14, %%ymm12, %%ymm6\n"
989         "vaddps %%ymm15, %%ymm13, %%ymm5\n"
990         "vsubps %%ymm15, %%ymm13, %%ymm7\n"
991         "vaddps %%ymm4, %%ymm0, %%ymm8\n"
992         "vsubps %%ymm4, %%ymm0, %%ymm12\n"
993         "vaddps %%ymm5, %%ymm1, %%ymm9\n"
994         "vsubps %%ymm5, %%ymm1, %%ymm13\n"
995         "vaddps %%ymm6, %%ymm2, %%ymm10\n"
996         "vsubps %%ymm6, %%ymm2, %%ymm14\n"
997         "vaddps %%ymm7, %%ymm3, %%ymm11\n"
998         "vsubps %%ymm7, %%ymm3, %%ymm15\n"
999         "vmovups %%ymm8, (%0)\n"
1000         "vmovups %%ymm9, (%1)\n"
1001         "vmovups %%ymm10, (%2)\n"
1002         "vmovups %%ymm11, (%3)\n"
1003         "vmovups %%ymm12, (%4)\n"
1004         "vmovups %%ymm13, (%5)\n"
1005         "vmovups %%ymm14, (%6)\n"
1006         "vmovups %%ymm15, (%7)\n"
1007         :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1008       );
1009     }
1010   }
1011 }
1012 void helper_float_10_recursive(float *buf, int depth);
helper_float_10_recursive(float * buf,int depth)1013 void helper_float_10_recursive(float *buf, int depth) {
1014   if (depth == 10) {
1015     for (int j = 0; j < 1024; j += 64) {
1016       for (int k = 0; k < 8; k += 8) {
1017         __asm__ volatile (
1018           "vmovups (%0), %%ymm0\n"
1019           "vmovups (%1), %%ymm1\n"
1020           "vmovups (%2), %%ymm2\n"
1021           "vmovups (%3), %%ymm3\n"
1022           "vmovups (%4), %%ymm4\n"
1023           "vmovups (%5), %%ymm5\n"
1024           "vmovups (%6), %%ymm6\n"
1025           "vmovups (%7), %%ymm7\n"
1026           "vpermilps $160, %%ymm0, %%ymm8\n"
1027           "vpermilps $245, %%ymm0, %%ymm9\n"
1028           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1029           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1030           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1031           "vpermilps $160, %%ymm1, %%ymm8\n"
1032           "vpermilps $245, %%ymm1, %%ymm9\n"
1033           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1034           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1035           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1036           "vpermilps $160, %%ymm2, %%ymm8\n"
1037           "vpermilps $245, %%ymm2, %%ymm9\n"
1038           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1039           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1040           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1041           "vpermilps $160, %%ymm3, %%ymm8\n"
1042           "vpermilps $245, %%ymm3, %%ymm9\n"
1043           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1044           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1045           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1046           "vpermilps $160, %%ymm4, %%ymm8\n"
1047           "vpermilps $245, %%ymm4, %%ymm9\n"
1048           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1049           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1050           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1051           "vpermilps $160, %%ymm5, %%ymm8\n"
1052           "vpermilps $245, %%ymm5, %%ymm9\n"
1053           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1054           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1055           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1056           "vpermilps $160, %%ymm6, %%ymm8\n"
1057           "vpermilps $245, %%ymm6, %%ymm9\n"
1058           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1059           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1060           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1061           "vpermilps $160, %%ymm7, %%ymm8\n"
1062           "vpermilps $245, %%ymm7, %%ymm9\n"
1063           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1064           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1065           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1066           "vpermilps $68, %%ymm0, %%ymm8\n"
1067           "vpermilps $238, %%ymm0, %%ymm9\n"
1068           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1069           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1070           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1071           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1072           "vpermilps $68, %%ymm1, %%ymm8\n"
1073           "vpermilps $238, %%ymm1, %%ymm9\n"
1074           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1075           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1076           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1077           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1078           "vpermilps $68, %%ymm2, %%ymm8\n"
1079           "vpermilps $238, %%ymm2, %%ymm9\n"
1080           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1081           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1082           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1083           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1084           "vpermilps $68, %%ymm3, %%ymm8\n"
1085           "vpermilps $238, %%ymm3, %%ymm9\n"
1086           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1087           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1088           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1089           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1090           "vpermilps $68, %%ymm4, %%ymm8\n"
1091           "vpermilps $238, %%ymm4, %%ymm9\n"
1092           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1093           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1094           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1095           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1096           "vpermilps $68, %%ymm5, %%ymm8\n"
1097           "vpermilps $238, %%ymm5, %%ymm9\n"
1098           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1099           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1100           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1101           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1102           "vpermilps $68, %%ymm6, %%ymm8\n"
1103           "vpermilps $238, %%ymm6, %%ymm9\n"
1104           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1105           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1106           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1107           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1108           "vpermilps $68, %%ymm7, %%ymm8\n"
1109           "vpermilps $238, %%ymm7, %%ymm9\n"
1110           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1111           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1112           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1113           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1114           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1115           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1116           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1117           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1118           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1119           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1120           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1121           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1122           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1123           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1124           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1125           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1126           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1127           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1128           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1129           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1130           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1131           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1132           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1133           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1134           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1135           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1136           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1137           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1138           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1139           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1140           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1141           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1142           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1143           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1144           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1145           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1146           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1147           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1148           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1149           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1150           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1151           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1152           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1153           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1154           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1155           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1156           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1157           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1158           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1159           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1160           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1161           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1162           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1163           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1164           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1165           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1166           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1167           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1168           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1169           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1170           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1171           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1172           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1173           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1174           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1175           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1176           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1177           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1178           "vmovups %%ymm8, (%0)\n"
1179           "vmovups %%ymm9, (%1)\n"
1180           "vmovups %%ymm10, (%2)\n"
1181           "vmovups %%ymm11, (%3)\n"
1182           "vmovups %%ymm12, (%4)\n"
1183           "vmovups %%ymm13, (%5)\n"
1184           "vmovups %%ymm14, (%6)\n"
1185           "vmovups %%ymm15, (%7)\n"
1186           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1187         );
1188       }
1189     }
1190     for (int j = 0; j < 1024; j += 512) {
1191       for (int k = 0; k < 64; k += 8) {
1192         __asm__ volatile (
1193           "vmovups (%0), %%ymm0\n"
1194           "vmovups (%1), %%ymm1\n"
1195           "vmovups (%2), %%ymm2\n"
1196           "vmovups (%3), %%ymm3\n"
1197           "vmovups (%4), %%ymm4\n"
1198           "vmovups (%5), %%ymm5\n"
1199           "vmovups (%6), %%ymm6\n"
1200           "vmovups (%7), %%ymm7\n"
1201           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1202           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1203           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1204           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1205           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1206           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1207           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1208           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1209           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1210           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1211           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1212           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1213           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1214           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1215           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1216           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1217           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1218           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1219           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1220           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1221           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1222           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1223           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1224           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1225           "vmovups %%ymm8, (%0)\n"
1226           "vmovups %%ymm9, (%1)\n"
1227           "vmovups %%ymm10, (%2)\n"
1228           "vmovups %%ymm11, (%3)\n"
1229           "vmovups %%ymm12, (%4)\n"
1230           "vmovups %%ymm13, (%5)\n"
1231           "vmovups %%ymm14, (%6)\n"
1232           "vmovups %%ymm15, (%7)\n"
1233           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1234         );
1235       }
1236     }
1237     for (int j = 0; j < 1024; j += 1024) {
1238       for (int k = 0; k < 512; k += 8) {
1239         __asm__ volatile (
1240           "vmovups (%0), %%ymm0\n"
1241           "vmovups (%1), %%ymm1\n"
1242           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1243           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1244           "vmovups %%ymm8, (%0)\n"
1245           "vmovups %%ymm9, (%1)\n"
1246           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1247         );
1248       }
1249     }
1250     return;
1251   }
1252 }
1253 void helper_float_10(float *buf);
helper_float_10(float * buf)1254 void helper_float_10(float *buf) {
1255   helper_float_10_recursive(buf, 10);
1256 }
1257 void helper_float_11_recursive(float *buf, int depth);
helper_float_11_recursive(float * buf,int depth)1258 void helper_float_11_recursive(float *buf, int depth) {
1259   if (depth == 11) {
1260     for (int j = 0; j < 2048; j += 64) {
1261       for (int k = 0; k < 8; k += 8) {
1262         __asm__ volatile (
1263           "vmovups (%0), %%ymm0\n"
1264           "vmovups (%1), %%ymm1\n"
1265           "vmovups (%2), %%ymm2\n"
1266           "vmovups (%3), %%ymm3\n"
1267           "vmovups (%4), %%ymm4\n"
1268           "vmovups (%5), %%ymm5\n"
1269           "vmovups (%6), %%ymm6\n"
1270           "vmovups (%7), %%ymm7\n"
1271           "vpermilps $160, %%ymm0, %%ymm8\n"
1272           "vpermilps $245, %%ymm0, %%ymm9\n"
1273           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1274           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1275           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1276           "vpermilps $160, %%ymm1, %%ymm8\n"
1277           "vpermilps $245, %%ymm1, %%ymm9\n"
1278           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1279           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1280           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1281           "vpermilps $160, %%ymm2, %%ymm8\n"
1282           "vpermilps $245, %%ymm2, %%ymm9\n"
1283           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1284           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1285           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1286           "vpermilps $160, %%ymm3, %%ymm8\n"
1287           "vpermilps $245, %%ymm3, %%ymm9\n"
1288           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1289           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1290           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1291           "vpermilps $160, %%ymm4, %%ymm8\n"
1292           "vpermilps $245, %%ymm4, %%ymm9\n"
1293           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1294           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1295           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1296           "vpermilps $160, %%ymm5, %%ymm8\n"
1297           "vpermilps $245, %%ymm5, %%ymm9\n"
1298           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1299           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1300           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1301           "vpermilps $160, %%ymm6, %%ymm8\n"
1302           "vpermilps $245, %%ymm6, %%ymm9\n"
1303           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1304           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1305           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1306           "vpermilps $160, %%ymm7, %%ymm8\n"
1307           "vpermilps $245, %%ymm7, %%ymm9\n"
1308           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1309           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1310           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1311           "vpermilps $68, %%ymm0, %%ymm8\n"
1312           "vpermilps $238, %%ymm0, %%ymm9\n"
1313           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1314           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1315           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1316           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1317           "vpermilps $68, %%ymm1, %%ymm8\n"
1318           "vpermilps $238, %%ymm1, %%ymm9\n"
1319           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1320           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1321           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1322           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1323           "vpermilps $68, %%ymm2, %%ymm8\n"
1324           "vpermilps $238, %%ymm2, %%ymm9\n"
1325           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1326           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1327           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1328           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1329           "vpermilps $68, %%ymm3, %%ymm8\n"
1330           "vpermilps $238, %%ymm3, %%ymm9\n"
1331           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1332           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1333           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1334           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1335           "vpermilps $68, %%ymm4, %%ymm8\n"
1336           "vpermilps $238, %%ymm4, %%ymm9\n"
1337           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1338           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1339           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1340           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1341           "vpermilps $68, %%ymm5, %%ymm8\n"
1342           "vpermilps $238, %%ymm5, %%ymm9\n"
1343           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1344           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1345           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1346           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1347           "vpermilps $68, %%ymm6, %%ymm8\n"
1348           "vpermilps $238, %%ymm6, %%ymm9\n"
1349           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1350           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1351           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1352           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1353           "vpermilps $68, %%ymm7, %%ymm8\n"
1354           "vpermilps $238, %%ymm7, %%ymm9\n"
1355           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1356           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1357           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1358           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1359           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1360           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1361           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1362           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1363           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1364           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1365           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1366           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1367           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1368           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1369           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1370           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1371           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1372           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1373           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1374           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1375           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1376           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1377           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1378           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1379           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1380           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1381           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1382           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1383           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1384           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1385           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1386           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1387           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1388           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1389           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1390           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1391           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1392           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1393           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1394           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1395           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1396           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1397           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1398           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1399           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1400           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1401           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1402           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1403           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1404           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1405           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1406           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1407           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1408           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1409           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1410           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1411           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1412           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1413           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1414           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1415           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1416           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1417           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1418           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1419           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1420           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1421           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1422           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1423           "vmovups %%ymm8, (%0)\n"
1424           "vmovups %%ymm9, (%1)\n"
1425           "vmovups %%ymm10, (%2)\n"
1426           "vmovups %%ymm11, (%3)\n"
1427           "vmovups %%ymm12, (%4)\n"
1428           "vmovups %%ymm13, (%5)\n"
1429           "vmovups %%ymm14, (%6)\n"
1430           "vmovups %%ymm15, (%7)\n"
1431           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1432         );
1433       }
1434     }
1435     for (int j = 0; j < 2048; j += 512) {
1436       for (int k = 0; k < 64; k += 8) {
1437         __asm__ volatile (
1438           "vmovups (%0), %%ymm0\n"
1439           "vmovups (%1), %%ymm1\n"
1440           "vmovups (%2), %%ymm2\n"
1441           "vmovups (%3), %%ymm3\n"
1442           "vmovups (%4), %%ymm4\n"
1443           "vmovups (%5), %%ymm5\n"
1444           "vmovups (%6), %%ymm6\n"
1445           "vmovups (%7), %%ymm7\n"
1446           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1447           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1448           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1449           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1450           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1451           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1452           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1453           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1454           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1455           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1456           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1457           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1458           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1459           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1460           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1461           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1462           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1463           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1464           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1465           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1466           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1467           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1468           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1469           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1470           "vmovups %%ymm8, (%0)\n"
1471           "vmovups %%ymm9, (%1)\n"
1472           "vmovups %%ymm10, (%2)\n"
1473           "vmovups %%ymm11, (%3)\n"
1474           "vmovups %%ymm12, (%4)\n"
1475           "vmovups %%ymm13, (%5)\n"
1476           "vmovups %%ymm14, (%6)\n"
1477           "vmovups %%ymm15, (%7)\n"
1478           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1479         );
1480       }
1481     }
1482     for (int j = 0; j < 2048; j += 2048) {
1483       for (int k = 0; k < 512; k += 8) {
1484         __asm__ volatile (
1485           "vmovups (%0), %%ymm0\n"
1486           "vmovups (%1), %%ymm1\n"
1487           "vmovups (%2), %%ymm2\n"
1488           "vmovups (%3), %%ymm3\n"
1489           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1490           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1491           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1492           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1493           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1494           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1495           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1496           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1497           "vmovups %%ymm0, (%0)\n"
1498           "vmovups %%ymm1, (%1)\n"
1499           "vmovups %%ymm2, (%2)\n"
1500           "vmovups %%ymm3, (%3)\n"
1501           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1502         );
1503       }
1504     }
1505     return;
1506   }
1507 }
1508 void helper_float_11(float *buf);
helper_float_11(float * buf)1509 void helper_float_11(float *buf) {
1510   helper_float_11_recursive(buf, 11);
1511 }
1512 static inline void helper_float_12(float *buf);
helper_float_12(float * buf)1513 static inline void helper_float_12(float *buf) {
1514   for (int j = 0; j < 4096; j += 64) {
1515     for (int k = 0; k < 8; k += 8) {
1516       __asm__ volatile (
1517         "vmovups (%0), %%ymm0\n"
1518         "vmovups (%1), %%ymm1\n"
1519         "vmovups (%2), %%ymm2\n"
1520         "vmovups (%3), %%ymm3\n"
1521         "vmovups (%4), %%ymm4\n"
1522         "vmovups (%5), %%ymm5\n"
1523         "vmovups (%6), %%ymm6\n"
1524         "vmovups (%7), %%ymm7\n"
1525         "vpermilps $160, %%ymm0, %%ymm8\n"
1526         "vpermilps $245, %%ymm0, %%ymm9\n"
1527         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1528         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1529         "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1530         "vpermilps $160, %%ymm1, %%ymm8\n"
1531         "vpermilps $245, %%ymm1, %%ymm9\n"
1532         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1533         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1534         "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1535         "vpermilps $160, %%ymm2, %%ymm8\n"
1536         "vpermilps $245, %%ymm2, %%ymm9\n"
1537         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1538         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1539         "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1540         "vpermilps $160, %%ymm3, %%ymm8\n"
1541         "vpermilps $245, %%ymm3, %%ymm9\n"
1542         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1543         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1544         "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1545         "vpermilps $160, %%ymm4, %%ymm8\n"
1546         "vpermilps $245, %%ymm4, %%ymm9\n"
1547         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1548         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1549         "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1550         "vpermilps $160, %%ymm5, %%ymm8\n"
1551         "vpermilps $245, %%ymm5, %%ymm9\n"
1552         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1553         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1554         "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1555         "vpermilps $160, %%ymm6, %%ymm8\n"
1556         "vpermilps $245, %%ymm6, %%ymm9\n"
1557         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1558         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1559         "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1560         "vpermilps $160, %%ymm7, %%ymm8\n"
1561         "vpermilps $245, %%ymm7, %%ymm9\n"
1562         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1563         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1564         "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1565         "vpermilps $68, %%ymm0, %%ymm8\n"
1566         "vpermilps $238, %%ymm0, %%ymm9\n"
1567         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1568         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1569         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1570         "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1571         "vpermilps $68, %%ymm1, %%ymm8\n"
1572         "vpermilps $238, %%ymm1, %%ymm9\n"
1573         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1574         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1575         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1576         "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1577         "vpermilps $68, %%ymm2, %%ymm8\n"
1578         "vpermilps $238, %%ymm2, %%ymm9\n"
1579         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1580         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1581         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1582         "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1583         "vpermilps $68, %%ymm3, %%ymm8\n"
1584         "vpermilps $238, %%ymm3, %%ymm9\n"
1585         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1586         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1587         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1588         "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1589         "vpermilps $68, %%ymm4, %%ymm8\n"
1590         "vpermilps $238, %%ymm4, %%ymm9\n"
1591         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1592         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1593         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1594         "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1595         "vpermilps $68, %%ymm5, %%ymm8\n"
1596         "vpermilps $238, %%ymm5, %%ymm9\n"
1597         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1598         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1599         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1600         "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1601         "vpermilps $68, %%ymm6, %%ymm8\n"
1602         "vpermilps $238, %%ymm6, %%ymm9\n"
1603         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1604         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1605         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1606         "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1607         "vpermilps $68, %%ymm7, %%ymm8\n"
1608         "vpermilps $238, %%ymm7, %%ymm9\n"
1609         "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1610         "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1611         "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1612         "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1613         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1614         "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1615         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1616         "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1617         "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1618         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1619         "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1620         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1621         "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1622         "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1623         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1624         "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1625         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1626         "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1627         "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1628         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1629         "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1630         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1631         "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1632         "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1633         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1634         "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1635         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1636         "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1637         "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1638         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1639         "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1640         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1641         "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1642         "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1643         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1644         "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1645         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1646         "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1647         "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1648         "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1649         "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1650         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1651         "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1652         "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1653         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1654         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1655         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1656         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1657         "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1658         "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1659         "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1660         "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1661         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1662         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1663         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1664         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1665         "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1666         "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1667         "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1668         "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1669         "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1670         "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1671         "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1672         "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1673         "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1674         "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1675         "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1676         "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1677         "vmovups %%ymm8, (%0)\n"
1678         "vmovups %%ymm9, (%1)\n"
1679         "vmovups %%ymm10, (%2)\n"
1680         "vmovups %%ymm11, (%3)\n"
1681         "vmovups %%ymm12, (%4)\n"
1682         "vmovups %%ymm13, (%5)\n"
1683         "vmovups %%ymm14, (%6)\n"
1684         "vmovups %%ymm15, (%7)\n"
1685         :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1686       );
1687     }
1688   }
1689   for (int j = 0; j < 4096; j += 512) {
1690     for (int k = 0; k < 64; k += 8) {
1691       __asm__ volatile (
1692         "vmovups (%0), %%ymm0\n"
1693         "vmovups (%1), %%ymm1\n"
1694         "vmovups (%2), %%ymm2\n"
1695         "vmovups (%3), %%ymm3\n"
1696         "vmovups (%4), %%ymm4\n"
1697         "vmovups (%5), %%ymm5\n"
1698         "vmovups (%6), %%ymm6\n"
1699         "vmovups (%7), %%ymm7\n"
1700         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1701         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1702         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1703         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1704         "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1705         "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1706         "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1707         "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1708         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1709         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1710         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1711         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1712         "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1713         "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1714         "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1715         "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1716         "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1717         "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1718         "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1719         "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1720         "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1721         "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1722         "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1723         "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1724         "vmovups %%ymm8, (%0)\n"
1725         "vmovups %%ymm9, (%1)\n"
1726         "vmovups %%ymm10, (%2)\n"
1727         "vmovups %%ymm11, (%3)\n"
1728         "vmovups %%ymm12, (%4)\n"
1729         "vmovups %%ymm13, (%5)\n"
1730         "vmovups %%ymm14, (%6)\n"
1731         "vmovups %%ymm15, (%7)\n"
1732         :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1733       );
1734     }
1735   }
1736   for (int j = 0; j < 4096; j += 4096) {
1737     for (int k = 0; k < 512; k += 8) {
1738       __asm__ volatile (
1739         "vmovups (%0), %%ymm0\n"
1740         "vmovups (%1), %%ymm1\n"
1741         "vmovups (%2), %%ymm2\n"
1742         "vmovups (%3), %%ymm3\n"
1743         "vmovups (%4), %%ymm4\n"
1744         "vmovups (%5), %%ymm5\n"
1745         "vmovups (%6), %%ymm6\n"
1746         "vmovups (%7), %%ymm7\n"
1747         "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1748         "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1749         "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1750         "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1751         "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1752         "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1753         "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1754         "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1755         "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1756         "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1757         "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1758         "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1759         "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1760         "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1761         "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1762         "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1763         "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1764         "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1765         "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1766         "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1767         "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1768         "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1769         "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1770         "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1771         "vmovups %%ymm8, (%0)\n"
1772         "vmovups %%ymm9, (%1)\n"
1773         "vmovups %%ymm10, (%2)\n"
1774         "vmovups %%ymm11, (%3)\n"
1775         "vmovups %%ymm12, (%4)\n"
1776         "vmovups %%ymm13, (%5)\n"
1777         "vmovups %%ymm14, (%6)\n"
1778         "vmovups %%ymm15, (%7)\n"
1779         :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1780       );
1781     }
1782   }
1783 }
1784 void helper_float_13_recursive(float *buf, int depth);
helper_float_13_recursive(float * buf,int depth)1785 void helper_float_13_recursive(float *buf, int depth) {
1786   if (depth == 11) {
1787     for (int j = 0; j < 2048; j += 64) {
1788       for (int k = 0; k < 8; k += 8) {
1789         __asm__ volatile (
1790           "vmovups (%0), %%ymm0\n"
1791           "vmovups (%1), %%ymm1\n"
1792           "vmovups (%2), %%ymm2\n"
1793           "vmovups (%3), %%ymm3\n"
1794           "vmovups (%4), %%ymm4\n"
1795           "vmovups (%5), %%ymm5\n"
1796           "vmovups (%6), %%ymm6\n"
1797           "vmovups (%7), %%ymm7\n"
1798           "vpermilps $160, %%ymm0, %%ymm8\n"
1799           "vpermilps $245, %%ymm0, %%ymm9\n"
1800           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1801           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1802           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1803           "vpermilps $160, %%ymm1, %%ymm8\n"
1804           "vpermilps $245, %%ymm1, %%ymm9\n"
1805           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1806           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1807           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1808           "vpermilps $160, %%ymm2, %%ymm8\n"
1809           "vpermilps $245, %%ymm2, %%ymm9\n"
1810           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1811           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1812           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1813           "vpermilps $160, %%ymm3, %%ymm8\n"
1814           "vpermilps $245, %%ymm3, %%ymm9\n"
1815           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1816           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1817           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1818           "vpermilps $160, %%ymm4, %%ymm8\n"
1819           "vpermilps $245, %%ymm4, %%ymm9\n"
1820           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1821           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1822           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1823           "vpermilps $160, %%ymm5, %%ymm8\n"
1824           "vpermilps $245, %%ymm5, %%ymm9\n"
1825           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1826           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1827           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1828           "vpermilps $160, %%ymm6, %%ymm8\n"
1829           "vpermilps $245, %%ymm6, %%ymm9\n"
1830           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1831           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1832           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1833           "vpermilps $160, %%ymm7, %%ymm8\n"
1834           "vpermilps $245, %%ymm7, %%ymm9\n"
1835           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1836           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1837           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1838           "vpermilps $68, %%ymm0, %%ymm8\n"
1839           "vpermilps $238, %%ymm0, %%ymm9\n"
1840           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1841           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1842           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1843           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1844           "vpermilps $68, %%ymm1, %%ymm8\n"
1845           "vpermilps $238, %%ymm1, %%ymm9\n"
1846           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1847           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1848           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1849           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1850           "vpermilps $68, %%ymm2, %%ymm8\n"
1851           "vpermilps $238, %%ymm2, %%ymm9\n"
1852           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1853           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1854           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1855           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1856           "vpermilps $68, %%ymm3, %%ymm8\n"
1857           "vpermilps $238, %%ymm3, %%ymm9\n"
1858           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1859           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1860           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1861           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1862           "vpermilps $68, %%ymm4, %%ymm8\n"
1863           "vpermilps $238, %%ymm4, %%ymm9\n"
1864           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1865           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1866           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1867           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1868           "vpermilps $68, %%ymm5, %%ymm8\n"
1869           "vpermilps $238, %%ymm5, %%ymm9\n"
1870           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1871           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1872           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1873           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1874           "vpermilps $68, %%ymm6, %%ymm8\n"
1875           "vpermilps $238, %%ymm6, %%ymm9\n"
1876           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1877           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1878           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1879           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1880           "vpermilps $68, %%ymm7, %%ymm8\n"
1881           "vpermilps $238, %%ymm7, %%ymm9\n"
1882           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1883           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1884           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1885           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1886           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1887           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1888           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1889           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1890           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1891           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1892           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1893           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1894           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1895           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1896           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1897           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1898           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1899           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1900           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1901           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1902           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1903           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1904           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1905           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1906           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1907           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1908           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1909           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1910           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1911           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1912           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1913           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1914           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1915           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1916           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1917           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1918           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1919           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1920           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1921           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1922           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1923           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1924           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1925           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1926           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1927           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1928           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1929           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1930           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1931           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1932           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1933           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1934           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1935           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1936           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1937           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1938           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1939           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1940           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1941           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1942           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1943           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1944           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1945           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1946           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1947           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1948           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1949           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1950           "vmovups %%ymm8, (%0)\n"
1951           "vmovups %%ymm9, (%1)\n"
1952           "vmovups %%ymm10, (%2)\n"
1953           "vmovups %%ymm11, (%3)\n"
1954           "vmovups %%ymm12, (%4)\n"
1955           "vmovups %%ymm13, (%5)\n"
1956           "vmovups %%ymm14, (%6)\n"
1957           "vmovups %%ymm15, (%7)\n"
1958           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1959         );
1960       }
1961     }
1962     for (int j = 0; j < 2048; j += 512) {
1963       for (int k = 0; k < 64; k += 8) {
1964         __asm__ volatile (
1965           "vmovups (%0), %%ymm0\n"
1966           "vmovups (%1), %%ymm1\n"
1967           "vmovups (%2), %%ymm2\n"
1968           "vmovups (%3), %%ymm3\n"
1969           "vmovups (%4), %%ymm4\n"
1970           "vmovups (%5), %%ymm5\n"
1971           "vmovups (%6), %%ymm6\n"
1972           "vmovups (%7), %%ymm7\n"
1973           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1974           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1975           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1976           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1977           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1978           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1979           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1980           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1981           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1982           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1983           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1984           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1985           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1986           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1987           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1988           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1989           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1990           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1991           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1992           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1993           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1994           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1995           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1996           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1997           "vmovups %%ymm8, (%0)\n"
1998           "vmovups %%ymm9, (%1)\n"
1999           "vmovups %%ymm10, (%2)\n"
2000           "vmovups %%ymm11, (%3)\n"
2001           "vmovups %%ymm12, (%4)\n"
2002           "vmovups %%ymm13, (%5)\n"
2003           "vmovups %%ymm14, (%6)\n"
2004           "vmovups %%ymm15, (%7)\n"
2005           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2006         );
2007       }
2008     }
2009     for (int j = 0; j < 2048; j += 2048) {
2010       for (int k = 0; k < 512; k += 8) {
2011         __asm__ volatile (
2012           "vmovups (%0), %%ymm0\n"
2013           "vmovups (%1), %%ymm1\n"
2014           "vmovups (%2), %%ymm2\n"
2015           "vmovups (%3), %%ymm3\n"
2016           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2017           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2018           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2019           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2020           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2021           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2022           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2023           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2024           "vmovups %%ymm0, (%0)\n"
2025           "vmovups %%ymm1, (%1)\n"
2026           "vmovups %%ymm2, (%2)\n"
2027           "vmovups %%ymm3, (%3)\n"
2028           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2029         );
2030       }
2031     }
2032     return;
2033   }
2034   if (depth == 13) {
2035     helper_float_13_recursive(buf + 0, 11);
2036     helper_float_13_recursive(buf + 2048, 11);
2037     helper_float_13_recursive(buf + 4096, 11);
2038     helper_float_13_recursive(buf + 6144, 11);
2039     for (int j = 0; j < 8192; j += 8192) {
2040       for (int k = 0; k < 2048; k += 8) {
2041         __asm__ volatile (
2042           "vmovups (%0), %%ymm0\n"
2043           "vmovups (%1), %%ymm1\n"
2044           "vmovups (%2), %%ymm2\n"
2045           "vmovups (%3), %%ymm3\n"
2046           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2047           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2048           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2049           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2050           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2051           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2052           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2053           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2054           "vmovups %%ymm0, (%0)\n"
2055           "vmovups %%ymm1, (%1)\n"
2056           "vmovups %%ymm2, (%2)\n"
2057           "vmovups %%ymm3, (%3)\n"
2058           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2059         );
2060       }
2061     }
2062     return;
2063   }
2064 }
2065 void helper_float_13(float *buf);
helper_float_13(float * buf)2066 void helper_float_13(float *buf) {
2067   helper_float_13_recursive(buf, 13);
2068 }
2069 void helper_float_14_recursive(float *buf, int depth);
helper_float_14_recursive(float * buf,int depth)2070 void helper_float_14_recursive(float *buf, int depth) {
2071   if (depth == 12) {
2072     for (int j = 0; j < 4096; j += 64) {
2073       for (int k = 0; k < 8; k += 8) {
2074         __asm__ volatile (
2075           "vmovups (%0), %%ymm0\n"
2076           "vmovups (%1), %%ymm1\n"
2077           "vmovups (%2), %%ymm2\n"
2078           "vmovups (%3), %%ymm3\n"
2079           "vmovups (%4), %%ymm4\n"
2080           "vmovups (%5), %%ymm5\n"
2081           "vmovups (%6), %%ymm6\n"
2082           "vmovups (%7), %%ymm7\n"
2083           "vpermilps $160, %%ymm0, %%ymm8\n"
2084           "vpermilps $245, %%ymm0, %%ymm9\n"
2085           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2086           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2087           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
2088           "vpermilps $160, %%ymm1, %%ymm8\n"
2089           "vpermilps $245, %%ymm1, %%ymm9\n"
2090           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2091           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2092           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
2093           "vpermilps $160, %%ymm2, %%ymm8\n"
2094           "vpermilps $245, %%ymm2, %%ymm9\n"
2095           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2096           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2097           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
2098           "vpermilps $160, %%ymm3, %%ymm8\n"
2099           "vpermilps $245, %%ymm3, %%ymm9\n"
2100           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2101           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2102           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
2103           "vpermilps $160, %%ymm4, %%ymm8\n"
2104           "vpermilps $245, %%ymm4, %%ymm9\n"
2105           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2106           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2107           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
2108           "vpermilps $160, %%ymm5, %%ymm8\n"
2109           "vpermilps $245, %%ymm5, %%ymm9\n"
2110           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2111           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2112           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
2113           "vpermilps $160, %%ymm6, %%ymm8\n"
2114           "vpermilps $245, %%ymm6, %%ymm9\n"
2115           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2116           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2117           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
2118           "vpermilps $160, %%ymm7, %%ymm8\n"
2119           "vpermilps $245, %%ymm7, %%ymm9\n"
2120           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2121           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2122           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
2123           "vpermilps $68, %%ymm0, %%ymm8\n"
2124           "vpermilps $238, %%ymm0, %%ymm9\n"
2125           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2126           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2127           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2128           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
2129           "vpermilps $68, %%ymm1, %%ymm8\n"
2130           "vpermilps $238, %%ymm1, %%ymm9\n"
2131           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2132           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2133           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2134           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
2135           "vpermilps $68, %%ymm2, %%ymm8\n"
2136           "vpermilps $238, %%ymm2, %%ymm9\n"
2137           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2138           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2139           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2140           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
2141           "vpermilps $68, %%ymm3, %%ymm8\n"
2142           "vpermilps $238, %%ymm3, %%ymm9\n"
2143           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2144           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2145           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2146           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
2147           "vpermilps $68, %%ymm4, %%ymm8\n"
2148           "vpermilps $238, %%ymm4, %%ymm9\n"
2149           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2150           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2151           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2152           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
2153           "vpermilps $68, %%ymm5, %%ymm8\n"
2154           "vpermilps $238, %%ymm5, %%ymm9\n"
2155           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2156           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2157           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2158           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
2159           "vpermilps $68, %%ymm6, %%ymm8\n"
2160           "vpermilps $238, %%ymm6, %%ymm9\n"
2161           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2162           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2163           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2164           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
2165           "vpermilps $68, %%ymm7, %%ymm8\n"
2166           "vpermilps $238, %%ymm7, %%ymm9\n"
2167           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2168           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2169           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2170           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
2171           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2172           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
2173           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
2174           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
2175           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
2176           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2177           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
2178           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
2179           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
2180           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
2181           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2182           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
2183           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
2184           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
2185           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
2186           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2187           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
2188           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
2189           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
2190           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
2191           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2192           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
2193           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
2194           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
2195           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
2196           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2197           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
2198           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
2199           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
2200           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
2201           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2202           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
2203           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
2204           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
2205           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
2206           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2207           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
2208           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
2209           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
2210           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
2211           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2212           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2213           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2214           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2215           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2216           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2217           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2218           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2219           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2220           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2221           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2222           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2223           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2224           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2225           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2226           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2227           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2228           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2229           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2230           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2231           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2232           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2233           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2234           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2235           "vmovups %%ymm8, (%0)\n"
2236           "vmovups %%ymm9, (%1)\n"
2237           "vmovups %%ymm10, (%2)\n"
2238           "vmovups %%ymm11, (%3)\n"
2239           "vmovups %%ymm12, (%4)\n"
2240           "vmovups %%ymm13, (%5)\n"
2241           "vmovups %%ymm14, (%6)\n"
2242           "vmovups %%ymm15, (%7)\n"
2243           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2244         );
2245       }
2246     }
2247     for (int j = 0; j < 4096; j += 512) {
2248       for (int k = 0; k < 64; k += 8) {
2249         __asm__ volatile (
2250           "vmovups (%0), %%ymm0\n"
2251           "vmovups (%1), %%ymm1\n"
2252           "vmovups (%2), %%ymm2\n"
2253           "vmovups (%3), %%ymm3\n"
2254           "vmovups (%4), %%ymm4\n"
2255           "vmovups (%5), %%ymm5\n"
2256           "vmovups (%6), %%ymm6\n"
2257           "vmovups (%7), %%ymm7\n"
2258           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2259           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2260           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2261           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2262           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2263           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2264           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2265           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2266           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2267           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2268           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2269           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2270           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2271           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2272           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2273           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2274           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2275           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2276           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2277           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2278           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2279           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2280           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2281           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2282           "vmovups %%ymm8, (%0)\n"
2283           "vmovups %%ymm9, (%1)\n"
2284           "vmovups %%ymm10, (%2)\n"
2285           "vmovups %%ymm11, (%3)\n"
2286           "vmovups %%ymm12, (%4)\n"
2287           "vmovups %%ymm13, (%5)\n"
2288           "vmovups %%ymm14, (%6)\n"
2289           "vmovups %%ymm15, (%7)\n"
2290           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2291         );
2292       }
2293     }
2294     for (int j = 0; j < 4096; j += 4096) {
2295       for (int k = 0; k < 512; k += 8) {
2296         __asm__ volatile (
2297           "vmovups (%0), %%ymm0\n"
2298           "vmovups (%1), %%ymm1\n"
2299           "vmovups (%2), %%ymm2\n"
2300           "vmovups (%3), %%ymm3\n"
2301           "vmovups (%4), %%ymm4\n"
2302           "vmovups (%5), %%ymm5\n"
2303           "vmovups (%6), %%ymm6\n"
2304           "vmovups (%7), %%ymm7\n"
2305           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2306           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2307           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2308           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2309           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2310           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2311           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2312           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2313           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2314           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2315           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2316           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2317           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2318           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2319           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2320           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2321           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2322           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2323           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2324           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2325           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2326           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2327           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2328           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2329           "vmovups %%ymm8, (%0)\n"
2330           "vmovups %%ymm9, (%1)\n"
2331           "vmovups %%ymm10, (%2)\n"
2332           "vmovups %%ymm11, (%3)\n"
2333           "vmovups %%ymm12, (%4)\n"
2334           "vmovups %%ymm13, (%5)\n"
2335           "vmovups %%ymm14, (%6)\n"
2336           "vmovups %%ymm15, (%7)\n"
2337           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2338         );
2339       }
2340     }
2341     return;
2342   }
2343   if (depth == 14) {
2344     helper_float_14_recursive(buf + 0, 12);
2345     helper_float_14_recursive(buf + 4096, 12);
2346     helper_float_14_recursive(buf + 8192, 12);
2347     helper_float_14_recursive(buf + 12288, 12);
2348     for (int j = 0; j < 16384; j += 16384) {
2349       for (int k = 0; k < 4096; k += 8) {
2350         __asm__ volatile (
2351           "vmovups (%0), %%ymm0\n"
2352           "vmovups (%1), %%ymm1\n"
2353           "vmovups (%2), %%ymm2\n"
2354           "vmovups (%3), %%ymm3\n"
2355           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2356           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2357           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2358           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2359           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2360           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2361           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2362           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2363           "vmovups %%ymm0, (%0)\n"
2364           "vmovups %%ymm1, (%1)\n"
2365           "vmovups %%ymm2, (%2)\n"
2366           "vmovups %%ymm3, (%3)\n"
2367           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2368         );
2369       }
2370     }
2371     return;
2372   }
2373 }
2374 void helper_float_14(float *buf);
helper_float_14(float * buf)2375 void helper_float_14(float *buf) {
2376   helper_float_14_recursive(buf, 14);
2377 }
2378 void helper_float_15_recursive(float *buf, int depth);
helper_float_15_recursive(float * buf,int depth)2379 void helper_float_15_recursive(float *buf, int depth) {
2380   if (depth == 13) {
2381     for (int j = 0; j < 8192; j += 64) {
2382       for (int k = 0; k < 8; k += 8) {
2383         __asm__ volatile (
2384           "vmovups (%0), %%ymm0\n"
2385           "vmovups (%1), %%ymm1\n"
2386           "vmovups (%2), %%ymm2\n"
2387           "vmovups (%3), %%ymm3\n"
2388           "vmovups (%4), %%ymm4\n"
2389           "vmovups (%5), %%ymm5\n"
2390           "vmovups (%6), %%ymm6\n"
2391           "vmovups (%7), %%ymm7\n"
2392           "vpermilps $160, %%ymm0, %%ymm8\n"
2393           "vpermilps $245, %%ymm0, %%ymm9\n"
2394           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2395           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2396           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
2397           "vpermilps $160, %%ymm1, %%ymm8\n"
2398           "vpermilps $245, %%ymm1, %%ymm9\n"
2399           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2400           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2401           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
2402           "vpermilps $160, %%ymm2, %%ymm8\n"
2403           "vpermilps $245, %%ymm2, %%ymm9\n"
2404           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2405           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2406           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
2407           "vpermilps $160, %%ymm3, %%ymm8\n"
2408           "vpermilps $245, %%ymm3, %%ymm9\n"
2409           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2410           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2411           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
2412           "vpermilps $160, %%ymm4, %%ymm8\n"
2413           "vpermilps $245, %%ymm4, %%ymm9\n"
2414           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2415           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2416           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
2417           "vpermilps $160, %%ymm5, %%ymm8\n"
2418           "vpermilps $245, %%ymm5, %%ymm9\n"
2419           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2420           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2421           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
2422           "vpermilps $160, %%ymm6, %%ymm8\n"
2423           "vpermilps $245, %%ymm6, %%ymm9\n"
2424           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2425           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2426           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
2427           "vpermilps $160, %%ymm7, %%ymm8\n"
2428           "vpermilps $245, %%ymm7, %%ymm9\n"
2429           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2430           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2431           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
2432           "vpermilps $68, %%ymm0, %%ymm8\n"
2433           "vpermilps $238, %%ymm0, %%ymm9\n"
2434           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2435           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2436           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2437           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
2438           "vpermilps $68, %%ymm1, %%ymm8\n"
2439           "vpermilps $238, %%ymm1, %%ymm9\n"
2440           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2441           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2442           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2443           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
2444           "vpermilps $68, %%ymm2, %%ymm8\n"
2445           "vpermilps $238, %%ymm2, %%ymm9\n"
2446           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2447           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2448           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2449           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
2450           "vpermilps $68, %%ymm3, %%ymm8\n"
2451           "vpermilps $238, %%ymm3, %%ymm9\n"
2452           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2453           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2454           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2455           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
2456           "vpermilps $68, %%ymm4, %%ymm8\n"
2457           "vpermilps $238, %%ymm4, %%ymm9\n"
2458           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2459           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2460           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2461           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
2462           "vpermilps $68, %%ymm5, %%ymm8\n"
2463           "vpermilps $238, %%ymm5, %%ymm9\n"
2464           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2465           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2466           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2467           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
2468           "vpermilps $68, %%ymm6, %%ymm8\n"
2469           "vpermilps $238, %%ymm6, %%ymm9\n"
2470           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2471           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2472           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2473           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
2474           "vpermilps $68, %%ymm7, %%ymm8\n"
2475           "vpermilps $238, %%ymm7, %%ymm9\n"
2476           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2477           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2478           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2479           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
2480           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2481           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
2482           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
2483           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
2484           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
2485           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2486           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
2487           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
2488           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
2489           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
2490           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2491           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
2492           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
2493           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
2494           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
2495           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2496           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
2497           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
2498           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
2499           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
2500           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2501           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
2502           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
2503           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
2504           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
2505           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2506           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
2507           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
2508           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
2509           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
2510           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2511           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
2512           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
2513           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
2514           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
2515           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2516           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
2517           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
2518           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
2519           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
2520           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2521           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2522           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2523           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2524           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2525           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2526           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2527           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2528           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2529           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2530           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2531           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2532           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2533           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2534           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2535           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2536           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2537           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2538           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2539           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2540           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2541           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2542           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2543           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2544           "vmovups %%ymm8, (%0)\n"
2545           "vmovups %%ymm9, (%1)\n"
2546           "vmovups %%ymm10, (%2)\n"
2547           "vmovups %%ymm11, (%3)\n"
2548           "vmovups %%ymm12, (%4)\n"
2549           "vmovups %%ymm13, (%5)\n"
2550           "vmovups %%ymm14, (%6)\n"
2551           "vmovups %%ymm15, (%7)\n"
2552           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2553         );
2554       }
2555     }
2556     for (int j = 0; j < 8192; j += 512) {
2557       for (int k = 0; k < 64; k += 8) {
2558         __asm__ volatile (
2559           "vmovups (%0), %%ymm0\n"
2560           "vmovups (%1), %%ymm1\n"
2561           "vmovups (%2), %%ymm2\n"
2562           "vmovups (%3), %%ymm3\n"
2563           "vmovups (%4), %%ymm4\n"
2564           "vmovups (%5), %%ymm5\n"
2565           "vmovups (%6), %%ymm6\n"
2566           "vmovups (%7), %%ymm7\n"
2567           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2568           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2569           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2570           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2571           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2572           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2573           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2574           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2575           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2576           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2577           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2578           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2579           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2580           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2581           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2582           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2583           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2584           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2585           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2586           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2587           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2588           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2589           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2590           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2591           "vmovups %%ymm8, (%0)\n"
2592           "vmovups %%ymm9, (%1)\n"
2593           "vmovups %%ymm10, (%2)\n"
2594           "vmovups %%ymm11, (%3)\n"
2595           "vmovups %%ymm12, (%4)\n"
2596           "vmovups %%ymm13, (%5)\n"
2597           "vmovups %%ymm14, (%6)\n"
2598           "vmovups %%ymm15, (%7)\n"
2599           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2600         );
2601       }
2602     }
2603     for (int j = 0; j < 8192; j += 4096) {
2604       for (int k = 0; k < 512; k += 8) {
2605         __asm__ volatile (
2606           "vmovups (%0), %%ymm0\n"
2607           "vmovups (%1), %%ymm1\n"
2608           "vmovups (%2), %%ymm2\n"
2609           "vmovups (%3), %%ymm3\n"
2610           "vmovups (%4), %%ymm4\n"
2611           "vmovups (%5), %%ymm5\n"
2612           "vmovups (%6), %%ymm6\n"
2613           "vmovups (%7), %%ymm7\n"
2614           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2615           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2616           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2617           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2618           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2619           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2620           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2621           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2622           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2623           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2624           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2625           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2626           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2627           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2628           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2629           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2630           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2631           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2632           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2633           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2634           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2635           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2636           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2637           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2638           "vmovups %%ymm8, (%0)\n"
2639           "vmovups %%ymm9, (%1)\n"
2640           "vmovups %%ymm10, (%2)\n"
2641           "vmovups %%ymm11, (%3)\n"
2642           "vmovups %%ymm12, (%4)\n"
2643           "vmovups %%ymm13, (%5)\n"
2644           "vmovups %%ymm14, (%6)\n"
2645           "vmovups %%ymm15, (%7)\n"
2646           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2647         );
2648       }
2649     }
2650     for (int j = 0; j < 8192; j += 8192) {
2651       for (int k = 0; k < 4096; k += 8) {
2652         __asm__ volatile (
2653           "vmovups (%0), %%ymm0\n"
2654           "vmovups (%1), %%ymm1\n"
2655           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2656           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2657           "vmovups %%ymm8, (%0)\n"
2658           "vmovups %%ymm9, (%1)\n"
2659           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2660         );
2661       }
2662     }
2663     return;
2664   }
2665   if (depth == 15) {
2666     helper_float_15_recursive(buf + 0, 13);
2667     helper_float_15_recursive(buf + 8192, 13);
2668     helper_float_15_recursive(buf + 16384, 13);
2669     helper_float_15_recursive(buf + 24576, 13);
2670     for (int j = 0; j < 32768; j += 32768) {
2671       for (int k = 0; k < 8192; k += 8) {
2672         __asm__ volatile (
2673           "vmovups (%0), %%ymm0\n"
2674           "vmovups (%1), %%ymm1\n"
2675           "vmovups (%2), %%ymm2\n"
2676           "vmovups (%3), %%ymm3\n"
2677           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2678           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2679           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2680           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2681           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2682           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2683           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2684           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2685           "vmovups %%ymm0, (%0)\n"
2686           "vmovups %%ymm1, (%1)\n"
2687           "vmovups %%ymm2, (%2)\n"
2688           "vmovups %%ymm3, (%3)\n"
2689           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2690         );
2691       }
2692     }
2693     return;
2694   }
2695 }
2696 void helper_float_15(float *buf);
helper_float_15(float * buf)2697 void helper_float_15(float *buf) {
2698   helper_float_15_recursive(buf, 15);
2699 }
2700 void helper_float_16_recursive(float *buf, int depth);
helper_float_16_recursive(float * buf,int depth)2701 void helper_float_16_recursive(float *buf, int depth) {
2702   if (depth == 13) {
2703     for (int j = 0; j < 8192; j += 64) {
2704       for (int k = 0; k < 8; k += 8) {
2705         __asm__ volatile (
2706           "vmovups (%0), %%ymm0\n"
2707           "vmovups (%1), %%ymm1\n"
2708           "vmovups (%2), %%ymm2\n"
2709           "vmovups (%3), %%ymm3\n"
2710           "vmovups (%4), %%ymm4\n"
2711           "vmovups (%5), %%ymm5\n"
2712           "vmovups (%6), %%ymm6\n"
2713           "vmovups (%7), %%ymm7\n"
2714           "vpermilps $160, %%ymm0, %%ymm8\n"
2715           "vpermilps $245, %%ymm0, %%ymm9\n"
2716           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2717           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2718           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
2719           "vpermilps $160, %%ymm1, %%ymm8\n"
2720           "vpermilps $245, %%ymm1, %%ymm9\n"
2721           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2722           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2723           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
2724           "vpermilps $160, %%ymm2, %%ymm8\n"
2725           "vpermilps $245, %%ymm2, %%ymm9\n"
2726           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2727           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2728           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
2729           "vpermilps $160, %%ymm3, %%ymm8\n"
2730           "vpermilps $245, %%ymm3, %%ymm9\n"
2731           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2732           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2733           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
2734           "vpermilps $160, %%ymm4, %%ymm8\n"
2735           "vpermilps $245, %%ymm4, %%ymm9\n"
2736           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2737           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2738           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
2739           "vpermilps $160, %%ymm5, %%ymm8\n"
2740           "vpermilps $245, %%ymm5, %%ymm9\n"
2741           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2742           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2743           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
2744           "vpermilps $160, %%ymm6, %%ymm8\n"
2745           "vpermilps $245, %%ymm6, %%ymm9\n"
2746           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2747           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2748           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
2749           "vpermilps $160, %%ymm7, %%ymm8\n"
2750           "vpermilps $245, %%ymm7, %%ymm9\n"
2751           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2752           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2753           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
2754           "vpermilps $68, %%ymm0, %%ymm8\n"
2755           "vpermilps $238, %%ymm0, %%ymm9\n"
2756           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2757           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2758           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2759           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
2760           "vpermilps $68, %%ymm1, %%ymm8\n"
2761           "vpermilps $238, %%ymm1, %%ymm9\n"
2762           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2763           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2764           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2765           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
2766           "vpermilps $68, %%ymm2, %%ymm8\n"
2767           "vpermilps $238, %%ymm2, %%ymm9\n"
2768           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2769           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2770           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2771           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
2772           "vpermilps $68, %%ymm3, %%ymm8\n"
2773           "vpermilps $238, %%ymm3, %%ymm9\n"
2774           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2775           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2776           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2777           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
2778           "vpermilps $68, %%ymm4, %%ymm8\n"
2779           "vpermilps $238, %%ymm4, %%ymm9\n"
2780           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2781           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2782           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2783           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
2784           "vpermilps $68, %%ymm5, %%ymm8\n"
2785           "vpermilps $238, %%ymm5, %%ymm9\n"
2786           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2787           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2788           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2789           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
2790           "vpermilps $68, %%ymm6, %%ymm8\n"
2791           "vpermilps $238, %%ymm6, %%ymm9\n"
2792           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2793           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2794           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2795           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
2796           "vpermilps $68, %%ymm7, %%ymm8\n"
2797           "vpermilps $238, %%ymm7, %%ymm9\n"
2798           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2799           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2800           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2801           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
2802           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2803           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
2804           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
2805           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
2806           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
2807           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2808           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
2809           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
2810           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
2811           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
2812           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2813           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
2814           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
2815           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
2816           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
2817           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2818           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
2819           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
2820           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
2821           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
2822           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2823           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
2824           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
2825           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
2826           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
2827           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2828           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
2829           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
2830           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
2831           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
2832           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2833           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
2834           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
2835           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
2836           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
2837           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2838           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
2839           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
2840           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
2841           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
2842           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2843           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2844           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2845           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2846           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2847           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2848           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2849           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2850           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2851           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2852           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2853           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2854           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2855           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2856           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2857           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2858           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2859           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2860           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2861           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2862           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2863           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2864           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2865           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2866           "vmovups %%ymm8, (%0)\n"
2867           "vmovups %%ymm9, (%1)\n"
2868           "vmovups %%ymm10, (%2)\n"
2869           "vmovups %%ymm11, (%3)\n"
2870           "vmovups %%ymm12, (%4)\n"
2871           "vmovups %%ymm13, (%5)\n"
2872           "vmovups %%ymm14, (%6)\n"
2873           "vmovups %%ymm15, (%7)\n"
2874           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2875         );
2876       }
2877     }
2878     for (int j = 0; j < 8192; j += 512) {
2879       for (int k = 0; k < 64; k += 8) {
2880         __asm__ volatile (
2881           "vmovups (%0), %%ymm0\n"
2882           "vmovups (%1), %%ymm1\n"
2883           "vmovups (%2), %%ymm2\n"
2884           "vmovups (%3), %%ymm3\n"
2885           "vmovups (%4), %%ymm4\n"
2886           "vmovups (%5), %%ymm5\n"
2887           "vmovups (%6), %%ymm6\n"
2888           "vmovups (%7), %%ymm7\n"
2889           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2890           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2891           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2892           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2893           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2894           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2895           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2896           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2897           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2898           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2899           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2900           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2901           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2902           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2903           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2904           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2905           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2906           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2907           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2908           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2909           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2910           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2911           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2912           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2913           "vmovups %%ymm8, (%0)\n"
2914           "vmovups %%ymm9, (%1)\n"
2915           "vmovups %%ymm10, (%2)\n"
2916           "vmovups %%ymm11, (%3)\n"
2917           "vmovups %%ymm12, (%4)\n"
2918           "vmovups %%ymm13, (%5)\n"
2919           "vmovups %%ymm14, (%6)\n"
2920           "vmovups %%ymm15, (%7)\n"
2921           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2922         );
2923       }
2924     }
2925     for (int j = 0; j < 8192; j += 4096) {
2926       for (int k = 0; k < 512; k += 8) {
2927         __asm__ volatile (
2928           "vmovups (%0), %%ymm0\n"
2929           "vmovups (%1), %%ymm1\n"
2930           "vmovups (%2), %%ymm2\n"
2931           "vmovups (%3), %%ymm3\n"
2932           "vmovups (%4), %%ymm4\n"
2933           "vmovups (%5), %%ymm5\n"
2934           "vmovups (%6), %%ymm6\n"
2935           "vmovups (%7), %%ymm7\n"
2936           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2937           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2938           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2939           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2940           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2941           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2942           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2943           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2944           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2945           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2946           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2947           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2948           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2949           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2950           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2951           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2952           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2953           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2954           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2955           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2956           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2957           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2958           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2959           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2960           "vmovups %%ymm8, (%0)\n"
2961           "vmovups %%ymm9, (%1)\n"
2962           "vmovups %%ymm10, (%2)\n"
2963           "vmovups %%ymm11, (%3)\n"
2964           "vmovups %%ymm12, (%4)\n"
2965           "vmovups %%ymm13, (%5)\n"
2966           "vmovups %%ymm14, (%6)\n"
2967           "vmovups %%ymm15, (%7)\n"
2968           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2969         );
2970       }
2971     }
2972     for (int j = 0; j < 8192; j += 8192) {
2973       for (int k = 0; k < 4096; k += 8) {
2974         __asm__ volatile (
2975           "vmovups (%0), %%ymm0\n"
2976           "vmovups (%1), %%ymm1\n"
2977           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2978           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2979           "vmovups %%ymm8, (%0)\n"
2980           "vmovups %%ymm9, (%1)\n"
2981           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2982         );
2983       }
2984     }
2985     return;
2986   }
2987   if (depth == 16) {
2988     helper_float_16_recursive(buf + 0, 13);
2989     helper_float_16_recursive(buf + 8192, 13);
2990     helper_float_16_recursive(buf + 16384, 13);
2991     helper_float_16_recursive(buf + 24576, 13);
2992     helper_float_16_recursive(buf + 32768, 13);
2993     helper_float_16_recursive(buf + 40960, 13);
2994     helper_float_16_recursive(buf + 49152, 13);
2995     helper_float_16_recursive(buf + 57344, 13);
2996     for (int j = 0; j < 65536; j += 65536) {
2997       for (int k = 0; k < 8192; k += 8) {
2998         __asm__ volatile (
2999           "vmovups (%0), %%ymm0\n"
3000           "vmovups (%1), %%ymm1\n"
3001           "vmovups (%2), %%ymm2\n"
3002           "vmovups (%3), %%ymm3\n"
3003           "vmovups (%4), %%ymm4\n"
3004           "vmovups (%5), %%ymm5\n"
3005           "vmovups (%6), %%ymm6\n"
3006           "vmovups (%7), %%ymm7\n"
3007           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3008           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3009           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3010           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3011           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3012           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3013           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3014           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3015           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3016           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3017           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3018           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3019           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3020           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3021           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3022           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3023           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3024           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3025           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3026           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3027           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3028           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3029           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3030           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3031           "vmovups %%ymm8, (%0)\n"
3032           "vmovups %%ymm9, (%1)\n"
3033           "vmovups %%ymm10, (%2)\n"
3034           "vmovups %%ymm11, (%3)\n"
3035           "vmovups %%ymm12, (%4)\n"
3036           "vmovups %%ymm13, (%5)\n"
3037           "vmovups %%ymm14, (%6)\n"
3038           "vmovups %%ymm15, (%7)\n"
3039           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3040         );
3041       }
3042     }
3043     return;
3044   }
3045 }
3046 void helper_float_16(float *buf);
helper_float_16(float * buf)3047 void helper_float_16(float *buf) {
3048   helper_float_16_recursive(buf, 16);
3049 }
3050 void helper_float_17_recursive(float *buf, int depth);
helper_float_17_recursive(float * buf,int depth)3051 void helper_float_17_recursive(float *buf, int depth) {
3052   if (depth == 12) {
3053     for (int j = 0; j < 4096; j += 64) {
3054       for (int k = 0; k < 8; k += 8) {
3055         __asm__ volatile (
3056           "vmovups (%0), %%ymm0\n"
3057           "vmovups (%1), %%ymm1\n"
3058           "vmovups (%2), %%ymm2\n"
3059           "vmovups (%3), %%ymm3\n"
3060           "vmovups (%4), %%ymm4\n"
3061           "vmovups (%5), %%ymm5\n"
3062           "vmovups (%6), %%ymm6\n"
3063           "vmovups (%7), %%ymm7\n"
3064           "vpermilps $160, %%ymm0, %%ymm8\n"
3065           "vpermilps $245, %%ymm0, %%ymm9\n"
3066           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3067           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3068           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
3069           "vpermilps $160, %%ymm1, %%ymm8\n"
3070           "vpermilps $245, %%ymm1, %%ymm9\n"
3071           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3072           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3073           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
3074           "vpermilps $160, %%ymm2, %%ymm8\n"
3075           "vpermilps $245, %%ymm2, %%ymm9\n"
3076           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3077           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3078           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
3079           "vpermilps $160, %%ymm3, %%ymm8\n"
3080           "vpermilps $245, %%ymm3, %%ymm9\n"
3081           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3082           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3083           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
3084           "vpermilps $160, %%ymm4, %%ymm8\n"
3085           "vpermilps $245, %%ymm4, %%ymm9\n"
3086           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3087           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3088           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
3089           "vpermilps $160, %%ymm5, %%ymm8\n"
3090           "vpermilps $245, %%ymm5, %%ymm9\n"
3091           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3092           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3093           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
3094           "vpermilps $160, %%ymm6, %%ymm8\n"
3095           "vpermilps $245, %%ymm6, %%ymm9\n"
3096           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3097           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3098           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
3099           "vpermilps $160, %%ymm7, %%ymm8\n"
3100           "vpermilps $245, %%ymm7, %%ymm9\n"
3101           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3102           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3103           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
3104           "vpermilps $68, %%ymm0, %%ymm8\n"
3105           "vpermilps $238, %%ymm0, %%ymm9\n"
3106           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3107           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3108           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3109           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
3110           "vpermilps $68, %%ymm1, %%ymm8\n"
3111           "vpermilps $238, %%ymm1, %%ymm9\n"
3112           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3113           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3114           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3115           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
3116           "vpermilps $68, %%ymm2, %%ymm8\n"
3117           "vpermilps $238, %%ymm2, %%ymm9\n"
3118           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3119           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3120           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3121           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
3122           "vpermilps $68, %%ymm3, %%ymm8\n"
3123           "vpermilps $238, %%ymm3, %%ymm9\n"
3124           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3125           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3126           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3127           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
3128           "vpermilps $68, %%ymm4, %%ymm8\n"
3129           "vpermilps $238, %%ymm4, %%ymm9\n"
3130           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3131           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3132           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3133           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
3134           "vpermilps $68, %%ymm5, %%ymm8\n"
3135           "vpermilps $238, %%ymm5, %%ymm9\n"
3136           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3137           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3138           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3139           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
3140           "vpermilps $68, %%ymm6, %%ymm8\n"
3141           "vpermilps $238, %%ymm6, %%ymm9\n"
3142           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3143           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3144           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3145           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
3146           "vpermilps $68, %%ymm7, %%ymm8\n"
3147           "vpermilps $238, %%ymm7, %%ymm9\n"
3148           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3149           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3150           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3151           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
3152           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3153           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
3154           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
3155           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
3156           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
3157           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3158           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
3159           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
3160           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
3161           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
3162           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3163           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
3164           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
3165           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
3166           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
3167           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3168           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
3169           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
3170           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
3171           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
3172           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3173           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
3174           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
3175           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
3176           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
3177           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3178           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
3179           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
3180           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
3181           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
3182           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3183           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
3184           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
3185           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
3186           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
3187           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3188           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
3189           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
3190           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
3191           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
3192           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3193           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3194           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3195           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3196           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3197           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3198           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3199           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3200           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3201           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3202           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3203           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3204           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3205           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3206           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3207           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3208           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3209           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3210           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3211           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3212           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3213           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3214           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3215           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3216           "vmovups %%ymm8, (%0)\n"
3217           "vmovups %%ymm9, (%1)\n"
3218           "vmovups %%ymm10, (%2)\n"
3219           "vmovups %%ymm11, (%3)\n"
3220           "vmovups %%ymm12, (%4)\n"
3221           "vmovups %%ymm13, (%5)\n"
3222           "vmovups %%ymm14, (%6)\n"
3223           "vmovups %%ymm15, (%7)\n"
3224           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3225         );
3226       }
3227     }
3228     for (int j = 0; j < 4096; j += 512) {
3229       for (int k = 0; k < 64; k += 8) {
3230         __asm__ volatile (
3231           "vmovups (%0), %%ymm0\n"
3232           "vmovups (%1), %%ymm1\n"
3233           "vmovups (%2), %%ymm2\n"
3234           "vmovups (%3), %%ymm3\n"
3235           "vmovups (%4), %%ymm4\n"
3236           "vmovups (%5), %%ymm5\n"
3237           "vmovups (%6), %%ymm6\n"
3238           "vmovups (%7), %%ymm7\n"
3239           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3240           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3241           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3242           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3243           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3244           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3245           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3246           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3247           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3248           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3249           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3250           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3251           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3252           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3253           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3254           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3255           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3256           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3257           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3258           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3259           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3260           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3261           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3262           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3263           "vmovups %%ymm8, (%0)\n"
3264           "vmovups %%ymm9, (%1)\n"
3265           "vmovups %%ymm10, (%2)\n"
3266           "vmovups %%ymm11, (%3)\n"
3267           "vmovups %%ymm12, (%4)\n"
3268           "vmovups %%ymm13, (%5)\n"
3269           "vmovups %%ymm14, (%6)\n"
3270           "vmovups %%ymm15, (%7)\n"
3271           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3272         );
3273       }
3274     }
3275     for (int j = 0; j < 4096; j += 4096) {
3276       for (int k = 0; k < 512; k += 8) {
3277         __asm__ volatile (
3278           "vmovups (%0), %%ymm0\n"
3279           "vmovups (%1), %%ymm1\n"
3280           "vmovups (%2), %%ymm2\n"
3281           "vmovups (%3), %%ymm3\n"
3282           "vmovups (%4), %%ymm4\n"
3283           "vmovups (%5), %%ymm5\n"
3284           "vmovups (%6), %%ymm6\n"
3285           "vmovups (%7), %%ymm7\n"
3286           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3287           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3288           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3289           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3290           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3291           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3292           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3293           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3294           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3295           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3296           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3297           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3298           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3299           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3300           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3301           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3302           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3303           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3304           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3305           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3306           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3307           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3308           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3309           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3310           "vmovups %%ymm8, (%0)\n"
3311           "vmovups %%ymm9, (%1)\n"
3312           "vmovups %%ymm10, (%2)\n"
3313           "vmovups %%ymm11, (%3)\n"
3314           "vmovups %%ymm12, (%4)\n"
3315           "vmovups %%ymm13, (%5)\n"
3316           "vmovups %%ymm14, (%6)\n"
3317           "vmovups %%ymm15, (%7)\n"
3318           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3319         );
3320       }
3321     }
3322     return;
3323   }
3324   if (depth == 15) {
3325     helper_float_17_recursive(buf + 0, 12);
3326     helper_float_17_recursive(buf + 4096, 12);
3327     helper_float_17_recursive(buf + 8192, 12);
3328     helper_float_17_recursive(buf + 12288, 12);
3329     helper_float_17_recursive(buf + 16384, 12);
3330     helper_float_17_recursive(buf + 20480, 12);
3331     helper_float_17_recursive(buf + 24576, 12);
3332     helper_float_17_recursive(buf + 28672, 12);
3333     for (int j = 0; j < 32768; j += 32768) {
3334       for (int k = 0; k < 4096; k += 8) {
3335         __asm__ volatile (
3336           "vmovups (%0), %%ymm0\n"
3337           "vmovups (%1), %%ymm1\n"
3338           "vmovups (%2), %%ymm2\n"
3339           "vmovups (%3), %%ymm3\n"
3340           "vmovups (%4), %%ymm4\n"
3341           "vmovups (%5), %%ymm5\n"
3342           "vmovups (%6), %%ymm6\n"
3343           "vmovups (%7), %%ymm7\n"
3344           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3345           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3346           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3347           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3348           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3349           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3350           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3351           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3352           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3353           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3354           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3355           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3356           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3357           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3358           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3359           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3360           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3361           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3362           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3363           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3364           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3365           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3366           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3367           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3368           "vmovups %%ymm8, (%0)\n"
3369           "vmovups %%ymm9, (%1)\n"
3370           "vmovups %%ymm10, (%2)\n"
3371           "vmovups %%ymm11, (%3)\n"
3372           "vmovups %%ymm12, (%4)\n"
3373           "vmovups %%ymm13, (%5)\n"
3374           "vmovups %%ymm14, (%6)\n"
3375           "vmovups %%ymm15, (%7)\n"
3376           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3377         );
3378       }
3379     }
3380     return;
3381   }
3382   if (depth == 17) {
3383     helper_float_17_recursive(buf + 0, 15);
3384     helper_float_17_recursive(buf + 32768, 15);
3385     helper_float_17_recursive(buf + 65536, 15);
3386     helper_float_17_recursive(buf + 98304, 15);
3387     for (int j = 0; j < 131072; j += 131072) {
3388       for (int k = 0; k < 32768; k += 8) {
3389         __asm__ volatile (
3390           "vmovups (%0), %%ymm0\n"
3391           "vmovups (%1), %%ymm1\n"
3392           "vmovups (%2), %%ymm2\n"
3393           "vmovups (%3), %%ymm3\n"
3394           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3395           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3396           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3397           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3398           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3399           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3400           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3401           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3402           "vmovups %%ymm0, (%0)\n"
3403           "vmovups %%ymm1, (%1)\n"
3404           "vmovups %%ymm2, (%2)\n"
3405           "vmovups %%ymm3, (%3)\n"
3406           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3407         );
3408       }
3409     }
3410     return;
3411   }
3412 }
3413 void helper_float_17(float *buf);
helper_float_17(float * buf)3414 void helper_float_17(float *buf) {
3415   helper_float_17_recursive(buf, 17);
3416 }
3417 void helper_float_18_recursive(float *buf, int depth);
helper_float_18_recursive(float * buf,int depth)3418 void helper_float_18_recursive(float *buf, int depth) {
3419   if (depth == 12) {
3420     for (int j = 0; j < 4096; j += 64) {
3421       for (int k = 0; k < 8; k += 8) {
3422         __asm__ volatile (
3423           "vmovups (%0), %%ymm0\n"
3424           "vmovups (%1), %%ymm1\n"
3425           "vmovups (%2), %%ymm2\n"
3426           "vmovups (%3), %%ymm3\n"
3427           "vmovups (%4), %%ymm4\n"
3428           "vmovups (%5), %%ymm5\n"
3429           "vmovups (%6), %%ymm6\n"
3430           "vmovups (%7), %%ymm7\n"
3431           "vpermilps $160, %%ymm0, %%ymm8\n"
3432           "vpermilps $245, %%ymm0, %%ymm9\n"
3433           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3434           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3435           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
3436           "vpermilps $160, %%ymm1, %%ymm8\n"
3437           "vpermilps $245, %%ymm1, %%ymm9\n"
3438           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3439           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3440           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
3441           "vpermilps $160, %%ymm2, %%ymm8\n"
3442           "vpermilps $245, %%ymm2, %%ymm9\n"
3443           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3444           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3445           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
3446           "vpermilps $160, %%ymm3, %%ymm8\n"
3447           "vpermilps $245, %%ymm3, %%ymm9\n"
3448           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3449           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3450           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
3451           "vpermilps $160, %%ymm4, %%ymm8\n"
3452           "vpermilps $245, %%ymm4, %%ymm9\n"
3453           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3454           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3455           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
3456           "vpermilps $160, %%ymm5, %%ymm8\n"
3457           "vpermilps $245, %%ymm5, %%ymm9\n"
3458           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3459           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3460           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
3461           "vpermilps $160, %%ymm6, %%ymm8\n"
3462           "vpermilps $245, %%ymm6, %%ymm9\n"
3463           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3464           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3465           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
3466           "vpermilps $160, %%ymm7, %%ymm8\n"
3467           "vpermilps $245, %%ymm7, %%ymm9\n"
3468           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3469           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3470           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
3471           "vpermilps $68, %%ymm0, %%ymm8\n"
3472           "vpermilps $238, %%ymm0, %%ymm9\n"
3473           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3474           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3475           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3476           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
3477           "vpermilps $68, %%ymm1, %%ymm8\n"
3478           "vpermilps $238, %%ymm1, %%ymm9\n"
3479           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3480           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3481           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3482           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
3483           "vpermilps $68, %%ymm2, %%ymm8\n"
3484           "vpermilps $238, %%ymm2, %%ymm9\n"
3485           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3486           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3487           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3488           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
3489           "vpermilps $68, %%ymm3, %%ymm8\n"
3490           "vpermilps $238, %%ymm3, %%ymm9\n"
3491           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3492           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3493           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3494           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
3495           "vpermilps $68, %%ymm4, %%ymm8\n"
3496           "vpermilps $238, %%ymm4, %%ymm9\n"
3497           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3498           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3499           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3500           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
3501           "vpermilps $68, %%ymm5, %%ymm8\n"
3502           "vpermilps $238, %%ymm5, %%ymm9\n"
3503           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3504           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3505           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3506           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
3507           "vpermilps $68, %%ymm6, %%ymm8\n"
3508           "vpermilps $238, %%ymm6, %%ymm9\n"
3509           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3510           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3511           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3512           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
3513           "vpermilps $68, %%ymm7, %%ymm8\n"
3514           "vpermilps $238, %%ymm7, %%ymm9\n"
3515           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3516           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3517           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3518           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
3519           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3520           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
3521           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
3522           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
3523           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
3524           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3525           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
3526           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
3527           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
3528           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
3529           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3530           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
3531           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
3532           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
3533           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
3534           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3535           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
3536           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
3537           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
3538           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
3539           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3540           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
3541           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
3542           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
3543           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
3544           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3545           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
3546           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
3547           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
3548           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
3549           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3550           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
3551           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
3552           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
3553           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
3554           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3555           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
3556           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
3557           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
3558           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
3559           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3560           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3561           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3562           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3563           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3564           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3565           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3566           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3567           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3568           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3569           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3570           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3571           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3572           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3573           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3574           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3575           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3576           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3577           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3578           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3579           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3580           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3581           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3582           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3583           "vmovups %%ymm8, (%0)\n"
3584           "vmovups %%ymm9, (%1)\n"
3585           "vmovups %%ymm10, (%2)\n"
3586           "vmovups %%ymm11, (%3)\n"
3587           "vmovups %%ymm12, (%4)\n"
3588           "vmovups %%ymm13, (%5)\n"
3589           "vmovups %%ymm14, (%6)\n"
3590           "vmovups %%ymm15, (%7)\n"
3591           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3592         );
3593       }
3594     }
3595     for (int j = 0; j < 4096; j += 512) {
3596       for (int k = 0; k < 64; k += 8) {
3597         __asm__ volatile (
3598           "vmovups (%0), %%ymm0\n"
3599           "vmovups (%1), %%ymm1\n"
3600           "vmovups (%2), %%ymm2\n"
3601           "vmovups (%3), %%ymm3\n"
3602           "vmovups (%4), %%ymm4\n"
3603           "vmovups (%5), %%ymm5\n"
3604           "vmovups (%6), %%ymm6\n"
3605           "vmovups (%7), %%ymm7\n"
3606           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3607           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3608           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3609           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3610           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3611           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3612           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3613           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3614           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3615           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3616           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3617           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3618           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3619           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3620           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3621           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3622           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3623           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3624           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3625           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3626           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3627           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3628           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3629           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3630           "vmovups %%ymm8, (%0)\n"
3631           "vmovups %%ymm9, (%1)\n"
3632           "vmovups %%ymm10, (%2)\n"
3633           "vmovups %%ymm11, (%3)\n"
3634           "vmovups %%ymm12, (%4)\n"
3635           "vmovups %%ymm13, (%5)\n"
3636           "vmovups %%ymm14, (%6)\n"
3637           "vmovups %%ymm15, (%7)\n"
3638           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3639         );
3640       }
3641     }
3642     for (int j = 0; j < 4096; j += 4096) {
3643       for (int k = 0; k < 512; k += 8) {
3644         __asm__ volatile (
3645           "vmovups (%0), %%ymm0\n"
3646           "vmovups (%1), %%ymm1\n"
3647           "vmovups (%2), %%ymm2\n"
3648           "vmovups (%3), %%ymm3\n"
3649           "vmovups (%4), %%ymm4\n"
3650           "vmovups (%5), %%ymm5\n"
3651           "vmovups (%6), %%ymm6\n"
3652           "vmovups (%7), %%ymm7\n"
3653           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3654           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3655           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3656           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3657           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3658           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3659           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3660           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3661           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3662           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3663           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3664           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3665           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3666           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3667           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3668           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3669           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3670           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3671           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3672           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3673           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3674           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3675           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3676           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3677           "vmovups %%ymm8, (%0)\n"
3678           "vmovups %%ymm9, (%1)\n"
3679           "vmovups %%ymm10, (%2)\n"
3680           "vmovups %%ymm11, (%3)\n"
3681           "vmovups %%ymm12, (%4)\n"
3682           "vmovups %%ymm13, (%5)\n"
3683           "vmovups %%ymm14, (%6)\n"
3684           "vmovups %%ymm15, (%7)\n"
3685           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3686         );
3687       }
3688     }
3689     return;
3690   }
3691   if (depth == 15) {
3692     helper_float_18_recursive(buf + 0, 12);
3693     helper_float_18_recursive(buf + 4096, 12);
3694     helper_float_18_recursive(buf + 8192, 12);
3695     helper_float_18_recursive(buf + 12288, 12);
3696     helper_float_18_recursive(buf + 16384, 12);
3697     helper_float_18_recursive(buf + 20480, 12);
3698     helper_float_18_recursive(buf + 24576, 12);
3699     helper_float_18_recursive(buf + 28672, 12);
3700     for (int j = 0; j < 32768; j += 32768) {
3701       for (int k = 0; k < 4096; k += 8) {
3702         __asm__ volatile (
3703           "vmovups (%0), %%ymm0\n"
3704           "vmovups (%1), %%ymm1\n"
3705           "vmovups (%2), %%ymm2\n"
3706           "vmovups (%3), %%ymm3\n"
3707           "vmovups (%4), %%ymm4\n"
3708           "vmovups (%5), %%ymm5\n"
3709           "vmovups (%6), %%ymm6\n"
3710           "vmovups (%7), %%ymm7\n"
3711           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3712           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3713           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3714           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3715           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3716           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3717           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3718           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3719           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3720           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3721           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3722           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3723           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3724           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3725           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3726           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3727           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3728           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3729           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3730           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3731           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3732           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3733           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3734           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3735           "vmovups %%ymm8, (%0)\n"
3736           "vmovups %%ymm9, (%1)\n"
3737           "vmovups %%ymm10, (%2)\n"
3738           "vmovups %%ymm11, (%3)\n"
3739           "vmovups %%ymm12, (%4)\n"
3740           "vmovups %%ymm13, (%5)\n"
3741           "vmovups %%ymm14, (%6)\n"
3742           "vmovups %%ymm15, (%7)\n"
3743           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3744         );
3745       }
3746     }
3747     return;
3748   }
3749   if (depth == 18) {
3750     helper_float_18_recursive(buf + 0, 15);
3751     helper_float_18_recursive(buf + 32768, 15);
3752     helper_float_18_recursive(buf + 65536, 15);
3753     helper_float_18_recursive(buf + 98304, 15);
3754     helper_float_18_recursive(buf + 131072, 15);
3755     helper_float_18_recursive(buf + 163840, 15);
3756     helper_float_18_recursive(buf + 196608, 15);
3757     helper_float_18_recursive(buf + 229376, 15);
3758     for (int j = 0; j < 262144; j += 262144) {
3759       for (int k = 0; k < 32768; k += 8) {
3760         __asm__ volatile (
3761           "vmovups (%0), %%ymm0\n"
3762           "vmovups (%1), %%ymm1\n"
3763           "vmovups (%2), %%ymm2\n"
3764           "vmovups (%3), %%ymm3\n"
3765           "vmovups (%4), %%ymm4\n"
3766           "vmovups (%5), %%ymm5\n"
3767           "vmovups (%6), %%ymm6\n"
3768           "vmovups (%7), %%ymm7\n"
3769           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3770           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3771           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3772           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3773           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3774           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3775           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3776           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3777           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3778           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3779           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3780           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3781           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3782           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3783           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3784           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3785           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3786           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3787           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3788           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3789           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3790           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3791           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3792           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3793           "vmovups %%ymm8, (%0)\n"
3794           "vmovups %%ymm9, (%1)\n"
3795           "vmovups %%ymm10, (%2)\n"
3796           "vmovups %%ymm11, (%3)\n"
3797           "vmovups %%ymm12, (%4)\n"
3798           "vmovups %%ymm13, (%5)\n"
3799           "vmovups %%ymm14, (%6)\n"
3800           "vmovups %%ymm15, (%7)\n"
3801           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3802         );
3803       }
3804     }
3805     return;
3806   }
3807 }
3808 void helper_float_18(float *buf);
helper_float_18(float * buf)3809 void helper_float_18(float *buf) {
3810   helper_float_18_recursive(buf, 18);
3811 }
3812 void helper_float_19_recursive(float *buf, int depth);
helper_float_19_recursive(float * buf,int depth)3813 void helper_float_19_recursive(float *buf, int depth) {
3814   if (depth == 13) {
3815     for (int j = 0; j < 8192; j += 64) {
3816       for (int k = 0; k < 8; k += 8) {
3817         __asm__ volatile (
3818           "vmovups (%0), %%ymm0\n"
3819           "vmovups (%1), %%ymm1\n"
3820           "vmovups (%2), %%ymm2\n"
3821           "vmovups (%3), %%ymm3\n"
3822           "vmovups (%4), %%ymm4\n"
3823           "vmovups (%5), %%ymm5\n"
3824           "vmovups (%6), %%ymm6\n"
3825           "vmovups (%7), %%ymm7\n"
3826           "vpermilps $160, %%ymm0, %%ymm8\n"
3827           "vpermilps $245, %%ymm0, %%ymm9\n"
3828           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3829           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3830           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
3831           "vpermilps $160, %%ymm1, %%ymm8\n"
3832           "vpermilps $245, %%ymm1, %%ymm9\n"
3833           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3834           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3835           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
3836           "vpermilps $160, %%ymm2, %%ymm8\n"
3837           "vpermilps $245, %%ymm2, %%ymm9\n"
3838           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3839           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3840           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
3841           "vpermilps $160, %%ymm3, %%ymm8\n"
3842           "vpermilps $245, %%ymm3, %%ymm9\n"
3843           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3844           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3845           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
3846           "vpermilps $160, %%ymm4, %%ymm8\n"
3847           "vpermilps $245, %%ymm4, %%ymm9\n"
3848           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3849           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3850           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
3851           "vpermilps $160, %%ymm5, %%ymm8\n"
3852           "vpermilps $245, %%ymm5, %%ymm9\n"
3853           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3854           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3855           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
3856           "vpermilps $160, %%ymm6, %%ymm8\n"
3857           "vpermilps $245, %%ymm6, %%ymm9\n"
3858           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3859           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3860           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
3861           "vpermilps $160, %%ymm7, %%ymm8\n"
3862           "vpermilps $245, %%ymm7, %%ymm9\n"
3863           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3864           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3865           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
3866           "vpermilps $68, %%ymm0, %%ymm8\n"
3867           "vpermilps $238, %%ymm0, %%ymm9\n"
3868           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3869           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3870           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3871           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
3872           "vpermilps $68, %%ymm1, %%ymm8\n"
3873           "vpermilps $238, %%ymm1, %%ymm9\n"
3874           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3875           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3876           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3877           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
3878           "vpermilps $68, %%ymm2, %%ymm8\n"
3879           "vpermilps $238, %%ymm2, %%ymm9\n"
3880           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3881           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3882           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3883           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
3884           "vpermilps $68, %%ymm3, %%ymm8\n"
3885           "vpermilps $238, %%ymm3, %%ymm9\n"
3886           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3887           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3888           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3889           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
3890           "vpermilps $68, %%ymm4, %%ymm8\n"
3891           "vpermilps $238, %%ymm4, %%ymm9\n"
3892           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3893           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3894           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3895           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
3896           "vpermilps $68, %%ymm5, %%ymm8\n"
3897           "vpermilps $238, %%ymm5, %%ymm9\n"
3898           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3899           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3900           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3901           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
3902           "vpermilps $68, %%ymm6, %%ymm8\n"
3903           "vpermilps $238, %%ymm6, %%ymm9\n"
3904           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3905           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3906           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3907           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
3908           "vpermilps $68, %%ymm7, %%ymm8\n"
3909           "vpermilps $238, %%ymm7, %%ymm9\n"
3910           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3911           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3912           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3913           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
3914           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3915           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
3916           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
3917           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
3918           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
3919           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3920           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
3921           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
3922           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
3923           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
3924           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3925           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
3926           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
3927           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
3928           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
3929           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3930           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
3931           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
3932           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
3933           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
3934           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3935           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
3936           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
3937           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
3938           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
3939           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3940           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
3941           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
3942           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
3943           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
3944           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3945           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
3946           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
3947           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
3948           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
3949           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3950           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
3951           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
3952           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
3953           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
3954           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3955           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3956           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3957           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3958           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3959           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3960           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3961           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3962           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3963           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3964           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3965           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3966           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3967           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3968           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3969           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3970           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3971           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3972           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3973           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3974           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3975           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3976           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3977           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3978           "vmovups %%ymm8, (%0)\n"
3979           "vmovups %%ymm9, (%1)\n"
3980           "vmovups %%ymm10, (%2)\n"
3981           "vmovups %%ymm11, (%3)\n"
3982           "vmovups %%ymm12, (%4)\n"
3983           "vmovups %%ymm13, (%5)\n"
3984           "vmovups %%ymm14, (%6)\n"
3985           "vmovups %%ymm15, (%7)\n"
3986           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3987         );
3988       }
3989     }
3990     for (int j = 0; j < 8192; j += 512) {
3991       for (int k = 0; k < 64; k += 8) {
3992         __asm__ volatile (
3993           "vmovups (%0), %%ymm0\n"
3994           "vmovups (%1), %%ymm1\n"
3995           "vmovups (%2), %%ymm2\n"
3996           "vmovups (%3), %%ymm3\n"
3997           "vmovups (%4), %%ymm4\n"
3998           "vmovups (%5), %%ymm5\n"
3999           "vmovups (%6), %%ymm6\n"
4000           "vmovups (%7), %%ymm7\n"
4001           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4002           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4003           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4004           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4005           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4006           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4007           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4008           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4009           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4010           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4011           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4012           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4013           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4014           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4015           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4016           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4017           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4018           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4019           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4020           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4021           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4022           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4023           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4024           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4025           "vmovups %%ymm8, (%0)\n"
4026           "vmovups %%ymm9, (%1)\n"
4027           "vmovups %%ymm10, (%2)\n"
4028           "vmovups %%ymm11, (%3)\n"
4029           "vmovups %%ymm12, (%4)\n"
4030           "vmovups %%ymm13, (%5)\n"
4031           "vmovups %%ymm14, (%6)\n"
4032           "vmovups %%ymm15, (%7)\n"
4033           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4034         );
4035       }
4036     }
4037     for (int j = 0; j < 8192; j += 4096) {
4038       for (int k = 0; k < 512; k += 8) {
4039         __asm__ volatile (
4040           "vmovups (%0), %%ymm0\n"
4041           "vmovups (%1), %%ymm1\n"
4042           "vmovups (%2), %%ymm2\n"
4043           "vmovups (%3), %%ymm3\n"
4044           "vmovups (%4), %%ymm4\n"
4045           "vmovups (%5), %%ymm5\n"
4046           "vmovups (%6), %%ymm6\n"
4047           "vmovups (%7), %%ymm7\n"
4048           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4049           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4050           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4051           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4052           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4053           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4054           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4055           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4056           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4057           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4058           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4059           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4060           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4061           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4062           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4063           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4064           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4065           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4066           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4067           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4068           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4069           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4070           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4071           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4072           "vmovups %%ymm8, (%0)\n"
4073           "vmovups %%ymm9, (%1)\n"
4074           "vmovups %%ymm10, (%2)\n"
4075           "vmovups %%ymm11, (%3)\n"
4076           "vmovups %%ymm12, (%4)\n"
4077           "vmovups %%ymm13, (%5)\n"
4078           "vmovups %%ymm14, (%6)\n"
4079           "vmovups %%ymm15, (%7)\n"
4080           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4081         );
4082       }
4083     }
4084     for (int j = 0; j < 8192; j += 8192) {
4085       for (int k = 0; k < 4096; k += 8) {
4086         __asm__ volatile (
4087           "vmovups (%0), %%ymm0\n"
4088           "vmovups (%1), %%ymm1\n"
4089           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4090           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4091           "vmovups %%ymm8, (%0)\n"
4092           "vmovups %%ymm9, (%1)\n"
4093           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4094         );
4095       }
4096     }
4097     return;
4098   }
4099   if (depth == 16) {
4100     helper_float_19_recursive(buf + 0, 13);
4101     helper_float_19_recursive(buf + 8192, 13);
4102     helper_float_19_recursive(buf + 16384, 13);
4103     helper_float_19_recursive(buf + 24576, 13);
4104     helper_float_19_recursive(buf + 32768, 13);
4105     helper_float_19_recursive(buf + 40960, 13);
4106     helper_float_19_recursive(buf + 49152, 13);
4107     helper_float_19_recursive(buf + 57344, 13);
4108     for (int j = 0; j < 65536; j += 65536) {
4109       for (int k = 0; k < 8192; k += 8) {
4110         __asm__ volatile (
4111           "vmovups (%0), %%ymm0\n"
4112           "vmovups (%1), %%ymm1\n"
4113           "vmovups (%2), %%ymm2\n"
4114           "vmovups (%3), %%ymm3\n"
4115           "vmovups (%4), %%ymm4\n"
4116           "vmovups (%5), %%ymm5\n"
4117           "vmovups (%6), %%ymm6\n"
4118           "vmovups (%7), %%ymm7\n"
4119           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4120           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4121           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4122           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4123           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4124           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4125           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4126           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4127           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4128           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4129           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4130           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4131           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4132           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4133           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4134           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4135           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4136           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4137           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4138           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4139           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4140           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4141           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4142           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4143           "vmovups %%ymm8, (%0)\n"
4144           "vmovups %%ymm9, (%1)\n"
4145           "vmovups %%ymm10, (%2)\n"
4146           "vmovups %%ymm11, (%3)\n"
4147           "vmovups %%ymm12, (%4)\n"
4148           "vmovups %%ymm13, (%5)\n"
4149           "vmovups %%ymm14, (%6)\n"
4150           "vmovups %%ymm15, (%7)\n"
4151           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4152         );
4153       }
4154     }
4155     return;
4156   }
4157   if (depth == 19) {
4158     helper_float_19_recursive(buf + 0, 16);
4159     helper_float_19_recursive(buf + 65536, 16);
4160     helper_float_19_recursive(buf + 131072, 16);
4161     helper_float_19_recursive(buf + 196608, 16);
4162     helper_float_19_recursive(buf + 262144, 16);
4163     helper_float_19_recursive(buf + 327680, 16);
4164     helper_float_19_recursive(buf + 393216, 16);
4165     helper_float_19_recursive(buf + 458752, 16);
4166     for (int j = 0; j < 524288; j += 524288) {
4167       for (int k = 0; k < 65536; k += 8) {
4168         __asm__ volatile (
4169           "vmovups (%0), %%ymm0\n"
4170           "vmovups (%1), %%ymm1\n"
4171           "vmovups (%2), %%ymm2\n"
4172           "vmovups (%3), %%ymm3\n"
4173           "vmovups (%4), %%ymm4\n"
4174           "vmovups (%5), %%ymm5\n"
4175           "vmovups (%6), %%ymm6\n"
4176           "vmovups (%7), %%ymm7\n"
4177           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4178           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4179           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4180           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4181           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4182           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4183           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4184           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4185           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4186           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4187           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4188           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4189           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4190           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4191           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4192           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4193           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4194           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4195           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4196           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4197           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4198           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4199           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4200           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4201           "vmovups %%ymm8, (%0)\n"
4202           "vmovups %%ymm9, (%1)\n"
4203           "vmovups %%ymm10, (%2)\n"
4204           "vmovups %%ymm11, (%3)\n"
4205           "vmovups %%ymm12, (%4)\n"
4206           "vmovups %%ymm13, (%5)\n"
4207           "vmovups %%ymm14, (%6)\n"
4208           "vmovups %%ymm15, (%7)\n"
4209           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4210         );
4211       }
4212     }
4213     return;
4214   }
4215 }
4216 void helper_float_19(float *buf);
helper_float_19(float * buf)4217 void helper_float_19(float *buf) {
4218   helper_float_19_recursive(buf, 19);
4219 }
4220 void helper_float_20_recursive(float *buf, int depth);
helper_float_20_recursive(float * buf,int depth)4221 void helper_float_20_recursive(float *buf, int depth) {
4222   if (depth == 12) {
4223     for (int j = 0; j < 4096; j += 64) {
4224       for (int k = 0; k < 8; k += 8) {
4225         __asm__ volatile (
4226           "vmovups (%0), %%ymm0\n"
4227           "vmovups (%1), %%ymm1\n"
4228           "vmovups (%2), %%ymm2\n"
4229           "vmovups (%3), %%ymm3\n"
4230           "vmovups (%4), %%ymm4\n"
4231           "vmovups (%5), %%ymm5\n"
4232           "vmovups (%6), %%ymm6\n"
4233           "vmovups (%7), %%ymm7\n"
4234           "vpermilps $160, %%ymm0, %%ymm8\n"
4235           "vpermilps $245, %%ymm0, %%ymm9\n"
4236           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4237           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4238           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
4239           "vpermilps $160, %%ymm1, %%ymm8\n"
4240           "vpermilps $245, %%ymm1, %%ymm9\n"
4241           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4242           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4243           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
4244           "vpermilps $160, %%ymm2, %%ymm8\n"
4245           "vpermilps $245, %%ymm2, %%ymm9\n"
4246           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4247           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4248           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
4249           "vpermilps $160, %%ymm3, %%ymm8\n"
4250           "vpermilps $245, %%ymm3, %%ymm9\n"
4251           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4252           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4253           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
4254           "vpermilps $160, %%ymm4, %%ymm8\n"
4255           "vpermilps $245, %%ymm4, %%ymm9\n"
4256           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4257           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4258           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
4259           "vpermilps $160, %%ymm5, %%ymm8\n"
4260           "vpermilps $245, %%ymm5, %%ymm9\n"
4261           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4262           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4263           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
4264           "vpermilps $160, %%ymm6, %%ymm8\n"
4265           "vpermilps $245, %%ymm6, %%ymm9\n"
4266           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4267           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4268           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
4269           "vpermilps $160, %%ymm7, %%ymm8\n"
4270           "vpermilps $245, %%ymm7, %%ymm9\n"
4271           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4272           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4273           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
4274           "vpermilps $68, %%ymm0, %%ymm8\n"
4275           "vpermilps $238, %%ymm0, %%ymm9\n"
4276           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4277           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4278           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4279           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
4280           "vpermilps $68, %%ymm1, %%ymm8\n"
4281           "vpermilps $238, %%ymm1, %%ymm9\n"
4282           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4283           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4284           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4285           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
4286           "vpermilps $68, %%ymm2, %%ymm8\n"
4287           "vpermilps $238, %%ymm2, %%ymm9\n"
4288           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4289           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4290           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4291           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
4292           "vpermilps $68, %%ymm3, %%ymm8\n"
4293           "vpermilps $238, %%ymm3, %%ymm9\n"
4294           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4295           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4296           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4297           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
4298           "vpermilps $68, %%ymm4, %%ymm8\n"
4299           "vpermilps $238, %%ymm4, %%ymm9\n"
4300           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4301           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4302           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4303           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
4304           "vpermilps $68, %%ymm5, %%ymm8\n"
4305           "vpermilps $238, %%ymm5, %%ymm9\n"
4306           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4307           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4308           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4309           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
4310           "vpermilps $68, %%ymm6, %%ymm8\n"
4311           "vpermilps $238, %%ymm6, %%ymm9\n"
4312           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4313           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4314           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4315           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
4316           "vpermilps $68, %%ymm7, %%ymm8\n"
4317           "vpermilps $238, %%ymm7, %%ymm9\n"
4318           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4319           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4320           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4321           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
4322           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4323           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
4324           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
4325           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
4326           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
4327           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4328           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
4329           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
4330           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
4331           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
4332           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4333           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
4334           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
4335           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
4336           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
4337           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4338           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
4339           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
4340           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
4341           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
4342           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4343           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
4344           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
4345           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
4346           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
4347           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4348           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
4349           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
4350           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
4351           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
4352           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4353           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
4354           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
4355           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
4356           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
4357           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4358           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
4359           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
4360           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
4361           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
4362           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4363           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4364           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4365           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4366           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4367           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4368           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4369           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4370           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4371           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4372           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4373           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4374           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4375           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4376           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4377           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4378           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4379           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4380           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4381           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4382           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4383           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4384           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4385           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4386           "vmovups %%ymm8, (%0)\n"
4387           "vmovups %%ymm9, (%1)\n"
4388           "vmovups %%ymm10, (%2)\n"
4389           "vmovups %%ymm11, (%3)\n"
4390           "vmovups %%ymm12, (%4)\n"
4391           "vmovups %%ymm13, (%5)\n"
4392           "vmovups %%ymm14, (%6)\n"
4393           "vmovups %%ymm15, (%7)\n"
4394           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4395         );
4396       }
4397     }
4398     for (int j = 0; j < 4096; j += 512) {
4399       for (int k = 0; k < 64; k += 8) {
4400         __asm__ volatile (
4401           "vmovups (%0), %%ymm0\n"
4402           "vmovups (%1), %%ymm1\n"
4403           "vmovups (%2), %%ymm2\n"
4404           "vmovups (%3), %%ymm3\n"
4405           "vmovups (%4), %%ymm4\n"
4406           "vmovups (%5), %%ymm5\n"
4407           "vmovups (%6), %%ymm6\n"
4408           "vmovups (%7), %%ymm7\n"
4409           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4410           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4411           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4412           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4413           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4414           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4415           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4416           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4417           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4418           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4419           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4420           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4421           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4422           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4423           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4424           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4425           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4426           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4427           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4428           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4429           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4430           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4431           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4432           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4433           "vmovups %%ymm8, (%0)\n"
4434           "vmovups %%ymm9, (%1)\n"
4435           "vmovups %%ymm10, (%2)\n"
4436           "vmovups %%ymm11, (%3)\n"
4437           "vmovups %%ymm12, (%4)\n"
4438           "vmovups %%ymm13, (%5)\n"
4439           "vmovups %%ymm14, (%6)\n"
4440           "vmovups %%ymm15, (%7)\n"
4441           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4442         );
4443       }
4444     }
4445     for (int j = 0; j < 4096; j += 4096) {
4446       for (int k = 0; k < 512; k += 8) {
4447         __asm__ volatile (
4448           "vmovups (%0), %%ymm0\n"
4449           "vmovups (%1), %%ymm1\n"
4450           "vmovups (%2), %%ymm2\n"
4451           "vmovups (%3), %%ymm3\n"
4452           "vmovups (%4), %%ymm4\n"
4453           "vmovups (%5), %%ymm5\n"
4454           "vmovups (%6), %%ymm6\n"
4455           "vmovups (%7), %%ymm7\n"
4456           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4457           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4458           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4459           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4460           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4461           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4462           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4463           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4464           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4465           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4466           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4467           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4468           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4469           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4470           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4471           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4472           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4473           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4474           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4475           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4476           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4477           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4478           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4479           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4480           "vmovups %%ymm8, (%0)\n"
4481           "vmovups %%ymm9, (%1)\n"
4482           "vmovups %%ymm10, (%2)\n"
4483           "vmovups %%ymm11, (%3)\n"
4484           "vmovups %%ymm12, (%4)\n"
4485           "vmovups %%ymm13, (%5)\n"
4486           "vmovups %%ymm14, (%6)\n"
4487           "vmovups %%ymm15, (%7)\n"
4488           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4489         );
4490       }
4491     }
4492     return;
4493   }
4494   if (depth == 15) {
4495     helper_float_20_recursive(buf + 0, 12);
4496     helper_float_20_recursive(buf + 4096, 12);
4497     helper_float_20_recursive(buf + 8192, 12);
4498     helper_float_20_recursive(buf + 12288, 12);
4499     helper_float_20_recursive(buf + 16384, 12);
4500     helper_float_20_recursive(buf + 20480, 12);
4501     helper_float_20_recursive(buf + 24576, 12);
4502     helper_float_20_recursive(buf + 28672, 12);
4503     for (int j = 0; j < 32768; j += 32768) {
4504       for (int k = 0; k < 4096; k += 8) {
4505         __asm__ volatile (
4506           "vmovups (%0), %%ymm0\n"
4507           "vmovups (%1), %%ymm1\n"
4508           "vmovups (%2), %%ymm2\n"
4509           "vmovups (%3), %%ymm3\n"
4510           "vmovups (%4), %%ymm4\n"
4511           "vmovups (%5), %%ymm5\n"
4512           "vmovups (%6), %%ymm6\n"
4513           "vmovups (%7), %%ymm7\n"
4514           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4515           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4516           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4517           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4518           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4519           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4520           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4521           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4522           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4523           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4524           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4525           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4526           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4527           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4528           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4529           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4530           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4531           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4532           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4533           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4534           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4535           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4536           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4537           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4538           "vmovups %%ymm8, (%0)\n"
4539           "vmovups %%ymm9, (%1)\n"
4540           "vmovups %%ymm10, (%2)\n"
4541           "vmovups %%ymm11, (%3)\n"
4542           "vmovups %%ymm12, (%4)\n"
4543           "vmovups %%ymm13, (%5)\n"
4544           "vmovups %%ymm14, (%6)\n"
4545           "vmovups %%ymm15, (%7)\n"
4546           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4547         );
4548       }
4549     }
4550     return;
4551   }
4552   if (depth == 18) {
4553     helper_float_20_recursive(buf + 0, 15);
4554     helper_float_20_recursive(buf + 32768, 15);
4555     helper_float_20_recursive(buf + 65536, 15);
4556     helper_float_20_recursive(buf + 98304, 15);
4557     helper_float_20_recursive(buf + 131072, 15);
4558     helper_float_20_recursive(buf + 163840, 15);
4559     helper_float_20_recursive(buf + 196608, 15);
4560     helper_float_20_recursive(buf + 229376, 15);
4561     for (int j = 0; j < 262144; j += 262144) {
4562       for (int k = 0; k < 32768; k += 8) {
4563         __asm__ volatile (
4564           "vmovups (%0), %%ymm0\n"
4565           "vmovups (%1), %%ymm1\n"
4566           "vmovups (%2), %%ymm2\n"
4567           "vmovups (%3), %%ymm3\n"
4568           "vmovups (%4), %%ymm4\n"
4569           "vmovups (%5), %%ymm5\n"
4570           "vmovups (%6), %%ymm6\n"
4571           "vmovups (%7), %%ymm7\n"
4572           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4573           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4574           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4575           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4576           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4577           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4578           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4579           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4580           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4581           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4582           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4583           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4584           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4585           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4586           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4587           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4588           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4589           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4590           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4591           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4592           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4593           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4594           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4595           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4596           "vmovups %%ymm8, (%0)\n"
4597           "vmovups %%ymm9, (%1)\n"
4598           "vmovups %%ymm10, (%2)\n"
4599           "vmovups %%ymm11, (%3)\n"
4600           "vmovups %%ymm12, (%4)\n"
4601           "vmovups %%ymm13, (%5)\n"
4602           "vmovups %%ymm14, (%6)\n"
4603           "vmovups %%ymm15, (%7)\n"
4604           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4605         );
4606       }
4607     }
4608     return;
4609   }
4610   if (depth == 20) {
4611     helper_float_20_recursive(buf + 0, 18);
4612     helper_float_20_recursive(buf + 262144, 18);
4613     helper_float_20_recursive(buf + 524288, 18);
4614     helper_float_20_recursive(buf + 786432, 18);
4615     for (int j = 0; j < 1048576; j += 1048576) {
4616       for (int k = 0; k < 262144; k += 8) {
4617         __asm__ volatile (
4618           "vmovups (%0), %%ymm0\n"
4619           "vmovups (%1), %%ymm1\n"
4620           "vmovups (%2), %%ymm2\n"
4621           "vmovups (%3), %%ymm3\n"
4622           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4623           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4624           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4625           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4626           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4627           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4628           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4629           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4630           "vmovups %%ymm0, (%0)\n"
4631           "vmovups %%ymm1, (%1)\n"
4632           "vmovups %%ymm2, (%2)\n"
4633           "vmovups %%ymm3, (%3)\n"
4634           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4635         );
4636       }
4637     }
4638     return;
4639   }
4640 }
4641 void helper_float_20(float *buf);
helper_float_20(float * buf)4642 void helper_float_20(float *buf) {
4643   helper_float_20_recursive(buf, 20);
4644 }
4645 void helper_float_21_recursive(float *buf, int depth);
helper_float_21_recursive(float * buf,int depth)4646 void helper_float_21_recursive(float *buf, int depth) {
4647   if (depth == 9) {
4648     for (int j = 0; j < 512; j += 64) {
4649       for (int k = 0; k < 8; k += 8) {
4650         __asm__ volatile (
4651           "vmovups (%0), %%ymm0\n"
4652           "vmovups (%1), %%ymm1\n"
4653           "vmovups (%2), %%ymm2\n"
4654           "vmovups (%3), %%ymm3\n"
4655           "vmovups (%4), %%ymm4\n"
4656           "vmovups (%5), %%ymm5\n"
4657           "vmovups (%6), %%ymm6\n"
4658           "vmovups (%7), %%ymm7\n"
4659           "vpermilps $160, %%ymm0, %%ymm8\n"
4660           "vpermilps $245, %%ymm0, %%ymm9\n"
4661           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4662           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4663           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
4664           "vpermilps $160, %%ymm1, %%ymm8\n"
4665           "vpermilps $245, %%ymm1, %%ymm9\n"
4666           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4667           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4668           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
4669           "vpermilps $160, %%ymm2, %%ymm8\n"
4670           "vpermilps $245, %%ymm2, %%ymm9\n"
4671           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4672           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4673           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
4674           "vpermilps $160, %%ymm3, %%ymm8\n"
4675           "vpermilps $245, %%ymm3, %%ymm9\n"
4676           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4677           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4678           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
4679           "vpermilps $160, %%ymm4, %%ymm8\n"
4680           "vpermilps $245, %%ymm4, %%ymm9\n"
4681           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4682           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4683           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
4684           "vpermilps $160, %%ymm5, %%ymm8\n"
4685           "vpermilps $245, %%ymm5, %%ymm9\n"
4686           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4687           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4688           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
4689           "vpermilps $160, %%ymm6, %%ymm8\n"
4690           "vpermilps $245, %%ymm6, %%ymm9\n"
4691           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4692           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4693           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
4694           "vpermilps $160, %%ymm7, %%ymm8\n"
4695           "vpermilps $245, %%ymm7, %%ymm9\n"
4696           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4697           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4698           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
4699           "vpermilps $68, %%ymm0, %%ymm8\n"
4700           "vpermilps $238, %%ymm0, %%ymm9\n"
4701           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4702           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4703           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4704           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
4705           "vpermilps $68, %%ymm1, %%ymm8\n"
4706           "vpermilps $238, %%ymm1, %%ymm9\n"
4707           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4708           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4709           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4710           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
4711           "vpermilps $68, %%ymm2, %%ymm8\n"
4712           "vpermilps $238, %%ymm2, %%ymm9\n"
4713           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4714           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4715           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4716           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
4717           "vpermilps $68, %%ymm3, %%ymm8\n"
4718           "vpermilps $238, %%ymm3, %%ymm9\n"
4719           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4720           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4721           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4722           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
4723           "vpermilps $68, %%ymm4, %%ymm8\n"
4724           "vpermilps $238, %%ymm4, %%ymm9\n"
4725           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4726           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4727           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4728           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
4729           "vpermilps $68, %%ymm5, %%ymm8\n"
4730           "vpermilps $238, %%ymm5, %%ymm9\n"
4731           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4732           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4733           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4734           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
4735           "vpermilps $68, %%ymm6, %%ymm8\n"
4736           "vpermilps $238, %%ymm6, %%ymm9\n"
4737           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4738           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4739           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4740           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
4741           "vpermilps $68, %%ymm7, %%ymm8\n"
4742           "vpermilps $238, %%ymm7, %%ymm9\n"
4743           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4744           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4745           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4746           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
4747           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4748           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
4749           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
4750           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
4751           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
4752           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4753           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
4754           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
4755           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
4756           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
4757           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4758           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
4759           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
4760           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
4761           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
4762           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4763           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
4764           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
4765           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
4766           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
4767           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4768           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
4769           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
4770           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
4771           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
4772           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4773           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
4774           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
4775           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
4776           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
4777           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4778           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
4779           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
4780           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
4781           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
4782           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4783           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
4784           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
4785           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
4786           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
4787           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4788           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4789           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4790           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4791           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4792           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4793           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4794           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4795           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4796           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4797           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4798           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4799           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4800           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4801           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4802           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4803           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4804           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4805           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4806           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4807           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4808           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4809           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4810           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4811           "vmovups %%ymm8, (%0)\n"
4812           "vmovups %%ymm9, (%1)\n"
4813           "vmovups %%ymm10, (%2)\n"
4814           "vmovups %%ymm11, (%3)\n"
4815           "vmovups %%ymm12, (%4)\n"
4816           "vmovups %%ymm13, (%5)\n"
4817           "vmovups %%ymm14, (%6)\n"
4818           "vmovups %%ymm15, (%7)\n"
4819           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4820         );
4821       }
4822     }
4823     for (int j = 0; j < 512; j += 512) {
4824       for (int k = 0; k < 64; k += 8) {
4825         __asm__ volatile (
4826           "vmovups (%0), %%ymm0\n"
4827           "vmovups (%1), %%ymm1\n"
4828           "vmovups (%2), %%ymm2\n"
4829           "vmovups (%3), %%ymm3\n"
4830           "vmovups (%4), %%ymm4\n"
4831           "vmovups (%5), %%ymm5\n"
4832           "vmovups (%6), %%ymm6\n"
4833           "vmovups (%7), %%ymm7\n"
4834           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4835           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4836           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4837           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4838           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4839           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4840           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4841           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4842           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4843           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4844           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4845           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4846           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4847           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4848           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4849           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4850           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4851           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4852           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4853           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4854           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4855           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4856           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4857           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4858           "vmovups %%ymm8, (%0)\n"
4859           "vmovups %%ymm9, (%1)\n"
4860           "vmovups %%ymm10, (%2)\n"
4861           "vmovups %%ymm11, (%3)\n"
4862           "vmovups %%ymm12, (%4)\n"
4863           "vmovups %%ymm13, (%5)\n"
4864           "vmovups %%ymm14, (%6)\n"
4865           "vmovups %%ymm15, (%7)\n"
4866           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4867         );
4868       }
4869     }
4870     return;
4871   }
4872   if (depth == 12) {
4873     helper_float_21_recursive(buf + 0, 9);
4874     helper_float_21_recursive(buf + 512, 9);
4875     helper_float_21_recursive(buf + 1024, 9);
4876     helper_float_21_recursive(buf + 1536, 9);
4877     helper_float_21_recursive(buf + 2048, 9);
4878     helper_float_21_recursive(buf + 2560, 9);
4879     helper_float_21_recursive(buf + 3072, 9);
4880     helper_float_21_recursive(buf + 3584, 9);
4881     for (int j = 0; j < 4096; j += 4096) {
4882       for (int k = 0; k < 512; k += 8) {
4883         __asm__ volatile (
4884           "vmovups (%0), %%ymm0\n"
4885           "vmovups (%1), %%ymm1\n"
4886           "vmovups (%2), %%ymm2\n"
4887           "vmovups (%3), %%ymm3\n"
4888           "vmovups (%4), %%ymm4\n"
4889           "vmovups (%5), %%ymm5\n"
4890           "vmovups (%6), %%ymm6\n"
4891           "vmovups (%7), %%ymm7\n"
4892           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4893           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4894           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4895           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4896           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4897           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4898           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4899           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4900           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4901           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4902           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4903           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4904           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4905           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4906           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4907           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4908           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4909           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4910           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4911           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4912           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4913           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4914           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4915           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4916           "vmovups %%ymm8, (%0)\n"
4917           "vmovups %%ymm9, (%1)\n"
4918           "vmovups %%ymm10, (%2)\n"
4919           "vmovups %%ymm11, (%3)\n"
4920           "vmovups %%ymm12, (%4)\n"
4921           "vmovups %%ymm13, (%5)\n"
4922           "vmovups %%ymm14, (%6)\n"
4923           "vmovups %%ymm15, (%7)\n"
4924           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4925         );
4926       }
4927     }
4928     return;
4929   }
4930   if (depth == 15) {
4931     helper_float_21_recursive(buf + 0, 12);
4932     helper_float_21_recursive(buf + 4096, 12);
4933     helper_float_21_recursive(buf + 8192, 12);
4934     helper_float_21_recursive(buf + 12288, 12);
4935     helper_float_21_recursive(buf + 16384, 12);
4936     helper_float_21_recursive(buf + 20480, 12);
4937     helper_float_21_recursive(buf + 24576, 12);
4938     helper_float_21_recursive(buf + 28672, 12);
4939     for (int j = 0; j < 32768; j += 32768) {
4940       for (int k = 0; k < 4096; k += 8) {
4941         __asm__ volatile (
4942           "vmovups (%0), %%ymm0\n"
4943           "vmovups (%1), %%ymm1\n"
4944           "vmovups (%2), %%ymm2\n"
4945           "vmovups (%3), %%ymm3\n"
4946           "vmovups (%4), %%ymm4\n"
4947           "vmovups (%5), %%ymm5\n"
4948           "vmovups (%6), %%ymm6\n"
4949           "vmovups (%7), %%ymm7\n"
4950           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4951           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4952           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4953           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4954           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4955           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4956           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4957           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4958           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4959           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4960           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4961           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4962           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4963           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4964           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4965           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4966           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4967           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4968           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4969           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4970           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4971           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4972           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4973           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4974           "vmovups %%ymm8, (%0)\n"
4975           "vmovups %%ymm9, (%1)\n"
4976           "vmovups %%ymm10, (%2)\n"
4977           "vmovups %%ymm11, (%3)\n"
4978           "vmovups %%ymm12, (%4)\n"
4979           "vmovups %%ymm13, (%5)\n"
4980           "vmovups %%ymm14, (%6)\n"
4981           "vmovups %%ymm15, (%7)\n"
4982           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4983         );
4984       }
4985     }
4986     return;
4987   }
4988   if (depth == 18) {
4989     helper_float_21_recursive(buf + 0, 15);
4990     helper_float_21_recursive(buf + 32768, 15);
4991     helper_float_21_recursive(buf + 65536, 15);
4992     helper_float_21_recursive(buf + 98304, 15);
4993     helper_float_21_recursive(buf + 131072, 15);
4994     helper_float_21_recursive(buf + 163840, 15);
4995     helper_float_21_recursive(buf + 196608, 15);
4996     helper_float_21_recursive(buf + 229376, 15);
4997     for (int j = 0; j < 262144; j += 262144) {
4998       for (int k = 0; k < 32768; k += 8) {
4999         __asm__ volatile (
5000           "vmovups (%0), %%ymm0\n"
5001           "vmovups (%1), %%ymm1\n"
5002           "vmovups (%2), %%ymm2\n"
5003           "vmovups (%3), %%ymm3\n"
5004           "vmovups (%4), %%ymm4\n"
5005           "vmovups (%5), %%ymm5\n"
5006           "vmovups (%6), %%ymm6\n"
5007           "vmovups (%7), %%ymm7\n"
5008           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5009           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5010           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5011           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5012           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5013           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5014           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5015           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5016           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5017           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5018           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5019           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5020           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5021           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5022           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5023           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5024           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5025           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5026           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5027           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5028           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5029           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5030           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5031           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5032           "vmovups %%ymm8, (%0)\n"
5033           "vmovups %%ymm9, (%1)\n"
5034           "vmovups %%ymm10, (%2)\n"
5035           "vmovups %%ymm11, (%3)\n"
5036           "vmovups %%ymm12, (%4)\n"
5037           "vmovups %%ymm13, (%5)\n"
5038           "vmovups %%ymm14, (%6)\n"
5039           "vmovups %%ymm15, (%7)\n"
5040           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5041         );
5042       }
5043     }
5044     return;
5045   }
5046   if (depth == 21) {
5047     helper_float_21_recursive(buf + 0, 18);
5048     helper_float_21_recursive(buf + 262144, 18);
5049     helper_float_21_recursive(buf + 524288, 18);
5050     helper_float_21_recursive(buf + 786432, 18);
5051     helper_float_21_recursive(buf + 1048576, 18);
5052     helper_float_21_recursive(buf + 1310720, 18);
5053     helper_float_21_recursive(buf + 1572864, 18);
5054     helper_float_21_recursive(buf + 1835008, 18);
5055     for (int j = 0; j < 2097152; j += 2097152) {
5056       for (int k = 0; k < 262144; k += 8) {
5057         __asm__ volatile (
5058           "vmovups (%0), %%ymm0\n"
5059           "vmovups (%1), %%ymm1\n"
5060           "vmovups (%2), %%ymm2\n"
5061           "vmovups (%3), %%ymm3\n"
5062           "vmovups (%4), %%ymm4\n"
5063           "vmovups (%5), %%ymm5\n"
5064           "vmovups (%6), %%ymm6\n"
5065           "vmovups (%7), %%ymm7\n"
5066           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5067           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5068           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5069           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5070           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5071           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5072           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5073           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5074           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5075           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5076           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5077           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5078           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5079           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5080           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5081           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5082           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5083           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5084           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5085           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5086           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5087           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5088           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5089           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5090           "vmovups %%ymm8, (%0)\n"
5091           "vmovups %%ymm9, (%1)\n"
5092           "vmovups %%ymm10, (%2)\n"
5093           "vmovups %%ymm11, (%3)\n"
5094           "vmovups %%ymm12, (%4)\n"
5095           "vmovups %%ymm13, (%5)\n"
5096           "vmovups %%ymm14, (%6)\n"
5097           "vmovups %%ymm15, (%7)\n"
5098           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5099         );
5100       }
5101     }
5102     return;
5103   }
5104 }
5105 void helper_float_21(float *buf);
helper_float_21(float * buf)5106 void helper_float_21(float *buf) {
5107   helper_float_21_recursive(buf, 21);
5108 }
5109 void helper_float_22_recursive(float *buf, int depth);
helper_float_22_recursive(float * buf,int depth)5110 void helper_float_22_recursive(float *buf, int depth) {
5111   if (depth == 11) {
5112     for (int j = 0; j < 2048; j += 64) {
5113       for (int k = 0; k < 8; k += 8) {
5114         __asm__ volatile (
5115           "vmovups (%0), %%ymm0\n"
5116           "vmovups (%1), %%ymm1\n"
5117           "vmovups (%2), %%ymm2\n"
5118           "vmovups (%3), %%ymm3\n"
5119           "vmovups (%4), %%ymm4\n"
5120           "vmovups (%5), %%ymm5\n"
5121           "vmovups (%6), %%ymm6\n"
5122           "vmovups (%7), %%ymm7\n"
5123           "vpermilps $160, %%ymm0, %%ymm8\n"
5124           "vpermilps $245, %%ymm0, %%ymm9\n"
5125           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5126           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5127           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
5128           "vpermilps $160, %%ymm1, %%ymm8\n"
5129           "vpermilps $245, %%ymm1, %%ymm9\n"
5130           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5131           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5132           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
5133           "vpermilps $160, %%ymm2, %%ymm8\n"
5134           "vpermilps $245, %%ymm2, %%ymm9\n"
5135           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5136           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5137           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
5138           "vpermilps $160, %%ymm3, %%ymm8\n"
5139           "vpermilps $245, %%ymm3, %%ymm9\n"
5140           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5141           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5142           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
5143           "vpermilps $160, %%ymm4, %%ymm8\n"
5144           "vpermilps $245, %%ymm4, %%ymm9\n"
5145           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5146           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5147           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
5148           "vpermilps $160, %%ymm5, %%ymm8\n"
5149           "vpermilps $245, %%ymm5, %%ymm9\n"
5150           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5151           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5152           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
5153           "vpermilps $160, %%ymm6, %%ymm8\n"
5154           "vpermilps $245, %%ymm6, %%ymm9\n"
5155           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5156           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5157           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
5158           "vpermilps $160, %%ymm7, %%ymm8\n"
5159           "vpermilps $245, %%ymm7, %%ymm9\n"
5160           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5161           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5162           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
5163           "vpermilps $68, %%ymm0, %%ymm8\n"
5164           "vpermilps $238, %%ymm0, %%ymm9\n"
5165           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5166           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5167           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5168           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
5169           "vpermilps $68, %%ymm1, %%ymm8\n"
5170           "vpermilps $238, %%ymm1, %%ymm9\n"
5171           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5172           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5173           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5174           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
5175           "vpermilps $68, %%ymm2, %%ymm8\n"
5176           "vpermilps $238, %%ymm2, %%ymm9\n"
5177           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5178           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5179           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5180           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
5181           "vpermilps $68, %%ymm3, %%ymm8\n"
5182           "vpermilps $238, %%ymm3, %%ymm9\n"
5183           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5184           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5185           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5186           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
5187           "vpermilps $68, %%ymm4, %%ymm8\n"
5188           "vpermilps $238, %%ymm4, %%ymm9\n"
5189           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5190           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5191           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5192           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
5193           "vpermilps $68, %%ymm5, %%ymm8\n"
5194           "vpermilps $238, %%ymm5, %%ymm9\n"
5195           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5196           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5197           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5198           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
5199           "vpermilps $68, %%ymm6, %%ymm8\n"
5200           "vpermilps $238, %%ymm6, %%ymm9\n"
5201           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5202           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5203           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5204           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
5205           "vpermilps $68, %%ymm7, %%ymm8\n"
5206           "vpermilps $238, %%ymm7, %%ymm9\n"
5207           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5208           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5209           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5210           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
5211           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5212           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
5213           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
5214           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
5215           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
5216           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5217           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
5218           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
5219           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
5220           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
5221           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5222           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
5223           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
5224           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
5225           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
5226           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5227           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
5228           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
5229           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
5230           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
5231           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5232           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
5233           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
5234           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
5235           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
5236           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5237           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
5238           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
5239           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
5240           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
5241           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5242           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
5243           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
5244           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
5245           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
5246           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5247           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
5248           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
5249           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
5250           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
5251           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5252           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5253           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5254           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5255           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5256           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5257           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5258           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5259           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5260           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5261           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5262           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5263           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5264           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5265           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5266           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5267           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5268           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5269           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5270           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5271           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5272           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5273           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5274           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5275           "vmovups %%ymm8, (%0)\n"
5276           "vmovups %%ymm9, (%1)\n"
5277           "vmovups %%ymm10, (%2)\n"
5278           "vmovups %%ymm11, (%3)\n"
5279           "vmovups %%ymm12, (%4)\n"
5280           "vmovups %%ymm13, (%5)\n"
5281           "vmovups %%ymm14, (%6)\n"
5282           "vmovups %%ymm15, (%7)\n"
5283           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5284         );
5285       }
5286     }
5287     for (int j = 0; j < 2048; j += 512) {
5288       for (int k = 0; k < 64; k += 8) {
5289         __asm__ volatile (
5290           "vmovups (%0), %%ymm0\n"
5291           "vmovups (%1), %%ymm1\n"
5292           "vmovups (%2), %%ymm2\n"
5293           "vmovups (%3), %%ymm3\n"
5294           "vmovups (%4), %%ymm4\n"
5295           "vmovups (%5), %%ymm5\n"
5296           "vmovups (%6), %%ymm6\n"
5297           "vmovups (%7), %%ymm7\n"
5298           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5299           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5300           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5301           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5302           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5303           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5304           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5305           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5306           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5307           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5308           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5309           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5310           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5311           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5312           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5313           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5314           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5315           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5316           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5317           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5318           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5319           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5320           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5321           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5322           "vmovups %%ymm8, (%0)\n"
5323           "vmovups %%ymm9, (%1)\n"
5324           "vmovups %%ymm10, (%2)\n"
5325           "vmovups %%ymm11, (%3)\n"
5326           "vmovups %%ymm12, (%4)\n"
5327           "vmovups %%ymm13, (%5)\n"
5328           "vmovups %%ymm14, (%6)\n"
5329           "vmovups %%ymm15, (%7)\n"
5330           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5331         );
5332       }
5333     }
5334     for (int j = 0; j < 2048; j += 2048) {
5335       for (int k = 0; k < 512; k += 8) {
5336         __asm__ volatile (
5337           "vmovups (%0), %%ymm0\n"
5338           "vmovups (%1), %%ymm1\n"
5339           "vmovups (%2), %%ymm2\n"
5340           "vmovups (%3), %%ymm3\n"
5341           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5342           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5343           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5344           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5345           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5346           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5347           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5348           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5349           "vmovups %%ymm0, (%0)\n"
5350           "vmovups %%ymm1, (%1)\n"
5351           "vmovups %%ymm2, (%2)\n"
5352           "vmovups %%ymm3, (%3)\n"
5353           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5354         );
5355       }
5356     }
5357     return;
5358   }
5359   if (depth == 14) {
5360     helper_float_22_recursive(buf + 0, 11);
5361     helper_float_22_recursive(buf + 2048, 11);
5362     helper_float_22_recursive(buf + 4096, 11);
5363     helper_float_22_recursive(buf + 6144, 11);
5364     helper_float_22_recursive(buf + 8192, 11);
5365     helper_float_22_recursive(buf + 10240, 11);
5366     helper_float_22_recursive(buf + 12288, 11);
5367     helper_float_22_recursive(buf + 14336, 11);
5368     for (int j = 0; j < 16384; j += 16384) {
5369       for (int k = 0; k < 2048; k += 8) {
5370         __asm__ volatile (
5371           "vmovups (%0), %%ymm0\n"
5372           "vmovups (%1), %%ymm1\n"
5373           "vmovups (%2), %%ymm2\n"
5374           "vmovups (%3), %%ymm3\n"
5375           "vmovups (%4), %%ymm4\n"
5376           "vmovups (%5), %%ymm5\n"
5377           "vmovups (%6), %%ymm6\n"
5378           "vmovups (%7), %%ymm7\n"
5379           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5380           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5381           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5382           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5383           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5384           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5385           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5386           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5387           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5388           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5389           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5390           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5391           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5392           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5393           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5394           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5395           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5396           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5397           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5398           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5399           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5400           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5401           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5402           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5403           "vmovups %%ymm8, (%0)\n"
5404           "vmovups %%ymm9, (%1)\n"
5405           "vmovups %%ymm10, (%2)\n"
5406           "vmovups %%ymm11, (%3)\n"
5407           "vmovups %%ymm12, (%4)\n"
5408           "vmovups %%ymm13, (%5)\n"
5409           "vmovups %%ymm14, (%6)\n"
5410           "vmovups %%ymm15, (%7)\n"
5411           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5412         );
5413       }
5414     }
5415     return;
5416   }
5417   if (depth == 17) {
5418     helper_float_22_recursive(buf + 0, 14);
5419     helper_float_22_recursive(buf + 16384, 14);
5420     helper_float_22_recursive(buf + 32768, 14);
5421     helper_float_22_recursive(buf + 49152, 14);
5422     helper_float_22_recursive(buf + 65536, 14);
5423     helper_float_22_recursive(buf + 81920, 14);
5424     helper_float_22_recursive(buf + 98304, 14);
5425     helper_float_22_recursive(buf + 114688, 14);
5426     for (int j = 0; j < 131072; j += 131072) {
5427       for (int k = 0; k < 16384; k += 8) {
5428         __asm__ volatile (
5429           "vmovups (%0), %%ymm0\n"
5430           "vmovups (%1), %%ymm1\n"
5431           "vmovups (%2), %%ymm2\n"
5432           "vmovups (%3), %%ymm3\n"
5433           "vmovups (%4), %%ymm4\n"
5434           "vmovups (%5), %%ymm5\n"
5435           "vmovups (%6), %%ymm6\n"
5436           "vmovups (%7), %%ymm7\n"
5437           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5438           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5439           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5440           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5441           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5442           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5443           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5444           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5445           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5446           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5447           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5448           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5449           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5450           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5451           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5452           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5453           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5454           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5455           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5456           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5457           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5458           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5459           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5460           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5461           "vmovups %%ymm8, (%0)\n"
5462           "vmovups %%ymm9, (%1)\n"
5463           "vmovups %%ymm10, (%2)\n"
5464           "vmovups %%ymm11, (%3)\n"
5465           "vmovups %%ymm12, (%4)\n"
5466           "vmovups %%ymm13, (%5)\n"
5467           "vmovups %%ymm14, (%6)\n"
5468           "vmovups %%ymm15, (%7)\n"
5469           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5470         );
5471       }
5472     }
5473     return;
5474   }
5475   if (depth == 20) {
5476     helper_float_22_recursive(buf + 0, 17);
5477     helper_float_22_recursive(buf + 131072, 17);
5478     helper_float_22_recursive(buf + 262144, 17);
5479     helper_float_22_recursive(buf + 393216, 17);
5480     helper_float_22_recursive(buf + 524288, 17);
5481     helper_float_22_recursive(buf + 655360, 17);
5482     helper_float_22_recursive(buf + 786432, 17);
5483     helper_float_22_recursive(buf + 917504, 17);
5484     for (int j = 0; j < 1048576; j += 1048576) {
5485       for (int k = 0; k < 131072; k += 8) {
5486         __asm__ volatile (
5487           "vmovups (%0), %%ymm0\n"
5488           "vmovups (%1), %%ymm1\n"
5489           "vmovups (%2), %%ymm2\n"
5490           "vmovups (%3), %%ymm3\n"
5491           "vmovups (%4), %%ymm4\n"
5492           "vmovups (%5), %%ymm5\n"
5493           "vmovups (%6), %%ymm6\n"
5494           "vmovups (%7), %%ymm7\n"
5495           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5496           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5497           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5498           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5499           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5500           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5501           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5502           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5503           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5504           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5505           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5506           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5507           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5508           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5509           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5510           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5511           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5512           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5513           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5514           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5515           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5516           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5517           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5518           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5519           "vmovups %%ymm8, (%0)\n"
5520           "vmovups %%ymm9, (%1)\n"
5521           "vmovups %%ymm10, (%2)\n"
5522           "vmovups %%ymm11, (%3)\n"
5523           "vmovups %%ymm12, (%4)\n"
5524           "vmovups %%ymm13, (%5)\n"
5525           "vmovups %%ymm14, (%6)\n"
5526           "vmovups %%ymm15, (%7)\n"
5527           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5528         );
5529       }
5530     }
5531     return;
5532   }
5533   if (depth == 22) {
5534     helper_float_22_recursive(buf + 0, 20);
5535     helper_float_22_recursive(buf + 1048576, 20);
5536     helper_float_22_recursive(buf + 2097152, 20);
5537     helper_float_22_recursive(buf + 3145728, 20);
5538     for (int j = 0; j < 4194304; j += 4194304) {
5539       for (int k = 0; k < 1048576; k += 8) {
5540         __asm__ volatile (
5541           "vmovups (%0), %%ymm0\n"
5542           "vmovups (%1), %%ymm1\n"
5543           "vmovups (%2), %%ymm2\n"
5544           "vmovups (%3), %%ymm3\n"
5545           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5546           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5547           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5548           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5549           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5550           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5551           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5552           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5553           "vmovups %%ymm0, (%0)\n"
5554           "vmovups %%ymm1, (%1)\n"
5555           "vmovups %%ymm2, (%2)\n"
5556           "vmovups %%ymm3, (%3)\n"
5557           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5558         );
5559       }
5560     }
5561     return;
5562   }
5563 }
5564 void helper_float_22(float *buf);
helper_float_22(float * buf)5565 void helper_float_22(float *buf) {
5566   helper_float_22_recursive(buf, 22);
5567 }
5568 void helper_float_23_recursive(float *buf, int depth);
helper_float_23_recursive(float * buf,int depth)5569 void helper_float_23_recursive(float *buf, int depth) {
5570   if (depth == 9) {
5571     for (int j = 0; j < 512; j += 64) {
5572       for (int k = 0; k < 8; k += 8) {
5573         __asm__ volatile (
5574           "vmovups (%0), %%ymm0\n"
5575           "vmovups (%1), %%ymm1\n"
5576           "vmovups (%2), %%ymm2\n"
5577           "vmovups (%3), %%ymm3\n"
5578           "vmovups (%4), %%ymm4\n"
5579           "vmovups (%5), %%ymm5\n"
5580           "vmovups (%6), %%ymm6\n"
5581           "vmovups (%7), %%ymm7\n"
5582           "vpermilps $160, %%ymm0, %%ymm8\n"
5583           "vpermilps $245, %%ymm0, %%ymm9\n"
5584           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5585           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5586           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
5587           "vpermilps $160, %%ymm1, %%ymm8\n"
5588           "vpermilps $245, %%ymm1, %%ymm9\n"
5589           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5590           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5591           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
5592           "vpermilps $160, %%ymm2, %%ymm8\n"
5593           "vpermilps $245, %%ymm2, %%ymm9\n"
5594           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5595           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5596           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
5597           "vpermilps $160, %%ymm3, %%ymm8\n"
5598           "vpermilps $245, %%ymm3, %%ymm9\n"
5599           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5600           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5601           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
5602           "vpermilps $160, %%ymm4, %%ymm8\n"
5603           "vpermilps $245, %%ymm4, %%ymm9\n"
5604           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5605           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5606           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
5607           "vpermilps $160, %%ymm5, %%ymm8\n"
5608           "vpermilps $245, %%ymm5, %%ymm9\n"
5609           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5610           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5611           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
5612           "vpermilps $160, %%ymm6, %%ymm8\n"
5613           "vpermilps $245, %%ymm6, %%ymm9\n"
5614           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5615           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5616           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
5617           "vpermilps $160, %%ymm7, %%ymm8\n"
5618           "vpermilps $245, %%ymm7, %%ymm9\n"
5619           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5620           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5621           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
5622           "vpermilps $68, %%ymm0, %%ymm8\n"
5623           "vpermilps $238, %%ymm0, %%ymm9\n"
5624           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5625           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5626           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5627           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
5628           "vpermilps $68, %%ymm1, %%ymm8\n"
5629           "vpermilps $238, %%ymm1, %%ymm9\n"
5630           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5631           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5632           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5633           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
5634           "vpermilps $68, %%ymm2, %%ymm8\n"
5635           "vpermilps $238, %%ymm2, %%ymm9\n"
5636           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5637           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5638           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5639           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
5640           "vpermilps $68, %%ymm3, %%ymm8\n"
5641           "vpermilps $238, %%ymm3, %%ymm9\n"
5642           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5643           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5644           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5645           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
5646           "vpermilps $68, %%ymm4, %%ymm8\n"
5647           "vpermilps $238, %%ymm4, %%ymm9\n"
5648           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5649           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5650           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5651           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
5652           "vpermilps $68, %%ymm5, %%ymm8\n"
5653           "vpermilps $238, %%ymm5, %%ymm9\n"
5654           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5655           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5656           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5657           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
5658           "vpermilps $68, %%ymm6, %%ymm8\n"
5659           "vpermilps $238, %%ymm6, %%ymm9\n"
5660           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5661           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5662           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5663           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
5664           "vpermilps $68, %%ymm7, %%ymm8\n"
5665           "vpermilps $238, %%ymm7, %%ymm9\n"
5666           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5667           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5668           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5669           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
5670           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5671           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
5672           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
5673           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
5674           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
5675           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5676           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
5677           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
5678           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
5679           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
5680           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5681           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
5682           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
5683           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
5684           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
5685           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5686           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
5687           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
5688           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
5689           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
5690           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5691           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
5692           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
5693           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
5694           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
5695           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5696           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
5697           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
5698           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
5699           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
5700           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5701           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
5702           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
5703           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
5704           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
5705           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5706           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
5707           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
5708           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
5709           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
5710           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5711           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5712           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5713           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5714           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5715           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5716           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5717           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5718           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5719           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5720           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5721           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5722           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5723           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5724           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5725           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5726           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5727           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5728           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5729           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5730           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5731           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5732           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5733           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5734           "vmovups %%ymm8, (%0)\n"
5735           "vmovups %%ymm9, (%1)\n"
5736           "vmovups %%ymm10, (%2)\n"
5737           "vmovups %%ymm11, (%3)\n"
5738           "vmovups %%ymm12, (%4)\n"
5739           "vmovups %%ymm13, (%5)\n"
5740           "vmovups %%ymm14, (%6)\n"
5741           "vmovups %%ymm15, (%7)\n"
5742           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5743         );
5744       }
5745     }
5746     for (int j = 0; j < 512; j += 512) {
5747       for (int k = 0; k < 64; k += 8) {
5748         __asm__ volatile (
5749           "vmovups (%0), %%ymm0\n"
5750           "vmovups (%1), %%ymm1\n"
5751           "vmovups (%2), %%ymm2\n"
5752           "vmovups (%3), %%ymm3\n"
5753           "vmovups (%4), %%ymm4\n"
5754           "vmovups (%5), %%ymm5\n"
5755           "vmovups (%6), %%ymm6\n"
5756           "vmovups (%7), %%ymm7\n"
5757           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5758           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5759           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5760           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5761           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5762           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5763           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5764           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5765           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5766           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5767           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5768           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5769           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5770           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5771           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5772           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5773           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5774           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5775           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5776           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5777           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5778           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5779           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5780           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5781           "vmovups %%ymm8, (%0)\n"
5782           "vmovups %%ymm9, (%1)\n"
5783           "vmovups %%ymm10, (%2)\n"
5784           "vmovups %%ymm11, (%3)\n"
5785           "vmovups %%ymm12, (%4)\n"
5786           "vmovups %%ymm13, (%5)\n"
5787           "vmovups %%ymm14, (%6)\n"
5788           "vmovups %%ymm15, (%7)\n"
5789           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5790         );
5791       }
5792     }
5793     return;
5794   }
5795   if (depth == 12) {
5796     helper_float_23_recursive(buf + 0, 9);
5797     helper_float_23_recursive(buf + 512, 9);
5798     helper_float_23_recursive(buf + 1024, 9);
5799     helper_float_23_recursive(buf + 1536, 9);
5800     helper_float_23_recursive(buf + 2048, 9);
5801     helper_float_23_recursive(buf + 2560, 9);
5802     helper_float_23_recursive(buf + 3072, 9);
5803     helper_float_23_recursive(buf + 3584, 9);
5804     for (int j = 0; j < 4096; j += 4096) {
5805       for (int k = 0; k < 512; k += 8) {
5806         __asm__ volatile (
5807           "vmovups (%0), %%ymm0\n"
5808           "vmovups (%1), %%ymm1\n"
5809           "vmovups (%2), %%ymm2\n"
5810           "vmovups (%3), %%ymm3\n"
5811           "vmovups (%4), %%ymm4\n"
5812           "vmovups (%5), %%ymm5\n"
5813           "vmovups (%6), %%ymm6\n"
5814           "vmovups (%7), %%ymm7\n"
5815           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5816           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5817           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5818           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5819           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5820           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5821           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5822           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5823           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5824           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5825           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5826           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5827           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5828           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5829           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5830           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5831           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5832           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5833           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5834           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5835           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5836           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5837           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5838           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5839           "vmovups %%ymm8, (%0)\n"
5840           "vmovups %%ymm9, (%1)\n"
5841           "vmovups %%ymm10, (%2)\n"
5842           "vmovups %%ymm11, (%3)\n"
5843           "vmovups %%ymm12, (%4)\n"
5844           "vmovups %%ymm13, (%5)\n"
5845           "vmovups %%ymm14, (%6)\n"
5846           "vmovups %%ymm15, (%7)\n"
5847           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5848         );
5849       }
5850     }
5851     return;
5852   }
5853   if (depth == 15) {
5854     helper_float_23_recursive(buf + 0, 12);
5855     helper_float_23_recursive(buf + 4096, 12);
5856     helper_float_23_recursive(buf + 8192, 12);
5857     helper_float_23_recursive(buf + 12288, 12);
5858     helper_float_23_recursive(buf + 16384, 12);
5859     helper_float_23_recursive(buf + 20480, 12);
5860     helper_float_23_recursive(buf + 24576, 12);
5861     helper_float_23_recursive(buf + 28672, 12);
5862     for (int j = 0; j < 32768; j += 32768) {
5863       for (int k = 0; k < 4096; k += 8) {
5864         __asm__ volatile (
5865           "vmovups (%0), %%ymm0\n"
5866           "vmovups (%1), %%ymm1\n"
5867           "vmovups (%2), %%ymm2\n"
5868           "vmovups (%3), %%ymm3\n"
5869           "vmovups (%4), %%ymm4\n"
5870           "vmovups (%5), %%ymm5\n"
5871           "vmovups (%6), %%ymm6\n"
5872           "vmovups (%7), %%ymm7\n"
5873           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5874           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5875           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5876           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5877           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5878           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5879           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5880           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5881           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5882           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5883           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5884           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5885           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5886           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5887           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5888           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5889           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5890           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5891           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5892           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5893           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5894           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5895           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5896           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5897           "vmovups %%ymm8, (%0)\n"
5898           "vmovups %%ymm9, (%1)\n"
5899           "vmovups %%ymm10, (%2)\n"
5900           "vmovups %%ymm11, (%3)\n"
5901           "vmovups %%ymm12, (%4)\n"
5902           "vmovups %%ymm13, (%5)\n"
5903           "vmovups %%ymm14, (%6)\n"
5904           "vmovups %%ymm15, (%7)\n"
5905           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5906         );
5907       }
5908     }
5909     return;
5910   }
5911   if (depth == 18) {
5912     helper_float_23_recursive(buf + 0, 15);
5913     helper_float_23_recursive(buf + 32768, 15);
5914     helper_float_23_recursive(buf + 65536, 15);
5915     helper_float_23_recursive(buf + 98304, 15);
5916     helper_float_23_recursive(buf + 131072, 15);
5917     helper_float_23_recursive(buf + 163840, 15);
5918     helper_float_23_recursive(buf + 196608, 15);
5919     helper_float_23_recursive(buf + 229376, 15);
5920     for (int j = 0; j < 262144; j += 262144) {
5921       for (int k = 0; k < 32768; k += 8) {
5922         __asm__ volatile (
5923           "vmovups (%0), %%ymm0\n"
5924           "vmovups (%1), %%ymm1\n"
5925           "vmovups (%2), %%ymm2\n"
5926           "vmovups (%3), %%ymm3\n"
5927           "vmovups (%4), %%ymm4\n"
5928           "vmovups (%5), %%ymm5\n"
5929           "vmovups (%6), %%ymm6\n"
5930           "vmovups (%7), %%ymm7\n"
5931           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5932           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5933           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5934           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5935           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5936           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5937           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5938           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5939           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5940           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5941           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5942           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5943           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5944           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5945           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5946           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5947           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5948           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5949           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5950           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5951           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5952           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5953           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5954           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5955           "vmovups %%ymm8, (%0)\n"
5956           "vmovups %%ymm9, (%1)\n"
5957           "vmovups %%ymm10, (%2)\n"
5958           "vmovups %%ymm11, (%3)\n"
5959           "vmovups %%ymm12, (%4)\n"
5960           "vmovups %%ymm13, (%5)\n"
5961           "vmovups %%ymm14, (%6)\n"
5962           "vmovups %%ymm15, (%7)\n"
5963           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5964         );
5965       }
5966     }
5967     return;
5968   }
5969   if (depth == 21) {
5970     helper_float_23_recursive(buf + 0, 18);
5971     helper_float_23_recursive(buf + 262144, 18);
5972     helper_float_23_recursive(buf + 524288, 18);
5973     helper_float_23_recursive(buf + 786432, 18);
5974     helper_float_23_recursive(buf + 1048576, 18);
5975     helper_float_23_recursive(buf + 1310720, 18);
5976     helper_float_23_recursive(buf + 1572864, 18);
5977     helper_float_23_recursive(buf + 1835008, 18);
5978     for (int j = 0; j < 2097152; j += 2097152) {
5979       for (int k = 0; k < 262144; k += 8) {
5980         __asm__ volatile (
5981           "vmovups (%0), %%ymm0\n"
5982           "vmovups (%1), %%ymm1\n"
5983           "vmovups (%2), %%ymm2\n"
5984           "vmovups (%3), %%ymm3\n"
5985           "vmovups (%4), %%ymm4\n"
5986           "vmovups (%5), %%ymm5\n"
5987           "vmovups (%6), %%ymm6\n"
5988           "vmovups (%7), %%ymm7\n"
5989           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5990           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5991           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5992           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5993           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5994           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5995           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5996           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5997           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5998           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5999           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6000           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6001           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6002           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6003           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6004           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6005           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6006           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6007           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6008           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6009           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6010           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6011           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6012           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6013           "vmovups %%ymm8, (%0)\n"
6014           "vmovups %%ymm9, (%1)\n"
6015           "vmovups %%ymm10, (%2)\n"
6016           "vmovups %%ymm11, (%3)\n"
6017           "vmovups %%ymm12, (%4)\n"
6018           "vmovups %%ymm13, (%5)\n"
6019           "vmovups %%ymm14, (%6)\n"
6020           "vmovups %%ymm15, (%7)\n"
6021           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6022         );
6023       }
6024     }
6025     return;
6026   }
6027   if (depth == 23) {
6028     helper_float_23_recursive(buf + 0, 21);
6029     helper_float_23_recursive(buf + 2097152, 21);
6030     helper_float_23_recursive(buf + 4194304, 21);
6031     helper_float_23_recursive(buf + 6291456, 21);
6032     for (int j = 0; j < 8388608; j += 8388608) {
6033       for (int k = 0; k < 2097152; k += 8) {
6034         __asm__ volatile (
6035           "vmovups (%0), %%ymm0\n"
6036           "vmovups (%1), %%ymm1\n"
6037           "vmovups (%2), %%ymm2\n"
6038           "vmovups (%3), %%ymm3\n"
6039           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6040           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6041           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6042           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6043           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6044           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6045           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6046           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6047           "vmovups %%ymm0, (%0)\n"
6048           "vmovups %%ymm1, (%1)\n"
6049           "vmovups %%ymm2, (%2)\n"
6050           "vmovups %%ymm3, (%3)\n"
6051           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6052         );
6053       }
6054     }
6055     return;
6056   }
6057 }
6058 void helper_float_23(float *buf);
helper_float_23(float * buf)6059 void helper_float_23(float *buf) {
6060   helper_float_23_recursive(buf, 23);
6061 }
6062 void helper_float_24_recursive(float *buf, int depth);
helper_float_24_recursive(float * buf,int depth)6063 void helper_float_24_recursive(float *buf, int depth) {
6064   if (depth == 12) {
6065     for (int j = 0; j < 4096; j += 64) {
6066       for (int k = 0; k < 8; k += 8) {
6067         __asm__ volatile (
6068           "vmovups (%0), %%ymm0\n"
6069           "vmovups (%1), %%ymm1\n"
6070           "vmovups (%2), %%ymm2\n"
6071           "vmovups (%3), %%ymm3\n"
6072           "vmovups (%4), %%ymm4\n"
6073           "vmovups (%5), %%ymm5\n"
6074           "vmovups (%6), %%ymm6\n"
6075           "vmovups (%7), %%ymm7\n"
6076           "vpermilps $160, %%ymm0, %%ymm8\n"
6077           "vpermilps $245, %%ymm0, %%ymm9\n"
6078           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6079           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6080           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
6081           "vpermilps $160, %%ymm1, %%ymm8\n"
6082           "vpermilps $245, %%ymm1, %%ymm9\n"
6083           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6084           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6085           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
6086           "vpermilps $160, %%ymm2, %%ymm8\n"
6087           "vpermilps $245, %%ymm2, %%ymm9\n"
6088           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6089           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6090           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
6091           "vpermilps $160, %%ymm3, %%ymm8\n"
6092           "vpermilps $245, %%ymm3, %%ymm9\n"
6093           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6094           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6095           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
6096           "vpermilps $160, %%ymm4, %%ymm8\n"
6097           "vpermilps $245, %%ymm4, %%ymm9\n"
6098           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6099           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6100           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
6101           "vpermilps $160, %%ymm5, %%ymm8\n"
6102           "vpermilps $245, %%ymm5, %%ymm9\n"
6103           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6104           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6105           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
6106           "vpermilps $160, %%ymm6, %%ymm8\n"
6107           "vpermilps $245, %%ymm6, %%ymm9\n"
6108           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6109           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6110           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
6111           "vpermilps $160, %%ymm7, %%ymm8\n"
6112           "vpermilps $245, %%ymm7, %%ymm9\n"
6113           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6114           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6115           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
6116           "vpermilps $68, %%ymm0, %%ymm8\n"
6117           "vpermilps $238, %%ymm0, %%ymm9\n"
6118           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6119           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6120           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6121           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
6122           "vpermilps $68, %%ymm1, %%ymm8\n"
6123           "vpermilps $238, %%ymm1, %%ymm9\n"
6124           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6125           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6126           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6127           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
6128           "vpermilps $68, %%ymm2, %%ymm8\n"
6129           "vpermilps $238, %%ymm2, %%ymm9\n"
6130           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6131           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6132           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6133           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
6134           "vpermilps $68, %%ymm3, %%ymm8\n"
6135           "vpermilps $238, %%ymm3, %%ymm9\n"
6136           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6137           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6138           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6139           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
6140           "vpermilps $68, %%ymm4, %%ymm8\n"
6141           "vpermilps $238, %%ymm4, %%ymm9\n"
6142           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6143           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6144           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6145           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
6146           "vpermilps $68, %%ymm5, %%ymm8\n"
6147           "vpermilps $238, %%ymm5, %%ymm9\n"
6148           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6149           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6150           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6151           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
6152           "vpermilps $68, %%ymm6, %%ymm8\n"
6153           "vpermilps $238, %%ymm6, %%ymm9\n"
6154           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6155           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6156           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6157           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
6158           "vpermilps $68, %%ymm7, %%ymm8\n"
6159           "vpermilps $238, %%ymm7, %%ymm9\n"
6160           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6161           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6162           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6163           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
6164           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6165           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
6166           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
6167           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
6168           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
6169           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6170           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
6171           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
6172           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
6173           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
6174           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6175           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
6176           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
6177           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
6178           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
6179           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6180           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
6181           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
6182           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
6183           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
6184           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6185           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
6186           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
6187           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
6188           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
6189           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6190           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
6191           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
6192           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
6193           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
6194           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6195           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
6196           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
6197           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
6198           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
6199           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6200           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
6201           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
6202           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
6203           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
6204           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6205           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6206           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6207           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6208           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6209           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6210           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6211           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6212           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6213           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6214           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6215           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6216           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6217           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6218           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6219           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6220           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6221           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6222           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6223           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6224           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6225           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6226           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6227           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6228           "vmovups %%ymm8, (%0)\n"
6229           "vmovups %%ymm9, (%1)\n"
6230           "vmovups %%ymm10, (%2)\n"
6231           "vmovups %%ymm11, (%3)\n"
6232           "vmovups %%ymm12, (%4)\n"
6233           "vmovups %%ymm13, (%5)\n"
6234           "vmovups %%ymm14, (%6)\n"
6235           "vmovups %%ymm15, (%7)\n"
6236           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6237         );
6238       }
6239     }
6240     for (int j = 0; j < 4096; j += 512) {
6241       for (int k = 0; k < 64; k += 8) {
6242         __asm__ volatile (
6243           "vmovups (%0), %%ymm0\n"
6244           "vmovups (%1), %%ymm1\n"
6245           "vmovups (%2), %%ymm2\n"
6246           "vmovups (%3), %%ymm3\n"
6247           "vmovups (%4), %%ymm4\n"
6248           "vmovups (%5), %%ymm5\n"
6249           "vmovups (%6), %%ymm6\n"
6250           "vmovups (%7), %%ymm7\n"
6251           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6252           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6253           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6254           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6255           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6256           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6257           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6258           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6259           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6260           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6261           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6262           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6263           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6264           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6265           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6266           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6267           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6268           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6269           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6270           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6271           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6272           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6273           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6274           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6275           "vmovups %%ymm8, (%0)\n"
6276           "vmovups %%ymm9, (%1)\n"
6277           "vmovups %%ymm10, (%2)\n"
6278           "vmovups %%ymm11, (%3)\n"
6279           "vmovups %%ymm12, (%4)\n"
6280           "vmovups %%ymm13, (%5)\n"
6281           "vmovups %%ymm14, (%6)\n"
6282           "vmovups %%ymm15, (%7)\n"
6283           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6284         );
6285       }
6286     }
6287     for (int j = 0; j < 4096; j += 4096) {
6288       for (int k = 0; k < 512; k += 8) {
6289         __asm__ volatile (
6290           "vmovups (%0), %%ymm0\n"
6291           "vmovups (%1), %%ymm1\n"
6292           "vmovups (%2), %%ymm2\n"
6293           "vmovups (%3), %%ymm3\n"
6294           "vmovups (%4), %%ymm4\n"
6295           "vmovups (%5), %%ymm5\n"
6296           "vmovups (%6), %%ymm6\n"
6297           "vmovups (%7), %%ymm7\n"
6298           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6299           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6300           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6301           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6302           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6303           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6304           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6305           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6306           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6307           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6308           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6309           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6310           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6311           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6312           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6313           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6314           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6315           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6316           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6317           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6318           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6319           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6320           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6321           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6322           "vmovups %%ymm8, (%0)\n"
6323           "vmovups %%ymm9, (%1)\n"
6324           "vmovups %%ymm10, (%2)\n"
6325           "vmovups %%ymm11, (%3)\n"
6326           "vmovups %%ymm12, (%4)\n"
6327           "vmovups %%ymm13, (%5)\n"
6328           "vmovups %%ymm14, (%6)\n"
6329           "vmovups %%ymm15, (%7)\n"
6330           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6331         );
6332       }
6333     }
6334     return;
6335   }
6336   if (depth == 15) {
6337     helper_float_24_recursive(buf + 0, 12);
6338     helper_float_24_recursive(buf + 4096, 12);
6339     helper_float_24_recursive(buf + 8192, 12);
6340     helper_float_24_recursive(buf + 12288, 12);
6341     helper_float_24_recursive(buf + 16384, 12);
6342     helper_float_24_recursive(buf + 20480, 12);
6343     helper_float_24_recursive(buf + 24576, 12);
6344     helper_float_24_recursive(buf + 28672, 12);
6345     for (int j = 0; j < 32768; j += 32768) {
6346       for (int k = 0; k < 4096; k += 8) {
6347         __asm__ volatile (
6348           "vmovups (%0), %%ymm0\n"
6349           "vmovups (%1), %%ymm1\n"
6350           "vmovups (%2), %%ymm2\n"
6351           "vmovups (%3), %%ymm3\n"
6352           "vmovups (%4), %%ymm4\n"
6353           "vmovups (%5), %%ymm5\n"
6354           "vmovups (%6), %%ymm6\n"
6355           "vmovups (%7), %%ymm7\n"
6356           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6357           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6358           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6359           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6360           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6361           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6362           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6363           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6364           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6365           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6366           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6367           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6368           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6369           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6370           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6371           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6372           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6373           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6374           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6375           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6376           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6377           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6378           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6379           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6380           "vmovups %%ymm8, (%0)\n"
6381           "vmovups %%ymm9, (%1)\n"
6382           "vmovups %%ymm10, (%2)\n"
6383           "vmovups %%ymm11, (%3)\n"
6384           "vmovups %%ymm12, (%4)\n"
6385           "vmovups %%ymm13, (%5)\n"
6386           "vmovups %%ymm14, (%6)\n"
6387           "vmovups %%ymm15, (%7)\n"
6388           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6389         );
6390       }
6391     }
6392     return;
6393   }
6394   if (depth == 18) {
6395     helper_float_24_recursive(buf + 0, 15);
6396     helper_float_24_recursive(buf + 32768, 15);
6397     helper_float_24_recursive(buf + 65536, 15);
6398     helper_float_24_recursive(buf + 98304, 15);
6399     helper_float_24_recursive(buf + 131072, 15);
6400     helper_float_24_recursive(buf + 163840, 15);
6401     helper_float_24_recursive(buf + 196608, 15);
6402     helper_float_24_recursive(buf + 229376, 15);
6403     for (int j = 0; j < 262144; j += 262144) {
6404       for (int k = 0; k < 32768; k += 8) {
6405         __asm__ volatile (
6406           "vmovups (%0), %%ymm0\n"
6407           "vmovups (%1), %%ymm1\n"
6408           "vmovups (%2), %%ymm2\n"
6409           "vmovups (%3), %%ymm3\n"
6410           "vmovups (%4), %%ymm4\n"
6411           "vmovups (%5), %%ymm5\n"
6412           "vmovups (%6), %%ymm6\n"
6413           "vmovups (%7), %%ymm7\n"
6414           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6415           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6416           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6417           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6418           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6419           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6420           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6421           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6422           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6423           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6424           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6425           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6426           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6427           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6428           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6429           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6430           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6431           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6432           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6433           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6434           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6435           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6436           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6437           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6438           "vmovups %%ymm8, (%0)\n"
6439           "vmovups %%ymm9, (%1)\n"
6440           "vmovups %%ymm10, (%2)\n"
6441           "vmovups %%ymm11, (%3)\n"
6442           "vmovups %%ymm12, (%4)\n"
6443           "vmovups %%ymm13, (%5)\n"
6444           "vmovups %%ymm14, (%6)\n"
6445           "vmovups %%ymm15, (%7)\n"
6446           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6447         );
6448       }
6449     }
6450     return;
6451   }
6452   if (depth == 21) {
6453     helper_float_24_recursive(buf + 0, 18);
6454     helper_float_24_recursive(buf + 262144, 18);
6455     helper_float_24_recursive(buf + 524288, 18);
6456     helper_float_24_recursive(buf + 786432, 18);
6457     helper_float_24_recursive(buf + 1048576, 18);
6458     helper_float_24_recursive(buf + 1310720, 18);
6459     helper_float_24_recursive(buf + 1572864, 18);
6460     helper_float_24_recursive(buf + 1835008, 18);
6461     for (int j = 0; j < 2097152; j += 2097152) {
6462       for (int k = 0; k < 262144; k += 8) {
6463         __asm__ volatile (
6464           "vmovups (%0), %%ymm0\n"
6465           "vmovups (%1), %%ymm1\n"
6466           "vmovups (%2), %%ymm2\n"
6467           "vmovups (%3), %%ymm3\n"
6468           "vmovups (%4), %%ymm4\n"
6469           "vmovups (%5), %%ymm5\n"
6470           "vmovups (%6), %%ymm6\n"
6471           "vmovups (%7), %%ymm7\n"
6472           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6473           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6474           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6475           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6476           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6477           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6478           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6479           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6480           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6481           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6482           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6483           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6484           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6485           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6486           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6487           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6488           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6489           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6490           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6491           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6492           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6493           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6494           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6495           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6496           "vmovups %%ymm8, (%0)\n"
6497           "vmovups %%ymm9, (%1)\n"
6498           "vmovups %%ymm10, (%2)\n"
6499           "vmovups %%ymm11, (%3)\n"
6500           "vmovups %%ymm12, (%4)\n"
6501           "vmovups %%ymm13, (%5)\n"
6502           "vmovups %%ymm14, (%6)\n"
6503           "vmovups %%ymm15, (%7)\n"
6504           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6505         );
6506       }
6507     }
6508     return;
6509   }
6510   if (depth == 24) {
6511     helper_float_24_recursive(buf + 0, 21);
6512     helper_float_24_recursive(buf + 2097152, 21);
6513     helper_float_24_recursive(buf + 4194304, 21);
6514     helper_float_24_recursive(buf + 6291456, 21);
6515     helper_float_24_recursive(buf + 8388608, 21);
6516     helper_float_24_recursive(buf + 10485760, 21);
6517     helper_float_24_recursive(buf + 12582912, 21);
6518     helper_float_24_recursive(buf + 14680064, 21);
6519     for (int j = 0; j < 16777216; j += 16777216) {
6520       for (int k = 0; k < 2097152; k += 8) {
6521         __asm__ volatile (
6522           "vmovups (%0), %%ymm0\n"
6523           "vmovups (%1), %%ymm1\n"
6524           "vmovups (%2), %%ymm2\n"
6525           "vmovups (%3), %%ymm3\n"
6526           "vmovups (%4), %%ymm4\n"
6527           "vmovups (%5), %%ymm5\n"
6528           "vmovups (%6), %%ymm6\n"
6529           "vmovups (%7), %%ymm7\n"
6530           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6531           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6532           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6533           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6534           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6535           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6536           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6537           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6538           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6539           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6540           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6541           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6542           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6543           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6544           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6545           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6546           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6547           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6548           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6549           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6550           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6551           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6552           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6553           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6554           "vmovups %%ymm8, (%0)\n"
6555           "vmovups %%ymm9, (%1)\n"
6556           "vmovups %%ymm10, (%2)\n"
6557           "vmovups %%ymm11, (%3)\n"
6558           "vmovups %%ymm12, (%4)\n"
6559           "vmovups %%ymm13, (%5)\n"
6560           "vmovups %%ymm14, (%6)\n"
6561           "vmovups %%ymm15, (%7)\n"
6562           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6563         );
6564       }
6565     }
6566     return;
6567   }
6568 }
6569 void helper_float_24(float *buf);
helper_float_24(float * buf)6570 void helper_float_24(float *buf) {
6571   helper_float_24_recursive(buf, 24);
6572 }
6573 void helper_float_25_recursive(float *buf, int depth);
helper_float_25_recursive(float * buf,int depth)6574 void helper_float_25_recursive(float *buf, int depth) {
6575   if (depth == 7) {
6576     for (int j = 0; j < 128; j += 64) {
6577       for (int k = 0; k < 8; k += 8) {
6578         __asm__ volatile (
6579           "vmovups (%0), %%ymm0\n"
6580           "vmovups (%1), %%ymm1\n"
6581           "vmovups (%2), %%ymm2\n"
6582           "vmovups (%3), %%ymm3\n"
6583           "vmovups (%4), %%ymm4\n"
6584           "vmovups (%5), %%ymm5\n"
6585           "vmovups (%6), %%ymm6\n"
6586           "vmovups (%7), %%ymm7\n"
6587           "vpermilps $160, %%ymm0, %%ymm8\n"
6588           "vpermilps $245, %%ymm0, %%ymm9\n"
6589           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6590           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6591           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
6592           "vpermilps $160, %%ymm1, %%ymm8\n"
6593           "vpermilps $245, %%ymm1, %%ymm9\n"
6594           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6595           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6596           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
6597           "vpermilps $160, %%ymm2, %%ymm8\n"
6598           "vpermilps $245, %%ymm2, %%ymm9\n"
6599           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6600           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6601           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
6602           "vpermilps $160, %%ymm3, %%ymm8\n"
6603           "vpermilps $245, %%ymm3, %%ymm9\n"
6604           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6605           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6606           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
6607           "vpermilps $160, %%ymm4, %%ymm8\n"
6608           "vpermilps $245, %%ymm4, %%ymm9\n"
6609           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6610           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6611           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
6612           "vpermilps $160, %%ymm5, %%ymm8\n"
6613           "vpermilps $245, %%ymm5, %%ymm9\n"
6614           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6615           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6616           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
6617           "vpermilps $160, %%ymm6, %%ymm8\n"
6618           "vpermilps $245, %%ymm6, %%ymm9\n"
6619           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6620           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6621           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
6622           "vpermilps $160, %%ymm7, %%ymm8\n"
6623           "vpermilps $245, %%ymm7, %%ymm9\n"
6624           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6625           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6626           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
6627           "vpermilps $68, %%ymm0, %%ymm8\n"
6628           "vpermilps $238, %%ymm0, %%ymm9\n"
6629           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6630           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6631           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6632           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
6633           "vpermilps $68, %%ymm1, %%ymm8\n"
6634           "vpermilps $238, %%ymm1, %%ymm9\n"
6635           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6636           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6637           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6638           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
6639           "vpermilps $68, %%ymm2, %%ymm8\n"
6640           "vpermilps $238, %%ymm2, %%ymm9\n"
6641           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6642           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6643           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6644           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
6645           "vpermilps $68, %%ymm3, %%ymm8\n"
6646           "vpermilps $238, %%ymm3, %%ymm9\n"
6647           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6648           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6649           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6650           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
6651           "vpermilps $68, %%ymm4, %%ymm8\n"
6652           "vpermilps $238, %%ymm4, %%ymm9\n"
6653           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6654           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6655           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6656           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
6657           "vpermilps $68, %%ymm5, %%ymm8\n"
6658           "vpermilps $238, %%ymm5, %%ymm9\n"
6659           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6660           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6661           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6662           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
6663           "vpermilps $68, %%ymm6, %%ymm8\n"
6664           "vpermilps $238, %%ymm6, %%ymm9\n"
6665           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6666           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6667           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6668           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
6669           "vpermilps $68, %%ymm7, %%ymm8\n"
6670           "vpermilps $238, %%ymm7, %%ymm9\n"
6671           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6672           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6673           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6674           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
6675           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6676           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
6677           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
6678           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
6679           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
6680           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6681           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
6682           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
6683           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
6684           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
6685           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6686           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
6687           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
6688           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
6689           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
6690           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6691           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
6692           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
6693           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
6694           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
6695           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6696           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
6697           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
6698           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
6699           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
6700           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6701           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
6702           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
6703           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
6704           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
6705           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6706           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
6707           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
6708           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
6709           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
6710           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6711           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
6712           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
6713           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
6714           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
6715           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6716           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6717           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6718           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6719           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6720           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6721           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6722           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6723           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6724           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6725           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6726           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6727           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6728           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6729           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6730           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6731           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6732           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6733           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6734           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6735           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6736           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6737           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6738           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6739           "vmovups %%ymm8, (%0)\n"
6740           "vmovups %%ymm9, (%1)\n"
6741           "vmovups %%ymm10, (%2)\n"
6742           "vmovups %%ymm11, (%3)\n"
6743           "vmovups %%ymm12, (%4)\n"
6744           "vmovups %%ymm13, (%5)\n"
6745           "vmovups %%ymm14, (%6)\n"
6746           "vmovups %%ymm15, (%7)\n"
6747           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6748         );
6749       }
6750     }
6751     for (int j = 0; j < 128; j += 128) {
6752       for (int k = 0; k < 64; k += 8) {
6753         __asm__ volatile (
6754           "vmovups (%0), %%ymm0\n"
6755           "vmovups (%1), %%ymm1\n"
6756           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6757           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6758           "vmovups %%ymm8, (%0)\n"
6759           "vmovups %%ymm9, (%1)\n"
6760           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6761         );
6762       }
6763     }
6764     return;
6765   }
6766   if (depth == 10) {
6767     helper_float_25_recursive(buf + 0, 7);
6768     helper_float_25_recursive(buf + 128, 7);
6769     helper_float_25_recursive(buf + 256, 7);
6770     helper_float_25_recursive(buf + 384, 7);
6771     helper_float_25_recursive(buf + 512, 7);
6772     helper_float_25_recursive(buf + 640, 7);
6773     helper_float_25_recursive(buf + 768, 7);
6774     helper_float_25_recursive(buf + 896, 7);
6775     for (int j = 0; j < 1024; j += 1024) {
6776       for (int k = 0; k < 128; k += 8) {
6777         __asm__ volatile (
6778           "vmovups (%0), %%ymm0\n"
6779           "vmovups (%1), %%ymm1\n"
6780           "vmovups (%2), %%ymm2\n"
6781           "vmovups (%3), %%ymm3\n"
6782           "vmovups (%4), %%ymm4\n"
6783           "vmovups (%5), %%ymm5\n"
6784           "vmovups (%6), %%ymm6\n"
6785           "vmovups (%7), %%ymm7\n"
6786           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6787           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6788           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6789           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6790           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6791           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6792           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6793           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6794           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6795           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6796           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6797           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6798           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6799           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6800           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6801           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6802           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6803           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6804           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6805           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6806           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6807           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6808           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6809           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6810           "vmovups %%ymm8, (%0)\n"
6811           "vmovups %%ymm9, (%1)\n"
6812           "vmovups %%ymm10, (%2)\n"
6813           "vmovups %%ymm11, (%3)\n"
6814           "vmovups %%ymm12, (%4)\n"
6815           "vmovups %%ymm13, (%5)\n"
6816           "vmovups %%ymm14, (%6)\n"
6817           "vmovups %%ymm15, (%7)\n"
6818           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6819         );
6820       }
6821     }
6822     return;
6823   }
6824   if (depth == 13) {
6825     helper_float_25_recursive(buf + 0, 10);
6826     helper_float_25_recursive(buf + 1024, 10);
6827     helper_float_25_recursive(buf + 2048, 10);
6828     helper_float_25_recursive(buf + 3072, 10);
6829     helper_float_25_recursive(buf + 4096, 10);
6830     helper_float_25_recursive(buf + 5120, 10);
6831     helper_float_25_recursive(buf + 6144, 10);
6832     helper_float_25_recursive(buf + 7168, 10);
6833     for (int j = 0; j < 8192; j += 8192) {
6834       for (int k = 0; k < 1024; k += 8) {
6835         __asm__ volatile (
6836           "vmovups (%0), %%ymm0\n"
6837           "vmovups (%1), %%ymm1\n"
6838           "vmovups (%2), %%ymm2\n"
6839           "vmovups (%3), %%ymm3\n"
6840           "vmovups (%4), %%ymm4\n"
6841           "vmovups (%5), %%ymm5\n"
6842           "vmovups (%6), %%ymm6\n"
6843           "vmovups (%7), %%ymm7\n"
6844           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6845           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6846           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6847           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6848           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6849           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6850           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6851           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6852           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6853           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6854           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6855           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6856           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6857           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6858           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6859           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6860           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6861           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6862           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6863           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6864           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6865           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6866           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6867           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6868           "vmovups %%ymm8, (%0)\n"
6869           "vmovups %%ymm9, (%1)\n"
6870           "vmovups %%ymm10, (%2)\n"
6871           "vmovups %%ymm11, (%3)\n"
6872           "vmovups %%ymm12, (%4)\n"
6873           "vmovups %%ymm13, (%5)\n"
6874           "vmovups %%ymm14, (%6)\n"
6875           "vmovups %%ymm15, (%7)\n"
6876           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6877         );
6878       }
6879     }
6880     return;
6881   }
6882   if (depth == 16) {
6883     helper_float_25_recursive(buf + 0, 13);
6884     helper_float_25_recursive(buf + 8192, 13);
6885     helper_float_25_recursive(buf + 16384, 13);
6886     helper_float_25_recursive(buf + 24576, 13);
6887     helper_float_25_recursive(buf + 32768, 13);
6888     helper_float_25_recursive(buf + 40960, 13);
6889     helper_float_25_recursive(buf + 49152, 13);
6890     helper_float_25_recursive(buf + 57344, 13);
6891     for (int j = 0; j < 65536; j += 65536) {
6892       for (int k = 0; k < 8192; k += 8) {
6893         __asm__ volatile (
6894           "vmovups (%0), %%ymm0\n"
6895           "vmovups (%1), %%ymm1\n"
6896           "vmovups (%2), %%ymm2\n"
6897           "vmovups (%3), %%ymm3\n"
6898           "vmovups (%4), %%ymm4\n"
6899           "vmovups (%5), %%ymm5\n"
6900           "vmovups (%6), %%ymm6\n"
6901           "vmovups (%7), %%ymm7\n"
6902           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6903           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6904           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6905           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6906           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6907           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6908           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6909           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6910           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6911           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6912           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6913           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6914           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6915           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6916           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6917           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6918           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6919           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6920           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6921           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6922           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6923           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6924           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6925           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6926           "vmovups %%ymm8, (%0)\n"
6927           "vmovups %%ymm9, (%1)\n"
6928           "vmovups %%ymm10, (%2)\n"
6929           "vmovups %%ymm11, (%3)\n"
6930           "vmovups %%ymm12, (%4)\n"
6931           "vmovups %%ymm13, (%5)\n"
6932           "vmovups %%ymm14, (%6)\n"
6933           "vmovups %%ymm15, (%7)\n"
6934           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6935         );
6936       }
6937     }
6938     return;
6939   }
6940   if (depth == 19) {
6941     helper_float_25_recursive(buf + 0, 16);
6942     helper_float_25_recursive(buf + 65536, 16);
6943     helper_float_25_recursive(buf + 131072, 16);
6944     helper_float_25_recursive(buf + 196608, 16);
6945     helper_float_25_recursive(buf + 262144, 16);
6946     helper_float_25_recursive(buf + 327680, 16);
6947     helper_float_25_recursive(buf + 393216, 16);
6948     helper_float_25_recursive(buf + 458752, 16);
6949     for (int j = 0; j < 524288; j += 524288) {
6950       for (int k = 0; k < 65536; k += 8) {
6951         __asm__ volatile (
6952           "vmovups (%0), %%ymm0\n"
6953           "vmovups (%1), %%ymm1\n"
6954           "vmovups (%2), %%ymm2\n"
6955           "vmovups (%3), %%ymm3\n"
6956           "vmovups (%4), %%ymm4\n"
6957           "vmovups (%5), %%ymm5\n"
6958           "vmovups (%6), %%ymm6\n"
6959           "vmovups (%7), %%ymm7\n"
6960           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6961           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6962           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6963           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6964           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6965           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6966           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6967           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6968           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6969           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6970           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6971           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6972           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6973           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6974           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6975           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6976           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6977           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6978           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6979           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6980           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6981           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6982           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6983           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6984           "vmovups %%ymm8, (%0)\n"
6985           "vmovups %%ymm9, (%1)\n"
6986           "vmovups %%ymm10, (%2)\n"
6987           "vmovups %%ymm11, (%3)\n"
6988           "vmovups %%ymm12, (%4)\n"
6989           "vmovups %%ymm13, (%5)\n"
6990           "vmovups %%ymm14, (%6)\n"
6991           "vmovups %%ymm15, (%7)\n"
6992           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6993         );
6994       }
6995     }
6996     return;
6997   }
6998   if (depth == 22) {
6999     helper_float_25_recursive(buf + 0, 19);
7000     helper_float_25_recursive(buf + 524288, 19);
7001     helper_float_25_recursive(buf + 1048576, 19);
7002     helper_float_25_recursive(buf + 1572864, 19);
7003     helper_float_25_recursive(buf + 2097152, 19);
7004     helper_float_25_recursive(buf + 2621440, 19);
7005     helper_float_25_recursive(buf + 3145728, 19);
7006     helper_float_25_recursive(buf + 3670016, 19);
7007     for (int j = 0; j < 4194304; j += 4194304) {
7008       for (int k = 0; k < 524288; k += 8) {
7009         __asm__ volatile (
7010           "vmovups (%0), %%ymm0\n"
7011           "vmovups (%1), %%ymm1\n"
7012           "vmovups (%2), %%ymm2\n"
7013           "vmovups (%3), %%ymm3\n"
7014           "vmovups (%4), %%ymm4\n"
7015           "vmovups (%5), %%ymm5\n"
7016           "vmovups (%6), %%ymm6\n"
7017           "vmovups (%7), %%ymm7\n"
7018           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7019           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7020           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7021           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7022           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7023           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7024           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7025           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7026           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7027           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7028           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7029           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7030           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7031           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7032           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7033           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7034           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7035           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7036           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7037           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7038           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7039           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7040           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7041           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7042           "vmovups %%ymm8, (%0)\n"
7043           "vmovups %%ymm9, (%1)\n"
7044           "vmovups %%ymm10, (%2)\n"
7045           "vmovups %%ymm11, (%3)\n"
7046           "vmovups %%ymm12, (%4)\n"
7047           "vmovups %%ymm13, (%5)\n"
7048           "vmovups %%ymm14, (%6)\n"
7049           "vmovups %%ymm15, (%7)\n"
7050           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7051         );
7052       }
7053     }
7054     return;
7055   }
7056   if (depth == 25) {
7057     helper_float_25_recursive(buf + 0, 22);
7058     helper_float_25_recursive(buf + 4194304, 22);
7059     helper_float_25_recursive(buf + 8388608, 22);
7060     helper_float_25_recursive(buf + 12582912, 22);
7061     helper_float_25_recursive(buf + 16777216, 22);
7062     helper_float_25_recursive(buf + 20971520, 22);
7063     helper_float_25_recursive(buf + 25165824, 22);
7064     helper_float_25_recursive(buf + 29360128, 22);
7065     for (int j = 0; j < 33554432; j += 33554432) {
7066       for (int k = 0; k < 4194304; k += 8) {
7067         __asm__ volatile (
7068           "vmovups (%0), %%ymm0\n"
7069           "vmovups (%1), %%ymm1\n"
7070           "vmovups (%2), %%ymm2\n"
7071           "vmovups (%3), %%ymm3\n"
7072           "vmovups (%4), %%ymm4\n"
7073           "vmovups (%5), %%ymm5\n"
7074           "vmovups (%6), %%ymm6\n"
7075           "vmovups (%7), %%ymm7\n"
7076           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7077           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7078           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7079           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7080           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7081           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7082           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7083           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7084           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7085           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7086           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7087           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7088           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7089           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7090           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7091           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7092           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7093           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7094           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7095           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7096           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7097           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7098           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7099           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7100           "vmovups %%ymm8, (%0)\n"
7101           "vmovups %%ymm9, (%1)\n"
7102           "vmovups %%ymm10, (%2)\n"
7103           "vmovups %%ymm11, (%3)\n"
7104           "vmovups %%ymm12, (%4)\n"
7105           "vmovups %%ymm13, (%5)\n"
7106           "vmovups %%ymm14, (%6)\n"
7107           "vmovups %%ymm15, (%7)\n"
7108           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7109         );
7110       }
7111     }
7112     return;
7113   }
7114 }
7115 void helper_float_25(float *buf);
helper_float_25(float * buf)7116 void helper_float_25(float *buf) {
7117   helper_float_25_recursive(buf, 25);
7118 }
7119 void helper_float_26_recursive(float *buf, int depth);
helper_float_26_recursive(float * buf,int depth)7120 void helper_float_26_recursive(float *buf, int depth) {
7121   if (depth == 12) {
7122     for (int j = 0; j < 4096; j += 64) {
7123       for (int k = 0; k < 8; k += 8) {
7124         __asm__ volatile (
7125           "vmovups (%0), %%ymm0\n"
7126           "vmovups (%1), %%ymm1\n"
7127           "vmovups (%2), %%ymm2\n"
7128           "vmovups (%3), %%ymm3\n"
7129           "vmovups (%4), %%ymm4\n"
7130           "vmovups (%5), %%ymm5\n"
7131           "vmovups (%6), %%ymm6\n"
7132           "vmovups (%7), %%ymm7\n"
7133           "vpermilps $160, %%ymm0, %%ymm8\n"
7134           "vpermilps $245, %%ymm0, %%ymm9\n"
7135           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7136           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7137           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
7138           "vpermilps $160, %%ymm1, %%ymm8\n"
7139           "vpermilps $245, %%ymm1, %%ymm9\n"
7140           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7141           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7142           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
7143           "vpermilps $160, %%ymm2, %%ymm8\n"
7144           "vpermilps $245, %%ymm2, %%ymm9\n"
7145           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7146           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7147           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
7148           "vpermilps $160, %%ymm3, %%ymm8\n"
7149           "vpermilps $245, %%ymm3, %%ymm9\n"
7150           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7151           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7152           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
7153           "vpermilps $160, %%ymm4, %%ymm8\n"
7154           "vpermilps $245, %%ymm4, %%ymm9\n"
7155           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7156           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7157           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
7158           "vpermilps $160, %%ymm5, %%ymm8\n"
7159           "vpermilps $245, %%ymm5, %%ymm9\n"
7160           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7161           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7162           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
7163           "vpermilps $160, %%ymm6, %%ymm8\n"
7164           "vpermilps $245, %%ymm6, %%ymm9\n"
7165           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7166           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7167           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
7168           "vpermilps $160, %%ymm7, %%ymm8\n"
7169           "vpermilps $245, %%ymm7, %%ymm9\n"
7170           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7171           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7172           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
7173           "vpermilps $68, %%ymm0, %%ymm8\n"
7174           "vpermilps $238, %%ymm0, %%ymm9\n"
7175           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7176           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7177           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7178           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
7179           "vpermilps $68, %%ymm1, %%ymm8\n"
7180           "vpermilps $238, %%ymm1, %%ymm9\n"
7181           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7182           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7183           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7184           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
7185           "vpermilps $68, %%ymm2, %%ymm8\n"
7186           "vpermilps $238, %%ymm2, %%ymm9\n"
7187           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7188           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7189           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7190           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
7191           "vpermilps $68, %%ymm3, %%ymm8\n"
7192           "vpermilps $238, %%ymm3, %%ymm9\n"
7193           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7194           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7195           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7196           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
7197           "vpermilps $68, %%ymm4, %%ymm8\n"
7198           "vpermilps $238, %%ymm4, %%ymm9\n"
7199           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7200           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7201           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7202           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
7203           "vpermilps $68, %%ymm5, %%ymm8\n"
7204           "vpermilps $238, %%ymm5, %%ymm9\n"
7205           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7206           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7207           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7208           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
7209           "vpermilps $68, %%ymm6, %%ymm8\n"
7210           "vpermilps $238, %%ymm6, %%ymm9\n"
7211           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7212           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7213           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7214           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
7215           "vpermilps $68, %%ymm7, %%ymm8\n"
7216           "vpermilps $238, %%ymm7, %%ymm9\n"
7217           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7218           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7219           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7220           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
7221           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7222           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
7223           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
7224           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
7225           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
7226           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7227           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
7228           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
7229           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
7230           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
7231           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7232           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
7233           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
7234           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
7235           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
7236           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7237           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
7238           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
7239           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
7240           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
7241           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7242           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
7243           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
7244           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
7245           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
7246           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7247           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
7248           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
7249           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
7250           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
7251           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7252           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
7253           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
7254           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
7255           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
7256           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7257           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
7258           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
7259           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
7260           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
7261           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7262           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7263           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7264           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7265           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7266           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7267           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7268           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7269           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7270           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7271           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7272           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7273           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7274           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7275           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7276           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7277           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7278           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7279           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7280           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7281           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7282           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7283           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7284           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7285           "vmovups %%ymm8, (%0)\n"
7286           "vmovups %%ymm9, (%1)\n"
7287           "vmovups %%ymm10, (%2)\n"
7288           "vmovups %%ymm11, (%3)\n"
7289           "vmovups %%ymm12, (%4)\n"
7290           "vmovups %%ymm13, (%5)\n"
7291           "vmovups %%ymm14, (%6)\n"
7292           "vmovups %%ymm15, (%7)\n"
7293           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7294         );
7295       }
7296     }
7297     for (int j = 0; j < 4096; j += 512) {
7298       for (int k = 0; k < 64; k += 8) {
7299         __asm__ volatile (
7300           "vmovups (%0), %%ymm0\n"
7301           "vmovups (%1), %%ymm1\n"
7302           "vmovups (%2), %%ymm2\n"
7303           "vmovups (%3), %%ymm3\n"
7304           "vmovups (%4), %%ymm4\n"
7305           "vmovups (%5), %%ymm5\n"
7306           "vmovups (%6), %%ymm6\n"
7307           "vmovups (%7), %%ymm7\n"
7308           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7309           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7310           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7311           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7312           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7313           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7314           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7315           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7316           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7317           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7318           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7319           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7320           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7321           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7322           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7323           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7324           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7325           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7326           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7327           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7328           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7329           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7330           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7331           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7332           "vmovups %%ymm8, (%0)\n"
7333           "vmovups %%ymm9, (%1)\n"
7334           "vmovups %%ymm10, (%2)\n"
7335           "vmovups %%ymm11, (%3)\n"
7336           "vmovups %%ymm12, (%4)\n"
7337           "vmovups %%ymm13, (%5)\n"
7338           "vmovups %%ymm14, (%6)\n"
7339           "vmovups %%ymm15, (%7)\n"
7340           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7341         );
7342       }
7343     }
7344     for (int j = 0; j < 4096; j += 4096) {
7345       for (int k = 0; k < 512; k += 8) {
7346         __asm__ volatile (
7347           "vmovups (%0), %%ymm0\n"
7348           "vmovups (%1), %%ymm1\n"
7349           "vmovups (%2), %%ymm2\n"
7350           "vmovups (%3), %%ymm3\n"
7351           "vmovups (%4), %%ymm4\n"
7352           "vmovups (%5), %%ymm5\n"
7353           "vmovups (%6), %%ymm6\n"
7354           "vmovups (%7), %%ymm7\n"
7355           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7356           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7357           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7358           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7359           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7360           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7361           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7362           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7363           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7364           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7365           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7366           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7367           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7368           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7369           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7370           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7371           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7372           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7373           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7374           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7375           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7376           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7377           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7378           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7379           "vmovups %%ymm8, (%0)\n"
7380           "vmovups %%ymm9, (%1)\n"
7381           "vmovups %%ymm10, (%2)\n"
7382           "vmovups %%ymm11, (%3)\n"
7383           "vmovups %%ymm12, (%4)\n"
7384           "vmovups %%ymm13, (%5)\n"
7385           "vmovups %%ymm14, (%6)\n"
7386           "vmovups %%ymm15, (%7)\n"
7387           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7388         );
7389       }
7390     }
7391     return;
7392   }
7393   if (depth == 15) {
7394     helper_float_26_recursive(buf + 0, 12);
7395     helper_float_26_recursive(buf + 4096, 12);
7396     helper_float_26_recursive(buf + 8192, 12);
7397     helper_float_26_recursive(buf + 12288, 12);
7398     helper_float_26_recursive(buf + 16384, 12);
7399     helper_float_26_recursive(buf + 20480, 12);
7400     helper_float_26_recursive(buf + 24576, 12);
7401     helper_float_26_recursive(buf + 28672, 12);
7402     for (int j = 0; j < 32768; j += 32768) {
7403       for (int k = 0; k < 4096; k += 8) {
7404         __asm__ volatile (
7405           "vmovups (%0), %%ymm0\n"
7406           "vmovups (%1), %%ymm1\n"
7407           "vmovups (%2), %%ymm2\n"
7408           "vmovups (%3), %%ymm3\n"
7409           "vmovups (%4), %%ymm4\n"
7410           "vmovups (%5), %%ymm5\n"
7411           "vmovups (%6), %%ymm6\n"
7412           "vmovups (%7), %%ymm7\n"
7413           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7414           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7415           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7416           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7417           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7418           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7419           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7420           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7421           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7422           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7423           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7424           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7425           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7426           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7427           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7428           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7429           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7430           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7431           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7432           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7433           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7434           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7435           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7436           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7437           "vmovups %%ymm8, (%0)\n"
7438           "vmovups %%ymm9, (%1)\n"
7439           "vmovups %%ymm10, (%2)\n"
7440           "vmovups %%ymm11, (%3)\n"
7441           "vmovups %%ymm12, (%4)\n"
7442           "vmovups %%ymm13, (%5)\n"
7443           "vmovups %%ymm14, (%6)\n"
7444           "vmovups %%ymm15, (%7)\n"
7445           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7446         );
7447       }
7448     }
7449     return;
7450   }
7451   if (depth == 18) {
7452     helper_float_26_recursive(buf + 0, 15);
7453     helper_float_26_recursive(buf + 32768, 15);
7454     helper_float_26_recursive(buf + 65536, 15);
7455     helper_float_26_recursive(buf + 98304, 15);
7456     helper_float_26_recursive(buf + 131072, 15);
7457     helper_float_26_recursive(buf + 163840, 15);
7458     helper_float_26_recursive(buf + 196608, 15);
7459     helper_float_26_recursive(buf + 229376, 15);
7460     for (int j = 0; j < 262144; j += 262144) {
7461       for (int k = 0; k < 32768; k += 8) {
7462         __asm__ volatile (
7463           "vmovups (%0), %%ymm0\n"
7464           "vmovups (%1), %%ymm1\n"
7465           "vmovups (%2), %%ymm2\n"
7466           "vmovups (%3), %%ymm3\n"
7467           "vmovups (%4), %%ymm4\n"
7468           "vmovups (%5), %%ymm5\n"
7469           "vmovups (%6), %%ymm6\n"
7470           "vmovups (%7), %%ymm7\n"
7471           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7472           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7473           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7474           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7475           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7476           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7477           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7478           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7479           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7480           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7481           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7482           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7483           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7484           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7485           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7486           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7487           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7488           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7489           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7490           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7491           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7492           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7493           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7494           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7495           "vmovups %%ymm8, (%0)\n"
7496           "vmovups %%ymm9, (%1)\n"
7497           "vmovups %%ymm10, (%2)\n"
7498           "vmovups %%ymm11, (%3)\n"
7499           "vmovups %%ymm12, (%4)\n"
7500           "vmovups %%ymm13, (%5)\n"
7501           "vmovups %%ymm14, (%6)\n"
7502           "vmovups %%ymm15, (%7)\n"
7503           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7504         );
7505       }
7506     }
7507     return;
7508   }
7509   if (depth == 21) {
7510     helper_float_26_recursive(buf + 0, 18);
7511     helper_float_26_recursive(buf + 262144, 18);
7512     helper_float_26_recursive(buf + 524288, 18);
7513     helper_float_26_recursive(buf + 786432, 18);
7514     helper_float_26_recursive(buf + 1048576, 18);
7515     helper_float_26_recursive(buf + 1310720, 18);
7516     helper_float_26_recursive(buf + 1572864, 18);
7517     helper_float_26_recursive(buf + 1835008, 18);
7518     for (int j = 0; j < 2097152; j += 2097152) {
7519       for (int k = 0; k < 262144; k += 8) {
7520         __asm__ volatile (
7521           "vmovups (%0), %%ymm0\n"
7522           "vmovups (%1), %%ymm1\n"
7523           "vmovups (%2), %%ymm2\n"
7524           "vmovups (%3), %%ymm3\n"
7525           "vmovups (%4), %%ymm4\n"
7526           "vmovups (%5), %%ymm5\n"
7527           "vmovups (%6), %%ymm6\n"
7528           "vmovups (%7), %%ymm7\n"
7529           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7530           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7531           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7532           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7533           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7534           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7535           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7536           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7537           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7538           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7539           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7540           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7541           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7542           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7543           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7544           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7545           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7546           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7547           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7548           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7549           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7550           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7551           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7552           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7553           "vmovups %%ymm8, (%0)\n"
7554           "vmovups %%ymm9, (%1)\n"
7555           "vmovups %%ymm10, (%2)\n"
7556           "vmovups %%ymm11, (%3)\n"
7557           "vmovups %%ymm12, (%4)\n"
7558           "vmovups %%ymm13, (%5)\n"
7559           "vmovups %%ymm14, (%6)\n"
7560           "vmovups %%ymm15, (%7)\n"
7561           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7562         );
7563       }
7564     }
7565     return;
7566   }
7567   if (depth == 24) {
7568     helper_float_26_recursive(buf + 0, 21);
7569     helper_float_26_recursive(buf + 2097152, 21);
7570     helper_float_26_recursive(buf + 4194304, 21);
7571     helper_float_26_recursive(buf + 6291456, 21);
7572     helper_float_26_recursive(buf + 8388608, 21);
7573     helper_float_26_recursive(buf + 10485760, 21);
7574     helper_float_26_recursive(buf + 12582912, 21);
7575     helper_float_26_recursive(buf + 14680064, 21);
7576     for (int j = 0; j < 16777216; j += 16777216) {
7577       for (int k = 0; k < 2097152; k += 8) {
7578         __asm__ volatile (
7579           "vmovups (%0), %%ymm0\n"
7580           "vmovups (%1), %%ymm1\n"
7581           "vmovups (%2), %%ymm2\n"
7582           "vmovups (%3), %%ymm3\n"
7583           "vmovups (%4), %%ymm4\n"
7584           "vmovups (%5), %%ymm5\n"
7585           "vmovups (%6), %%ymm6\n"
7586           "vmovups (%7), %%ymm7\n"
7587           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7588           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7589           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7590           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7591           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7592           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7593           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7594           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7595           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7596           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7597           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7598           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7599           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7600           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7601           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7602           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7603           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7604           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7605           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7606           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7607           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7608           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7609           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7610           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7611           "vmovups %%ymm8, (%0)\n"
7612           "vmovups %%ymm9, (%1)\n"
7613           "vmovups %%ymm10, (%2)\n"
7614           "vmovups %%ymm11, (%3)\n"
7615           "vmovups %%ymm12, (%4)\n"
7616           "vmovups %%ymm13, (%5)\n"
7617           "vmovups %%ymm14, (%6)\n"
7618           "vmovups %%ymm15, (%7)\n"
7619           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7620         );
7621       }
7622     }
7623     return;
7624   }
7625   if (depth == 26) {
7626     helper_float_26_recursive(buf + 0, 24);
7627     helper_float_26_recursive(buf + 16777216, 24);
7628     helper_float_26_recursive(buf + 33554432, 24);
7629     helper_float_26_recursive(buf + 50331648, 24);
7630     for (int j = 0; j < 67108864; j += 67108864) {
7631       for (int k = 0; k < 16777216; k += 8) {
7632         __asm__ volatile (
7633           "vmovups (%0), %%ymm0\n"
7634           "vmovups (%1), %%ymm1\n"
7635           "vmovups (%2), %%ymm2\n"
7636           "vmovups (%3), %%ymm3\n"
7637           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7638           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7639           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7640           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7641           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7642           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7643           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7644           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7645           "vmovups %%ymm0, (%0)\n"
7646           "vmovups %%ymm1, (%1)\n"
7647           "vmovups %%ymm2, (%2)\n"
7648           "vmovups %%ymm3, (%3)\n"
7649           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7650         );
7651       }
7652     }
7653     return;
7654   }
7655 }
7656 void helper_float_26(float *buf);
helper_float_26(float * buf)7657 void helper_float_26(float *buf) {
7658   helper_float_26_recursive(buf, 26);
7659 }
7660 void helper_float_27_recursive(float *buf, int depth);
helper_float_27_recursive(float * buf,int depth)7661 void helper_float_27_recursive(float *buf, int depth) {
7662   if (depth == 12) {
7663     for (int j = 0; j < 4096; j += 64) {
7664       for (int k = 0; k < 8; k += 8) {
7665         __asm__ volatile (
7666           "vmovups (%0), %%ymm0\n"
7667           "vmovups (%1), %%ymm1\n"
7668           "vmovups (%2), %%ymm2\n"
7669           "vmovups (%3), %%ymm3\n"
7670           "vmovups (%4), %%ymm4\n"
7671           "vmovups (%5), %%ymm5\n"
7672           "vmovups (%6), %%ymm6\n"
7673           "vmovups (%7), %%ymm7\n"
7674           "vpermilps $160, %%ymm0, %%ymm8\n"
7675           "vpermilps $245, %%ymm0, %%ymm9\n"
7676           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7677           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7678           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
7679           "vpermilps $160, %%ymm1, %%ymm8\n"
7680           "vpermilps $245, %%ymm1, %%ymm9\n"
7681           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7682           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7683           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
7684           "vpermilps $160, %%ymm2, %%ymm8\n"
7685           "vpermilps $245, %%ymm2, %%ymm9\n"
7686           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7687           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7688           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
7689           "vpermilps $160, %%ymm3, %%ymm8\n"
7690           "vpermilps $245, %%ymm3, %%ymm9\n"
7691           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7692           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7693           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
7694           "vpermilps $160, %%ymm4, %%ymm8\n"
7695           "vpermilps $245, %%ymm4, %%ymm9\n"
7696           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7697           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7698           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
7699           "vpermilps $160, %%ymm5, %%ymm8\n"
7700           "vpermilps $245, %%ymm5, %%ymm9\n"
7701           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7702           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7703           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
7704           "vpermilps $160, %%ymm6, %%ymm8\n"
7705           "vpermilps $245, %%ymm6, %%ymm9\n"
7706           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7707           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7708           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
7709           "vpermilps $160, %%ymm7, %%ymm8\n"
7710           "vpermilps $245, %%ymm7, %%ymm9\n"
7711           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7712           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7713           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
7714           "vpermilps $68, %%ymm0, %%ymm8\n"
7715           "vpermilps $238, %%ymm0, %%ymm9\n"
7716           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7717           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7718           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7719           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
7720           "vpermilps $68, %%ymm1, %%ymm8\n"
7721           "vpermilps $238, %%ymm1, %%ymm9\n"
7722           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7723           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7724           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7725           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
7726           "vpermilps $68, %%ymm2, %%ymm8\n"
7727           "vpermilps $238, %%ymm2, %%ymm9\n"
7728           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7729           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7730           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7731           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
7732           "vpermilps $68, %%ymm3, %%ymm8\n"
7733           "vpermilps $238, %%ymm3, %%ymm9\n"
7734           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7735           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7736           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7737           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
7738           "vpermilps $68, %%ymm4, %%ymm8\n"
7739           "vpermilps $238, %%ymm4, %%ymm9\n"
7740           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7741           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7742           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7743           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
7744           "vpermilps $68, %%ymm5, %%ymm8\n"
7745           "vpermilps $238, %%ymm5, %%ymm9\n"
7746           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7747           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7748           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7749           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
7750           "vpermilps $68, %%ymm6, %%ymm8\n"
7751           "vpermilps $238, %%ymm6, %%ymm9\n"
7752           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7753           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7754           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7755           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
7756           "vpermilps $68, %%ymm7, %%ymm8\n"
7757           "vpermilps $238, %%ymm7, %%ymm9\n"
7758           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7759           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7760           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7761           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
7762           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7763           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
7764           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
7765           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
7766           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
7767           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7768           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
7769           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
7770           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
7771           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
7772           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7773           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
7774           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
7775           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
7776           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
7777           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7778           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
7779           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
7780           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
7781           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
7782           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7783           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
7784           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
7785           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
7786           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
7787           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7788           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
7789           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
7790           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
7791           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
7792           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7793           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
7794           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
7795           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
7796           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
7797           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7798           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
7799           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
7800           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
7801           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
7802           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7803           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7804           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7805           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7806           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7807           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7808           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7809           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7810           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7811           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7812           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7813           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7814           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7815           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7816           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7817           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7818           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7819           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7820           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7821           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7822           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7823           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7824           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7825           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7826           "vmovups %%ymm8, (%0)\n"
7827           "vmovups %%ymm9, (%1)\n"
7828           "vmovups %%ymm10, (%2)\n"
7829           "vmovups %%ymm11, (%3)\n"
7830           "vmovups %%ymm12, (%4)\n"
7831           "vmovups %%ymm13, (%5)\n"
7832           "vmovups %%ymm14, (%6)\n"
7833           "vmovups %%ymm15, (%7)\n"
7834           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7835         );
7836       }
7837     }
7838     for (int j = 0; j < 4096; j += 512) {
7839       for (int k = 0; k < 64; k += 8) {
7840         __asm__ volatile (
7841           "vmovups (%0), %%ymm0\n"
7842           "vmovups (%1), %%ymm1\n"
7843           "vmovups (%2), %%ymm2\n"
7844           "vmovups (%3), %%ymm3\n"
7845           "vmovups (%4), %%ymm4\n"
7846           "vmovups (%5), %%ymm5\n"
7847           "vmovups (%6), %%ymm6\n"
7848           "vmovups (%7), %%ymm7\n"
7849           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7850           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7851           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7852           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7853           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7854           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7855           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7856           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7857           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7858           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7859           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7860           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7861           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7862           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7863           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7864           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7865           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7866           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7867           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7868           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7869           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7870           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7871           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7872           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7873           "vmovups %%ymm8, (%0)\n"
7874           "vmovups %%ymm9, (%1)\n"
7875           "vmovups %%ymm10, (%2)\n"
7876           "vmovups %%ymm11, (%3)\n"
7877           "vmovups %%ymm12, (%4)\n"
7878           "vmovups %%ymm13, (%5)\n"
7879           "vmovups %%ymm14, (%6)\n"
7880           "vmovups %%ymm15, (%7)\n"
7881           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7882         );
7883       }
7884     }
7885     for (int j = 0; j < 4096; j += 4096) {
7886       for (int k = 0; k < 512; k += 8) {
7887         __asm__ volatile (
7888           "vmovups (%0), %%ymm0\n"
7889           "vmovups (%1), %%ymm1\n"
7890           "vmovups (%2), %%ymm2\n"
7891           "vmovups (%3), %%ymm3\n"
7892           "vmovups (%4), %%ymm4\n"
7893           "vmovups (%5), %%ymm5\n"
7894           "vmovups (%6), %%ymm6\n"
7895           "vmovups (%7), %%ymm7\n"
7896           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7897           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7898           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7899           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7900           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7901           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7902           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7903           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7904           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7905           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7906           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7907           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7908           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7909           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7910           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7911           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7912           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7913           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7914           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7915           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7916           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7917           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7918           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7919           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7920           "vmovups %%ymm8, (%0)\n"
7921           "vmovups %%ymm9, (%1)\n"
7922           "vmovups %%ymm10, (%2)\n"
7923           "vmovups %%ymm11, (%3)\n"
7924           "vmovups %%ymm12, (%4)\n"
7925           "vmovups %%ymm13, (%5)\n"
7926           "vmovups %%ymm14, (%6)\n"
7927           "vmovups %%ymm15, (%7)\n"
7928           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7929         );
7930       }
7931     }
7932     return;
7933   }
7934   if (depth == 15) {
7935     helper_float_27_recursive(buf + 0, 12);
7936     helper_float_27_recursive(buf + 4096, 12);
7937     helper_float_27_recursive(buf + 8192, 12);
7938     helper_float_27_recursive(buf + 12288, 12);
7939     helper_float_27_recursive(buf + 16384, 12);
7940     helper_float_27_recursive(buf + 20480, 12);
7941     helper_float_27_recursive(buf + 24576, 12);
7942     helper_float_27_recursive(buf + 28672, 12);
7943     for (int j = 0; j < 32768; j += 32768) {
7944       for (int k = 0; k < 4096; k += 8) {
7945         __asm__ volatile (
7946           "vmovups (%0), %%ymm0\n"
7947           "vmovups (%1), %%ymm1\n"
7948           "vmovups (%2), %%ymm2\n"
7949           "vmovups (%3), %%ymm3\n"
7950           "vmovups (%4), %%ymm4\n"
7951           "vmovups (%5), %%ymm5\n"
7952           "vmovups (%6), %%ymm6\n"
7953           "vmovups (%7), %%ymm7\n"
7954           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7955           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7956           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7957           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7958           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7959           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7960           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7961           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7962           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7963           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7964           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7965           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7966           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7967           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7968           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7969           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7970           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7971           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7972           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7973           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7974           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7975           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7976           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7977           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7978           "vmovups %%ymm8, (%0)\n"
7979           "vmovups %%ymm9, (%1)\n"
7980           "vmovups %%ymm10, (%2)\n"
7981           "vmovups %%ymm11, (%3)\n"
7982           "vmovups %%ymm12, (%4)\n"
7983           "vmovups %%ymm13, (%5)\n"
7984           "vmovups %%ymm14, (%6)\n"
7985           "vmovups %%ymm15, (%7)\n"
7986           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7987         );
7988       }
7989     }
7990     return;
7991   }
7992   if (depth == 18) {
7993     helper_float_27_recursive(buf + 0, 15);
7994     helper_float_27_recursive(buf + 32768, 15);
7995     helper_float_27_recursive(buf + 65536, 15);
7996     helper_float_27_recursive(buf + 98304, 15);
7997     helper_float_27_recursive(buf + 131072, 15);
7998     helper_float_27_recursive(buf + 163840, 15);
7999     helper_float_27_recursive(buf + 196608, 15);
8000     helper_float_27_recursive(buf + 229376, 15);
8001     for (int j = 0; j < 262144; j += 262144) {
8002       for (int k = 0; k < 32768; k += 8) {
8003         __asm__ volatile (
8004           "vmovups (%0), %%ymm0\n"
8005           "vmovups (%1), %%ymm1\n"
8006           "vmovups (%2), %%ymm2\n"
8007           "vmovups (%3), %%ymm3\n"
8008           "vmovups (%4), %%ymm4\n"
8009           "vmovups (%5), %%ymm5\n"
8010           "vmovups (%6), %%ymm6\n"
8011           "vmovups (%7), %%ymm7\n"
8012           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8013           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8014           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8015           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8016           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8017           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8018           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8019           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8020           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8021           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8022           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8023           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8024           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8025           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8026           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8027           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8028           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8029           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8030           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8031           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8032           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8033           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8034           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8035           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8036           "vmovups %%ymm8, (%0)\n"
8037           "vmovups %%ymm9, (%1)\n"
8038           "vmovups %%ymm10, (%2)\n"
8039           "vmovups %%ymm11, (%3)\n"
8040           "vmovups %%ymm12, (%4)\n"
8041           "vmovups %%ymm13, (%5)\n"
8042           "vmovups %%ymm14, (%6)\n"
8043           "vmovups %%ymm15, (%7)\n"
8044           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8045         );
8046       }
8047     }
8048     return;
8049   }
8050   if (depth == 21) {
8051     helper_float_27_recursive(buf + 0, 18);
8052     helper_float_27_recursive(buf + 262144, 18);
8053     helper_float_27_recursive(buf + 524288, 18);
8054     helper_float_27_recursive(buf + 786432, 18);
8055     helper_float_27_recursive(buf + 1048576, 18);
8056     helper_float_27_recursive(buf + 1310720, 18);
8057     helper_float_27_recursive(buf + 1572864, 18);
8058     helper_float_27_recursive(buf + 1835008, 18);
8059     for (int j = 0; j < 2097152; j += 2097152) {
8060       for (int k = 0; k < 262144; k += 8) {
8061         __asm__ volatile (
8062           "vmovups (%0), %%ymm0\n"
8063           "vmovups (%1), %%ymm1\n"
8064           "vmovups (%2), %%ymm2\n"
8065           "vmovups (%3), %%ymm3\n"
8066           "vmovups (%4), %%ymm4\n"
8067           "vmovups (%5), %%ymm5\n"
8068           "vmovups (%6), %%ymm6\n"
8069           "vmovups (%7), %%ymm7\n"
8070           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8071           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8072           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8073           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8074           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8075           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8076           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8077           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8078           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8079           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8080           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8081           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8082           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8083           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8084           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8085           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8086           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8087           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8088           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8089           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8090           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8091           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8092           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8093           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8094           "vmovups %%ymm8, (%0)\n"
8095           "vmovups %%ymm9, (%1)\n"
8096           "vmovups %%ymm10, (%2)\n"
8097           "vmovups %%ymm11, (%3)\n"
8098           "vmovups %%ymm12, (%4)\n"
8099           "vmovups %%ymm13, (%5)\n"
8100           "vmovups %%ymm14, (%6)\n"
8101           "vmovups %%ymm15, (%7)\n"
8102           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8103         );
8104       }
8105     }
8106     return;
8107   }
8108   if (depth == 24) {
8109     helper_float_27_recursive(buf + 0, 21);
8110     helper_float_27_recursive(buf + 2097152, 21);
8111     helper_float_27_recursive(buf + 4194304, 21);
8112     helper_float_27_recursive(buf + 6291456, 21);
8113     helper_float_27_recursive(buf + 8388608, 21);
8114     helper_float_27_recursive(buf + 10485760, 21);
8115     helper_float_27_recursive(buf + 12582912, 21);
8116     helper_float_27_recursive(buf + 14680064, 21);
8117     for (int j = 0; j < 16777216; j += 16777216) {
8118       for (int k = 0; k < 2097152; k += 8) {
8119         __asm__ volatile (
8120           "vmovups (%0), %%ymm0\n"
8121           "vmovups (%1), %%ymm1\n"
8122           "vmovups (%2), %%ymm2\n"
8123           "vmovups (%3), %%ymm3\n"
8124           "vmovups (%4), %%ymm4\n"
8125           "vmovups (%5), %%ymm5\n"
8126           "vmovups (%6), %%ymm6\n"
8127           "vmovups (%7), %%ymm7\n"
8128           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8129           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8130           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8131           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8132           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8133           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8134           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8135           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8136           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8137           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8138           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8139           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8140           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8141           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8142           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8143           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8144           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8145           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8146           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8147           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8148           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8149           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8150           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8151           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8152           "vmovups %%ymm8, (%0)\n"
8153           "vmovups %%ymm9, (%1)\n"
8154           "vmovups %%ymm10, (%2)\n"
8155           "vmovups %%ymm11, (%3)\n"
8156           "vmovups %%ymm12, (%4)\n"
8157           "vmovups %%ymm13, (%5)\n"
8158           "vmovups %%ymm14, (%6)\n"
8159           "vmovups %%ymm15, (%7)\n"
8160           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8161         );
8162       }
8163     }
8164     return;
8165   }
8166   if (depth == 27) {
8167     helper_float_27_recursive(buf + 0, 24);
8168     helper_float_27_recursive(buf + 16777216, 24);
8169     helper_float_27_recursive(buf + 33554432, 24);
8170     helper_float_27_recursive(buf + 50331648, 24);
8171     helper_float_27_recursive(buf + 67108864, 24);
8172     helper_float_27_recursive(buf + 83886080, 24);
8173     helper_float_27_recursive(buf + 100663296, 24);
8174     helper_float_27_recursive(buf + 117440512, 24);
8175     for (int j = 0; j < 134217728; j += 134217728) {
8176       for (int k = 0; k < 16777216; k += 8) {
8177         __asm__ volatile (
8178           "vmovups (%0), %%ymm0\n"
8179           "vmovups (%1), %%ymm1\n"
8180           "vmovups (%2), %%ymm2\n"
8181           "vmovups (%3), %%ymm3\n"
8182           "vmovups (%4), %%ymm4\n"
8183           "vmovups (%5), %%ymm5\n"
8184           "vmovups (%6), %%ymm6\n"
8185           "vmovups (%7), %%ymm7\n"
8186           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8187           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8188           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8189           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8190           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8191           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8192           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8193           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8194           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8195           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8196           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8197           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8198           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8199           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8200           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8201           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8202           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8203           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8204           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8205           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8206           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8207           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8208           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8209           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8210           "vmovups %%ymm8, (%0)\n"
8211           "vmovups %%ymm9, (%1)\n"
8212           "vmovups %%ymm10, (%2)\n"
8213           "vmovups %%ymm11, (%3)\n"
8214           "vmovups %%ymm12, (%4)\n"
8215           "vmovups %%ymm13, (%5)\n"
8216           "vmovups %%ymm14, (%6)\n"
8217           "vmovups %%ymm15, (%7)\n"
8218           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8219         );
8220       }
8221     }
8222     return;
8223   }
8224 }
8225 void helper_float_27(float *buf);
helper_float_27(float * buf)8226 void helper_float_27(float *buf) {
8227   helper_float_27_recursive(buf, 27);
8228 }
8229 void helper_float_28_recursive(float *buf, int depth);
helper_float_28_recursive(float * buf,int depth)8230 void helper_float_28_recursive(float *buf, int depth) {
8231   if (depth == 7) {
8232     for (int j = 0; j < 128; j += 64) {
8233       for (int k = 0; k < 8; k += 8) {
8234         __asm__ volatile (
8235           "vmovups (%0), %%ymm0\n"
8236           "vmovups (%1), %%ymm1\n"
8237           "vmovups (%2), %%ymm2\n"
8238           "vmovups (%3), %%ymm3\n"
8239           "vmovups (%4), %%ymm4\n"
8240           "vmovups (%5), %%ymm5\n"
8241           "vmovups (%6), %%ymm6\n"
8242           "vmovups (%7), %%ymm7\n"
8243           "vpermilps $160, %%ymm0, %%ymm8\n"
8244           "vpermilps $245, %%ymm0, %%ymm9\n"
8245           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8246           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8247           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
8248           "vpermilps $160, %%ymm1, %%ymm8\n"
8249           "vpermilps $245, %%ymm1, %%ymm9\n"
8250           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8251           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8252           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
8253           "vpermilps $160, %%ymm2, %%ymm8\n"
8254           "vpermilps $245, %%ymm2, %%ymm9\n"
8255           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8256           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8257           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
8258           "vpermilps $160, %%ymm3, %%ymm8\n"
8259           "vpermilps $245, %%ymm3, %%ymm9\n"
8260           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8261           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8262           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
8263           "vpermilps $160, %%ymm4, %%ymm8\n"
8264           "vpermilps $245, %%ymm4, %%ymm9\n"
8265           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8266           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8267           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
8268           "vpermilps $160, %%ymm5, %%ymm8\n"
8269           "vpermilps $245, %%ymm5, %%ymm9\n"
8270           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8271           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8272           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
8273           "vpermilps $160, %%ymm6, %%ymm8\n"
8274           "vpermilps $245, %%ymm6, %%ymm9\n"
8275           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8276           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8277           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
8278           "vpermilps $160, %%ymm7, %%ymm8\n"
8279           "vpermilps $245, %%ymm7, %%ymm9\n"
8280           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8281           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8282           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
8283           "vpermilps $68, %%ymm0, %%ymm8\n"
8284           "vpermilps $238, %%ymm0, %%ymm9\n"
8285           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8286           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8287           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8288           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
8289           "vpermilps $68, %%ymm1, %%ymm8\n"
8290           "vpermilps $238, %%ymm1, %%ymm9\n"
8291           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8292           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8293           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8294           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
8295           "vpermilps $68, %%ymm2, %%ymm8\n"
8296           "vpermilps $238, %%ymm2, %%ymm9\n"
8297           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8298           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8299           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8300           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
8301           "vpermilps $68, %%ymm3, %%ymm8\n"
8302           "vpermilps $238, %%ymm3, %%ymm9\n"
8303           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8304           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8305           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8306           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
8307           "vpermilps $68, %%ymm4, %%ymm8\n"
8308           "vpermilps $238, %%ymm4, %%ymm9\n"
8309           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8310           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8311           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8312           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
8313           "vpermilps $68, %%ymm5, %%ymm8\n"
8314           "vpermilps $238, %%ymm5, %%ymm9\n"
8315           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8316           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8317           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8318           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
8319           "vpermilps $68, %%ymm6, %%ymm8\n"
8320           "vpermilps $238, %%ymm6, %%ymm9\n"
8321           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8322           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8323           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8324           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
8325           "vpermilps $68, %%ymm7, %%ymm8\n"
8326           "vpermilps $238, %%ymm7, %%ymm9\n"
8327           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8328           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8329           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8330           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
8331           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8332           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
8333           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
8334           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
8335           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
8336           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8337           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
8338           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
8339           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
8340           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
8341           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8342           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
8343           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
8344           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
8345           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
8346           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8347           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
8348           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
8349           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
8350           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
8351           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8352           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
8353           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
8354           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
8355           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
8356           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8357           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
8358           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
8359           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
8360           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
8361           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8362           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
8363           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
8364           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
8365           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
8366           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8367           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
8368           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
8369           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
8370           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
8371           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8372           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8373           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8374           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8375           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8376           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8377           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8378           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8379           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8380           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8381           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8382           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8383           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8384           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8385           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8386           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8387           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8388           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8389           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8390           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8391           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8392           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8393           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8394           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8395           "vmovups %%ymm8, (%0)\n"
8396           "vmovups %%ymm9, (%1)\n"
8397           "vmovups %%ymm10, (%2)\n"
8398           "vmovups %%ymm11, (%3)\n"
8399           "vmovups %%ymm12, (%4)\n"
8400           "vmovups %%ymm13, (%5)\n"
8401           "vmovups %%ymm14, (%6)\n"
8402           "vmovups %%ymm15, (%7)\n"
8403           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8404         );
8405       }
8406     }
8407     for (int j = 0; j < 128; j += 128) {
8408       for (int k = 0; k < 64; k += 8) {
8409         __asm__ volatile (
8410           "vmovups (%0), %%ymm0\n"
8411           "vmovups (%1), %%ymm1\n"
8412           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8413           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8414           "vmovups %%ymm8, (%0)\n"
8415           "vmovups %%ymm9, (%1)\n"
8416           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8417         );
8418       }
8419     }
8420     return;
8421   }
8422   if (depth == 10) {
8423     helper_float_28_recursive(buf + 0, 7);
8424     helper_float_28_recursive(buf + 128, 7);
8425     helper_float_28_recursive(buf + 256, 7);
8426     helper_float_28_recursive(buf + 384, 7);
8427     helper_float_28_recursive(buf + 512, 7);
8428     helper_float_28_recursive(buf + 640, 7);
8429     helper_float_28_recursive(buf + 768, 7);
8430     helper_float_28_recursive(buf + 896, 7);
8431     for (int j = 0; j < 1024; j += 1024) {
8432       for (int k = 0; k < 128; k += 8) {
8433         __asm__ volatile (
8434           "vmovups (%0), %%ymm0\n"
8435           "vmovups (%1), %%ymm1\n"
8436           "vmovups (%2), %%ymm2\n"
8437           "vmovups (%3), %%ymm3\n"
8438           "vmovups (%4), %%ymm4\n"
8439           "vmovups (%5), %%ymm5\n"
8440           "vmovups (%6), %%ymm6\n"
8441           "vmovups (%7), %%ymm7\n"
8442           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8443           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8444           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8445           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8446           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8447           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8448           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8449           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8450           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8451           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8452           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8453           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8454           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8455           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8456           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8457           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8458           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8459           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8460           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8461           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8462           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8463           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8464           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8465           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8466           "vmovups %%ymm8, (%0)\n"
8467           "vmovups %%ymm9, (%1)\n"
8468           "vmovups %%ymm10, (%2)\n"
8469           "vmovups %%ymm11, (%3)\n"
8470           "vmovups %%ymm12, (%4)\n"
8471           "vmovups %%ymm13, (%5)\n"
8472           "vmovups %%ymm14, (%6)\n"
8473           "vmovups %%ymm15, (%7)\n"
8474           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8475         );
8476       }
8477     }
8478     return;
8479   }
8480   if (depth == 13) {
8481     helper_float_28_recursive(buf + 0, 10);
8482     helper_float_28_recursive(buf + 1024, 10);
8483     helper_float_28_recursive(buf + 2048, 10);
8484     helper_float_28_recursive(buf + 3072, 10);
8485     helper_float_28_recursive(buf + 4096, 10);
8486     helper_float_28_recursive(buf + 5120, 10);
8487     helper_float_28_recursive(buf + 6144, 10);
8488     helper_float_28_recursive(buf + 7168, 10);
8489     for (int j = 0; j < 8192; j += 8192) {
8490       for (int k = 0; k < 1024; k += 8) {
8491         __asm__ volatile (
8492           "vmovups (%0), %%ymm0\n"
8493           "vmovups (%1), %%ymm1\n"
8494           "vmovups (%2), %%ymm2\n"
8495           "vmovups (%3), %%ymm3\n"
8496           "vmovups (%4), %%ymm4\n"
8497           "vmovups (%5), %%ymm5\n"
8498           "vmovups (%6), %%ymm6\n"
8499           "vmovups (%7), %%ymm7\n"
8500           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8501           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8502           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8503           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8504           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8505           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8506           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8507           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8508           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8509           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8510           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8511           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8512           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8513           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8514           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8515           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8516           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8517           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8518           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8519           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8520           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8521           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8522           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8523           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8524           "vmovups %%ymm8, (%0)\n"
8525           "vmovups %%ymm9, (%1)\n"
8526           "vmovups %%ymm10, (%2)\n"
8527           "vmovups %%ymm11, (%3)\n"
8528           "vmovups %%ymm12, (%4)\n"
8529           "vmovups %%ymm13, (%5)\n"
8530           "vmovups %%ymm14, (%6)\n"
8531           "vmovups %%ymm15, (%7)\n"
8532           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8533         );
8534       }
8535     }
8536     return;
8537   }
8538   if (depth == 16) {
8539     helper_float_28_recursive(buf + 0, 13);
8540     helper_float_28_recursive(buf + 8192, 13);
8541     helper_float_28_recursive(buf + 16384, 13);
8542     helper_float_28_recursive(buf + 24576, 13);
8543     helper_float_28_recursive(buf + 32768, 13);
8544     helper_float_28_recursive(buf + 40960, 13);
8545     helper_float_28_recursive(buf + 49152, 13);
8546     helper_float_28_recursive(buf + 57344, 13);
8547     for (int j = 0; j < 65536; j += 65536) {
8548       for (int k = 0; k < 8192; k += 8) {
8549         __asm__ volatile (
8550           "vmovups (%0), %%ymm0\n"
8551           "vmovups (%1), %%ymm1\n"
8552           "vmovups (%2), %%ymm2\n"
8553           "vmovups (%3), %%ymm3\n"
8554           "vmovups (%4), %%ymm4\n"
8555           "vmovups (%5), %%ymm5\n"
8556           "vmovups (%6), %%ymm6\n"
8557           "vmovups (%7), %%ymm7\n"
8558           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8559           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8560           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8561           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8562           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8563           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8564           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8565           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8566           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8567           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8568           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8569           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8570           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8571           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8572           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8573           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8574           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8575           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8576           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8577           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8578           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8579           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8580           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8581           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8582           "vmovups %%ymm8, (%0)\n"
8583           "vmovups %%ymm9, (%1)\n"
8584           "vmovups %%ymm10, (%2)\n"
8585           "vmovups %%ymm11, (%3)\n"
8586           "vmovups %%ymm12, (%4)\n"
8587           "vmovups %%ymm13, (%5)\n"
8588           "vmovups %%ymm14, (%6)\n"
8589           "vmovups %%ymm15, (%7)\n"
8590           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8591         );
8592       }
8593     }
8594     return;
8595   }
8596   if (depth == 19) {
8597     helper_float_28_recursive(buf + 0, 16);
8598     helper_float_28_recursive(buf + 65536, 16);
8599     helper_float_28_recursive(buf + 131072, 16);
8600     helper_float_28_recursive(buf + 196608, 16);
8601     helper_float_28_recursive(buf + 262144, 16);
8602     helper_float_28_recursive(buf + 327680, 16);
8603     helper_float_28_recursive(buf + 393216, 16);
8604     helper_float_28_recursive(buf + 458752, 16);
8605     for (int j = 0; j < 524288; j += 524288) {
8606       for (int k = 0; k < 65536; k += 8) {
8607         __asm__ volatile (
8608           "vmovups (%0), %%ymm0\n"
8609           "vmovups (%1), %%ymm1\n"
8610           "vmovups (%2), %%ymm2\n"
8611           "vmovups (%3), %%ymm3\n"
8612           "vmovups (%4), %%ymm4\n"
8613           "vmovups (%5), %%ymm5\n"
8614           "vmovups (%6), %%ymm6\n"
8615           "vmovups (%7), %%ymm7\n"
8616           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8617           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8618           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8619           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8620           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8621           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8622           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8623           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8624           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8625           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8626           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8627           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8628           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8629           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8630           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8631           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8632           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8633           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8634           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8635           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8636           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8637           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8638           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8639           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8640           "vmovups %%ymm8, (%0)\n"
8641           "vmovups %%ymm9, (%1)\n"
8642           "vmovups %%ymm10, (%2)\n"
8643           "vmovups %%ymm11, (%3)\n"
8644           "vmovups %%ymm12, (%4)\n"
8645           "vmovups %%ymm13, (%5)\n"
8646           "vmovups %%ymm14, (%6)\n"
8647           "vmovups %%ymm15, (%7)\n"
8648           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8649         );
8650       }
8651     }
8652     return;
8653   }
8654   if (depth == 22) {
8655     helper_float_28_recursive(buf + 0, 19);
8656     helper_float_28_recursive(buf + 524288, 19);
8657     helper_float_28_recursive(buf + 1048576, 19);
8658     helper_float_28_recursive(buf + 1572864, 19);
8659     helper_float_28_recursive(buf + 2097152, 19);
8660     helper_float_28_recursive(buf + 2621440, 19);
8661     helper_float_28_recursive(buf + 3145728, 19);
8662     helper_float_28_recursive(buf + 3670016, 19);
8663     for (int j = 0; j < 4194304; j += 4194304) {
8664       for (int k = 0; k < 524288; k += 8) {
8665         __asm__ volatile (
8666           "vmovups (%0), %%ymm0\n"
8667           "vmovups (%1), %%ymm1\n"
8668           "vmovups (%2), %%ymm2\n"
8669           "vmovups (%3), %%ymm3\n"
8670           "vmovups (%4), %%ymm4\n"
8671           "vmovups (%5), %%ymm5\n"
8672           "vmovups (%6), %%ymm6\n"
8673           "vmovups (%7), %%ymm7\n"
8674           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8675           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8676           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8677           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8678           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8679           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8680           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8681           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8682           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8683           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8684           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8685           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8686           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8687           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8688           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8689           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8690           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8691           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8692           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8693           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8694           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8695           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8696           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8697           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8698           "vmovups %%ymm8, (%0)\n"
8699           "vmovups %%ymm9, (%1)\n"
8700           "vmovups %%ymm10, (%2)\n"
8701           "vmovups %%ymm11, (%3)\n"
8702           "vmovups %%ymm12, (%4)\n"
8703           "vmovups %%ymm13, (%5)\n"
8704           "vmovups %%ymm14, (%6)\n"
8705           "vmovups %%ymm15, (%7)\n"
8706           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8707         );
8708       }
8709     }
8710     return;
8711   }
8712   if (depth == 25) {
8713     helper_float_28_recursive(buf + 0, 22);
8714     helper_float_28_recursive(buf + 4194304, 22);
8715     helper_float_28_recursive(buf + 8388608, 22);
8716     helper_float_28_recursive(buf + 12582912, 22);
8717     helper_float_28_recursive(buf + 16777216, 22);
8718     helper_float_28_recursive(buf + 20971520, 22);
8719     helper_float_28_recursive(buf + 25165824, 22);
8720     helper_float_28_recursive(buf + 29360128, 22);
8721     for (int j = 0; j < 33554432; j += 33554432) {
8722       for (int k = 0; k < 4194304; k += 8) {
8723         __asm__ volatile (
8724           "vmovups (%0), %%ymm0\n"
8725           "vmovups (%1), %%ymm1\n"
8726           "vmovups (%2), %%ymm2\n"
8727           "vmovups (%3), %%ymm3\n"
8728           "vmovups (%4), %%ymm4\n"
8729           "vmovups (%5), %%ymm5\n"
8730           "vmovups (%6), %%ymm6\n"
8731           "vmovups (%7), %%ymm7\n"
8732           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8733           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8734           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8735           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8736           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8737           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8738           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8739           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8740           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8741           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8742           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8743           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8744           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8745           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8746           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8747           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8748           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8749           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8750           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8751           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8752           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8753           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8754           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8755           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8756           "vmovups %%ymm8, (%0)\n"
8757           "vmovups %%ymm9, (%1)\n"
8758           "vmovups %%ymm10, (%2)\n"
8759           "vmovups %%ymm11, (%3)\n"
8760           "vmovups %%ymm12, (%4)\n"
8761           "vmovups %%ymm13, (%5)\n"
8762           "vmovups %%ymm14, (%6)\n"
8763           "vmovups %%ymm15, (%7)\n"
8764           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8765         );
8766       }
8767     }
8768     return;
8769   }
8770   if (depth == 28) {
8771     helper_float_28_recursive(buf + 0, 25);
8772     helper_float_28_recursive(buf + 33554432, 25);
8773     helper_float_28_recursive(buf + 67108864, 25);
8774     helper_float_28_recursive(buf + 100663296, 25);
8775     helper_float_28_recursive(buf + 134217728, 25);
8776     helper_float_28_recursive(buf + 167772160, 25);
8777     helper_float_28_recursive(buf + 201326592, 25);
8778     helper_float_28_recursive(buf + 234881024, 25);
8779     for (int j = 0; j < 268435456; j += 268435456) {
8780       for (int k = 0; k < 33554432; k += 8) {
8781         __asm__ volatile (
8782           "vmovups (%0), %%ymm0\n"
8783           "vmovups (%1), %%ymm1\n"
8784           "vmovups (%2), %%ymm2\n"
8785           "vmovups (%3), %%ymm3\n"
8786           "vmovups (%4), %%ymm4\n"
8787           "vmovups (%5), %%ymm5\n"
8788           "vmovups (%6), %%ymm6\n"
8789           "vmovups (%7), %%ymm7\n"
8790           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8791           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8792           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8793           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8794           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8795           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8796           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8797           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8798           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8799           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8800           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8801           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8802           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8803           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8804           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8805           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8806           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8807           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8808           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8809           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8810           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8811           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8812           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8813           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8814           "vmovups %%ymm8, (%0)\n"
8815           "vmovups %%ymm9, (%1)\n"
8816           "vmovups %%ymm10, (%2)\n"
8817           "vmovups %%ymm11, (%3)\n"
8818           "vmovups %%ymm12, (%4)\n"
8819           "vmovups %%ymm13, (%5)\n"
8820           "vmovups %%ymm14, (%6)\n"
8821           "vmovups %%ymm15, (%7)\n"
8822           :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8823         );
8824       }
8825     }
8826     return;
8827   }
8828 }
8829 void helper_float_28(float *buf);
helper_float_28(float * buf)8830 void helper_float_28(float *buf) {
8831   helper_float_28_recursive(buf, 28);
8832 }
8833 void helper_float_29_recursive(float *buf, int depth);
helper_float_29_recursive(float * buf,int depth)8834 void helper_float_29_recursive(float *buf, int depth) {
8835   if (depth == 12) {
8836     for (int j = 0; j < 4096; j += 64) {
8837       for (int k = 0; k < 8; k += 8) {
8838         __asm__ volatile (
8839           "vmovups (%0), %%ymm0\n"
8840           "vmovups (%1), %%ymm1\n"
8841           "vmovups (%2), %%ymm2\n"
8842           "vmovups (%3), %%ymm3\n"
8843           "vmovups (%4), %%ymm4\n"
8844           "vmovups (%5), %%ymm5\n"
8845           "vmovups (%6), %%ymm6\n"
8846           "vmovups (%7), %%ymm7\n"
8847           "vpermilps $160, %%ymm0, %%ymm8\n"
8848           "vpermilps $245, %%ymm0, %%ymm9\n"
8849           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8850           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8851           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
8852           "vpermilps $160, %%ymm1, %%ymm8\n"
8853           "vpermilps $245, %%ymm1, %%ymm9\n"
8854           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8855           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8856           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
8857           "vpermilps $160, %%ymm2, %%ymm8\n"
8858           "vpermilps $245, %%ymm2, %%ymm9\n"
8859           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8860           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8861           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
8862           "vpermilps $160, %%ymm3, %%ymm8\n"
8863           "vpermilps $245, %%ymm3, %%ymm9\n"
8864           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8865           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8866           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
8867           "vpermilps $160, %%ymm4, %%ymm8\n"
8868           "vpermilps $245, %%ymm4, %%ymm9\n"
8869           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8870           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8871           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
8872           "vpermilps $160, %%ymm5, %%ymm8\n"
8873           "vpermilps $245, %%ymm5, %%ymm9\n"
8874           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8875           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8876           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
8877           "vpermilps $160, %%ymm6, %%ymm8\n"
8878           "vpermilps $245, %%ymm6, %%ymm9\n"
8879           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8880           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8881           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
8882           "vpermilps $160, %%ymm7, %%ymm8\n"
8883           "vpermilps $245, %%ymm7, %%ymm9\n"
8884           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8885           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8886           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
8887           "vpermilps $68, %%ymm0, %%ymm8\n"
8888           "vpermilps $238, %%ymm0, %%ymm9\n"
8889           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8890           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8891           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8892           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
8893           "vpermilps $68, %%ymm1, %%ymm8\n"
8894           "vpermilps $238, %%ymm1, %%ymm9\n"
8895           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8896           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8897           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8898           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
8899           "vpermilps $68, %%ymm2, %%ymm8\n"
8900           "vpermilps $238, %%ymm2, %%ymm9\n"
8901           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8902           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8903           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8904           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
8905           "vpermilps $68, %%ymm3, %%ymm8\n"
8906           "vpermilps $238, %%ymm3, %%ymm9\n"
8907           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8908           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8909           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8910           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
8911           "vpermilps $68, %%ymm4, %%ymm8\n"
8912           "vpermilps $238, %%ymm4, %%ymm9\n"
8913           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8914           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8915           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8916           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
8917           "vpermilps $68, %%ymm5, %%ymm8\n"
8918           "vpermilps $238, %%ymm5, %%ymm9\n"
8919           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8920           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8921           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8922           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
8923           "vpermilps $68, %%ymm6, %%ymm8\n"
8924           "vpermilps $238, %%ymm6, %%ymm9\n"
8925           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8926           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8927           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8928           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
8929           "vpermilps $68, %%ymm7, %%ymm8\n"
8930           "vpermilps $238, %%ymm7, %%ymm9\n"
8931           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8932           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8933           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8934           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
8935           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8936           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
8937           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
8938           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
8939           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
8940           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8941           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
8942           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
8943           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
8944           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
8945           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8946           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
8947           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
8948           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
8949           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
8950           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8951           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
8952           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
8953           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
8954           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
8955           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8956           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
8957           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
8958           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
8959           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
8960           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8961           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
8962           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
8963           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
8964           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
8965           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8966           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
8967           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
8968           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
8969           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
8970           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8971           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
8972           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
8973           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
8974           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
8975           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8976           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8977           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8978           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8979           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8980           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8981           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8982           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8983           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8984           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8985           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8986           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8987           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8988           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8989           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8990           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8991           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8992           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8993           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8994           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8995           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8996           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8997           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8998           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8999           "vmovups %%ymm8, (%0)\n"
9000           "vmovups %%ymm9, (%1)\n"
9001           "vmovups %%ymm10, (%2)\n"
9002           "vmovups %%ymm11, (%3)\n"
9003           "vmovups %%ymm12, (%4)\n"
9004           "vmovups %%ymm13, (%5)\n"
9005           "vmovups %%ymm14, (%6)\n"
9006           "vmovups %%ymm15, (%7)\n"
9007           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9008         );
9009       }
9010     }
9011     for (int j = 0; j < 4096; j += 512) {
9012       for (int k = 0; k < 64; k += 8) {
9013         __asm__ volatile (
9014           "vmovups (%0), %%ymm0\n"
9015           "vmovups (%1), %%ymm1\n"
9016           "vmovups (%2), %%ymm2\n"
9017           "vmovups (%3), %%ymm3\n"
9018           "vmovups (%4), %%ymm4\n"
9019           "vmovups (%5), %%ymm5\n"
9020           "vmovups (%6), %%ymm6\n"
9021           "vmovups (%7), %%ymm7\n"
9022           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9023           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9024           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9025           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9026           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9027           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9028           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9029           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9030           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9031           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9032           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9033           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9034           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9035           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9036           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9037           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9038           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9039           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9040           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9041           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9042           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9043           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9044           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9045           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9046           "vmovups %%ymm8, (%0)\n"
9047           "vmovups %%ymm9, (%1)\n"
9048           "vmovups %%ymm10, (%2)\n"
9049           "vmovups %%ymm11, (%3)\n"
9050           "vmovups %%ymm12, (%4)\n"
9051           "vmovups %%ymm13, (%5)\n"
9052           "vmovups %%ymm14, (%6)\n"
9053           "vmovups %%ymm15, (%7)\n"
9054           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9055         );
9056       }
9057     }
9058     for (int j = 0; j < 4096; j += 4096) {
9059       for (int k = 0; k < 512; k += 8) {
9060         __asm__ volatile (
9061           "vmovups (%0), %%ymm0\n"
9062           "vmovups (%1), %%ymm1\n"
9063           "vmovups (%2), %%ymm2\n"
9064           "vmovups (%3), %%ymm3\n"
9065           "vmovups (%4), %%ymm4\n"
9066           "vmovups (%5), %%ymm5\n"
9067           "vmovups (%6), %%ymm6\n"
9068           "vmovups (%7), %%ymm7\n"
9069           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9070           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9071           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9072           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9073           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9074           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9075           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9076           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9077           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9078           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9079           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9080           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9081           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9082           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9083           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9084           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9085           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9086           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9087           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9088           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9089           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9090           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9091           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9092           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9093           "vmovups %%ymm8, (%0)\n"
9094           "vmovups %%ymm9, (%1)\n"
9095           "vmovups %%ymm10, (%2)\n"
9096           "vmovups %%ymm11, (%3)\n"
9097           "vmovups %%ymm12, (%4)\n"
9098           "vmovups %%ymm13, (%5)\n"
9099           "vmovups %%ymm14, (%6)\n"
9100           "vmovups %%ymm15, (%7)\n"
9101           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9102         );
9103       }
9104     }
9105     return;
9106   }
9107   if (depth == 15) {
9108     helper_float_29_recursive(buf + 0, 12);
9109     helper_float_29_recursive(buf + 4096, 12);
9110     helper_float_29_recursive(buf + 8192, 12);
9111     helper_float_29_recursive(buf + 12288, 12);
9112     helper_float_29_recursive(buf + 16384, 12);
9113     helper_float_29_recursive(buf + 20480, 12);
9114     helper_float_29_recursive(buf + 24576, 12);
9115     helper_float_29_recursive(buf + 28672, 12);
9116     for (int j = 0; j < 32768; j += 32768) {
9117       for (int k = 0; k < 4096; k += 8) {
9118         __asm__ volatile (
9119           "vmovups (%0), %%ymm0\n"
9120           "vmovups (%1), %%ymm1\n"
9121           "vmovups (%2), %%ymm2\n"
9122           "vmovups (%3), %%ymm3\n"
9123           "vmovups (%4), %%ymm4\n"
9124           "vmovups (%5), %%ymm5\n"
9125           "vmovups (%6), %%ymm6\n"
9126           "vmovups (%7), %%ymm7\n"
9127           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9128           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9129           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9130           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9131           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9132           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9133           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9134           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9135           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9136           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9137           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9138           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9139           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9140           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9141           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9142           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9143           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9144           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9145           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9146           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9147           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9148           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9149           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9150           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9151           "vmovups %%ymm8, (%0)\n"
9152           "vmovups %%ymm9, (%1)\n"
9153           "vmovups %%ymm10, (%2)\n"
9154           "vmovups %%ymm11, (%3)\n"
9155           "vmovups %%ymm12, (%4)\n"
9156           "vmovups %%ymm13, (%5)\n"
9157           "vmovups %%ymm14, (%6)\n"
9158           "vmovups %%ymm15, (%7)\n"
9159           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9160         );
9161       }
9162     }
9163     return;
9164   }
9165   if (depth == 18) {
9166     helper_float_29_recursive(buf + 0, 15);
9167     helper_float_29_recursive(buf + 32768, 15);
9168     helper_float_29_recursive(buf + 65536, 15);
9169     helper_float_29_recursive(buf + 98304, 15);
9170     helper_float_29_recursive(buf + 131072, 15);
9171     helper_float_29_recursive(buf + 163840, 15);
9172     helper_float_29_recursive(buf + 196608, 15);
9173     helper_float_29_recursive(buf + 229376, 15);
9174     for (int j = 0; j < 262144; j += 262144) {
9175       for (int k = 0; k < 32768; k += 8) {
9176         __asm__ volatile (
9177           "vmovups (%0), %%ymm0\n"
9178           "vmovups (%1), %%ymm1\n"
9179           "vmovups (%2), %%ymm2\n"
9180           "vmovups (%3), %%ymm3\n"
9181           "vmovups (%4), %%ymm4\n"
9182           "vmovups (%5), %%ymm5\n"
9183           "vmovups (%6), %%ymm6\n"
9184           "vmovups (%7), %%ymm7\n"
9185           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9186           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9187           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9188           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9189           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9190           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9191           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9192           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9193           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9194           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9195           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9196           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9197           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9198           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9199           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9200           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9201           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9202           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9203           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9204           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9205           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9206           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9207           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9208           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9209           "vmovups %%ymm8, (%0)\n"
9210           "vmovups %%ymm9, (%1)\n"
9211           "vmovups %%ymm10, (%2)\n"
9212           "vmovups %%ymm11, (%3)\n"
9213           "vmovups %%ymm12, (%4)\n"
9214           "vmovups %%ymm13, (%5)\n"
9215           "vmovups %%ymm14, (%6)\n"
9216           "vmovups %%ymm15, (%7)\n"
9217           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9218         );
9219       }
9220     }
9221     return;
9222   }
9223   if (depth == 21) {
9224     helper_float_29_recursive(buf + 0, 18);
9225     helper_float_29_recursive(buf + 262144, 18);
9226     helper_float_29_recursive(buf + 524288, 18);
9227     helper_float_29_recursive(buf + 786432, 18);
9228     helper_float_29_recursive(buf + 1048576, 18);
9229     helper_float_29_recursive(buf + 1310720, 18);
9230     helper_float_29_recursive(buf + 1572864, 18);
9231     helper_float_29_recursive(buf + 1835008, 18);
9232     for (int j = 0; j < 2097152; j += 2097152) {
9233       for (int k = 0; k < 262144; k += 8) {
9234         __asm__ volatile (
9235           "vmovups (%0), %%ymm0\n"
9236           "vmovups (%1), %%ymm1\n"
9237           "vmovups (%2), %%ymm2\n"
9238           "vmovups (%3), %%ymm3\n"
9239           "vmovups (%4), %%ymm4\n"
9240           "vmovups (%5), %%ymm5\n"
9241           "vmovups (%6), %%ymm6\n"
9242           "vmovups (%7), %%ymm7\n"
9243           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9244           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9245           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9246           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9247           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9248           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9249           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9250           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9251           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9252           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9253           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9254           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9255           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9256           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9257           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9258           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9259           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9260           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9261           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9262           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9263           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9264           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9265           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9266           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9267           "vmovups %%ymm8, (%0)\n"
9268           "vmovups %%ymm9, (%1)\n"
9269           "vmovups %%ymm10, (%2)\n"
9270           "vmovups %%ymm11, (%3)\n"
9271           "vmovups %%ymm12, (%4)\n"
9272           "vmovups %%ymm13, (%5)\n"
9273           "vmovups %%ymm14, (%6)\n"
9274           "vmovups %%ymm15, (%7)\n"
9275           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9276         );
9277       }
9278     }
9279     return;
9280   }
9281   if (depth == 24) {
9282     helper_float_29_recursive(buf + 0, 21);
9283     helper_float_29_recursive(buf + 2097152, 21);
9284     helper_float_29_recursive(buf + 4194304, 21);
9285     helper_float_29_recursive(buf + 6291456, 21);
9286     helper_float_29_recursive(buf + 8388608, 21);
9287     helper_float_29_recursive(buf + 10485760, 21);
9288     helper_float_29_recursive(buf + 12582912, 21);
9289     helper_float_29_recursive(buf + 14680064, 21);
9290     for (int j = 0; j < 16777216; j += 16777216) {
9291       for (int k = 0; k < 2097152; k += 8) {
9292         __asm__ volatile (
9293           "vmovups (%0), %%ymm0\n"
9294           "vmovups (%1), %%ymm1\n"
9295           "vmovups (%2), %%ymm2\n"
9296           "vmovups (%3), %%ymm3\n"
9297           "vmovups (%4), %%ymm4\n"
9298           "vmovups (%5), %%ymm5\n"
9299           "vmovups (%6), %%ymm6\n"
9300           "vmovups (%7), %%ymm7\n"
9301           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9302           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9303           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9304           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9305           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9306           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9307           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9308           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9309           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9310           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9311           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9312           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9313           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9314           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9315           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9316           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9317           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9318           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9319           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9320           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9321           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9322           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9323           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9324           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9325           "vmovups %%ymm8, (%0)\n"
9326           "vmovups %%ymm9, (%1)\n"
9327           "vmovups %%ymm10, (%2)\n"
9328           "vmovups %%ymm11, (%3)\n"
9329           "vmovups %%ymm12, (%4)\n"
9330           "vmovups %%ymm13, (%5)\n"
9331           "vmovups %%ymm14, (%6)\n"
9332           "vmovups %%ymm15, (%7)\n"
9333           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9334         );
9335       }
9336     }
9337     return;
9338   }
9339   if (depth == 27) {
9340     helper_float_29_recursive(buf + 0, 24);
9341     helper_float_29_recursive(buf + 16777216, 24);
9342     helper_float_29_recursive(buf + 33554432, 24);
9343     helper_float_29_recursive(buf + 50331648, 24);
9344     helper_float_29_recursive(buf + 67108864, 24);
9345     helper_float_29_recursive(buf + 83886080, 24);
9346     helper_float_29_recursive(buf + 100663296, 24);
9347     helper_float_29_recursive(buf + 117440512, 24);
9348     for (int j = 0; j < 134217728; j += 134217728) {
9349       for (int k = 0; k < 16777216; k += 8) {
9350         __asm__ volatile (
9351           "vmovups (%0), %%ymm0\n"
9352           "vmovups (%1), %%ymm1\n"
9353           "vmovups (%2), %%ymm2\n"
9354           "vmovups (%3), %%ymm3\n"
9355           "vmovups (%4), %%ymm4\n"
9356           "vmovups (%5), %%ymm5\n"
9357           "vmovups (%6), %%ymm6\n"
9358           "vmovups (%7), %%ymm7\n"
9359           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9360           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9361           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9362           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9363           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9364           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9365           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9366           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9367           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9368           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9369           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9370           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9371           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9372           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9373           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9374           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9375           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9376           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9377           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9378           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9379           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9380           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9381           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9382           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9383           "vmovups %%ymm8, (%0)\n"
9384           "vmovups %%ymm9, (%1)\n"
9385           "vmovups %%ymm10, (%2)\n"
9386           "vmovups %%ymm11, (%3)\n"
9387           "vmovups %%ymm12, (%4)\n"
9388           "vmovups %%ymm13, (%5)\n"
9389           "vmovups %%ymm14, (%6)\n"
9390           "vmovups %%ymm15, (%7)\n"
9391           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9392         );
9393       }
9394     }
9395     return;
9396   }
9397   if (depth == 29) {
9398     helper_float_29_recursive(buf + 0, 27);
9399     helper_float_29_recursive(buf + 134217728, 27);
9400     helper_float_29_recursive(buf + 268435456, 27);
9401     helper_float_29_recursive(buf + 402653184, 27);
9402     for (int j = 0; j < 536870912; j += 536870912) {
9403       for (int k = 0; k < 134217728; k += 8) {
9404         __asm__ volatile (
9405           "vmovups (%0), %%ymm0\n"
9406           "vmovups (%1), %%ymm1\n"
9407           "vmovups (%2), %%ymm2\n"
9408           "vmovups (%3), %%ymm3\n"
9409           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9410           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9411           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9412           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9413           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9414           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9415           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9416           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9417           "vmovups %%ymm0, (%0)\n"
9418           "vmovups %%ymm1, (%1)\n"
9419           "vmovups %%ymm2, (%2)\n"
9420           "vmovups %%ymm3, (%3)\n"
9421           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9422         );
9423       }
9424     }
9425     return;
9426   }
9427 }
9428 void helper_float_29(float *buf);
helper_float_29(float * buf)9429 void helper_float_29(float *buf) {
9430   helper_float_29_recursive(buf, 29);
9431 }
9432 void helper_float_30_recursive(float *buf, int depth);
helper_float_30_recursive(float * buf,int depth)9433 void helper_float_30_recursive(float *buf, int depth) {
9434   if (depth == 6) {
9435     for (int j = 0; j < 64; j += 64) {
9436       for (int k = 0; k < 8; k += 8) {
9437         __asm__ volatile (
9438           "vmovups (%0), %%ymm0\n"
9439           "vmovups (%1), %%ymm1\n"
9440           "vmovups (%2), %%ymm2\n"
9441           "vmovups (%3), %%ymm3\n"
9442           "vmovups (%4), %%ymm4\n"
9443           "vmovups (%5), %%ymm5\n"
9444           "vmovups (%6), %%ymm6\n"
9445           "vmovups (%7), %%ymm7\n"
9446           "vpermilps $160, %%ymm0, %%ymm8\n"
9447           "vpermilps $245, %%ymm0, %%ymm9\n"
9448           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9449           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9450           "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
9451           "vpermilps $160, %%ymm1, %%ymm8\n"
9452           "vpermilps $245, %%ymm1, %%ymm9\n"
9453           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9454           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9455           "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
9456           "vpermilps $160, %%ymm2, %%ymm8\n"
9457           "vpermilps $245, %%ymm2, %%ymm9\n"
9458           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9459           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9460           "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
9461           "vpermilps $160, %%ymm3, %%ymm8\n"
9462           "vpermilps $245, %%ymm3, %%ymm9\n"
9463           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9464           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9465           "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
9466           "vpermilps $160, %%ymm4, %%ymm8\n"
9467           "vpermilps $245, %%ymm4, %%ymm9\n"
9468           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9469           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9470           "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
9471           "vpermilps $160, %%ymm5, %%ymm8\n"
9472           "vpermilps $245, %%ymm5, %%ymm9\n"
9473           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9474           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9475           "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
9476           "vpermilps $160, %%ymm6, %%ymm8\n"
9477           "vpermilps $245, %%ymm6, %%ymm9\n"
9478           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9479           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9480           "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
9481           "vpermilps $160, %%ymm7, %%ymm8\n"
9482           "vpermilps $245, %%ymm7, %%ymm9\n"
9483           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9484           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9485           "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
9486           "vpermilps $68, %%ymm0, %%ymm8\n"
9487           "vpermilps $238, %%ymm0, %%ymm9\n"
9488           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9489           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9490           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9491           "vaddps %%ymm8, %%ymm12, %%ymm0\n"
9492           "vpermilps $68, %%ymm1, %%ymm8\n"
9493           "vpermilps $238, %%ymm1, %%ymm9\n"
9494           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9495           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9496           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9497           "vaddps %%ymm8, %%ymm12, %%ymm1\n"
9498           "vpermilps $68, %%ymm2, %%ymm8\n"
9499           "vpermilps $238, %%ymm2, %%ymm9\n"
9500           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9501           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9502           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9503           "vaddps %%ymm8, %%ymm12, %%ymm2\n"
9504           "vpermilps $68, %%ymm3, %%ymm8\n"
9505           "vpermilps $238, %%ymm3, %%ymm9\n"
9506           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9507           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9508           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9509           "vaddps %%ymm8, %%ymm12, %%ymm3\n"
9510           "vpermilps $68, %%ymm4, %%ymm8\n"
9511           "vpermilps $238, %%ymm4, %%ymm9\n"
9512           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9513           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9514           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9515           "vaddps %%ymm8, %%ymm12, %%ymm4\n"
9516           "vpermilps $68, %%ymm5, %%ymm8\n"
9517           "vpermilps $238, %%ymm5, %%ymm9\n"
9518           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9519           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9520           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9521           "vaddps %%ymm8, %%ymm12, %%ymm5\n"
9522           "vpermilps $68, %%ymm6, %%ymm8\n"
9523           "vpermilps $238, %%ymm6, %%ymm9\n"
9524           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9525           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9526           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9527           "vaddps %%ymm8, %%ymm12, %%ymm6\n"
9528           "vpermilps $68, %%ymm7, %%ymm8\n"
9529           "vpermilps $238, %%ymm7, %%ymm9\n"
9530           "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9531           "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9532           "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9533           "vaddps %%ymm8, %%ymm12, %%ymm7\n"
9534           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9535           "vsubps %%ymm0, %%ymm8, %%ymm9\n"
9536           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
9537           "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
9538           "vaddps %%ymm10, %%ymm11, %%ymm0\n"
9539           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9540           "vsubps %%ymm1, %%ymm8, %%ymm9\n"
9541           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
9542           "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
9543           "vaddps %%ymm10, %%ymm11, %%ymm1\n"
9544           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9545           "vsubps %%ymm2, %%ymm8, %%ymm9\n"
9546           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
9547           "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
9548           "vaddps %%ymm10, %%ymm11, %%ymm2\n"
9549           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9550           "vsubps %%ymm3, %%ymm8, %%ymm9\n"
9551           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
9552           "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
9553           "vaddps %%ymm10, %%ymm11, %%ymm3\n"
9554           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9555           "vsubps %%ymm4, %%ymm8, %%ymm9\n"
9556           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
9557           "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
9558           "vaddps %%ymm10, %%ymm11, %%ymm4\n"
9559           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9560           "vsubps %%ymm5, %%ymm8, %%ymm9\n"
9561           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
9562           "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
9563           "vaddps %%ymm10, %%ymm11, %%ymm5\n"
9564           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9565           "vsubps %%ymm6, %%ymm8, %%ymm9\n"
9566           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
9567           "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
9568           "vaddps %%ymm10, %%ymm11, %%ymm6\n"
9569           "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9570           "vsubps %%ymm7, %%ymm8, %%ymm9\n"
9571           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
9572           "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
9573           "vaddps %%ymm10, %%ymm11, %%ymm7\n"
9574           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9575           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9576           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9577           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9578           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9579           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9580           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9581           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9582           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9583           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9584           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9585           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9586           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9587           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9588           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9589           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9590           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9591           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9592           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9593           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9594           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9595           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9596           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9597           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9598           "vmovups %%ymm8, (%0)\n"
9599           "vmovups %%ymm9, (%1)\n"
9600           "vmovups %%ymm10, (%2)\n"
9601           "vmovups %%ymm11, (%3)\n"
9602           "vmovups %%ymm12, (%4)\n"
9603           "vmovups %%ymm13, (%5)\n"
9604           "vmovups %%ymm14, (%6)\n"
9605           "vmovups %%ymm15, (%7)\n"
9606           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9607         );
9608       }
9609     }
9610     return;
9611   }
9612   if (depth == 9) {
9613     helper_float_30_recursive(buf + 0, 6);
9614     helper_float_30_recursive(buf + 64, 6);
9615     helper_float_30_recursive(buf + 128, 6);
9616     helper_float_30_recursive(buf + 192, 6);
9617     helper_float_30_recursive(buf + 256, 6);
9618     helper_float_30_recursive(buf + 320, 6);
9619     helper_float_30_recursive(buf + 384, 6);
9620     helper_float_30_recursive(buf + 448, 6);
9621     for (int j = 0; j < 512; j += 512) {
9622       for (int k = 0; k < 64; k += 8) {
9623         __asm__ volatile (
9624           "vmovups (%0), %%ymm0\n"
9625           "vmovups (%1), %%ymm1\n"
9626           "vmovups (%2), %%ymm2\n"
9627           "vmovups (%3), %%ymm3\n"
9628           "vmovups (%4), %%ymm4\n"
9629           "vmovups (%5), %%ymm5\n"
9630           "vmovups (%6), %%ymm6\n"
9631           "vmovups (%7), %%ymm7\n"
9632           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9633           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9634           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9635           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9636           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9637           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9638           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9639           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9640           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9641           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9642           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9643           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9644           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9645           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9646           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9647           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9648           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9649           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9650           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9651           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9652           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9653           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9654           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9655           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9656           "vmovups %%ymm8, (%0)\n"
9657           "vmovups %%ymm9, (%1)\n"
9658           "vmovups %%ymm10, (%2)\n"
9659           "vmovups %%ymm11, (%3)\n"
9660           "vmovups %%ymm12, (%4)\n"
9661           "vmovups %%ymm13, (%5)\n"
9662           "vmovups %%ymm14, (%6)\n"
9663           "vmovups %%ymm15, (%7)\n"
9664           :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9665         );
9666       }
9667     }
9668     return;
9669   }
9670   if (depth == 12) {
9671     helper_float_30_recursive(buf + 0, 9);
9672     helper_float_30_recursive(buf + 512, 9);
9673     helper_float_30_recursive(buf + 1024, 9);
9674     helper_float_30_recursive(buf + 1536, 9);
9675     helper_float_30_recursive(buf + 2048, 9);
9676     helper_float_30_recursive(buf + 2560, 9);
9677     helper_float_30_recursive(buf + 3072, 9);
9678     helper_float_30_recursive(buf + 3584, 9);
9679     for (int j = 0; j < 4096; j += 4096) {
9680       for (int k = 0; k < 512; k += 8) {
9681         __asm__ volatile (
9682           "vmovups (%0), %%ymm0\n"
9683           "vmovups (%1), %%ymm1\n"
9684           "vmovups (%2), %%ymm2\n"
9685           "vmovups (%3), %%ymm3\n"
9686           "vmovups (%4), %%ymm4\n"
9687           "vmovups (%5), %%ymm5\n"
9688           "vmovups (%6), %%ymm6\n"
9689           "vmovups (%7), %%ymm7\n"
9690           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9691           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9692           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9693           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9694           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9695           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9696           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9697           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9698           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9699           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9700           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9701           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9702           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9703           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9704           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9705           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9706           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9707           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9708           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9709           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9710           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9711           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9712           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9713           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9714           "vmovups %%ymm8, (%0)\n"
9715           "vmovups %%ymm9, (%1)\n"
9716           "vmovups %%ymm10, (%2)\n"
9717           "vmovups %%ymm11, (%3)\n"
9718           "vmovups %%ymm12, (%4)\n"
9719           "vmovups %%ymm13, (%5)\n"
9720           "vmovups %%ymm14, (%6)\n"
9721           "vmovups %%ymm15, (%7)\n"
9722           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9723         );
9724       }
9725     }
9726     return;
9727   }
9728   if (depth == 15) {
9729     helper_float_30_recursive(buf + 0, 12);
9730     helper_float_30_recursive(buf + 4096, 12);
9731     helper_float_30_recursive(buf + 8192, 12);
9732     helper_float_30_recursive(buf + 12288, 12);
9733     helper_float_30_recursive(buf + 16384, 12);
9734     helper_float_30_recursive(buf + 20480, 12);
9735     helper_float_30_recursive(buf + 24576, 12);
9736     helper_float_30_recursive(buf + 28672, 12);
9737     for (int j = 0; j < 32768; j += 32768) {
9738       for (int k = 0; k < 4096; k += 8) {
9739         __asm__ volatile (
9740           "vmovups (%0), %%ymm0\n"
9741           "vmovups (%1), %%ymm1\n"
9742           "vmovups (%2), %%ymm2\n"
9743           "vmovups (%3), %%ymm3\n"
9744           "vmovups (%4), %%ymm4\n"
9745           "vmovups (%5), %%ymm5\n"
9746           "vmovups (%6), %%ymm6\n"
9747           "vmovups (%7), %%ymm7\n"
9748           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9749           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9750           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9751           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9752           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9753           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9754           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9755           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9756           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9757           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9758           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9759           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9760           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9761           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9762           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9763           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9764           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9765           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9766           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9767           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9768           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9769           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9770           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9771           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9772           "vmovups %%ymm8, (%0)\n"
9773           "vmovups %%ymm9, (%1)\n"
9774           "vmovups %%ymm10, (%2)\n"
9775           "vmovups %%ymm11, (%3)\n"
9776           "vmovups %%ymm12, (%4)\n"
9777           "vmovups %%ymm13, (%5)\n"
9778           "vmovups %%ymm14, (%6)\n"
9779           "vmovups %%ymm15, (%7)\n"
9780           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9781         );
9782       }
9783     }
9784     return;
9785   }
9786   if (depth == 18) {
9787     helper_float_30_recursive(buf + 0, 15);
9788     helper_float_30_recursive(buf + 32768, 15);
9789     helper_float_30_recursive(buf + 65536, 15);
9790     helper_float_30_recursive(buf + 98304, 15);
9791     helper_float_30_recursive(buf + 131072, 15);
9792     helper_float_30_recursive(buf + 163840, 15);
9793     helper_float_30_recursive(buf + 196608, 15);
9794     helper_float_30_recursive(buf + 229376, 15);
9795     for (int j = 0; j < 262144; j += 262144) {
9796       for (int k = 0; k < 32768; k += 8) {
9797         __asm__ volatile (
9798           "vmovups (%0), %%ymm0\n"
9799           "vmovups (%1), %%ymm1\n"
9800           "vmovups (%2), %%ymm2\n"
9801           "vmovups (%3), %%ymm3\n"
9802           "vmovups (%4), %%ymm4\n"
9803           "vmovups (%5), %%ymm5\n"
9804           "vmovups (%6), %%ymm6\n"
9805           "vmovups (%7), %%ymm7\n"
9806           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9807           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9808           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9809           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9810           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9811           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9812           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9813           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9814           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9815           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9816           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9817           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9818           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9819           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9820           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9821           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9822           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9823           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9824           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9825           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9826           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9827           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9828           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9829           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9830           "vmovups %%ymm8, (%0)\n"
9831           "vmovups %%ymm9, (%1)\n"
9832           "vmovups %%ymm10, (%2)\n"
9833           "vmovups %%ymm11, (%3)\n"
9834           "vmovups %%ymm12, (%4)\n"
9835           "vmovups %%ymm13, (%5)\n"
9836           "vmovups %%ymm14, (%6)\n"
9837           "vmovups %%ymm15, (%7)\n"
9838           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9839         );
9840       }
9841     }
9842     return;
9843   }
9844   if (depth == 21) {
9845     helper_float_30_recursive(buf + 0, 18);
9846     helper_float_30_recursive(buf + 262144, 18);
9847     helper_float_30_recursive(buf + 524288, 18);
9848     helper_float_30_recursive(buf + 786432, 18);
9849     helper_float_30_recursive(buf + 1048576, 18);
9850     helper_float_30_recursive(buf + 1310720, 18);
9851     helper_float_30_recursive(buf + 1572864, 18);
9852     helper_float_30_recursive(buf + 1835008, 18);
9853     for (int j = 0; j < 2097152; j += 2097152) {
9854       for (int k = 0; k < 262144; k += 8) {
9855         __asm__ volatile (
9856           "vmovups (%0), %%ymm0\n"
9857           "vmovups (%1), %%ymm1\n"
9858           "vmovups (%2), %%ymm2\n"
9859           "vmovups (%3), %%ymm3\n"
9860           "vmovups (%4), %%ymm4\n"
9861           "vmovups (%5), %%ymm5\n"
9862           "vmovups (%6), %%ymm6\n"
9863           "vmovups (%7), %%ymm7\n"
9864           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9865           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9866           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9867           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9868           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9869           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9870           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9871           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9872           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9873           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9874           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9875           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9876           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9877           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9878           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9879           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9880           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9881           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9882           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9883           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9884           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9885           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9886           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9887           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9888           "vmovups %%ymm8, (%0)\n"
9889           "vmovups %%ymm9, (%1)\n"
9890           "vmovups %%ymm10, (%2)\n"
9891           "vmovups %%ymm11, (%3)\n"
9892           "vmovups %%ymm12, (%4)\n"
9893           "vmovups %%ymm13, (%5)\n"
9894           "vmovups %%ymm14, (%6)\n"
9895           "vmovups %%ymm15, (%7)\n"
9896           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9897         );
9898       }
9899     }
9900     return;
9901   }
9902   if (depth == 24) {
9903     helper_float_30_recursive(buf + 0, 21);
9904     helper_float_30_recursive(buf + 2097152, 21);
9905     helper_float_30_recursive(buf + 4194304, 21);
9906     helper_float_30_recursive(buf + 6291456, 21);
9907     helper_float_30_recursive(buf + 8388608, 21);
9908     helper_float_30_recursive(buf + 10485760, 21);
9909     helper_float_30_recursive(buf + 12582912, 21);
9910     helper_float_30_recursive(buf + 14680064, 21);
9911     for (int j = 0; j < 16777216; j += 16777216) {
9912       for (int k = 0; k < 2097152; k += 8) {
9913         __asm__ volatile (
9914           "vmovups (%0), %%ymm0\n"
9915           "vmovups (%1), %%ymm1\n"
9916           "vmovups (%2), %%ymm2\n"
9917           "vmovups (%3), %%ymm3\n"
9918           "vmovups (%4), %%ymm4\n"
9919           "vmovups (%5), %%ymm5\n"
9920           "vmovups (%6), %%ymm6\n"
9921           "vmovups (%7), %%ymm7\n"
9922           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9923           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9924           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9925           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9926           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9927           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9928           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9929           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9930           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9931           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9932           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9933           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9934           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9935           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9936           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9937           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9938           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9939           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9940           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9941           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9942           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9943           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9944           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9945           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9946           "vmovups %%ymm8, (%0)\n"
9947           "vmovups %%ymm9, (%1)\n"
9948           "vmovups %%ymm10, (%2)\n"
9949           "vmovups %%ymm11, (%3)\n"
9950           "vmovups %%ymm12, (%4)\n"
9951           "vmovups %%ymm13, (%5)\n"
9952           "vmovups %%ymm14, (%6)\n"
9953           "vmovups %%ymm15, (%7)\n"
9954           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9955         );
9956       }
9957     }
9958     return;
9959   }
9960   if (depth == 27) {
9961     helper_float_30_recursive(buf + 0, 24);
9962     helper_float_30_recursive(buf + 16777216, 24);
9963     helper_float_30_recursive(buf + 33554432, 24);
9964     helper_float_30_recursive(buf + 50331648, 24);
9965     helper_float_30_recursive(buf + 67108864, 24);
9966     helper_float_30_recursive(buf + 83886080, 24);
9967     helper_float_30_recursive(buf + 100663296, 24);
9968     helper_float_30_recursive(buf + 117440512, 24);
9969     for (int j = 0; j < 134217728; j += 134217728) {
9970       for (int k = 0; k < 16777216; k += 8) {
9971         __asm__ volatile (
9972           "vmovups (%0), %%ymm0\n"
9973           "vmovups (%1), %%ymm1\n"
9974           "vmovups (%2), %%ymm2\n"
9975           "vmovups (%3), %%ymm3\n"
9976           "vmovups (%4), %%ymm4\n"
9977           "vmovups (%5), %%ymm5\n"
9978           "vmovups (%6), %%ymm6\n"
9979           "vmovups (%7), %%ymm7\n"
9980           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9981           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9982           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9983           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9984           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9985           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9986           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9987           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9988           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9989           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9990           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9991           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9992           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9993           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9994           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9995           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9996           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9997           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9998           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9999           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
10000           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
10001           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
10002           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
10003           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
10004           "vmovups %%ymm8, (%0)\n"
10005           "vmovups %%ymm9, (%1)\n"
10006           "vmovups %%ymm10, (%2)\n"
10007           "vmovups %%ymm11, (%3)\n"
10008           "vmovups %%ymm12, (%4)\n"
10009           "vmovups %%ymm13, (%5)\n"
10010           "vmovups %%ymm14, (%6)\n"
10011           "vmovups %%ymm15, (%7)\n"
10012           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10013         );
10014       }
10015     }
10016     return;
10017   }
10018   if (depth == 30) {
10019     helper_float_30_recursive(buf + 0, 27);
10020     helper_float_30_recursive(buf + 134217728, 27);
10021     helper_float_30_recursive(buf + 268435456, 27);
10022     helper_float_30_recursive(buf + 402653184, 27);
10023     helper_float_30_recursive(buf + 536870912, 27);
10024     helper_float_30_recursive(buf + 671088640, 27);
10025     helper_float_30_recursive(buf + 805306368, 27);
10026     helper_float_30_recursive(buf + 939524096, 27);
10027     for (int j = 0; j < 1073741824; j += 1073741824) {
10028       for (int k = 0; k < 134217728; k += 8) {
10029         __asm__ volatile (
10030           "vmovups (%0), %%ymm0\n"
10031           "vmovups (%1), %%ymm1\n"
10032           "vmovups (%2), %%ymm2\n"
10033           "vmovups (%3), %%ymm3\n"
10034           "vmovups (%4), %%ymm4\n"
10035           "vmovups (%5), %%ymm5\n"
10036           "vmovups (%6), %%ymm6\n"
10037           "vmovups (%7), %%ymm7\n"
10038           "vaddps %%ymm1, %%ymm0, %%ymm8\n"
10039           "vsubps %%ymm1, %%ymm0, %%ymm9\n"
10040           "vaddps %%ymm3, %%ymm2, %%ymm10\n"
10041           "vsubps %%ymm3, %%ymm2, %%ymm11\n"
10042           "vaddps %%ymm5, %%ymm4, %%ymm12\n"
10043           "vsubps %%ymm5, %%ymm4, %%ymm13\n"
10044           "vaddps %%ymm7, %%ymm6, %%ymm14\n"
10045           "vsubps %%ymm7, %%ymm6, %%ymm15\n"
10046           "vaddps %%ymm10, %%ymm8, %%ymm0\n"
10047           "vsubps %%ymm10, %%ymm8, %%ymm2\n"
10048           "vaddps %%ymm11, %%ymm9, %%ymm1\n"
10049           "vsubps %%ymm11, %%ymm9, %%ymm3\n"
10050           "vaddps %%ymm14, %%ymm12, %%ymm4\n"
10051           "vsubps %%ymm14, %%ymm12, %%ymm6\n"
10052           "vaddps %%ymm15, %%ymm13, %%ymm5\n"
10053           "vsubps %%ymm15, %%ymm13, %%ymm7\n"
10054           "vaddps %%ymm4, %%ymm0, %%ymm8\n"
10055           "vsubps %%ymm4, %%ymm0, %%ymm12\n"
10056           "vaddps %%ymm5, %%ymm1, %%ymm9\n"
10057           "vsubps %%ymm5, %%ymm1, %%ymm13\n"
10058           "vaddps %%ymm6, %%ymm2, %%ymm10\n"
10059           "vsubps %%ymm6, %%ymm2, %%ymm14\n"
10060           "vaddps %%ymm7, %%ymm3, %%ymm11\n"
10061           "vsubps %%ymm7, %%ymm3, %%ymm15\n"
10062           "vmovups %%ymm8, (%0)\n"
10063           "vmovups %%ymm9, (%1)\n"
10064           "vmovups %%ymm10, (%2)\n"
10065           "vmovups %%ymm11, (%3)\n"
10066           "vmovups %%ymm12, (%4)\n"
10067           "vmovups %%ymm13, (%5)\n"
10068           "vmovups %%ymm14, (%6)\n"
10069           "vmovups %%ymm15, (%7)\n"
10070           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10071         );
10072       }
10073     }
10074     return;
10075   }
10076 }
10077 void helper_float_30(float *buf);
helper_float_30(float * buf)10078 void helper_float_30(float *buf) {
10079   helper_float_30_recursive(buf, 30);
10080 }
fht_float(float * buf,int log_n)10081 int fht_float(float *buf, int log_n) {
10082   if (log_n == 0) {
10083     return 0;
10084   }
10085   if (log_n == 1) {
10086     helper_float_1(buf);
10087     return 0;
10088   }
10089   if (log_n == 2) {
10090     helper_float_2(buf);
10091     return 0;
10092   }
10093   if (log_n == 3) {
10094     helper_float_3(buf);
10095     return 0;
10096   }
10097   if (log_n == 4) {
10098     helper_float_4(buf);
10099     return 0;
10100   }
10101   if (log_n == 5) {
10102     helper_float_5(buf);
10103     return 0;
10104   }
10105   if (log_n == 6) {
10106     helper_float_6(buf);
10107     return 0;
10108   }
10109   if (log_n == 7) {
10110     helper_float_7(buf);
10111     return 0;
10112   }
10113   if (log_n == 8) {
10114     helper_float_8(buf);
10115     return 0;
10116   }
10117   if (log_n == 9) {
10118     helper_float_9(buf);
10119     return 0;
10120   }
10121   if (log_n == 10) {
10122     helper_float_10(buf);
10123     return 0;
10124   }
10125   if (log_n == 11) {
10126     helper_float_11(buf);
10127     return 0;
10128   }
10129   if (log_n == 12) {
10130     helper_float_12(buf);
10131     return 0;
10132   }
10133   if (log_n == 13) {
10134     helper_float_13(buf);
10135     return 0;
10136   }
10137   if (log_n == 14) {
10138     helper_float_14(buf);
10139     return 0;
10140   }
10141   if (log_n == 15) {
10142     helper_float_15(buf);
10143     return 0;
10144   }
10145   if (log_n == 16) {
10146     helper_float_16(buf);
10147     return 0;
10148   }
10149   if (log_n == 17) {
10150     helper_float_17(buf);
10151     return 0;
10152   }
10153   if (log_n == 18) {
10154     helper_float_18(buf);
10155     return 0;
10156   }
10157   if (log_n == 19) {
10158     helper_float_19(buf);
10159     return 0;
10160   }
10161   if (log_n == 20) {
10162     helper_float_20(buf);
10163     return 0;
10164   }
10165   if (log_n == 21) {
10166     helper_float_21(buf);
10167     return 0;
10168   }
10169   if (log_n == 22) {
10170     helper_float_22(buf);
10171     return 0;
10172   }
10173   if (log_n == 23) {
10174     helper_float_23(buf);
10175     return 0;
10176   }
10177   if (log_n == 24) {
10178     helper_float_24(buf);
10179     return 0;
10180   }
10181   if (log_n == 25) {
10182     helper_float_25(buf);
10183     return 0;
10184   }
10185   if (log_n == 26) {
10186     helper_float_26(buf);
10187     return 0;
10188   }
10189   if (log_n == 27) {
10190     helper_float_27(buf);
10191     return 0;
10192   }
10193   if (log_n == 28) {
10194     helper_float_28(buf);
10195     return 0;
10196   }
10197   if (log_n == 29) {
10198     helper_float_29(buf);
10199     return 0;
10200   }
10201   if (log_n == 30) {
10202     helper_float_30(buf);
10203     return 0;
10204   }
10205   return 1;
10206 }
10207 static inline void helper_double_1(double *buf);
helper_double_1(double * buf)10208 static inline void helper_double_1(double *buf) {
10209   for (int j = 0; j < 2; j += 2) {
10210     for (int k = 0; k < 1; ++k) {
10211       double u = buf[j + k];
10212       double v = buf[j + k + 1];
10213       buf[j + k] = u + v;
10214       buf[j + k + 1] = u - v;
10215     }
10216   }
10217 }
10218 static inline void helper_double_2(double *buf);
helper_double_2(double * buf)10219 static inline void helper_double_2(double *buf) {
10220   for (int j = 0; j < 4; j += 4) {
10221     __asm__ volatile (
10222       "vmovupd (%0), %%ymm0\n"
10223       "vpermilpd $0, %%ymm0, %%ymm8\n"
10224       "vpermilpd $15, %%ymm0, %%ymm9\n"
10225       "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10226       "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10227       "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10228       "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10229       "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10230       "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10231       "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10232       "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10233       "vmovupd %%ymm0, (%0)\n"
10234       :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10235     );
10236   }
10237 }
10238 static inline void helper_double_3(double *buf);
helper_double_3(double * buf)10239 static inline void helper_double_3(double *buf) {
10240   for (int j = 0; j < 8; j += 8) {
10241     for (int k = 0; k < 4; k += 4) {
10242       __asm__ volatile (
10243         "vmovupd (%0), %%ymm0\n"
10244         "vmovupd (%1), %%ymm1\n"
10245         "vpermilpd $0, %%ymm0, %%ymm8\n"
10246         "vpermilpd $15, %%ymm0, %%ymm9\n"
10247         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10248         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10249         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10250         "vpermilpd $0, %%ymm1, %%ymm8\n"
10251         "vpermilpd $15, %%ymm1, %%ymm9\n"
10252         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10253         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10254         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10255         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10256         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10257         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10258         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10259         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10260         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10261         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10262         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10263         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10264         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10265         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10266         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10267         "vmovupd %%ymm8, (%0)\n"
10268         "vmovupd %%ymm9, (%1)\n"
10269         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10270       );
10271     }
10272   }
10273 }
10274 void helper_double_4_recursive(double *buf, int depth);
helper_double_4_recursive(double * buf,int depth)10275 void helper_double_4_recursive(double *buf, int depth) {
10276   if (depth == 4) {
10277     for (int j = 0; j < 16; j += 16) {
10278       for (int k = 0; k < 4; k += 4) {
10279         __asm__ volatile (
10280           "vmovupd (%0), %%ymm0\n"
10281           "vmovupd (%1), %%ymm1\n"
10282           "vmovupd (%2), %%ymm2\n"
10283           "vmovupd (%3), %%ymm3\n"
10284           "vpermilpd $0, %%ymm0, %%ymm8\n"
10285           "vpermilpd $15, %%ymm0, %%ymm9\n"
10286           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10287           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10288           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10289           "vpermilpd $0, %%ymm1, %%ymm8\n"
10290           "vpermilpd $15, %%ymm1, %%ymm9\n"
10291           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10292           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10293           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10294           "vpermilpd $0, %%ymm2, %%ymm8\n"
10295           "vpermilpd $15, %%ymm2, %%ymm9\n"
10296           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10297           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10298           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10299           "vpermilpd $0, %%ymm3, %%ymm8\n"
10300           "vpermilpd $15, %%ymm3, %%ymm9\n"
10301           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10302           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10303           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10304           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10305           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10306           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10307           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10308           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10309           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10310           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10311           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10312           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10313           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10314           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10315           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10316           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10317           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10318           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10319           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10320           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10321           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10322           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10323           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10324           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10325           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10326           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10327           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10328           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10329           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10330           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10331           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10332           "vmovupd %%ymm0, (%0)\n"
10333           "vmovupd %%ymm1, (%1)\n"
10334           "vmovupd %%ymm2, (%2)\n"
10335           "vmovupd %%ymm3, (%3)\n"
10336           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10337         );
10338       }
10339     }
10340     return;
10341   }
10342 }
10343 void helper_double_4(double *buf);
helper_double_4(double * buf)10344 void helper_double_4(double *buf) {
10345   helper_double_4_recursive(buf, 4);
10346 }
10347 static inline void helper_double_5(double *buf);
helper_double_5(double * buf)10348 static inline void helper_double_5(double *buf) {
10349   for (int j = 0; j < 32; j += 32) {
10350     for (int k = 0; k < 4; k += 4) {
10351       __asm__ volatile (
10352         "vmovupd (%0), %%ymm0\n"
10353         "vmovupd (%1), %%ymm1\n"
10354         "vmovupd (%2), %%ymm2\n"
10355         "vmovupd (%3), %%ymm3\n"
10356         "vmovupd (%4), %%ymm4\n"
10357         "vmovupd (%5), %%ymm5\n"
10358         "vmovupd (%6), %%ymm6\n"
10359         "vmovupd (%7), %%ymm7\n"
10360         "vpermilpd $0, %%ymm0, %%ymm8\n"
10361         "vpermilpd $15, %%ymm0, %%ymm9\n"
10362         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10363         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10364         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10365         "vpermilpd $0, %%ymm1, %%ymm8\n"
10366         "vpermilpd $15, %%ymm1, %%ymm9\n"
10367         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10368         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10369         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10370         "vpermilpd $0, %%ymm2, %%ymm8\n"
10371         "vpermilpd $15, %%ymm2, %%ymm9\n"
10372         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10373         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10374         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10375         "vpermilpd $0, %%ymm3, %%ymm8\n"
10376         "vpermilpd $15, %%ymm3, %%ymm9\n"
10377         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10378         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10379         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10380         "vpermilpd $0, %%ymm4, %%ymm8\n"
10381         "vpermilpd $15, %%ymm4, %%ymm9\n"
10382         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10383         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10384         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10385         "vpermilpd $0, %%ymm5, %%ymm8\n"
10386         "vpermilpd $15, %%ymm5, %%ymm9\n"
10387         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10388         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10389         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10390         "vpermilpd $0, %%ymm6, %%ymm8\n"
10391         "vpermilpd $15, %%ymm6, %%ymm9\n"
10392         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10393         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10394         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10395         "vpermilpd $0, %%ymm7, %%ymm8\n"
10396         "vpermilpd $15, %%ymm7, %%ymm9\n"
10397         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10398         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10399         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10400         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10401         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10402         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10403         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10404         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10405         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10406         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10407         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10408         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10409         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10410         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10411         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10412         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10413         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10414         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10415         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10416         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10417         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10418         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10419         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10420         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10421         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10422         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10423         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10424         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10425         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10426         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10427         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10428         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10429         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10430         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10431         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10432         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10433         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10434         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10435         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10436         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10437         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10438         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10439         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10440         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10441         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10442         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10443         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10444         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10445         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10446         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10447         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10448         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10449         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10450         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10451         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10452         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10453         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10454         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10455         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10456         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10457         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10458         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10459         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10460         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10461         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10462         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10463         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10464         "vmovupd %%ymm8, (%0)\n"
10465         "vmovupd %%ymm9, (%1)\n"
10466         "vmovupd %%ymm10, (%2)\n"
10467         "vmovupd %%ymm11, (%3)\n"
10468         "vmovupd %%ymm12, (%4)\n"
10469         "vmovupd %%ymm13, (%5)\n"
10470         "vmovupd %%ymm14, (%6)\n"
10471         "vmovupd %%ymm15, (%7)\n"
10472         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10473       );
10474     }
10475   }
10476 }
10477 static inline void helper_double_6(double *buf);
helper_double_6(double * buf)10478 static inline void helper_double_6(double *buf) {
10479   for (int j = 0; j < 64; j += 32) {
10480     for (int k = 0; k < 4; k += 4) {
10481       __asm__ volatile (
10482         "vmovupd (%0), %%ymm0\n"
10483         "vmovupd (%1), %%ymm1\n"
10484         "vmovupd (%2), %%ymm2\n"
10485         "vmovupd (%3), %%ymm3\n"
10486         "vmovupd (%4), %%ymm4\n"
10487         "vmovupd (%5), %%ymm5\n"
10488         "vmovupd (%6), %%ymm6\n"
10489         "vmovupd (%7), %%ymm7\n"
10490         "vpermilpd $0, %%ymm0, %%ymm8\n"
10491         "vpermilpd $15, %%ymm0, %%ymm9\n"
10492         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10493         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10494         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10495         "vpermilpd $0, %%ymm1, %%ymm8\n"
10496         "vpermilpd $15, %%ymm1, %%ymm9\n"
10497         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10498         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10499         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10500         "vpermilpd $0, %%ymm2, %%ymm8\n"
10501         "vpermilpd $15, %%ymm2, %%ymm9\n"
10502         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10503         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10504         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10505         "vpermilpd $0, %%ymm3, %%ymm8\n"
10506         "vpermilpd $15, %%ymm3, %%ymm9\n"
10507         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10508         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10509         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10510         "vpermilpd $0, %%ymm4, %%ymm8\n"
10511         "vpermilpd $15, %%ymm4, %%ymm9\n"
10512         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10513         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10514         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10515         "vpermilpd $0, %%ymm5, %%ymm8\n"
10516         "vpermilpd $15, %%ymm5, %%ymm9\n"
10517         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10518         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10519         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10520         "vpermilpd $0, %%ymm6, %%ymm8\n"
10521         "vpermilpd $15, %%ymm6, %%ymm9\n"
10522         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10523         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10524         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10525         "vpermilpd $0, %%ymm7, %%ymm8\n"
10526         "vpermilpd $15, %%ymm7, %%ymm9\n"
10527         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10528         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10529         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10530         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10531         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10532         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10533         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10534         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10535         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10536         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10537         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10538         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10539         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10540         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10541         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10542         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10543         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10544         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10545         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10546         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10547         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10548         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10549         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10550         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10551         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10552         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10553         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10554         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10555         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10556         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10557         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10558         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10559         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10560         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10561         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10562         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10563         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10564         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10565         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10566         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10567         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10568         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10569         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10570         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10571         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10572         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10573         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10574         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10575         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10576         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10577         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10578         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10579         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10580         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10581         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10582         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10583         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10584         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10585         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10586         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10587         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10588         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10589         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10590         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10591         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10592         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10593         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10594         "vmovupd %%ymm8, (%0)\n"
10595         "vmovupd %%ymm9, (%1)\n"
10596         "vmovupd %%ymm10, (%2)\n"
10597         "vmovupd %%ymm11, (%3)\n"
10598         "vmovupd %%ymm12, (%4)\n"
10599         "vmovupd %%ymm13, (%5)\n"
10600         "vmovupd %%ymm14, (%6)\n"
10601         "vmovupd %%ymm15, (%7)\n"
10602         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10603       );
10604     }
10605   }
10606   for (int j = 0; j < 64; j += 64) {
10607     for (int k = 0; k < 32; k += 4) {
10608       __asm__ volatile (
10609         "vmovupd (%0), %%ymm0\n"
10610         "vmovupd (%1), %%ymm1\n"
10611         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10612         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10613         "vmovupd %%ymm8, (%0)\n"
10614         "vmovupd %%ymm9, (%1)\n"
10615         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10616       );
10617     }
10618   }
10619 }
10620 static inline void helper_double_7(double *buf);
helper_double_7(double * buf)10621 static inline void helper_double_7(double *buf) {
10622   for (int j = 0; j < 128; j += 32) {
10623     for (int k = 0; k < 4; k += 4) {
10624       __asm__ volatile (
10625         "vmovupd (%0), %%ymm0\n"
10626         "vmovupd (%1), %%ymm1\n"
10627         "vmovupd (%2), %%ymm2\n"
10628         "vmovupd (%3), %%ymm3\n"
10629         "vmovupd (%4), %%ymm4\n"
10630         "vmovupd (%5), %%ymm5\n"
10631         "vmovupd (%6), %%ymm6\n"
10632         "vmovupd (%7), %%ymm7\n"
10633         "vpermilpd $0, %%ymm0, %%ymm8\n"
10634         "vpermilpd $15, %%ymm0, %%ymm9\n"
10635         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10636         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10637         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10638         "vpermilpd $0, %%ymm1, %%ymm8\n"
10639         "vpermilpd $15, %%ymm1, %%ymm9\n"
10640         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10641         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10642         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10643         "vpermilpd $0, %%ymm2, %%ymm8\n"
10644         "vpermilpd $15, %%ymm2, %%ymm9\n"
10645         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10646         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10647         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10648         "vpermilpd $0, %%ymm3, %%ymm8\n"
10649         "vpermilpd $15, %%ymm3, %%ymm9\n"
10650         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10651         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10652         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10653         "vpermilpd $0, %%ymm4, %%ymm8\n"
10654         "vpermilpd $15, %%ymm4, %%ymm9\n"
10655         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10656         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10657         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10658         "vpermilpd $0, %%ymm5, %%ymm8\n"
10659         "vpermilpd $15, %%ymm5, %%ymm9\n"
10660         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10661         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10662         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10663         "vpermilpd $0, %%ymm6, %%ymm8\n"
10664         "vpermilpd $15, %%ymm6, %%ymm9\n"
10665         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10666         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10667         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10668         "vpermilpd $0, %%ymm7, %%ymm8\n"
10669         "vpermilpd $15, %%ymm7, %%ymm9\n"
10670         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10671         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10672         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10673         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10674         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10675         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10676         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10677         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10678         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10679         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10680         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10681         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10682         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10683         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10684         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10685         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10686         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10687         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10688         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10689         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10690         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10691         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10692         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10693         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10694         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10695         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10696         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10697         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10698         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10699         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10700         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10701         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10702         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10703         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10704         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10705         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10706         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10707         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10708         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10709         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10710         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10711         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10712         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10713         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10714         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10715         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10716         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10717         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10718         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10719         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10720         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10721         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10722         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10723         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10724         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10725         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10726         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10727         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10728         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10729         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10730         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10731         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10732         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10733         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10734         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10735         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10736         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10737         "vmovupd %%ymm8, (%0)\n"
10738         "vmovupd %%ymm9, (%1)\n"
10739         "vmovupd %%ymm10, (%2)\n"
10740         "vmovupd %%ymm11, (%3)\n"
10741         "vmovupd %%ymm12, (%4)\n"
10742         "vmovupd %%ymm13, (%5)\n"
10743         "vmovupd %%ymm14, (%6)\n"
10744         "vmovupd %%ymm15, (%7)\n"
10745         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10746       );
10747     }
10748   }
10749   for (int j = 0; j < 128; j += 128) {
10750     for (int k = 0; k < 32; k += 4) {
10751       __asm__ volatile (
10752         "vmovupd (%0), %%ymm0\n"
10753         "vmovupd (%1), %%ymm1\n"
10754         "vmovupd (%2), %%ymm2\n"
10755         "vmovupd (%3), %%ymm3\n"
10756         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10757         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10758         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10759         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10760         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10761         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10762         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10763         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10764         "vmovupd %%ymm0, (%0)\n"
10765         "vmovupd %%ymm1, (%1)\n"
10766         "vmovupd %%ymm2, (%2)\n"
10767         "vmovupd %%ymm3, (%3)\n"
10768         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10769       );
10770     }
10771   }
10772 }
10773 static inline void helper_double_8(double *buf);
helper_double_8(double * buf)10774 static inline void helper_double_8(double *buf) {
10775   for (int j = 0; j < 256; j += 32) {
10776     for (int k = 0; k < 4; k += 4) {
10777       __asm__ volatile (
10778         "vmovupd (%0), %%ymm0\n"
10779         "vmovupd (%1), %%ymm1\n"
10780         "vmovupd (%2), %%ymm2\n"
10781         "vmovupd (%3), %%ymm3\n"
10782         "vmovupd (%4), %%ymm4\n"
10783         "vmovupd (%5), %%ymm5\n"
10784         "vmovupd (%6), %%ymm6\n"
10785         "vmovupd (%7), %%ymm7\n"
10786         "vpermilpd $0, %%ymm0, %%ymm8\n"
10787         "vpermilpd $15, %%ymm0, %%ymm9\n"
10788         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10789         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10790         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10791         "vpermilpd $0, %%ymm1, %%ymm8\n"
10792         "vpermilpd $15, %%ymm1, %%ymm9\n"
10793         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10794         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10795         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10796         "vpermilpd $0, %%ymm2, %%ymm8\n"
10797         "vpermilpd $15, %%ymm2, %%ymm9\n"
10798         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10799         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10800         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10801         "vpermilpd $0, %%ymm3, %%ymm8\n"
10802         "vpermilpd $15, %%ymm3, %%ymm9\n"
10803         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10804         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10805         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10806         "vpermilpd $0, %%ymm4, %%ymm8\n"
10807         "vpermilpd $15, %%ymm4, %%ymm9\n"
10808         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10809         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10810         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10811         "vpermilpd $0, %%ymm5, %%ymm8\n"
10812         "vpermilpd $15, %%ymm5, %%ymm9\n"
10813         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10814         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10815         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10816         "vpermilpd $0, %%ymm6, %%ymm8\n"
10817         "vpermilpd $15, %%ymm6, %%ymm9\n"
10818         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10819         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10820         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10821         "vpermilpd $0, %%ymm7, %%ymm8\n"
10822         "vpermilpd $15, %%ymm7, %%ymm9\n"
10823         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10824         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10825         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10826         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10827         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10828         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10829         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10830         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10831         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10832         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10833         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10834         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10835         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10836         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10837         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10838         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10839         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10840         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10841         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10842         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10843         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10844         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10845         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10846         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10847         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10848         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10849         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10850         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10851         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10852         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10853         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10854         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10855         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10856         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10857         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10858         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10859         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10860         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10861         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10862         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10863         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10864         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10865         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10866         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10867         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10868         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10869         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10870         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10871         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10872         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10873         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10874         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10875         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10876         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10877         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10878         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10879         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10880         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10881         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10882         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10883         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10884         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10885         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10886         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10887         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10888         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10889         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10890         "vmovupd %%ymm8, (%0)\n"
10891         "vmovupd %%ymm9, (%1)\n"
10892         "vmovupd %%ymm10, (%2)\n"
10893         "vmovupd %%ymm11, (%3)\n"
10894         "vmovupd %%ymm12, (%4)\n"
10895         "vmovupd %%ymm13, (%5)\n"
10896         "vmovupd %%ymm14, (%6)\n"
10897         "vmovupd %%ymm15, (%7)\n"
10898         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10899       );
10900     }
10901   }
10902   for (int j = 0; j < 256; j += 256) {
10903     for (int k = 0; k < 32; k += 4) {
10904       __asm__ volatile (
10905         "vmovupd (%0), %%ymm0\n"
10906         "vmovupd (%1), %%ymm1\n"
10907         "vmovupd (%2), %%ymm2\n"
10908         "vmovupd (%3), %%ymm3\n"
10909         "vmovupd (%4), %%ymm4\n"
10910         "vmovupd (%5), %%ymm5\n"
10911         "vmovupd (%6), %%ymm6\n"
10912         "vmovupd (%7), %%ymm7\n"
10913         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10914         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10915         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10916         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10917         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10918         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10919         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10920         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10921         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10922         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10923         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10924         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10925         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10926         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10927         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10928         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10929         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10930         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10931         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10932         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10933         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10934         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10935         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10936         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10937         "vmovupd %%ymm8, (%0)\n"
10938         "vmovupd %%ymm9, (%1)\n"
10939         "vmovupd %%ymm10, (%2)\n"
10940         "vmovupd %%ymm11, (%3)\n"
10941         "vmovupd %%ymm12, (%4)\n"
10942         "vmovupd %%ymm13, (%5)\n"
10943         "vmovupd %%ymm14, (%6)\n"
10944         "vmovupd %%ymm15, (%7)\n"
10945         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10946       );
10947     }
10948   }
10949 }
10950 static inline void helper_double_9(double *buf);
helper_double_9(double * buf)10951 static inline void helper_double_9(double *buf) {
10952   for (int j = 0; j < 512; j += 32) {
10953     for (int k = 0; k < 4; k += 4) {
10954       __asm__ volatile (
10955         "vmovupd (%0), %%ymm0\n"
10956         "vmovupd (%1), %%ymm1\n"
10957         "vmovupd (%2), %%ymm2\n"
10958         "vmovupd (%3), %%ymm3\n"
10959         "vmovupd (%4), %%ymm4\n"
10960         "vmovupd (%5), %%ymm5\n"
10961         "vmovupd (%6), %%ymm6\n"
10962         "vmovupd (%7), %%ymm7\n"
10963         "vpermilpd $0, %%ymm0, %%ymm8\n"
10964         "vpermilpd $15, %%ymm0, %%ymm9\n"
10965         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10966         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10967         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10968         "vpermilpd $0, %%ymm1, %%ymm8\n"
10969         "vpermilpd $15, %%ymm1, %%ymm9\n"
10970         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10971         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10972         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10973         "vpermilpd $0, %%ymm2, %%ymm8\n"
10974         "vpermilpd $15, %%ymm2, %%ymm9\n"
10975         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10976         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10977         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10978         "vpermilpd $0, %%ymm3, %%ymm8\n"
10979         "vpermilpd $15, %%ymm3, %%ymm9\n"
10980         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10981         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10982         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10983         "vpermilpd $0, %%ymm4, %%ymm8\n"
10984         "vpermilpd $15, %%ymm4, %%ymm9\n"
10985         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10986         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10987         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10988         "vpermilpd $0, %%ymm5, %%ymm8\n"
10989         "vpermilpd $15, %%ymm5, %%ymm9\n"
10990         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10991         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10992         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10993         "vpermilpd $0, %%ymm6, %%ymm8\n"
10994         "vpermilpd $15, %%ymm6, %%ymm9\n"
10995         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10996         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10997         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10998         "vpermilpd $0, %%ymm7, %%ymm8\n"
10999         "vpermilpd $15, %%ymm7, %%ymm9\n"
11000         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11001         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11002         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11003         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11004         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11005         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11006         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11007         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11008         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11009         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11010         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11011         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11012         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11013         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11014         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11015         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11016         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11017         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11018         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11019         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11020         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11021         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11022         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11023         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11024         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11025         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11026         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11027         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11028         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11029         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11030         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11031         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11032         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11033         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11034         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11035         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11036         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11037         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11038         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11039         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11040         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11041         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11042         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11043         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11044         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11045         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11046         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11047         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11048         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11049         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11050         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11051         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11052         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11053         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11054         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11055         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11056         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11057         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11058         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11059         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11060         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11061         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11062         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11063         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11064         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11065         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11066         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11067         "vmovupd %%ymm8, (%0)\n"
11068         "vmovupd %%ymm9, (%1)\n"
11069         "vmovupd %%ymm10, (%2)\n"
11070         "vmovupd %%ymm11, (%3)\n"
11071         "vmovupd %%ymm12, (%4)\n"
11072         "vmovupd %%ymm13, (%5)\n"
11073         "vmovupd %%ymm14, (%6)\n"
11074         "vmovupd %%ymm15, (%7)\n"
11075         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11076       );
11077     }
11078   }
11079   for (int j = 0; j < 512; j += 256) {
11080     for (int k = 0; k < 32; k += 4) {
11081       __asm__ volatile (
11082         "vmovupd (%0), %%ymm0\n"
11083         "vmovupd (%1), %%ymm1\n"
11084         "vmovupd (%2), %%ymm2\n"
11085         "vmovupd (%3), %%ymm3\n"
11086         "vmovupd (%4), %%ymm4\n"
11087         "vmovupd (%5), %%ymm5\n"
11088         "vmovupd (%6), %%ymm6\n"
11089         "vmovupd (%7), %%ymm7\n"
11090         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11091         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11092         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11093         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11094         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11095         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11096         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11097         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11098         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11099         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11100         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11101         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11102         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11103         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11104         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11105         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11106         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11107         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11108         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11109         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11110         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11111         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11112         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11113         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11114         "vmovupd %%ymm8, (%0)\n"
11115         "vmovupd %%ymm9, (%1)\n"
11116         "vmovupd %%ymm10, (%2)\n"
11117         "vmovupd %%ymm11, (%3)\n"
11118         "vmovupd %%ymm12, (%4)\n"
11119         "vmovupd %%ymm13, (%5)\n"
11120         "vmovupd %%ymm14, (%6)\n"
11121         "vmovupd %%ymm15, (%7)\n"
11122         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11123       );
11124     }
11125   }
11126   for (int j = 0; j < 512; j += 512) {
11127     for (int k = 0; k < 256; k += 4) {
11128       __asm__ volatile (
11129         "vmovupd (%0), %%ymm0\n"
11130         "vmovupd (%1), %%ymm1\n"
11131         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11132         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11133         "vmovupd %%ymm8, (%0)\n"
11134         "vmovupd %%ymm9, (%1)\n"
11135         :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11136       );
11137     }
11138   }
11139 }
11140 static inline void helper_double_10(double *buf);
helper_double_10(double * buf)11141 static inline void helper_double_10(double *buf) {
11142   for (int j = 0; j < 1024; j += 32) {
11143     for (int k = 0; k < 4; k += 4) {
11144       __asm__ volatile (
11145         "vmovupd (%0), %%ymm0\n"
11146         "vmovupd (%1), %%ymm1\n"
11147         "vmovupd (%2), %%ymm2\n"
11148         "vmovupd (%3), %%ymm3\n"
11149         "vmovupd (%4), %%ymm4\n"
11150         "vmovupd (%5), %%ymm5\n"
11151         "vmovupd (%6), %%ymm6\n"
11152         "vmovupd (%7), %%ymm7\n"
11153         "vpermilpd $0, %%ymm0, %%ymm8\n"
11154         "vpermilpd $15, %%ymm0, %%ymm9\n"
11155         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11156         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11157         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11158         "vpermilpd $0, %%ymm1, %%ymm8\n"
11159         "vpermilpd $15, %%ymm1, %%ymm9\n"
11160         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11161         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11162         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11163         "vpermilpd $0, %%ymm2, %%ymm8\n"
11164         "vpermilpd $15, %%ymm2, %%ymm9\n"
11165         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11166         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11167         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11168         "vpermilpd $0, %%ymm3, %%ymm8\n"
11169         "vpermilpd $15, %%ymm3, %%ymm9\n"
11170         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11171         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11172         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11173         "vpermilpd $0, %%ymm4, %%ymm8\n"
11174         "vpermilpd $15, %%ymm4, %%ymm9\n"
11175         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11176         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11177         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11178         "vpermilpd $0, %%ymm5, %%ymm8\n"
11179         "vpermilpd $15, %%ymm5, %%ymm9\n"
11180         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11181         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11182         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11183         "vpermilpd $0, %%ymm6, %%ymm8\n"
11184         "vpermilpd $15, %%ymm6, %%ymm9\n"
11185         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11186         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11187         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11188         "vpermilpd $0, %%ymm7, %%ymm8\n"
11189         "vpermilpd $15, %%ymm7, %%ymm9\n"
11190         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11191         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11192         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11193         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11194         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11195         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11196         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11197         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11198         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11199         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11200         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11201         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11202         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11203         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11204         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11205         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11206         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11207         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11208         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11209         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11210         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11211         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11212         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11213         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11214         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11215         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11216         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11217         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11218         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11219         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11220         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11221         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11222         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11223         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11224         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11225         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11226         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11227         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11228         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11229         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11230         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11231         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11232         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11233         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11234         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11235         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11236         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11237         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11238         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11239         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11240         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11241         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11242         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11243         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11244         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11245         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11246         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11247         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11248         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11249         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11250         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11251         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11252         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11253         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11254         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11255         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11256         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11257         "vmovupd %%ymm8, (%0)\n"
11258         "vmovupd %%ymm9, (%1)\n"
11259         "vmovupd %%ymm10, (%2)\n"
11260         "vmovupd %%ymm11, (%3)\n"
11261         "vmovupd %%ymm12, (%4)\n"
11262         "vmovupd %%ymm13, (%5)\n"
11263         "vmovupd %%ymm14, (%6)\n"
11264         "vmovupd %%ymm15, (%7)\n"
11265         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11266       );
11267     }
11268   }
11269   for (int j = 0; j < 1024; j += 256) {
11270     for (int k = 0; k < 32; k += 4) {
11271       __asm__ volatile (
11272         "vmovupd (%0), %%ymm0\n"
11273         "vmovupd (%1), %%ymm1\n"
11274         "vmovupd (%2), %%ymm2\n"
11275         "vmovupd (%3), %%ymm3\n"
11276         "vmovupd (%4), %%ymm4\n"
11277         "vmovupd (%5), %%ymm5\n"
11278         "vmovupd (%6), %%ymm6\n"
11279         "vmovupd (%7), %%ymm7\n"
11280         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11281         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11282         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11283         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11284         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11285         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11286         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11287         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11288         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11289         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11290         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11291         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11292         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11293         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11294         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11295         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11296         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11297         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11298         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11299         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11300         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11301         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11302         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11303         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11304         "vmovupd %%ymm8, (%0)\n"
11305         "vmovupd %%ymm9, (%1)\n"
11306         "vmovupd %%ymm10, (%2)\n"
11307         "vmovupd %%ymm11, (%3)\n"
11308         "vmovupd %%ymm12, (%4)\n"
11309         "vmovupd %%ymm13, (%5)\n"
11310         "vmovupd %%ymm14, (%6)\n"
11311         "vmovupd %%ymm15, (%7)\n"
11312         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11313       );
11314     }
11315   }
11316   for (int j = 0; j < 1024; j += 1024) {
11317     for (int k = 0; k < 256; k += 4) {
11318       __asm__ volatile (
11319         "vmovupd (%0), %%ymm0\n"
11320         "vmovupd (%1), %%ymm1\n"
11321         "vmovupd (%2), %%ymm2\n"
11322         "vmovupd (%3), %%ymm3\n"
11323         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11324         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11325         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11326         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11327         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11328         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11329         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11330         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11331         "vmovupd %%ymm0, (%0)\n"
11332         "vmovupd %%ymm1, (%1)\n"
11333         "vmovupd %%ymm2, (%2)\n"
11334         "vmovupd %%ymm3, (%3)\n"
11335         :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11336       );
11337     }
11338   }
11339 }
11340 static inline void helper_double_11(double *buf);
helper_double_11(double * buf)11341 static inline void helper_double_11(double *buf) {
11342   for (int j = 0; j < 2048; j += 32) {
11343     for (int k = 0; k < 4; k += 4) {
11344       __asm__ volatile (
11345         "vmovupd (%0), %%ymm0\n"
11346         "vmovupd (%1), %%ymm1\n"
11347         "vmovupd (%2), %%ymm2\n"
11348         "vmovupd (%3), %%ymm3\n"
11349         "vmovupd (%4), %%ymm4\n"
11350         "vmovupd (%5), %%ymm5\n"
11351         "vmovupd (%6), %%ymm6\n"
11352         "vmovupd (%7), %%ymm7\n"
11353         "vpermilpd $0, %%ymm0, %%ymm8\n"
11354         "vpermilpd $15, %%ymm0, %%ymm9\n"
11355         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11356         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11357         "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11358         "vpermilpd $0, %%ymm1, %%ymm8\n"
11359         "vpermilpd $15, %%ymm1, %%ymm9\n"
11360         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11361         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11362         "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11363         "vpermilpd $0, %%ymm2, %%ymm8\n"
11364         "vpermilpd $15, %%ymm2, %%ymm9\n"
11365         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11366         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11367         "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11368         "vpermilpd $0, %%ymm3, %%ymm8\n"
11369         "vpermilpd $15, %%ymm3, %%ymm9\n"
11370         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11371         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11372         "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11373         "vpermilpd $0, %%ymm4, %%ymm8\n"
11374         "vpermilpd $15, %%ymm4, %%ymm9\n"
11375         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11376         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11377         "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11378         "vpermilpd $0, %%ymm5, %%ymm8\n"
11379         "vpermilpd $15, %%ymm5, %%ymm9\n"
11380         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11381         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11382         "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11383         "vpermilpd $0, %%ymm6, %%ymm8\n"
11384         "vpermilpd $15, %%ymm6, %%ymm9\n"
11385         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11386         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11387         "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11388         "vpermilpd $0, %%ymm7, %%ymm8\n"
11389         "vpermilpd $15, %%ymm7, %%ymm9\n"
11390         "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11391         "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11392         "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11393         "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11394         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11395         "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11396         "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11397         "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11398         "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11399         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11400         "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11401         "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11402         "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11403         "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11404         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11405         "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11406         "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11407         "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11408         "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11409         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11410         "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11411         "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11412         "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11413         "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11414         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11415         "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11416         "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11417         "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11418         "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11419         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11420         "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11421         "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11422         "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11423         "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11424         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11425         "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11426         "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11427         "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11428         "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11429         "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11430         "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11431         "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11432         "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11433         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11434         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11435         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11436         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11437         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11438         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11439         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11440         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11441         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11442         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11443         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11444         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11445         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11446         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11447         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11448         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11449         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11450         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11451         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11452         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11453         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11454         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11455         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11456         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11457         "vmovupd %%ymm8, (%0)\n"
11458         "vmovupd %%ymm9, (%1)\n"
11459         "vmovupd %%ymm10, (%2)\n"
11460         "vmovupd %%ymm11, (%3)\n"
11461         "vmovupd %%ymm12, (%4)\n"
11462         "vmovupd %%ymm13, (%5)\n"
11463         "vmovupd %%ymm14, (%6)\n"
11464         "vmovupd %%ymm15, (%7)\n"
11465         :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11466       );
11467     }
11468   }
11469   for (int j = 0; j < 2048; j += 256) {
11470     for (int k = 0; k < 32; k += 4) {
11471       __asm__ volatile (
11472         "vmovupd (%0), %%ymm0\n"
11473         "vmovupd (%1), %%ymm1\n"
11474         "vmovupd (%2), %%ymm2\n"
11475         "vmovupd (%3), %%ymm3\n"
11476         "vmovupd (%4), %%ymm4\n"
11477         "vmovupd (%5), %%ymm5\n"
11478         "vmovupd (%6), %%ymm6\n"
11479         "vmovupd (%7), %%ymm7\n"
11480         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11481         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11482         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11483         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11484         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11485         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11486         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11487         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11488         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11489         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11490         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11491         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11492         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11493         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11494         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11495         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11496         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11497         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11498         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11499         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11500         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11501         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11502         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11503         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11504         "vmovupd %%ymm8, (%0)\n"
11505         "vmovupd %%ymm9, (%1)\n"
11506         "vmovupd %%ymm10, (%2)\n"
11507         "vmovupd %%ymm11, (%3)\n"
11508         "vmovupd %%ymm12, (%4)\n"
11509         "vmovupd %%ymm13, (%5)\n"
11510         "vmovupd %%ymm14, (%6)\n"
11511         "vmovupd %%ymm15, (%7)\n"
11512         :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11513       );
11514     }
11515   }
11516   for (int j = 0; j < 2048; j += 2048) {
11517     for (int k = 0; k < 256; k += 4) {
11518       __asm__ volatile (
11519         "vmovupd (%0), %%ymm0\n"
11520         "vmovupd (%1), %%ymm1\n"
11521         "vmovupd (%2), %%ymm2\n"
11522         "vmovupd (%3), %%ymm3\n"
11523         "vmovupd (%4), %%ymm4\n"
11524         "vmovupd (%5), %%ymm5\n"
11525         "vmovupd (%6), %%ymm6\n"
11526         "vmovupd (%7), %%ymm7\n"
11527         "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11528         "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11529         "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11530         "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11531         "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11532         "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11533         "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11534         "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11535         "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11536         "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11537         "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11538         "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11539         "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11540         "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11541         "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11542         "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11543         "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11544         "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11545         "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11546         "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11547         "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11548         "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11549         "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11550         "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11551         "vmovupd %%ymm8, (%0)\n"
11552         "vmovupd %%ymm9, (%1)\n"
11553         "vmovupd %%ymm10, (%2)\n"
11554         "vmovupd %%ymm11, (%3)\n"
11555         "vmovupd %%ymm12, (%4)\n"
11556         "vmovupd %%ymm13, (%5)\n"
11557         "vmovupd %%ymm14, (%6)\n"
11558         "vmovupd %%ymm15, (%7)\n"
11559         :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11560       );
11561     }
11562   }
11563 }
11564 void helper_double_12_recursive(double *buf, int depth);
helper_double_12_recursive(double * buf,int depth)11565 void helper_double_12_recursive(double *buf, int depth) {
11566   if (depth == 11) {
11567     for (int j = 0; j < 2048; j += 32) {
11568       for (int k = 0; k < 4; k += 4) {
11569         __asm__ volatile (
11570           "vmovupd (%0), %%ymm0\n"
11571           "vmovupd (%1), %%ymm1\n"
11572           "vmovupd (%2), %%ymm2\n"
11573           "vmovupd (%3), %%ymm3\n"
11574           "vmovupd (%4), %%ymm4\n"
11575           "vmovupd (%5), %%ymm5\n"
11576           "vmovupd (%6), %%ymm6\n"
11577           "vmovupd (%7), %%ymm7\n"
11578           "vpermilpd $0, %%ymm0, %%ymm8\n"
11579           "vpermilpd $15, %%ymm0, %%ymm9\n"
11580           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11581           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11582           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11583           "vpermilpd $0, %%ymm1, %%ymm8\n"
11584           "vpermilpd $15, %%ymm1, %%ymm9\n"
11585           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11586           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11587           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11588           "vpermilpd $0, %%ymm2, %%ymm8\n"
11589           "vpermilpd $15, %%ymm2, %%ymm9\n"
11590           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11591           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11592           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11593           "vpermilpd $0, %%ymm3, %%ymm8\n"
11594           "vpermilpd $15, %%ymm3, %%ymm9\n"
11595           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11596           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11597           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11598           "vpermilpd $0, %%ymm4, %%ymm8\n"
11599           "vpermilpd $15, %%ymm4, %%ymm9\n"
11600           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11601           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11602           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11603           "vpermilpd $0, %%ymm5, %%ymm8\n"
11604           "vpermilpd $15, %%ymm5, %%ymm9\n"
11605           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11606           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11607           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11608           "vpermilpd $0, %%ymm6, %%ymm8\n"
11609           "vpermilpd $15, %%ymm6, %%ymm9\n"
11610           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11611           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11612           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11613           "vpermilpd $0, %%ymm7, %%ymm8\n"
11614           "vpermilpd $15, %%ymm7, %%ymm9\n"
11615           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11616           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11617           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11618           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11619           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11620           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11621           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11622           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11623           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11624           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11625           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11626           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11627           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11628           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11629           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11630           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11631           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11632           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11633           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11634           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11635           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11636           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11637           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11638           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11639           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11640           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11641           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11642           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11643           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11644           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11645           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11646           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11647           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11648           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11649           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11650           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11651           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11652           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11653           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11654           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11655           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11656           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11657           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11658           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11659           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11660           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11661           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11662           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11663           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11664           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11665           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11666           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11667           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11668           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11669           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11670           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11671           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11672           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11673           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11674           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11675           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11676           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11677           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11678           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11679           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11680           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11681           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11682           "vmovupd %%ymm8, (%0)\n"
11683           "vmovupd %%ymm9, (%1)\n"
11684           "vmovupd %%ymm10, (%2)\n"
11685           "vmovupd %%ymm11, (%3)\n"
11686           "vmovupd %%ymm12, (%4)\n"
11687           "vmovupd %%ymm13, (%5)\n"
11688           "vmovupd %%ymm14, (%6)\n"
11689           "vmovupd %%ymm15, (%7)\n"
11690           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11691         );
11692       }
11693     }
11694     for (int j = 0; j < 2048; j += 256) {
11695       for (int k = 0; k < 32; k += 4) {
11696         __asm__ volatile (
11697           "vmovupd (%0), %%ymm0\n"
11698           "vmovupd (%1), %%ymm1\n"
11699           "vmovupd (%2), %%ymm2\n"
11700           "vmovupd (%3), %%ymm3\n"
11701           "vmovupd (%4), %%ymm4\n"
11702           "vmovupd (%5), %%ymm5\n"
11703           "vmovupd (%6), %%ymm6\n"
11704           "vmovupd (%7), %%ymm7\n"
11705           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11706           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11707           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11708           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11709           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11710           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11711           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11712           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11713           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11714           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11715           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11716           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11717           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11718           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11719           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11720           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11721           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11722           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11723           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11724           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11725           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11726           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11727           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11728           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11729           "vmovupd %%ymm8, (%0)\n"
11730           "vmovupd %%ymm9, (%1)\n"
11731           "vmovupd %%ymm10, (%2)\n"
11732           "vmovupd %%ymm11, (%3)\n"
11733           "vmovupd %%ymm12, (%4)\n"
11734           "vmovupd %%ymm13, (%5)\n"
11735           "vmovupd %%ymm14, (%6)\n"
11736           "vmovupd %%ymm15, (%7)\n"
11737           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11738         );
11739       }
11740     }
11741     for (int j = 0; j < 2048; j += 2048) {
11742       for (int k = 0; k < 256; k += 4) {
11743         __asm__ volatile (
11744           "vmovupd (%0), %%ymm0\n"
11745           "vmovupd (%1), %%ymm1\n"
11746           "vmovupd (%2), %%ymm2\n"
11747           "vmovupd (%3), %%ymm3\n"
11748           "vmovupd (%4), %%ymm4\n"
11749           "vmovupd (%5), %%ymm5\n"
11750           "vmovupd (%6), %%ymm6\n"
11751           "vmovupd (%7), %%ymm7\n"
11752           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11753           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11754           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11755           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11756           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11757           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11758           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11759           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11760           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11761           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11762           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11763           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11764           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11765           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11766           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11767           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11768           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11769           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11770           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11771           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11772           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11773           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11774           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11775           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11776           "vmovupd %%ymm8, (%0)\n"
11777           "vmovupd %%ymm9, (%1)\n"
11778           "vmovupd %%ymm10, (%2)\n"
11779           "vmovupd %%ymm11, (%3)\n"
11780           "vmovupd %%ymm12, (%4)\n"
11781           "vmovupd %%ymm13, (%5)\n"
11782           "vmovupd %%ymm14, (%6)\n"
11783           "vmovupd %%ymm15, (%7)\n"
11784           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11785         );
11786       }
11787     }
11788     return;
11789   }
11790   if (depth == 12) {
11791     helper_double_12_recursive(buf + 0, 11);
11792     helper_double_12_recursive(buf + 2048, 11);
11793     for (int j = 0; j < 4096; j += 4096) {
11794       for (int k = 0; k < 2048; k += 4) {
11795         __asm__ volatile (
11796           "vmovupd (%0), %%ymm0\n"
11797           "vmovupd (%1), %%ymm1\n"
11798           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11799           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11800           "vmovupd %%ymm8, (%0)\n"
11801           "vmovupd %%ymm9, (%1)\n"
11802           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11803         );
11804       }
11805     }
11806     return;
11807   }
11808 }
11809 void helper_double_12(double *buf);
helper_double_12(double * buf)11810 void helper_double_12(double *buf) {
11811   helper_double_12_recursive(buf, 12);
11812 }
11813 void helper_double_13_recursive(double *buf, int depth);
helper_double_13_recursive(double * buf,int depth)11814 void helper_double_13_recursive(double *buf, int depth) {
11815   if (depth == 11) {
11816     for (int j = 0; j < 2048; j += 32) {
11817       for (int k = 0; k < 4; k += 4) {
11818         __asm__ volatile (
11819           "vmovupd (%0), %%ymm0\n"
11820           "vmovupd (%1), %%ymm1\n"
11821           "vmovupd (%2), %%ymm2\n"
11822           "vmovupd (%3), %%ymm3\n"
11823           "vmovupd (%4), %%ymm4\n"
11824           "vmovupd (%5), %%ymm5\n"
11825           "vmovupd (%6), %%ymm6\n"
11826           "vmovupd (%7), %%ymm7\n"
11827           "vpermilpd $0, %%ymm0, %%ymm8\n"
11828           "vpermilpd $15, %%ymm0, %%ymm9\n"
11829           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11830           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11831           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11832           "vpermilpd $0, %%ymm1, %%ymm8\n"
11833           "vpermilpd $15, %%ymm1, %%ymm9\n"
11834           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11835           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11836           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11837           "vpermilpd $0, %%ymm2, %%ymm8\n"
11838           "vpermilpd $15, %%ymm2, %%ymm9\n"
11839           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11840           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11841           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11842           "vpermilpd $0, %%ymm3, %%ymm8\n"
11843           "vpermilpd $15, %%ymm3, %%ymm9\n"
11844           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11845           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11846           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11847           "vpermilpd $0, %%ymm4, %%ymm8\n"
11848           "vpermilpd $15, %%ymm4, %%ymm9\n"
11849           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11850           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11851           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11852           "vpermilpd $0, %%ymm5, %%ymm8\n"
11853           "vpermilpd $15, %%ymm5, %%ymm9\n"
11854           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11855           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11856           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11857           "vpermilpd $0, %%ymm6, %%ymm8\n"
11858           "vpermilpd $15, %%ymm6, %%ymm9\n"
11859           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11860           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11861           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11862           "vpermilpd $0, %%ymm7, %%ymm8\n"
11863           "vpermilpd $15, %%ymm7, %%ymm9\n"
11864           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11865           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11866           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11867           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11868           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11869           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11870           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11871           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11872           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11873           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11874           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11875           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11876           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11877           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11878           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11879           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11880           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11881           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11882           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11883           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11884           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11885           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11886           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11887           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11888           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11889           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11890           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11891           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11892           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11893           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11894           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11895           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11896           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11897           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11898           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11899           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11900           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11901           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11902           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11903           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11904           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11905           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11906           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11907           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11908           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11909           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11910           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11911           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11912           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11913           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11914           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11915           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11916           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11917           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11918           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11919           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11920           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11921           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11922           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11923           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11924           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11925           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11926           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11927           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11928           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11929           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11930           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11931           "vmovupd %%ymm8, (%0)\n"
11932           "vmovupd %%ymm9, (%1)\n"
11933           "vmovupd %%ymm10, (%2)\n"
11934           "vmovupd %%ymm11, (%3)\n"
11935           "vmovupd %%ymm12, (%4)\n"
11936           "vmovupd %%ymm13, (%5)\n"
11937           "vmovupd %%ymm14, (%6)\n"
11938           "vmovupd %%ymm15, (%7)\n"
11939           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11940         );
11941       }
11942     }
11943     for (int j = 0; j < 2048; j += 256) {
11944       for (int k = 0; k < 32; k += 4) {
11945         __asm__ volatile (
11946           "vmovupd (%0), %%ymm0\n"
11947           "vmovupd (%1), %%ymm1\n"
11948           "vmovupd (%2), %%ymm2\n"
11949           "vmovupd (%3), %%ymm3\n"
11950           "vmovupd (%4), %%ymm4\n"
11951           "vmovupd (%5), %%ymm5\n"
11952           "vmovupd (%6), %%ymm6\n"
11953           "vmovupd (%7), %%ymm7\n"
11954           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11955           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11956           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11957           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11958           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11959           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11960           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11961           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11962           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11963           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11964           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11965           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11966           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11967           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11968           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11969           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11970           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11971           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11972           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11973           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11974           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11975           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11976           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11977           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11978           "vmovupd %%ymm8, (%0)\n"
11979           "vmovupd %%ymm9, (%1)\n"
11980           "vmovupd %%ymm10, (%2)\n"
11981           "vmovupd %%ymm11, (%3)\n"
11982           "vmovupd %%ymm12, (%4)\n"
11983           "vmovupd %%ymm13, (%5)\n"
11984           "vmovupd %%ymm14, (%6)\n"
11985           "vmovupd %%ymm15, (%7)\n"
11986           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11987         );
11988       }
11989     }
11990     for (int j = 0; j < 2048; j += 2048) {
11991       for (int k = 0; k < 256; k += 4) {
11992         __asm__ volatile (
11993           "vmovupd (%0), %%ymm0\n"
11994           "vmovupd (%1), %%ymm1\n"
11995           "vmovupd (%2), %%ymm2\n"
11996           "vmovupd (%3), %%ymm3\n"
11997           "vmovupd (%4), %%ymm4\n"
11998           "vmovupd (%5), %%ymm5\n"
11999           "vmovupd (%6), %%ymm6\n"
12000           "vmovupd (%7), %%ymm7\n"
12001           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12002           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12003           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12004           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12005           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12006           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12007           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12008           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12009           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12010           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12011           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12012           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12013           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12014           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12015           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12016           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12017           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12018           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12019           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12020           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12021           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12022           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12023           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12024           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12025           "vmovupd %%ymm8, (%0)\n"
12026           "vmovupd %%ymm9, (%1)\n"
12027           "vmovupd %%ymm10, (%2)\n"
12028           "vmovupd %%ymm11, (%3)\n"
12029           "vmovupd %%ymm12, (%4)\n"
12030           "vmovupd %%ymm13, (%5)\n"
12031           "vmovupd %%ymm14, (%6)\n"
12032           "vmovupd %%ymm15, (%7)\n"
12033           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12034         );
12035       }
12036     }
12037     return;
12038   }
12039   if (depth == 13) {
12040     helper_double_13_recursive(buf + 0, 11);
12041     helper_double_13_recursive(buf + 2048, 11);
12042     helper_double_13_recursive(buf + 4096, 11);
12043     helper_double_13_recursive(buf + 6144, 11);
12044     for (int j = 0; j < 8192; j += 8192) {
12045       for (int k = 0; k < 2048; k += 4) {
12046         __asm__ volatile (
12047           "vmovupd (%0), %%ymm0\n"
12048           "vmovupd (%1), %%ymm1\n"
12049           "vmovupd (%2), %%ymm2\n"
12050           "vmovupd (%3), %%ymm3\n"
12051           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12052           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12053           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12054           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12055           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12056           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12057           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12058           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12059           "vmovupd %%ymm0, (%0)\n"
12060           "vmovupd %%ymm1, (%1)\n"
12061           "vmovupd %%ymm2, (%2)\n"
12062           "vmovupd %%ymm3, (%3)\n"
12063           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12064         );
12065       }
12066     }
12067     return;
12068   }
12069 }
12070 void helper_double_13(double *buf);
helper_double_13(double * buf)12071 void helper_double_13(double *buf) {
12072   helper_double_13_recursive(buf, 13);
12073 }
12074 void helper_double_14_recursive(double *buf, int depth);
helper_double_14_recursive(double * buf,int depth)12075 void helper_double_14_recursive(double *buf, int depth) {
12076   if (depth == 12) {
12077     for (int j = 0; j < 4096; j += 32) {
12078       for (int k = 0; k < 4; k += 4) {
12079         __asm__ volatile (
12080           "vmovupd (%0), %%ymm0\n"
12081           "vmovupd (%1), %%ymm1\n"
12082           "vmovupd (%2), %%ymm2\n"
12083           "vmovupd (%3), %%ymm3\n"
12084           "vmovupd (%4), %%ymm4\n"
12085           "vmovupd (%5), %%ymm5\n"
12086           "vmovupd (%6), %%ymm6\n"
12087           "vmovupd (%7), %%ymm7\n"
12088           "vpermilpd $0, %%ymm0, %%ymm8\n"
12089           "vpermilpd $15, %%ymm0, %%ymm9\n"
12090           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12091           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12092           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12093           "vpermilpd $0, %%ymm1, %%ymm8\n"
12094           "vpermilpd $15, %%ymm1, %%ymm9\n"
12095           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12096           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12097           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12098           "vpermilpd $0, %%ymm2, %%ymm8\n"
12099           "vpermilpd $15, %%ymm2, %%ymm9\n"
12100           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12101           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12102           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12103           "vpermilpd $0, %%ymm3, %%ymm8\n"
12104           "vpermilpd $15, %%ymm3, %%ymm9\n"
12105           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12106           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12107           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
12108           "vpermilpd $0, %%ymm4, %%ymm8\n"
12109           "vpermilpd $15, %%ymm4, %%ymm9\n"
12110           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12111           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12112           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
12113           "vpermilpd $0, %%ymm5, %%ymm8\n"
12114           "vpermilpd $15, %%ymm5, %%ymm9\n"
12115           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12116           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12117           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
12118           "vpermilpd $0, %%ymm6, %%ymm8\n"
12119           "vpermilpd $15, %%ymm6, %%ymm9\n"
12120           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12121           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12122           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
12123           "vpermilpd $0, %%ymm7, %%ymm8\n"
12124           "vpermilpd $15, %%ymm7, %%ymm9\n"
12125           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12126           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12127           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
12128           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
12129           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12130           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
12131           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
12132           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
12133           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
12134           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12135           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
12136           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
12137           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
12138           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
12139           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12140           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
12141           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
12142           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
12143           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
12144           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12145           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
12146           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
12147           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
12148           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
12149           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12150           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
12151           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
12152           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
12153           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
12154           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12155           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
12156           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
12157           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
12158           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
12159           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12160           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
12161           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
12162           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
12163           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
12164           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12165           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
12166           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
12167           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
12168           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12169           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12170           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12171           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12172           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12173           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12174           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12175           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12176           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12177           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12178           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12179           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12180           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12181           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12182           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12183           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12184           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12185           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12186           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12187           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12188           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12189           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12190           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12191           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12192           "vmovupd %%ymm8, (%0)\n"
12193           "vmovupd %%ymm9, (%1)\n"
12194           "vmovupd %%ymm10, (%2)\n"
12195           "vmovupd %%ymm11, (%3)\n"
12196           "vmovupd %%ymm12, (%4)\n"
12197           "vmovupd %%ymm13, (%5)\n"
12198           "vmovupd %%ymm14, (%6)\n"
12199           "vmovupd %%ymm15, (%7)\n"
12200           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12201         );
12202       }
12203     }
12204     for (int j = 0; j < 4096; j += 256) {
12205       for (int k = 0; k < 32; k += 4) {
12206         __asm__ volatile (
12207           "vmovupd (%0), %%ymm0\n"
12208           "vmovupd (%1), %%ymm1\n"
12209           "vmovupd (%2), %%ymm2\n"
12210           "vmovupd (%3), %%ymm3\n"
12211           "vmovupd (%4), %%ymm4\n"
12212           "vmovupd (%5), %%ymm5\n"
12213           "vmovupd (%6), %%ymm6\n"
12214           "vmovupd (%7), %%ymm7\n"
12215           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12216           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12217           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12218           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12219           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12220           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12221           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12222           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12223           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12224           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12225           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12226           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12227           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12228           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12229           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12230           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12231           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12232           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12233           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12234           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12235           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12236           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12237           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12238           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12239           "vmovupd %%ymm8, (%0)\n"
12240           "vmovupd %%ymm9, (%1)\n"
12241           "vmovupd %%ymm10, (%2)\n"
12242           "vmovupd %%ymm11, (%3)\n"
12243           "vmovupd %%ymm12, (%4)\n"
12244           "vmovupd %%ymm13, (%5)\n"
12245           "vmovupd %%ymm14, (%6)\n"
12246           "vmovupd %%ymm15, (%7)\n"
12247           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12248         );
12249       }
12250     }
12251     for (int j = 0; j < 4096; j += 2048) {
12252       for (int k = 0; k < 256; k += 4) {
12253         __asm__ volatile (
12254           "vmovupd (%0), %%ymm0\n"
12255           "vmovupd (%1), %%ymm1\n"
12256           "vmovupd (%2), %%ymm2\n"
12257           "vmovupd (%3), %%ymm3\n"
12258           "vmovupd (%4), %%ymm4\n"
12259           "vmovupd (%5), %%ymm5\n"
12260           "vmovupd (%6), %%ymm6\n"
12261           "vmovupd (%7), %%ymm7\n"
12262           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12263           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12264           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12265           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12266           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12267           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12268           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12269           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12270           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12271           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12272           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12273           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12274           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12275           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12276           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12277           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12278           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12279           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12280           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12281           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12282           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12283           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12284           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12285           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12286           "vmovupd %%ymm8, (%0)\n"
12287           "vmovupd %%ymm9, (%1)\n"
12288           "vmovupd %%ymm10, (%2)\n"
12289           "vmovupd %%ymm11, (%3)\n"
12290           "vmovupd %%ymm12, (%4)\n"
12291           "vmovupd %%ymm13, (%5)\n"
12292           "vmovupd %%ymm14, (%6)\n"
12293           "vmovupd %%ymm15, (%7)\n"
12294           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12295         );
12296       }
12297     }
12298     for (int j = 0; j < 4096; j += 4096) {
12299       for (int k = 0; k < 2048; k += 4) {
12300         __asm__ volatile (
12301           "vmovupd (%0), %%ymm0\n"
12302           "vmovupd (%1), %%ymm1\n"
12303           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12304           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12305           "vmovupd %%ymm8, (%0)\n"
12306           "vmovupd %%ymm9, (%1)\n"
12307           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12308         );
12309       }
12310     }
12311     return;
12312   }
12313   if (depth == 14) {
12314     helper_double_14_recursive(buf + 0, 12);
12315     helper_double_14_recursive(buf + 4096, 12);
12316     helper_double_14_recursive(buf + 8192, 12);
12317     helper_double_14_recursive(buf + 12288, 12);
12318     for (int j = 0; j < 16384; j += 16384) {
12319       for (int k = 0; k < 4096; k += 4) {
12320         __asm__ volatile (
12321           "vmovupd (%0), %%ymm0\n"
12322           "vmovupd (%1), %%ymm1\n"
12323           "vmovupd (%2), %%ymm2\n"
12324           "vmovupd (%3), %%ymm3\n"
12325           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12326           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12327           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12328           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12329           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12330           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12331           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12332           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12333           "vmovupd %%ymm0, (%0)\n"
12334           "vmovupd %%ymm1, (%1)\n"
12335           "vmovupd %%ymm2, (%2)\n"
12336           "vmovupd %%ymm3, (%3)\n"
12337           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12338         );
12339       }
12340     }
12341     return;
12342   }
12343 }
12344 void helper_double_14(double *buf);
helper_double_14(double * buf)12345 void helper_double_14(double *buf) {
12346   helper_double_14_recursive(buf, 14);
12347 }
12348 void helper_double_15_recursive(double *buf, int depth);
helper_double_15_recursive(double * buf,int depth)12349 void helper_double_15_recursive(double *buf, int depth) {
12350   if (depth == 12) {
12351     for (int j = 0; j < 4096; j += 32) {
12352       for (int k = 0; k < 4; k += 4) {
12353         __asm__ volatile (
12354           "vmovupd (%0), %%ymm0\n"
12355           "vmovupd (%1), %%ymm1\n"
12356           "vmovupd (%2), %%ymm2\n"
12357           "vmovupd (%3), %%ymm3\n"
12358           "vmovupd (%4), %%ymm4\n"
12359           "vmovupd (%5), %%ymm5\n"
12360           "vmovupd (%6), %%ymm6\n"
12361           "vmovupd (%7), %%ymm7\n"
12362           "vpermilpd $0, %%ymm0, %%ymm8\n"
12363           "vpermilpd $15, %%ymm0, %%ymm9\n"
12364           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12365           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12366           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12367           "vpermilpd $0, %%ymm1, %%ymm8\n"
12368           "vpermilpd $15, %%ymm1, %%ymm9\n"
12369           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12370           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12371           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12372           "vpermilpd $0, %%ymm2, %%ymm8\n"
12373           "vpermilpd $15, %%ymm2, %%ymm9\n"
12374           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12375           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12376           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12377           "vpermilpd $0, %%ymm3, %%ymm8\n"
12378           "vpermilpd $15, %%ymm3, %%ymm9\n"
12379           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12380           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12381           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
12382           "vpermilpd $0, %%ymm4, %%ymm8\n"
12383           "vpermilpd $15, %%ymm4, %%ymm9\n"
12384           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12385           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12386           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
12387           "vpermilpd $0, %%ymm5, %%ymm8\n"
12388           "vpermilpd $15, %%ymm5, %%ymm9\n"
12389           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12390           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12391           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
12392           "vpermilpd $0, %%ymm6, %%ymm8\n"
12393           "vpermilpd $15, %%ymm6, %%ymm9\n"
12394           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12395           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12396           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
12397           "vpermilpd $0, %%ymm7, %%ymm8\n"
12398           "vpermilpd $15, %%ymm7, %%ymm9\n"
12399           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12400           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12401           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
12402           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
12403           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12404           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
12405           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
12406           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
12407           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
12408           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12409           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
12410           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
12411           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
12412           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
12413           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12414           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
12415           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
12416           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
12417           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
12418           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12419           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
12420           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
12421           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
12422           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
12423           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12424           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
12425           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
12426           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
12427           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
12428           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12429           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
12430           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
12431           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
12432           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
12433           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12434           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
12435           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
12436           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
12437           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
12438           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12439           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
12440           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
12441           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
12442           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12443           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12444           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12445           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12446           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12447           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12448           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12449           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12450           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12451           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12452           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12453           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12454           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12455           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12456           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12457           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12458           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12459           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12460           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12461           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12462           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12463           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12464           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12465           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12466           "vmovupd %%ymm8, (%0)\n"
12467           "vmovupd %%ymm9, (%1)\n"
12468           "vmovupd %%ymm10, (%2)\n"
12469           "vmovupd %%ymm11, (%3)\n"
12470           "vmovupd %%ymm12, (%4)\n"
12471           "vmovupd %%ymm13, (%5)\n"
12472           "vmovupd %%ymm14, (%6)\n"
12473           "vmovupd %%ymm15, (%7)\n"
12474           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12475         );
12476       }
12477     }
12478     for (int j = 0; j < 4096; j += 256) {
12479       for (int k = 0; k < 32; k += 4) {
12480         __asm__ volatile (
12481           "vmovupd (%0), %%ymm0\n"
12482           "vmovupd (%1), %%ymm1\n"
12483           "vmovupd (%2), %%ymm2\n"
12484           "vmovupd (%3), %%ymm3\n"
12485           "vmovupd (%4), %%ymm4\n"
12486           "vmovupd (%5), %%ymm5\n"
12487           "vmovupd (%6), %%ymm6\n"
12488           "vmovupd (%7), %%ymm7\n"
12489           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12490           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12491           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12492           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12493           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12494           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12495           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12496           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12497           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12498           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12499           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12500           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12501           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12502           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12503           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12504           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12505           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12506           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12507           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12508           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12509           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12510           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12511           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12512           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12513           "vmovupd %%ymm8, (%0)\n"
12514           "vmovupd %%ymm9, (%1)\n"
12515           "vmovupd %%ymm10, (%2)\n"
12516           "vmovupd %%ymm11, (%3)\n"
12517           "vmovupd %%ymm12, (%4)\n"
12518           "vmovupd %%ymm13, (%5)\n"
12519           "vmovupd %%ymm14, (%6)\n"
12520           "vmovupd %%ymm15, (%7)\n"
12521           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12522         );
12523       }
12524     }
12525     for (int j = 0; j < 4096; j += 2048) {
12526       for (int k = 0; k < 256; k += 4) {
12527         __asm__ volatile (
12528           "vmovupd (%0), %%ymm0\n"
12529           "vmovupd (%1), %%ymm1\n"
12530           "vmovupd (%2), %%ymm2\n"
12531           "vmovupd (%3), %%ymm3\n"
12532           "vmovupd (%4), %%ymm4\n"
12533           "vmovupd (%5), %%ymm5\n"
12534           "vmovupd (%6), %%ymm6\n"
12535           "vmovupd (%7), %%ymm7\n"
12536           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12537           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12538           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12539           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12540           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12541           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12542           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12543           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12544           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12545           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12546           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12547           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12548           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12549           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12550           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12551           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12552           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12553           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12554           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12555           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12556           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12557           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12558           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12559           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12560           "vmovupd %%ymm8, (%0)\n"
12561           "vmovupd %%ymm9, (%1)\n"
12562           "vmovupd %%ymm10, (%2)\n"
12563           "vmovupd %%ymm11, (%3)\n"
12564           "vmovupd %%ymm12, (%4)\n"
12565           "vmovupd %%ymm13, (%5)\n"
12566           "vmovupd %%ymm14, (%6)\n"
12567           "vmovupd %%ymm15, (%7)\n"
12568           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12569         );
12570       }
12571     }
12572     for (int j = 0; j < 4096; j += 4096) {
12573       for (int k = 0; k < 2048; k += 4) {
12574         __asm__ volatile (
12575           "vmovupd (%0), %%ymm0\n"
12576           "vmovupd (%1), %%ymm1\n"
12577           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12578           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12579           "vmovupd %%ymm8, (%0)\n"
12580           "vmovupd %%ymm9, (%1)\n"
12581           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12582         );
12583       }
12584     }
12585     return;
12586   }
12587   if (depth == 15) {
12588     helper_double_15_recursive(buf + 0, 12);
12589     helper_double_15_recursive(buf + 4096, 12);
12590     helper_double_15_recursive(buf + 8192, 12);
12591     helper_double_15_recursive(buf + 12288, 12);
12592     helper_double_15_recursive(buf + 16384, 12);
12593     helper_double_15_recursive(buf + 20480, 12);
12594     helper_double_15_recursive(buf + 24576, 12);
12595     helper_double_15_recursive(buf + 28672, 12);
12596     for (int j = 0; j < 32768; j += 32768) {
12597       for (int k = 0; k < 4096; k += 4) {
12598         __asm__ volatile (
12599           "vmovupd (%0), %%ymm0\n"
12600           "vmovupd (%1), %%ymm1\n"
12601           "vmovupd (%2), %%ymm2\n"
12602           "vmovupd (%3), %%ymm3\n"
12603           "vmovupd (%4), %%ymm4\n"
12604           "vmovupd (%5), %%ymm5\n"
12605           "vmovupd (%6), %%ymm6\n"
12606           "vmovupd (%7), %%ymm7\n"
12607           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12608           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12609           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12610           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12611           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12612           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12613           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12614           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12615           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12616           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12617           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12618           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12619           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12620           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12621           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12622           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12623           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12624           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12625           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12626           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12627           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12628           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12629           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12630           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12631           "vmovupd %%ymm8, (%0)\n"
12632           "vmovupd %%ymm9, (%1)\n"
12633           "vmovupd %%ymm10, (%2)\n"
12634           "vmovupd %%ymm11, (%3)\n"
12635           "vmovupd %%ymm12, (%4)\n"
12636           "vmovupd %%ymm13, (%5)\n"
12637           "vmovupd %%ymm14, (%6)\n"
12638           "vmovupd %%ymm15, (%7)\n"
12639           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12640         );
12641       }
12642     }
12643     return;
12644   }
12645 }
12646 void helper_double_15(double *buf);
helper_double_15(double * buf)12647 void helper_double_15(double *buf) {
12648   helper_double_15_recursive(buf, 15);
12649 }
12650 void helper_double_16_recursive(double *buf, int depth);
helper_double_16_recursive(double * buf,int depth)12651 void helper_double_16_recursive(double *buf, int depth) {
12652   if (depth == 11) {
12653     for (int j = 0; j < 2048; j += 32) {
12654       for (int k = 0; k < 4; k += 4) {
12655         __asm__ volatile (
12656           "vmovupd (%0), %%ymm0\n"
12657           "vmovupd (%1), %%ymm1\n"
12658           "vmovupd (%2), %%ymm2\n"
12659           "vmovupd (%3), %%ymm3\n"
12660           "vmovupd (%4), %%ymm4\n"
12661           "vmovupd (%5), %%ymm5\n"
12662           "vmovupd (%6), %%ymm6\n"
12663           "vmovupd (%7), %%ymm7\n"
12664           "vpermilpd $0, %%ymm0, %%ymm8\n"
12665           "vpermilpd $15, %%ymm0, %%ymm9\n"
12666           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12667           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12668           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12669           "vpermilpd $0, %%ymm1, %%ymm8\n"
12670           "vpermilpd $15, %%ymm1, %%ymm9\n"
12671           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12672           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12673           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12674           "vpermilpd $0, %%ymm2, %%ymm8\n"
12675           "vpermilpd $15, %%ymm2, %%ymm9\n"
12676           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12677           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12678           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12679           "vpermilpd $0, %%ymm3, %%ymm8\n"
12680           "vpermilpd $15, %%ymm3, %%ymm9\n"
12681           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12682           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12683           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
12684           "vpermilpd $0, %%ymm4, %%ymm8\n"
12685           "vpermilpd $15, %%ymm4, %%ymm9\n"
12686           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12687           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12688           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
12689           "vpermilpd $0, %%ymm5, %%ymm8\n"
12690           "vpermilpd $15, %%ymm5, %%ymm9\n"
12691           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12692           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12693           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
12694           "vpermilpd $0, %%ymm6, %%ymm8\n"
12695           "vpermilpd $15, %%ymm6, %%ymm9\n"
12696           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12697           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12698           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
12699           "vpermilpd $0, %%ymm7, %%ymm8\n"
12700           "vpermilpd $15, %%ymm7, %%ymm9\n"
12701           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12702           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12703           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
12704           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
12705           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12706           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
12707           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
12708           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
12709           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
12710           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12711           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
12712           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
12713           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
12714           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
12715           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12716           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
12717           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
12718           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
12719           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
12720           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12721           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
12722           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
12723           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
12724           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
12725           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12726           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
12727           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
12728           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
12729           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
12730           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12731           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
12732           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
12733           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
12734           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
12735           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12736           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
12737           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
12738           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
12739           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
12740           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12741           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
12742           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
12743           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
12744           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12745           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12746           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12747           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12748           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12749           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12750           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12751           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12752           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12753           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12754           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12755           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12756           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12757           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12758           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12759           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12760           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12761           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12762           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12763           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12764           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12765           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12766           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12767           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12768           "vmovupd %%ymm8, (%0)\n"
12769           "vmovupd %%ymm9, (%1)\n"
12770           "vmovupd %%ymm10, (%2)\n"
12771           "vmovupd %%ymm11, (%3)\n"
12772           "vmovupd %%ymm12, (%4)\n"
12773           "vmovupd %%ymm13, (%5)\n"
12774           "vmovupd %%ymm14, (%6)\n"
12775           "vmovupd %%ymm15, (%7)\n"
12776           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12777         );
12778       }
12779     }
12780     for (int j = 0; j < 2048; j += 256) {
12781       for (int k = 0; k < 32; k += 4) {
12782         __asm__ volatile (
12783           "vmovupd (%0), %%ymm0\n"
12784           "vmovupd (%1), %%ymm1\n"
12785           "vmovupd (%2), %%ymm2\n"
12786           "vmovupd (%3), %%ymm3\n"
12787           "vmovupd (%4), %%ymm4\n"
12788           "vmovupd (%5), %%ymm5\n"
12789           "vmovupd (%6), %%ymm6\n"
12790           "vmovupd (%7), %%ymm7\n"
12791           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12792           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12793           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12794           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12795           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12796           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12797           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12798           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12799           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12800           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12801           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12802           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12803           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12804           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12805           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12806           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12807           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12808           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12809           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12810           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12811           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12812           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12813           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12814           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12815           "vmovupd %%ymm8, (%0)\n"
12816           "vmovupd %%ymm9, (%1)\n"
12817           "vmovupd %%ymm10, (%2)\n"
12818           "vmovupd %%ymm11, (%3)\n"
12819           "vmovupd %%ymm12, (%4)\n"
12820           "vmovupd %%ymm13, (%5)\n"
12821           "vmovupd %%ymm14, (%6)\n"
12822           "vmovupd %%ymm15, (%7)\n"
12823           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12824         );
12825       }
12826     }
12827     for (int j = 0; j < 2048; j += 2048) {
12828       for (int k = 0; k < 256; k += 4) {
12829         __asm__ volatile (
12830           "vmovupd (%0), %%ymm0\n"
12831           "vmovupd (%1), %%ymm1\n"
12832           "vmovupd (%2), %%ymm2\n"
12833           "vmovupd (%3), %%ymm3\n"
12834           "vmovupd (%4), %%ymm4\n"
12835           "vmovupd (%5), %%ymm5\n"
12836           "vmovupd (%6), %%ymm6\n"
12837           "vmovupd (%7), %%ymm7\n"
12838           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12839           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12840           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12841           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12842           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12843           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12844           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12845           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12846           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12847           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12848           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12849           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12850           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12851           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12852           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12853           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12854           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12855           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12856           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12857           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12858           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12859           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12860           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12861           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12862           "vmovupd %%ymm8, (%0)\n"
12863           "vmovupd %%ymm9, (%1)\n"
12864           "vmovupd %%ymm10, (%2)\n"
12865           "vmovupd %%ymm11, (%3)\n"
12866           "vmovupd %%ymm12, (%4)\n"
12867           "vmovupd %%ymm13, (%5)\n"
12868           "vmovupd %%ymm14, (%6)\n"
12869           "vmovupd %%ymm15, (%7)\n"
12870           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12871         );
12872       }
12873     }
12874     return;
12875   }
12876   if (depth == 14) {
12877     helper_double_16_recursive(buf + 0, 11);
12878     helper_double_16_recursive(buf + 2048, 11);
12879     helper_double_16_recursive(buf + 4096, 11);
12880     helper_double_16_recursive(buf + 6144, 11);
12881     helper_double_16_recursive(buf + 8192, 11);
12882     helper_double_16_recursive(buf + 10240, 11);
12883     helper_double_16_recursive(buf + 12288, 11);
12884     helper_double_16_recursive(buf + 14336, 11);
12885     for (int j = 0; j < 16384; j += 16384) {
12886       for (int k = 0; k < 2048; k += 4) {
12887         __asm__ volatile (
12888           "vmovupd (%0), %%ymm0\n"
12889           "vmovupd (%1), %%ymm1\n"
12890           "vmovupd (%2), %%ymm2\n"
12891           "vmovupd (%3), %%ymm3\n"
12892           "vmovupd (%4), %%ymm4\n"
12893           "vmovupd (%5), %%ymm5\n"
12894           "vmovupd (%6), %%ymm6\n"
12895           "vmovupd (%7), %%ymm7\n"
12896           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12897           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12898           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12899           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12900           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12901           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12902           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12903           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12904           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12905           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12906           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12907           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12908           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12909           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12910           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12911           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12912           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12913           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12914           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12915           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12916           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12917           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12918           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12919           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12920           "vmovupd %%ymm8, (%0)\n"
12921           "vmovupd %%ymm9, (%1)\n"
12922           "vmovupd %%ymm10, (%2)\n"
12923           "vmovupd %%ymm11, (%3)\n"
12924           "vmovupd %%ymm12, (%4)\n"
12925           "vmovupd %%ymm13, (%5)\n"
12926           "vmovupd %%ymm14, (%6)\n"
12927           "vmovupd %%ymm15, (%7)\n"
12928           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12929         );
12930       }
12931     }
12932     return;
12933   }
12934   if (depth == 16) {
12935     helper_double_16_recursive(buf + 0, 14);
12936     helper_double_16_recursive(buf + 16384, 14);
12937     helper_double_16_recursive(buf + 32768, 14);
12938     helper_double_16_recursive(buf + 49152, 14);
12939     for (int j = 0; j < 65536; j += 65536) {
12940       for (int k = 0; k < 16384; k += 4) {
12941         __asm__ volatile (
12942           "vmovupd (%0), %%ymm0\n"
12943           "vmovupd (%1), %%ymm1\n"
12944           "vmovupd (%2), %%ymm2\n"
12945           "vmovupd (%3), %%ymm3\n"
12946           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12947           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12948           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12949           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12950           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12951           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12952           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12953           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12954           "vmovupd %%ymm0, (%0)\n"
12955           "vmovupd %%ymm1, (%1)\n"
12956           "vmovupd %%ymm2, (%2)\n"
12957           "vmovupd %%ymm3, (%3)\n"
12958           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12959         );
12960       }
12961     }
12962     return;
12963   }
12964 }
12965 void helper_double_16(double *buf);
helper_double_16(double * buf)12966 void helper_double_16(double *buf) {
12967   helper_double_16_recursive(buf, 16);
12968 }
12969 void helper_double_17_recursive(double *buf, int depth);
helper_double_17_recursive(double * buf,int depth)12970 void helper_double_17_recursive(double *buf, int depth) {
12971   if (depth == 11) {
12972     for (int j = 0; j < 2048; j += 32) {
12973       for (int k = 0; k < 4; k += 4) {
12974         __asm__ volatile (
12975           "vmovupd (%0), %%ymm0\n"
12976           "vmovupd (%1), %%ymm1\n"
12977           "vmovupd (%2), %%ymm2\n"
12978           "vmovupd (%3), %%ymm3\n"
12979           "vmovupd (%4), %%ymm4\n"
12980           "vmovupd (%5), %%ymm5\n"
12981           "vmovupd (%6), %%ymm6\n"
12982           "vmovupd (%7), %%ymm7\n"
12983           "vpermilpd $0, %%ymm0, %%ymm8\n"
12984           "vpermilpd $15, %%ymm0, %%ymm9\n"
12985           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12986           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12987           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12988           "vpermilpd $0, %%ymm1, %%ymm8\n"
12989           "vpermilpd $15, %%ymm1, %%ymm9\n"
12990           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12991           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12992           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12993           "vpermilpd $0, %%ymm2, %%ymm8\n"
12994           "vpermilpd $15, %%ymm2, %%ymm9\n"
12995           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12996           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12997           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12998           "vpermilpd $0, %%ymm3, %%ymm8\n"
12999           "vpermilpd $15, %%ymm3, %%ymm9\n"
13000           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13001           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13002           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
13003           "vpermilpd $0, %%ymm4, %%ymm8\n"
13004           "vpermilpd $15, %%ymm4, %%ymm9\n"
13005           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13006           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13007           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
13008           "vpermilpd $0, %%ymm5, %%ymm8\n"
13009           "vpermilpd $15, %%ymm5, %%ymm9\n"
13010           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13011           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13012           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
13013           "vpermilpd $0, %%ymm6, %%ymm8\n"
13014           "vpermilpd $15, %%ymm6, %%ymm9\n"
13015           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13016           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13017           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
13018           "vpermilpd $0, %%ymm7, %%ymm8\n"
13019           "vpermilpd $15, %%ymm7, %%ymm9\n"
13020           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13021           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13022           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
13023           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
13024           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13025           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
13026           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
13027           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
13028           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
13029           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13030           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
13031           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
13032           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
13033           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
13034           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13035           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
13036           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
13037           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
13038           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
13039           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13040           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
13041           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
13042           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
13043           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
13044           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13045           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
13046           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
13047           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
13048           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
13049           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13050           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
13051           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
13052           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
13053           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
13054           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13055           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
13056           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
13057           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
13058           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
13059           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13060           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
13061           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
13062           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
13063           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13064           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13065           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13066           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13067           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13068           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13069           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13070           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13071           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13072           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13073           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13074           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13075           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13076           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13077           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13078           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13079           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13080           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13081           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13082           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13083           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13084           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13085           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13086           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13087           "vmovupd %%ymm8, (%0)\n"
13088           "vmovupd %%ymm9, (%1)\n"
13089           "vmovupd %%ymm10, (%2)\n"
13090           "vmovupd %%ymm11, (%3)\n"
13091           "vmovupd %%ymm12, (%4)\n"
13092           "vmovupd %%ymm13, (%5)\n"
13093           "vmovupd %%ymm14, (%6)\n"
13094           "vmovupd %%ymm15, (%7)\n"
13095           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13096         );
13097       }
13098     }
13099     for (int j = 0; j < 2048; j += 256) {
13100       for (int k = 0; k < 32; k += 4) {
13101         __asm__ volatile (
13102           "vmovupd (%0), %%ymm0\n"
13103           "vmovupd (%1), %%ymm1\n"
13104           "vmovupd (%2), %%ymm2\n"
13105           "vmovupd (%3), %%ymm3\n"
13106           "vmovupd (%4), %%ymm4\n"
13107           "vmovupd (%5), %%ymm5\n"
13108           "vmovupd (%6), %%ymm6\n"
13109           "vmovupd (%7), %%ymm7\n"
13110           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13111           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13112           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13113           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13114           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13115           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13116           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13117           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13118           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13119           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13120           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13121           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13122           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13123           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13124           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13125           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13126           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13127           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13128           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13129           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13130           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13131           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13132           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13133           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13134           "vmovupd %%ymm8, (%0)\n"
13135           "vmovupd %%ymm9, (%1)\n"
13136           "vmovupd %%ymm10, (%2)\n"
13137           "vmovupd %%ymm11, (%3)\n"
13138           "vmovupd %%ymm12, (%4)\n"
13139           "vmovupd %%ymm13, (%5)\n"
13140           "vmovupd %%ymm14, (%6)\n"
13141           "vmovupd %%ymm15, (%7)\n"
13142           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13143         );
13144       }
13145     }
13146     for (int j = 0; j < 2048; j += 2048) {
13147       for (int k = 0; k < 256; k += 4) {
13148         __asm__ volatile (
13149           "vmovupd (%0), %%ymm0\n"
13150           "vmovupd (%1), %%ymm1\n"
13151           "vmovupd (%2), %%ymm2\n"
13152           "vmovupd (%3), %%ymm3\n"
13153           "vmovupd (%4), %%ymm4\n"
13154           "vmovupd (%5), %%ymm5\n"
13155           "vmovupd (%6), %%ymm6\n"
13156           "vmovupd (%7), %%ymm7\n"
13157           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13158           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13159           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13160           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13161           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13162           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13163           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13164           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13165           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13166           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13167           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13168           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13169           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13170           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13171           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13172           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13173           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13174           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13175           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13176           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13177           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13178           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13179           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13180           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13181           "vmovupd %%ymm8, (%0)\n"
13182           "vmovupd %%ymm9, (%1)\n"
13183           "vmovupd %%ymm10, (%2)\n"
13184           "vmovupd %%ymm11, (%3)\n"
13185           "vmovupd %%ymm12, (%4)\n"
13186           "vmovupd %%ymm13, (%5)\n"
13187           "vmovupd %%ymm14, (%6)\n"
13188           "vmovupd %%ymm15, (%7)\n"
13189           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13190         );
13191       }
13192     }
13193     return;
13194   }
13195   if (depth == 14) {
13196     helper_double_17_recursive(buf + 0, 11);
13197     helper_double_17_recursive(buf + 2048, 11);
13198     helper_double_17_recursive(buf + 4096, 11);
13199     helper_double_17_recursive(buf + 6144, 11);
13200     helper_double_17_recursive(buf + 8192, 11);
13201     helper_double_17_recursive(buf + 10240, 11);
13202     helper_double_17_recursive(buf + 12288, 11);
13203     helper_double_17_recursive(buf + 14336, 11);
13204     for (int j = 0; j < 16384; j += 16384) {
13205       for (int k = 0; k < 2048; k += 4) {
13206         __asm__ volatile (
13207           "vmovupd (%0), %%ymm0\n"
13208           "vmovupd (%1), %%ymm1\n"
13209           "vmovupd (%2), %%ymm2\n"
13210           "vmovupd (%3), %%ymm3\n"
13211           "vmovupd (%4), %%ymm4\n"
13212           "vmovupd (%5), %%ymm5\n"
13213           "vmovupd (%6), %%ymm6\n"
13214           "vmovupd (%7), %%ymm7\n"
13215           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13216           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13217           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13218           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13219           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13220           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13221           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13222           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13223           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13224           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13225           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13226           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13227           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13228           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13229           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13230           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13231           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13232           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13233           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13234           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13235           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13236           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13237           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13238           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13239           "vmovupd %%ymm8, (%0)\n"
13240           "vmovupd %%ymm9, (%1)\n"
13241           "vmovupd %%ymm10, (%2)\n"
13242           "vmovupd %%ymm11, (%3)\n"
13243           "vmovupd %%ymm12, (%4)\n"
13244           "vmovupd %%ymm13, (%5)\n"
13245           "vmovupd %%ymm14, (%6)\n"
13246           "vmovupd %%ymm15, (%7)\n"
13247           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13248         );
13249       }
13250     }
13251     return;
13252   }
13253   if (depth == 17) {
13254     helper_double_17_recursive(buf + 0, 14);
13255     helper_double_17_recursive(buf + 16384, 14);
13256     helper_double_17_recursive(buf + 32768, 14);
13257     helper_double_17_recursive(buf + 49152, 14);
13258     helper_double_17_recursive(buf + 65536, 14);
13259     helper_double_17_recursive(buf + 81920, 14);
13260     helper_double_17_recursive(buf + 98304, 14);
13261     helper_double_17_recursive(buf + 114688, 14);
13262     for (int j = 0; j < 131072; j += 131072) {
13263       for (int k = 0; k < 16384; k += 4) {
13264         __asm__ volatile (
13265           "vmovupd (%0), %%ymm0\n"
13266           "vmovupd (%1), %%ymm1\n"
13267           "vmovupd (%2), %%ymm2\n"
13268           "vmovupd (%3), %%ymm3\n"
13269           "vmovupd (%4), %%ymm4\n"
13270           "vmovupd (%5), %%ymm5\n"
13271           "vmovupd (%6), %%ymm6\n"
13272           "vmovupd (%7), %%ymm7\n"
13273           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13274           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13275           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13276           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13277           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13278           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13279           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13280           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13281           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13282           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13283           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13284           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13285           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13286           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13287           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13288           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13289           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13290           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13291           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13292           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13293           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13294           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13295           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13296           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13297           "vmovupd %%ymm8, (%0)\n"
13298           "vmovupd %%ymm9, (%1)\n"
13299           "vmovupd %%ymm10, (%2)\n"
13300           "vmovupd %%ymm11, (%3)\n"
13301           "vmovupd %%ymm12, (%4)\n"
13302           "vmovupd %%ymm13, (%5)\n"
13303           "vmovupd %%ymm14, (%6)\n"
13304           "vmovupd %%ymm15, (%7)\n"
13305           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13306         );
13307       }
13308     }
13309     return;
13310   }
13311 }
13312 void helper_double_17(double *buf);
helper_double_17(double * buf)13313 void helper_double_17(double *buf) {
13314   helper_double_17_recursive(buf, 17);
13315 }
13316 void helper_double_18_recursive(double *buf, int depth);
helper_double_18_recursive(double * buf,int depth)13317 void helper_double_18_recursive(double *buf, int depth) {
13318   if (depth == 12) {
13319     for (int j = 0; j < 4096; j += 32) {
13320       for (int k = 0; k < 4; k += 4) {
13321         __asm__ volatile (
13322           "vmovupd (%0), %%ymm0\n"
13323           "vmovupd (%1), %%ymm1\n"
13324           "vmovupd (%2), %%ymm2\n"
13325           "vmovupd (%3), %%ymm3\n"
13326           "vmovupd (%4), %%ymm4\n"
13327           "vmovupd (%5), %%ymm5\n"
13328           "vmovupd (%6), %%ymm6\n"
13329           "vmovupd (%7), %%ymm7\n"
13330           "vpermilpd $0, %%ymm0, %%ymm8\n"
13331           "vpermilpd $15, %%ymm0, %%ymm9\n"
13332           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13333           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13334           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
13335           "vpermilpd $0, %%ymm1, %%ymm8\n"
13336           "vpermilpd $15, %%ymm1, %%ymm9\n"
13337           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13338           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13339           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
13340           "vpermilpd $0, %%ymm2, %%ymm8\n"
13341           "vpermilpd $15, %%ymm2, %%ymm9\n"
13342           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13343           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13344           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
13345           "vpermilpd $0, %%ymm3, %%ymm8\n"
13346           "vpermilpd $15, %%ymm3, %%ymm9\n"
13347           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13348           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13349           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
13350           "vpermilpd $0, %%ymm4, %%ymm8\n"
13351           "vpermilpd $15, %%ymm4, %%ymm9\n"
13352           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13353           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13354           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
13355           "vpermilpd $0, %%ymm5, %%ymm8\n"
13356           "vpermilpd $15, %%ymm5, %%ymm9\n"
13357           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13358           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13359           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
13360           "vpermilpd $0, %%ymm6, %%ymm8\n"
13361           "vpermilpd $15, %%ymm6, %%ymm9\n"
13362           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13363           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13364           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
13365           "vpermilpd $0, %%ymm7, %%ymm8\n"
13366           "vpermilpd $15, %%ymm7, %%ymm9\n"
13367           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13368           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13369           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
13370           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
13371           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13372           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
13373           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
13374           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
13375           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
13376           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13377           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
13378           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
13379           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
13380           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
13381           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13382           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
13383           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
13384           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
13385           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
13386           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13387           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
13388           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
13389           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
13390           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
13391           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13392           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
13393           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
13394           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
13395           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
13396           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13397           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
13398           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
13399           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
13400           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
13401           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13402           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
13403           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
13404           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
13405           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
13406           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13407           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
13408           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
13409           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
13410           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13411           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13412           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13413           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13414           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13415           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13416           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13417           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13418           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13419           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13420           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13421           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13422           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13423           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13424           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13425           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13426           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13427           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13428           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13429           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13430           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13431           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13432           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13433           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13434           "vmovupd %%ymm8, (%0)\n"
13435           "vmovupd %%ymm9, (%1)\n"
13436           "vmovupd %%ymm10, (%2)\n"
13437           "vmovupd %%ymm11, (%3)\n"
13438           "vmovupd %%ymm12, (%4)\n"
13439           "vmovupd %%ymm13, (%5)\n"
13440           "vmovupd %%ymm14, (%6)\n"
13441           "vmovupd %%ymm15, (%7)\n"
13442           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13443         );
13444       }
13445     }
13446     for (int j = 0; j < 4096; j += 256) {
13447       for (int k = 0; k < 32; k += 4) {
13448         __asm__ volatile (
13449           "vmovupd (%0), %%ymm0\n"
13450           "vmovupd (%1), %%ymm1\n"
13451           "vmovupd (%2), %%ymm2\n"
13452           "vmovupd (%3), %%ymm3\n"
13453           "vmovupd (%4), %%ymm4\n"
13454           "vmovupd (%5), %%ymm5\n"
13455           "vmovupd (%6), %%ymm6\n"
13456           "vmovupd (%7), %%ymm7\n"
13457           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13458           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13459           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13460           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13461           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13462           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13463           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13464           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13465           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13466           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13467           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13468           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13469           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13470           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13471           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13472           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13473           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13474           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13475           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13476           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13477           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13478           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13479           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13480           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13481           "vmovupd %%ymm8, (%0)\n"
13482           "vmovupd %%ymm9, (%1)\n"
13483           "vmovupd %%ymm10, (%2)\n"
13484           "vmovupd %%ymm11, (%3)\n"
13485           "vmovupd %%ymm12, (%4)\n"
13486           "vmovupd %%ymm13, (%5)\n"
13487           "vmovupd %%ymm14, (%6)\n"
13488           "vmovupd %%ymm15, (%7)\n"
13489           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13490         );
13491       }
13492     }
13493     for (int j = 0; j < 4096; j += 2048) {
13494       for (int k = 0; k < 256; k += 4) {
13495         __asm__ volatile (
13496           "vmovupd (%0), %%ymm0\n"
13497           "vmovupd (%1), %%ymm1\n"
13498           "vmovupd (%2), %%ymm2\n"
13499           "vmovupd (%3), %%ymm3\n"
13500           "vmovupd (%4), %%ymm4\n"
13501           "vmovupd (%5), %%ymm5\n"
13502           "vmovupd (%6), %%ymm6\n"
13503           "vmovupd (%7), %%ymm7\n"
13504           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13505           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13506           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13507           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13508           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13509           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13510           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13511           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13512           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13513           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13514           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13515           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13516           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13517           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13518           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13519           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13520           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13521           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13522           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13523           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13524           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13525           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13526           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13527           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13528           "vmovupd %%ymm8, (%0)\n"
13529           "vmovupd %%ymm9, (%1)\n"
13530           "vmovupd %%ymm10, (%2)\n"
13531           "vmovupd %%ymm11, (%3)\n"
13532           "vmovupd %%ymm12, (%4)\n"
13533           "vmovupd %%ymm13, (%5)\n"
13534           "vmovupd %%ymm14, (%6)\n"
13535           "vmovupd %%ymm15, (%7)\n"
13536           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13537         );
13538       }
13539     }
13540     for (int j = 0; j < 4096; j += 4096) {
13541       for (int k = 0; k < 2048; k += 4) {
13542         __asm__ volatile (
13543           "vmovupd (%0), %%ymm0\n"
13544           "vmovupd (%1), %%ymm1\n"
13545           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13546           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13547           "vmovupd %%ymm8, (%0)\n"
13548           "vmovupd %%ymm9, (%1)\n"
13549           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13550         );
13551       }
13552     }
13553     return;
13554   }
13555   if (depth == 15) {
13556     helper_double_18_recursive(buf + 0, 12);
13557     helper_double_18_recursive(buf + 4096, 12);
13558     helper_double_18_recursive(buf + 8192, 12);
13559     helper_double_18_recursive(buf + 12288, 12);
13560     helper_double_18_recursive(buf + 16384, 12);
13561     helper_double_18_recursive(buf + 20480, 12);
13562     helper_double_18_recursive(buf + 24576, 12);
13563     helper_double_18_recursive(buf + 28672, 12);
13564     for (int j = 0; j < 32768; j += 32768) {
13565       for (int k = 0; k < 4096; k += 4) {
13566         __asm__ volatile (
13567           "vmovupd (%0), %%ymm0\n"
13568           "vmovupd (%1), %%ymm1\n"
13569           "vmovupd (%2), %%ymm2\n"
13570           "vmovupd (%3), %%ymm3\n"
13571           "vmovupd (%4), %%ymm4\n"
13572           "vmovupd (%5), %%ymm5\n"
13573           "vmovupd (%6), %%ymm6\n"
13574           "vmovupd (%7), %%ymm7\n"
13575           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13576           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13577           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13578           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13579           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13580           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13581           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13582           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13583           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13584           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13585           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13586           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13587           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13588           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13589           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13590           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13591           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13592           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13593           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13594           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13595           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13596           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13597           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13598           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13599           "vmovupd %%ymm8, (%0)\n"
13600           "vmovupd %%ymm9, (%1)\n"
13601           "vmovupd %%ymm10, (%2)\n"
13602           "vmovupd %%ymm11, (%3)\n"
13603           "vmovupd %%ymm12, (%4)\n"
13604           "vmovupd %%ymm13, (%5)\n"
13605           "vmovupd %%ymm14, (%6)\n"
13606           "vmovupd %%ymm15, (%7)\n"
13607           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13608         );
13609       }
13610     }
13611     return;
13612   }
13613   if (depth == 18) {
13614     helper_double_18_recursive(buf + 0, 15);
13615     helper_double_18_recursive(buf + 32768, 15);
13616     helper_double_18_recursive(buf + 65536, 15);
13617     helper_double_18_recursive(buf + 98304, 15);
13618     helper_double_18_recursive(buf + 131072, 15);
13619     helper_double_18_recursive(buf + 163840, 15);
13620     helper_double_18_recursive(buf + 196608, 15);
13621     helper_double_18_recursive(buf + 229376, 15);
13622     for (int j = 0; j < 262144; j += 262144) {
13623       for (int k = 0; k < 32768; k += 4) {
13624         __asm__ volatile (
13625           "vmovupd (%0), %%ymm0\n"
13626           "vmovupd (%1), %%ymm1\n"
13627           "vmovupd (%2), %%ymm2\n"
13628           "vmovupd (%3), %%ymm3\n"
13629           "vmovupd (%4), %%ymm4\n"
13630           "vmovupd (%5), %%ymm5\n"
13631           "vmovupd (%6), %%ymm6\n"
13632           "vmovupd (%7), %%ymm7\n"
13633           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13634           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13635           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13636           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13637           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13638           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13639           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13640           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13641           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13642           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13643           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13644           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13645           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13646           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13647           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13648           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13649           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13650           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13651           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13652           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13653           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13654           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13655           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13656           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13657           "vmovupd %%ymm8, (%0)\n"
13658           "vmovupd %%ymm9, (%1)\n"
13659           "vmovupd %%ymm10, (%2)\n"
13660           "vmovupd %%ymm11, (%3)\n"
13661           "vmovupd %%ymm12, (%4)\n"
13662           "vmovupd %%ymm13, (%5)\n"
13663           "vmovupd %%ymm14, (%6)\n"
13664           "vmovupd %%ymm15, (%7)\n"
13665           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13666         );
13667       }
13668     }
13669     return;
13670   }
13671 }
13672 void helper_double_18(double *buf);
helper_double_18(double * buf)13673 void helper_double_18(double *buf) {
13674   helper_double_18_recursive(buf, 18);
13675 }
13676 void helper_double_19_recursive(double *buf, int depth);
helper_double_19_recursive(double * buf,int depth)13677 void helper_double_19_recursive(double *buf, int depth) {
13678   if (depth == 11) {
13679     for (int j = 0; j < 2048; j += 32) {
13680       for (int k = 0; k < 4; k += 4) {
13681         __asm__ volatile (
13682           "vmovupd (%0), %%ymm0\n"
13683           "vmovupd (%1), %%ymm1\n"
13684           "vmovupd (%2), %%ymm2\n"
13685           "vmovupd (%3), %%ymm3\n"
13686           "vmovupd (%4), %%ymm4\n"
13687           "vmovupd (%5), %%ymm5\n"
13688           "vmovupd (%6), %%ymm6\n"
13689           "vmovupd (%7), %%ymm7\n"
13690           "vpermilpd $0, %%ymm0, %%ymm8\n"
13691           "vpermilpd $15, %%ymm0, %%ymm9\n"
13692           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13693           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13694           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
13695           "vpermilpd $0, %%ymm1, %%ymm8\n"
13696           "vpermilpd $15, %%ymm1, %%ymm9\n"
13697           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13698           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13699           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
13700           "vpermilpd $0, %%ymm2, %%ymm8\n"
13701           "vpermilpd $15, %%ymm2, %%ymm9\n"
13702           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13703           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13704           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
13705           "vpermilpd $0, %%ymm3, %%ymm8\n"
13706           "vpermilpd $15, %%ymm3, %%ymm9\n"
13707           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13708           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13709           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
13710           "vpermilpd $0, %%ymm4, %%ymm8\n"
13711           "vpermilpd $15, %%ymm4, %%ymm9\n"
13712           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13713           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13714           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
13715           "vpermilpd $0, %%ymm5, %%ymm8\n"
13716           "vpermilpd $15, %%ymm5, %%ymm9\n"
13717           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13718           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13719           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
13720           "vpermilpd $0, %%ymm6, %%ymm8\n"
13721           "vpermilpd $15, %%ymm6, %%ymm9\n"
13722           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13723           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13724           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
13725           "vpermilpd $0, %%ymm7, %%ymm8\n"
13726           "vpermilpd $15, %%ymm7, %%ymm9\n"
13727           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13728           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13729           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
13730           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
13731           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13732           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
13733           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
13734           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
13735           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
13736           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13737           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
13738           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
13739           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
13740           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
13741           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13742           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
13743           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
13744           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
13745           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
13746           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13747           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
13748           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
13749           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
13750           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
13751           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13752           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
13753           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
13754           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
13755           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
13756           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13757           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
13758           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
13759           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
13760           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
13761           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13762           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
13763           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
13764           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
13765           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
13766           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13767           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
13768           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
13769           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
13770           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13771           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13772           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13773           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13774           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13775           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13776           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13777           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13778           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13779           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13780           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13781           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13782           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13783           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13784           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13785           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13786           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13787           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13788           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13789           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13790           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13791           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13792           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13793           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13794           "vmovupd %%ymm8, (%0)\n"
13795           "vmovupd %%ymm9, (%1)\n"
13796           "vmovupd %%ymm10, (%2)\n"
13797           "vmovupd %%ymm11, (%3)\n"
13798           "vmovupd %%ymm12, (%4)\n"
13799           "vmovupd %%ymm13, (%5)\n"
13800           "vmovupd %%ymm14, (%6)\n"
13801           "vmovupd %%ymm15, (%7)\n"
13802           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13803         );
13804       }
13805     }
13806     for (int j = 0; j < 2048; j += 256) {
13807       for (int k = 0; k < 32; k += 4) {
13808         __asm__ volatile (
13809           "vmovupd (%0), %%ymm0\n"
13810           "vmovupd (%1), %%ymm1\n"
13811           "vmovupd (%2), %%ymm2\n"
13812           "vmovupd (%3), %%ymm3\n"
13813           "vmovupd (%4), %%ymm4\n"
13814           "vmovupd (%5), %%ymm5\n"
13815           "vmovupd (%6), %%ymm6\n"
13816           "vmovupd (%7), %%ymm7\n"
13817           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13818           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13819           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13820           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13821           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13822           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13823           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13824           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13825           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13826           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13827           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13828           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13829           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13830           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13831           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13832           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13833           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13834           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13835           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13836           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13837           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13838           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13839           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13840           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13841           "vmovupd %%ymm8, (%0)\n"
13842           "vmovupd %%ymm9, (%1)\n"
13843           "vmovupd %%ymm10, (%2)\n"
13844           "vmovupd %%ymm11, (%3)\n"
13845           "vmovupd %%ymm12, (%4)\n"
13846           "vmovupd %%ymm13, (%5)\n"
13847           "vmovupd %%ymm14, (%6)\n"
13848           "vmovupd %%ymm15, (%7)\n"
13849           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13850         );
13851       }
13852     }
13853     for (int j = 0; j < 2048; j += 2048) {
13854       for (int k = 0; k < 256; k += 4) {
13855         __asm__ volatile (
13856           "vmovupd (%0), %%ymm0\n"
13857           "vmovupd (%1), %%ymm1\n"
13858           "vmovupd (%2), %%ymm2\n"
13859           "vmovupd (%3), %%ymm3\n"
13860           "vmovupd (%4), %%ymm4\n"
13861           "vmovupd (%5), %%ymm5\n"
13862           "vmovupd (%6), %%ymm6\n"
13863           "vmovupd (%7), %%ymm7\n"
13864           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13865           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13866           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13867           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13868           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13869           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13870           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13871           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13872           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13873           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13874           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13875           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13876           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13877           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13878           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13879           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13880           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13881           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13882           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13883           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13884           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13885           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13886           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13887           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13888           "vmovupd %%ymm8, (%0)\n"
13889           "vmovupd %%ymm9, (%1)\n"
13890           "vmovupd %%ymm10, (%2)\n"
13891           "vmovupd %%ymm11, (%3)\n"
13892           "vmovupd %%ymm12, (%4)\n"
13893           "vmovupd %%ymm13, (%5)\n"
13894           "vmovupd %%ymm14, (%6)\n"
13895           "vmovupd %%ymm15, (%7)\n"
13896           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13897         );
13898       }
13899     }
13900     return;
13901   }
13902   if (depth == 14) {
13903     helper_double_19_recursive(buf + 0, 11);
13904     helper_double_19_recursive(buf + 2048, 11);
13905     helper_double_19_recursive(buf + 4096, 11);
13906     helper_double_19_recursive(buf + 6144, 11);
13907     helper_double_19_recursive(buf + 8192, 11);
13908     helper_double_19_recursive(buf + 10240, 11);
13909     helper_double_19_recursive(buf + 12288, 11);
13910     helper_double_19_recursive(buf + 14336, 11);
13911     for (int j = 0; j < 16384; j += 16384) {
13912       for (int k = 0; k < 2048; k += 4) {
13913         __asm__ volatile (
13914           "vmovupd (%0), %%ymm0\n"
13915           "vmovupd (%1), %%ymm1\n"
13916           "vmovupd (%2), %%ymm2\n"
13917           "vmovupd (%3), %%ymm3\n"
13918           "vmovupd (%4), %%ymm4\n"
13919           "vmovupd (%5), %%ymm5\n"
13920           "vmovupd (%6), %%ymm6\n"
13921           "vmovupd (%7), %%ymm7\n"
13922           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13923           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13924           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13925           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13926           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13927           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13928           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13929           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13930           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13931           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13932           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13933           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13934           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13935           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13936           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13937           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13938           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13939           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13940           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13941           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13942           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13943           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13944           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13945           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13946           "vmovupd %%ymm8, (%0)\n"
13947           "vmovupd %%ymm9, (%1)\n"
13948           "vmovupd %%ymm10, (%2)\n"
13949           "vmovupd %%ymm11, (%3)\n"
13950           "vmovupd %%ymm12, (%4)\n"
13951           "vmovupd %%ymm13, (%5)\n"
13952           "vmovupd %%ymm14, (%6)\n"
13953           "vmovupd %%ymm15, (%7)\n"
13954           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13955         );
13956       }
13957     }
13958     return;
13959   }
13960   if (depth == 17) {
13961     helper_double_19_recursive(buf + 0, 14);
13962     helper_double_19_recursive(buf + 16384, 14);
13963     helper_double_19_recursive(buf + 32768, 14);
13964     helper_double_19_recursive(buf + 49152, 14);
13965     helper_double_19_recursive(buf + 65536, 14);
13966     helper_double_19_recursive(buf + 81920, 14);
13967     helper_double_19_recursive(buf + 98304, 14);
13968     helper_double_19_recursive(buf + 114688, 14);
13969     for (int j = 0; j < 131072; j += 131072) {
13970       for (int k = 0; k < 16384; k += 4) {
13971         __asm__ volatile (
13972           "vmovupd (%0), %%ymm0\n"
13973           "vmovupd (%1), %%ymm1\n"
13974           "vmovupd (%2), %%ymm2\n"
13975           "vmovupd (%3), %%ymm3\n"
13976           "vmovupd (%4), %%ymm4\n"
13977           "vmovupd (%5), %%ymm5\n"
13978           "vmovupd (%6), %%ymm6\n"
13979           "vmovupd (%7), %%ymm7\n"
13980           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13981           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13982           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13983           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13984           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13985           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13986           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13987           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13988           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13989           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13990           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13991           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13992           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13993           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13994           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13995           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13996           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13997           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13998           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13999           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14000           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14001           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14002           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14003           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14004           "vmovupd %%ymm8, (%0)\n"
14005           "vmovupd %%ymm9, (%1)\n"
14006           "vmovupd %%ymm10, (%2)\n"
14007           "vmovupd %%ymm11, (%3)\n"
14008           "vmovupd %%ymm12, (%4)\n"
14009           "vmovupd %%ymm13, (%5)\n"
14010           "vmovupd %%ymm14, (%6)\n"
14011           "vmovupd %%ymm15, (%7)\n"
14012           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14013         );
14014       }
14015     }
14016     return;
14017   }
14018   if (depth == 19) {
14019     helper_double_19_recursive(buf + 0, 17);
14020     helper_double_19_recursive(buf + 131072, 17);
14021     helper_double_19_recursive(buf + 262144, 17);
14022     helper_double_19_recursive(buf + 393216, 17);
14023     for (int j = 0; j < 524288; j += 524288) {
14024       for (int k = 0; k < 131072; k += 4) {
14025         __asm__ volatile (
14026           "vmovupd (%0), %%ymm0\n"
14027           "vmovupd (%1), %%ymm1\n"
14028           "vmovupd (%2), %%ymm2\n"
14029           "vmovupd (%3), %%ymm3\n"
14030           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14031           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14032           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14033           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14034           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14035           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14036           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14037           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14038           "vmovupd %%ymm0, (%0)\n"
14039           "vmovupd %%ymm1, (%1)\n"
14040           "vmovupd %%ymm2, (%2)\n"
14041           "vmovupd %%ymm3, (%3)\n"
14042           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14043         );
14044       }
14045     }
14046     return;
14047   }
14048 }
14049 void helper_double_19(double *buf);
helper_double_19(double * buf)14050 void helper_double_19(double *buf) {
14051   helper_double_19_recursive(buf, 19);
14052 }
14053 void helper_double_20_recursive(double *buf, int depth);
helper_double_20_recursive(double * buf,int depth)14054 void helper_double_20_recursive(double *buf, int depth) {
14055   if (depth == 9) {
14056     for (int j = 0; j < 512; j += 32) {
14057       for (int k = 0; k < 4; k += 4) {
14058         __asm__ volatile (
14059           "vmovupd (%0), %%ymm0\n"
14060           "vmovupd (%1), %%ymm1\n"
14061           "vmovupd (%2), %%ymm2\n"
14062           "vmovupd (%3), %%ymm3\n"
14063           "vmovupd (%4), %%ymm4\n"
14064           "vmovupd (%5), %%ymm5\n"
14065           "vmovupd (%6), %%ymm6\n"
14066           "vmovupd (%7), %%ymm7\n"
14067           "vpermilpd $0, %%ymm0, %%ymm8\n"
14068           "vpermilpd $15, %%ymm0, %%ymm9\n"
14069           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14070           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14071           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
14072           "vpermilpd $0, %%ymm1, %%ymm8\n"
14073           "vpermilpd $15, %%ymm1, %%ymm9\n"
14074           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14075           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14076           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
14077           "vpermilpd $0, %%ymm2, %%ymm8\n"
14078           "vpermilpd $15, %%ymm2, %%ymm9\n"
14079           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14080           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14081           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
14082           "vpermilpd $0, %%ymm3, %%ymm8\n"
14083           "vpermilpd $15, %%ymm3, %%ymm9\n"
14084           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14085           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14086           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
14087           "vpermilpd $0, %%ymm4, %%ymm8\n"
14088           "vpermilpd $15, %%ymm4, %%ymm9\n"
14089           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14090           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14091           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
14092           "vpermilpd $0, %%ymm5, %%ymm8\n"
14093           "vpermilpd $15, %%ymm5, %%ymm9\n"
14094           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14095           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14096           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
14097           "vpermilpd $0, %%ymm6, %%ymm8\n"
14098           "vpermilpd $15, %%ymm6, %%ymm9\n"
14099           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14100           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14101           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
14102           "vpermilpd $0, %%ymm7, %%ymm8\n"
14103           "vpermilpd $15, %%ymm7, %%ymm9\n"
14104           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14105           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14106           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
14107           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
14108           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14109           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
14110           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
14111           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
14112           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
14113           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14114           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
14115           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
14116           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
14117           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
14118           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14119           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
14120           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
14121           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
14122           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
14123           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14124           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
14125           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
14126           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
14127           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
14128           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14129           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
14130           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
14131           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
14132           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
14133           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14134           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
14135           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
14136           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
14137           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
14138           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14139           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
14140           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
14141           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
14142           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
14143           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14144           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
14145           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
14146           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
14147           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14148           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14149           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14150           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14151           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14152           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14153           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14154           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14155           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14156           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14157           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14158           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14159           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14160           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14161           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14162           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14163           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14164           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14165           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14166           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14167           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14168           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14169           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14170           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14171           "vmovupd %%ymm8, (%0)\n"
14172           "vmovupd %%ymm9, (%1)\n"
14173           "vmovupd %%ymm10, (%2)\n"
14174           "vmovupd %%ymm11, (%3)\n"
14175           "vmovupd %%ymm12, (%4)\n"
14176           "vmovupd %%ymm13, (%5)\n"
14177           "vmovupd %%ymm14, (%6)\n"
14178           "vmovupd %%ymm15, (%7)\n"
14179           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14180         );
14181       }
14182     }
14183     for (int j = 0; j < 512; j += 256) {
14184       for (int k = 0; k < 32; k += 4) {
14185         __asm__ volatile (
14186           "vmovupd (%0), %%ymm0\n"
14187           "vmovupd (%1), %%ymm1\n"
14188           "vmovupd (%2), %%ymm2\n"
14189           "vmovupd (%3), %%ymm3\n"
14190           "vmovupd (%4), %%ymm4\n"
14191           "vmovupd (%5), %%ymm5\n"
14192           "vmovupd (%6), %%ymm6\n"
14193           "vmovupd (%7), %%ymm7\n"
14194           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14195           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14196           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14197           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14198           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14199           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14200           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14201           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14202           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14203           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14204           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14205           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14206           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14207           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14208           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14209           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14210           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14211           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14212           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14213           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14214           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14215           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14216           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14217           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14218           "vmovupd %%ymm8, (%0)\n"
14219           "vmovupd %%ymm9, (%1)\n"
14220           "vmovupd %%ymm10, (%2)\n"
14221           "vmovupd %%ymm11, (%3)\n"
14222           "vmovupd %%ymm12, (%4)\n"
14223           "vmovupd %%ymm13, (%5)\n"
14224           "vmovupd %%ymm14, (%6)\n"
14225           "vmovupd %%ymm15, (%7)\n"
14226           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14227         );
14228       }
14229     }
14230     for (int j = 0; j < 512; j += 512) {
14231       for (int k = 0; k < 256; k += 4) {
14232         __asm__ volatile (
14233           "vmovupd (%0), %%ymm0\n"
14234           "vmovupd (%1), %%ymm1\n"
14235           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14236           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14237           "vmovupd %%ymm8, (%0)\n"
14238           "vmovupd %%ymm9, (%1)\n"
14239           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14240         );
14241       }
14242     }
14243     return;
14244   }
14245   if (depth == 12) {
14246     helper_double_20_recursive(buf + 0, 9);
14247     helper_double_20_recursive(buf + 512, 9);
14248     helper_double_20_recursive(buf + 1024, 9);
14249     helper_double_20_recursive(buf + 1536, 9);
14250     helper_double_20_recursive(buf + 2048, 9);
14251     helper_double_20_recursive(buf + 2560, 9);
14252     helper_double_20_recursive(buf + 3072, 9);
14253     helper_double_20_recursive(buf + 3584, 9);
14254     for (int j = 0; j < 4096; j += 4096) {
14255       for (int k = 0; k < 512; k += 4) {
14256         __asm__ volatile (
14257           "vmovupd (%0), %%ymm0\n"
14258           "vmovupd (%1), %%ymm1\n"
14259           "vmovupd (%2), %%ymm2\n"
14260           "vmovupd (%3), %%ymm3\n"
14261           "vmovupd (%4), %%ymm4\n"
14262           "vmovupd (%5), %%ymm5\n"
14263           "vmovupd (%6), %%ymm6\n"
14264           "vmovupd (%7), %%ymm7\n"
14265           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14266           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14267           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14268           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14269           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14270           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14271           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14272           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14273           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14274           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14275           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14276           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14277           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14278           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14279           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14280           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14281           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14282           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14283           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14284           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14285           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14286           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14287           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14288           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14289           "vmovupd %%ymm8, (%0)\n"
14290           "vmovupd %%ymm9, (%1)\n"
14291           "vmovupd %%ymm10, (%2)\n"
14292           "vmovupd %%ymm11, (%3)\n"
14293           "vmovupd %%ymm12, (%4)\n"
14294           "vmovupd %%ymm13, (%5)\n"
14295           "vmovupd %%ymm14, (%6)\n"
14296           "vmovupd %%ymm15, (%7)\n"
14297           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14298         );
14299       }
14300     }
14301     return;
14302   }
14303   if (depth == 15) {
14304     helper_double_20_recursive(buf + 0, 12);
14305     helper_double_20_recursive(buf + 4096, 12);
14306     helper_double_20_recursive(buf + 8192, 12);
14307     helper_double_20_recursive(buf + 12288, 12);
14308     helper_double_20_recursive(buf + 16384, 12);
14309     helper_double_20_recursive(buf + 20480, 12);
14310     helper_double_20_recursive(buf + 24576, 12);
14311     helper_double_20_recursive(buf + 28672, 12);
14312     for (int j = 0; j < 32768; j += 32768) {
14313       for (int k = 0; k < 4096; k += 4) {
14314         __asm__ volatile (
14315           "vmovupd (%0), %%ymm0\n"
14316           "vmovupd (%1), %%ymm1\n"
14317           "vmovupd (%2), %%ymm2\n"
14318           "vmovupd (%3), %%ymm3\n"
14319           "vmovupd (%4), %%ymm4\n"
14320           "vmovupd (%5), %%ymm5\n"
14321           "vmovupd (%6), %%ymm6\n"
14322           "vmovupd (%7), %%ymm7\n"
14323           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14324           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14325           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14326           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14327           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14328           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14329           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14330           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14331           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14332           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14333           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14334           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14335           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14336           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14337           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14338           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14339           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14340           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14341           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14342           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14343           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14344           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14345           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14346           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14347           "vmovupd %%ymm8, (%0)\n"
14348           "vmovupd %%ymm9, (%1)\n"
14349           "vmovupd %%ymm10, (%2)\n"
14350           "vmovupd %%ymm11, (%3)\n"
14351           "vmovupd %%ymm12, (%4)\n"
14352           "vmovupd %%ymm13, (%5)\n"
14353           "vmovupd %%ymm14, (%6)\n"
14354           "vmovupd %%ymm15, (%7)\n"
14355           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14356         );
14357       }
14358     }
14359     return;
14360   }
14361   if (depth == 18) {
14362     helper_double_20_recursive(buf + 0, 15);
14363     helper_double_20_recursive(buf + 32768, 15);
14364     helper_double_20_recursive(buf + 65536, 15);
14365     helper_double_20_recursive(buf + 98304, 15);
14366     helper_double_20_recursive(buf + 131072, 15);
14367     helper_double_20_recursive(buf + 163840, 15);
14368     helper_double_20_recursive(buf + 196608, 15);
14369     helper_double_20_recursive(buf + 229376, 15);
14370     for (int j = 0; j < 262144; j += 262144) {
14371       for (int k = 0; k < 32768; k += 4) {
14372         __asm__ volatile (
14373           "vmovupd (%0), %%ymm0\n"
14374           "vmovupd (%1), %%ymm1\n"
14375           "vmovupd (%2), %%ymm2\n"
14376           "vmovupd (%3), %%ymm3\n"
14377           "vmovupd (%4), %%ymm4\n"
14378           "vmovupd (%5), %%ymm5\n"
14379           "vmovupd (%6), %%ymm6\n"
14380           "vmovupd (%7), %%ymm7\n"
14381           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14382           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14383           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14384           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14385           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14386           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14387           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14388           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14389           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14390           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14391           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14392           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14393           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14394           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14395           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14396           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14397           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14398           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14399           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14400           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14401           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14402           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14403           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14404           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14405           "vmovupd %%ymm8, (%0)\n"
14406           "vmovupd %%ymm9, (%1)\n"
14407           "vmovupd %%ymm10, (%2)\n"
14408           "vmovupd %%ymm11, (%3)\n"
14409           "vmovupd %%ymm12, (%4)\n"
14410           "vmovupd %%ymm13, (%5)\n"
14411           "vmovupd %%ymm14, (%6)\n"
14412           "vmovupd %%ymm15, (%7)\n"
14413           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14414         );
14415       }
14416     }
14417     return;
14418   }
14419   if (depth == 20) {
14420     helper_double_20_recursive(buf + 0, 18);
14421     helper_double_20_recursive(buf + 262144, 18);
14422     helper_double_20_recursive(buf + 524288, 18);
14423     helper_double_20_recursive(buf + 786432, 18);
14424     for (int j = 0; j < 1048576; j += 1048576) {
14425       for (int k = 0; k < 262144; k += 4) {
14426         __asm__ volatile (
14427           "vmovupd (%0), %%ymm0\n"
14428           "vmovupd (%1), %%ymm1\n"
14429           "vmovupd (%2), %%ymm2\n"
14430           "vmovupd (%3), %%ymm3\n"
14431           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14432           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14433           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14434           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14435           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14436           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14437           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14438           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14439           "vmovupd %%ymm0, (%0)\n"
14440           "vmovupd %%ymm1, (%1)\n"
14441           "vmovupd %%ymm2, (%2)\n"
14442           "vmovupd %%ymm3, (%3)\n"
14443           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14444         );
14445       }
14446     }
14447     return;
14448   }
14449 }
14450 void helper_double_20(double *buf);
helper_double_20(double * buf)14451 void helper_double_20(double *buf) {
14452   helper_double_20_recursive(buf, 20);
14453 }
14454 void helper_double_21_recursive(double *buf, int depth);
helper_double_21_recursive(double * buf,int depth)14455 void helper_double_21_recursive(double *buf, int depth) {
14456   if (depth == 7) {
14457     for (int j = 0; j < 128; j += 32) {
14458       for (int k = 0; k < 4; k += 4) {
14459         __asm__ volatile (
14460           "vmovupd (%0), %%ymm0\n"
14461           "vmovupd (%1), %%ymm1\n"
14462           "vmovupd (%2), %%ymm2\n"
14463           "vmovupd (%3), %%ymm3\n"
14464           "vmovupd (%4), %%ymm4\n"
14465           "vmovupd (%5), %%ymm5\n"
14466           "vmovupd (%6), %%ymm6\n"
14467           "vmovupd (%7), %%ymm7\n"
14468           "vpermilpd $0, %%ymm0, %%ymm8\n"
14469           "vpermilpd $15, %%ymm0, %%ymm9\n"
14470           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14471           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14472           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
14473           "vpermilpd $0, %%ymm1, %%ymm8\n"
14474           "vpermilpd $15, %%ymm1, %%ymm9\n"
14475           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14476           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14477           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
14478           "vpermilpd $0, %%ymm2, %%ymm8\n"
14479           "vpermilpd $15, %%ymm2, %%ymm9\n"
14480           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14481           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14482           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
14483           "vpermilpd $0, %%ymm3, %%ymm8\n"
14484           "vpermilpd $15, %%ymm3, %%ymm9\n"
14485           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14486           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14487           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
14488           "vpermilpd $0, %%ymm4, %%ymm8\n"
14489           "vpermilpd $15, %%ymm4, %%ymm9\n"
14490           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14491           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14492           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
14493           "vpermilpd $0, %%ymm5, %%ymm8\n"
14494           "vpermilpd $15, %%ymm5, %%ymm9\n"
14495           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14496           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14497           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
14498           "vpermilpd $0, %%ymm6, %%ymm8\n"
14499           "vpermilpd $15, %%ymm6, %%ymm9\n"
14500           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14501           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14502           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
14503           "vpermilpd $0, %%ymm7, %%ymm8\n"
14504           "vpermilpd $15, %%ymm7, %%ymm9\n"
14505           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14506           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14507           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
14508           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
14509           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14510           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
14511           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
14512           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
14513           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
14514           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14515           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
14516           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
14517           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
14518           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
14519           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14520           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
14521           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
14522           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
14523           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
14524           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14525           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
14526           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
14527           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
14528           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
14529           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14530           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
14531           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
14532           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
14533           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
14534           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14535           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
14536           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
14537           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
14538           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
14539           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14540           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
14541           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
14542           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
14543           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
14544           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14545           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
14546           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
14547           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
14548           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14549           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14550           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14551           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14552           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14553           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14554           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14555           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14556           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14557           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14558           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14559           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14560           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14561           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14562           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14563           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14564           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14565           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14566           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14567           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14568           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14569           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14570           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14571           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14572           "vmovupd %%ymm8, (%0)\n"
14573           "vmovupd %%ymm9, (%1)\n"
14574           "vmovupd %%ymm10, (%2)\n"
14575           "vmovupd %%ymm11, (%3)\n"
14576           "vmovupd %%ymm12, (%4)\n"
14577           "vmovupd %%ymm13, (%5)\n"
14578           "vmovupd %%ymm14, (%6)\n"
14579           "vmovupd %%ymm15, (%7)\n"
14580           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14581         );
14582       }
14583     }
14584     for (int j = 0; j < 128; j += 128) {
14585       for (int k = 0; k < 32; k += 4) {
14586         __asm__ volatile (
14587           "vmovupd (%0), %%ymm0\n"
14588           "vmovupd (%1), %%ymm1\n"
14589           "vmovupd (%2), %%ymm2\n"
14590           "vmovupd (%3), %%ymm3\n"
14591           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14592           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14593           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14594           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14595           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14596           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14597           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14598           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14599           "vmovupd %%ymm0, (%0)\n"
14600           "vmovupd %%ymm1, (%1)\n"
14601           "vmovupd %%ymm2, (%2)\n"
14602           "vmovupd %%ymm3, (%3)\n"
14603           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14604         );
14605       }
14606     }
14607     return;
14608   }
14609   if (depth == 10) {
14610     helper_double_21_recursive(buf + 0, 7);
14611     helper_double_21_recursive(buf + 128, 7);
14612     helper_double_21_recursive(buf + 256, 7);
14613     helper_double_21_recursive(buf + 384, 7);
14614     helper_double_21_recursive(buf + 512, 7);
14615     helper_double_21_recursive(buf + 640, 7);
14616     helper_double_21_recursive(buf + 768, 7);
14617     helper_double_21_recursive(buf + 896, 7);
14618     for (int j = 0; j < 1024; j += 1024) {
14619       for (int k = 0; k < 128; k += 4) {
14620         __asm__ volatile (
14621           "vmovupd (%0), %%ymm0\n"
14622           "vmovupd (%1), %%ymm1\n"
14623           "vmovupd (%2), %%ymm2\n"
14624           "vmovupd (%3), %%ymm3\n"
14625           "vmovupd (%4), %%ymm4\n"
14626           "vmovupd (%5), %%ymm5\n"
14627           "vmovupd (%6), %%ymm6\n"
14628           "vmovupd (%7), %%ymm7\n"
14629           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14630           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14631           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14632           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14633           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14634           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14635           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14636           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14637           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14638           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14639           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14640           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14641           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14642           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14643           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14644           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14645           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14646           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14647           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14648           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14649           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14650           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14651           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14652           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14653           "vmovupd %%ymm8, (%0)\n"
14654           "vmovupd %%ymm9, (%1)\n"
14655           "vmovupd %%ymm10, (%2)\n"
14656           "vmovupd %%ymm11, (%3)\n"
14657           "vmovupd %%ymm12, (%4)\n"
14658           "vmovupd %%ymm13, (%5)\n"
14659           "vmovupd %%ymm14, (%6)\n"
14660           "vmovupd %%ymm15, (%7)\n"
14661           :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14662         );
14663       }
14664     }
14665     return;
14666   }
14667   if (depth == 13) {
14668     helper_double_21_recursive(buf + 0, 10);
14669     helper_double_21_recursive(buf + 1024, 10);
14670     helper_double_21_recursive(buf + 2048, 10);
14671     helper_double_21_recursive(buf + 3072, 10);
14672     helper_double_21_recursive(buf + 4096, 10);
14673     helper_double_21_recursive(buf + 5120, 10);
14674     helper_double_21_recursive(buf + 6144, 10);
14675     helper_double_21_recursive(buf + 7168, 10);
14676     for (int j = 0; j < 8192; j += 8192) {
14677       for (int k = 0; k < 1024; k += 4) {
14678         __asm__ volatile (
14679           "vmovupd (%0), %%ymm0\n"
14680           "vmovupd (%1), %%ymm1\n"
14681           "vmovupd (%2), %%ymm2\n"
14682           "vmovupd (%3), %%ymm3\n"
14683           "vmovupd (%4), %%ymm4\n"
14684           "vmovupd (%5), %%ymm5\n"
14685           "vmovupd (%6), %%ymm6\n"
14686           "vmovupd (%7), %%ymm7\n"
14687           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14688           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14689           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14690           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14691           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14692           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14693           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14694           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14695           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14696           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14697           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14698           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14699           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14700           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14701           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14702           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14703           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14704           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14705           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14706           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14707           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14708           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14709           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14710           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14711           "vmovupd %%ymm8, (%0)\n"
14712           "vmovupd %%ymm9, (%1)\n"
14713           "vmovupd %%ymm10, (%2)\n"
14714           "vmovupd %%ymm11, (%3)\n"
14715           "vmovupd %%ymm12, (%4)\n"
14716           "vmovupd %%ymm13, (%5)\n"
14717           "vmovupd %%ymm14, (%6)\n"
14718           "vmovupd %%ymm15, (%7)\n"
14719           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14720         );
14721       }
14722     }
14723     return;
14724   }
14725   if (depth == 16) {
14726     helper_double_21_recursive(buf + 0, 13);
14727     helper_double_21_recursive(buf + 8192, 13);
14728     helper_double_21_recursive(buf + 16384, 13);
14729     helper_double_21_recursive(buf + 24576, 13);
14730     helper_double_21_recursive(buf + 32768, 13);
14731     helper_double_21_recursive(buf + 40960, 13);
14732     helper_double_21_recursive(buf + 49152, 13);
14733     helper_double_21_recursive(buf + 57344, 13);
14734     for (int j = 0; j < 65536; j += 65536) {
14735       for (int k = 0; k < 8192; k += 4) {
14736         __asm__ volatile (
14737           "vmovupd (%0), %%ymm0\n"
14738           "vmovupd (%1), %%ymm1\n"
14739           "vmovupd (%2), %%ymm2\n"
14740           "vmovupd (%3), %%ymm3\n"
14741           "vmovupd (%4), %%ymm4\n"
14742           "vmovupd (%5), %%ymm5\n"
14743           "vmovupd (%6), %%ymm6\n"
14744           "vmovupd (%7), %%ymm7\n"
14745           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14746           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14747           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14748           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14749           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14750           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14751           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14752           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14753           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14754           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14755           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14756           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14757           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14758           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14759           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14760           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14761           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14762           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14763           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14764           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14765           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14766           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14767           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14768           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14769           "vmovupd %%ymm8, (%0)\n"
14770           "vmovupd %%ymm9, (%1)\n"
14771           "vmovupd %%ymm10, (%2)\n"
14772           "vmovupd %%ymm11, (%3)\n"
14773           "vmovupd %%ymm12, (%4)\n"
14774           "vmovupd %%ymm13, (%5)\n"
14775           "vmovupd %%ymm14, (%6)\n"
14776           "vmovupd %%ymm15, (%7)\n"
14777           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14778         );
14779       }
14780     }
14781     return;
14782   }
14783   if (depth == 19) {
14784     helper_double_21_recursive(buf + 0, 16);
14785     helper_double_21_recursive(buf + 65536, 16);
14786     helper_double_21_recursive(buf + 131072, 16);
14787     helper_double_21_recursive(buf + 196608, 16);
14788     helper_double_21_recursive(buf + 262144, 16);
14789     helper_double_21_recursive(buf + 327680, 16);
14790     helper_double_21_recursive(buf + 393216, 16);
14791     helper_double_21_recursive(buf + 458752, 16);
14792     for (int j = 0; j < 524288; j += 524288) {
14793       for (int k = 0; k < 65536; k += 4) {
14794         __asm__ volatile (
14795           "vmovupd (%0), %%ymm0\n"
14796           "vmovupd (%1), %%ymm1\n"
14797           "vmovupd (%2), %%ymm2\n"
14798           "vmovupd (%3), %%ymm3\n"
14799           "vmovupd (%4), %%ymm4\n"
14800           "vmovupd (%5), %%ymm5\n"
14801           "vmovupd (%6), %%ymm6\n"
14802           "vmovupd (%7), %%ymm7\n"
14803           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14804           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14805           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14806           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14807           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14808           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14809           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14810           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14811           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14812           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14813           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14814           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14815           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14816           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14817           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14818           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14819           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14820           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14821           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14822           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14823           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14824           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14825           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14826           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14827           "vmovupd %%ymm8, (%0)\n"
14828           "vmovupd %%ymm9, (%1)\n"
14829           "vmovupd %%ymm10, (%2)\n"
14830           "vmovupd %%ymm11, (%3)\n"
14831           "vmovupd %%ymm12, (%4)\n"
14832           "vmovupd %%ymm13, (%5)\n"
14833           "vmovupd %%ymm14, (%6)\n"
14834           "vmovupd %%ymm15, (%7)\n"
14835           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14836         );
14837       }
14838     }
14839     return;
14840   }
14841   if (depth == 21) {
14842     helper_double_21_recursive(buf + 0, 19);
14843     helper_double_21_recursive(buf + 524288, 19);
14844     helper_double_21_recursive(buf + 1048576, 19);
14845     helper_double_21_recursive(buf + 1572864, 19);
14846     for (int j = 0; j < 2097152; j += 2097152) {
14847       for (int k = 0; k < 524288; k += 4) {
14848         __asm__ volatile (
14849           "vmovupd (%0), %%ymm0\n"
14850           "vmovupd (%1), %%ymm1\n"
14851           "vmovupd (%2), %%ymm2\n"
14852           "vmovupd (%3), %%ymm3\n"
14853           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14854           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14855           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14856           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14857           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14858           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14859           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14860           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14861           "vmovupd %%ymm0, (%0)\n"
14862           "vmovupd %%ymm1, (%1)\n"
14863           "vmovupd %%ymm2, (%2)\n"
14864           "vmovupd %%ymm3, (%3)\n"
14865           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14866         );
14867       }
14868     }
14869     return;
14870   }
14871 }
14872 void helper_double_21(double *buf);
helper_double_21(double * buf)14873 void helper_double_21(double *buf) {
14874   helper_double_21_recursive(buf, 21);
14875 }
14876 void helper_double_22_recursive(double *buf, int depth);
helper_double_22_recursive(double * buf,int depth)14877 void helper_double_22_recursive(double *buf, int depth) {
14878   if (depth == 11) {
14879     for (int j = 0; j < 2048; j += 32) {
14880       for (int k = 0; k < 4; k += 4) {
14881         __asm__ volatile (
14882           "vmovupd (%0), %%ymm0\n"
14883           "vmovupd (%1), %%ymm1\n"
14884           "vmovupd (%2), %%ymm2\n"
14885           "vmovupd (%3), %%ymm3\n"
14886           "vmovupd (%4), %%ymm4\n"
14887           "vmovupd (%5), %%ymm5\n"
14888           "vmovupd (%6), %%ymm6\n"
14889           "vmovupd (%7), %%ymm7\n"
14890           "vpermilpd $0, %%ymm0, %%ymm8\n"
14891           "vpermilpd $15, %%ymm0, %%ymm9\n"
14892           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14893           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14894           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
14895           "vpermilpd $0, %%ymm1, %%ymm8\n"
14896           "vpermilpd $15, %%ymm1, %%ymm9\n"
14897           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14898           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14899           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
14900           "vpermilpd $0, %%ymm2, %%ymm8\n"
14901           "vpermilpd $15, %%ymm2, %%ymm9\n"
14902           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14903           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14904           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
14905           "vpermilpd $0, %%ymm3, %%ymm8\n"
14906           "vpermilpd $15, %%ymm3, %%ymm9\n"
14907           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14908           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14909           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
14910           "vpermilpd $0, %%ymm4, %%ymm8\n"
14911           "vpermilpd $15, %%ymm4, %%ymm9\n"
14912           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14913           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14914           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
14915           "vpermilpd $0, %%ymm5, %%ymm8\n"
14916           "vpermilpd $15, %%ymm5, %%ymm9\n"
14917           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14918           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14919           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
14920           "vpermilpd $0, %%ymm6, %%ymm8\n"
14921           "vpermilpd $15, %%ymm6, %%ymm9\n"
14922           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14923           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14924           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
14925           "vpermilpd $0, %%ymm7, %%ymm8\n"
14926           "vpermilpd $15, %%ymm7, %%ymm9\n"
14927           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14928           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14929           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
14930           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
14931           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14932           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
14933           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
14934           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
14935           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
14936           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14937           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
14938           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
14939           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
14940           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
14941           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14942           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
14943           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
14944           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
14945           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
14946           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14947           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
14948           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
14949           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
14950           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
14951           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14952           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
14953           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
14954           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
14955           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
14956           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14957           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
14958           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
14959           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
14960           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
14961           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14962           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
14963           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
14964           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
14965           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
14966           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14967           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
14968           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
14969           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
14970           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14971           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14972           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14973           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14974           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14975           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14976           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14977           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14978           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14979           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14980           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14981           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14982           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14983           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14984           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14985           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14986           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14987           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14988           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14989           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14990           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14991           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14992           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14993           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14994           "vmovupd %%ymm8, (%0)\n"
14995           "vmovupd %%ymm9, (%1)\n"
14996           "vmovupd %%ymm10, (%2)\n"
14997           "vmovupd %%ymm11, (%3)\n"
14998           "vmovupd %%ymm12, (%4)\n"
14999           "vmovupd %%ymm13, (%5)\n"
15000           "vmovupd %%ymm14, (%6)\n"
15001           "vmovupd %%ymm15, (%7)\n"
15002           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15003         );
15004       }
15005     }
15006     for (int j = 0; j < 2048; j += 256) {
15007       for (int k = 0; k < 32; k += 4) {
15008         __asm__ volatile (
15009           "vmovupd (%0), %%ymm0\n"
15010           "vmovupd (%1), %%ymm1\n"
15011           "vmovupd (%2), %%ymm2\n"
15012           "vmovupd (%3), %%ymm3\n"
15013           "vmovupd (%4), %%ymm4\n"
15014           "vmovupd (%5), %%ymm5\n"
15015           "vmovupd (%6), %%ymm6\n"
15016           "vmovupd (%7), %%ymm7\n"
15017           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15018           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15019           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15020           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15021           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15022           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15023           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15024           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15025           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15026           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15027           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15028           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15029           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15030           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15031           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15032           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15033           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15034           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15035           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15036           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15037           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15038           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15039           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15040           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15041           "vmovupd %%ymm8, (%0)\n"
15042           "vmovupd %%ymm9, (%1)\n"
15043           "vmovupd %%ymm10, (%2)\n"
15044           "vmovupd %%ymm11, (%3)\n"
15045           "vmovupd %%ymm12, (%4)\n"
15046           "vmovupd %%ymm13, (%5)\n"
15047           "vmovupd %%ymm14, (%6)\n"
15048           "vmovupd %%ymm15, (%7)\n"
15049           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15050         );
15051       }
15052     }
15053     for (int j = 0; j < 2048; j += 2048) {
15054       for (int k = 0; k < 256; k += 4) {
15055         __asm__ volatile (
15056           "vmovupd (%0), %%ymm0\n"
15057           "vmovupd (%1), %%ymm1\n"
15058           "vmovupd (%2), %%ymm2\n"
15059           "vmovupd (%3), %%ymm3\n"
15060           "vmovupd (%4), %%ymm4\n"
15061           "vmovupd (%5), %%ymm5\n"
15062           "vmovupd (%6), %%ymm6\n"
15063           "vmovupd (%7), %%ymm7\n"
15064           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15065           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15066           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15067           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15068           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15069           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15070           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15071           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15072           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15073           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15074           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15075           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15076           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15077           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15078           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15079           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15080           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15081           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15082           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15083           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15084           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15085           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15086           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15087           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15088           "vmovupd %%ymm8, (%0)\n"
15089           "vmovupd %%ymm9, (%1)\n"
15090           "vmovupd %%ymm10, (%2)\n"
15091           "vmovupd %%ymm11, (%3)\n"
15092           "vmovupd %%ymm12, (%4)\n"
15093           "vmovupd %%ymm13, (%5)\n"
15094           "vmovupd %%ymm14, (%6)\n"
15095           "vmovupd %%ymm15, (%7)\n"
15096           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15097         );
15098       }
15099     }
15100     return;
15101   }
15102   if (depth == 14) {
15103     helper_double_22_recursive(buf + 0, 11);
15104     helper_double_22_recursive(buf + 2048, 11);
15105     helper_double_22_recursive(buf + 4096, 11);
15106     helper_double_22_recursive(buf + 6144, 11);
15107     helper_double_22_recursive(buf + 8192, 11);
15108     helper_double_22_recursive(buf + 10240, 11);
15109     helper_double_22_recursive(buf + 12288, 11);
15110     helper_double_22_recursive(buf + 14336, 11);
15111     for (int j = 0; j < 16384; j += 16384) {
15112       for (int k = 0; k < 2048; k += 4) {
15113         __asm__ volatile (
15114           "vmovupd (%0), %%ymm0\n"
15115           "vmovupd (%1), %%ymm1\n"
15116           "vmovupd (%2), %%ymm2\n"
15117           "vmovupd (%3), %%ymm3\n"
15118           "vmovupd (%4), %%ymm4\n"
15119           "vmovupd (%5), %%ymm5\n"
15120           "vmovupd (%6), %%ymm6\n"
15121           "vmovupd (%7), %%ymm7\n"
15122           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15123           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15124           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15125           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15126           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15127           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15128           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15129           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15130           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15131           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15132           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15133           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15134           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15135           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15136           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15137           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15138           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15139           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15140           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15141           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15142           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15143           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15144           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15145           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15146           "vmovupd %%ymm8, (%0)\n"
15147           "vmovupd %%ymm9, (%1)\n"
15148           "vmovupd %%ymm10, (%2)\n"
15149           "vmovupd %%ymm11, (%3)\n"
15150           "vmovupd %%ymm12, (%4)\n"
15151           "vmovupd %%ymm13, (%5)\n"
15152           "vmovupd %%ymm14, (%6)\n"
15153           "vmovupd %%ymm15, (%7)\n"
15154           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15155         );
15156       }
15157     }
15158     return;
15159   }
15160   if (depth == 17) {
15161     helper_double_22_recursive(buf + 0, 14);
15162     helper_double_22_recursive(buf + 16384, 14);
15163     helper_double_22_recursive(buf + 32768, 14);
15164     helper_double_22_recursive(buf + 49152, 14);
15165     helper_double_22_recursive(buf + 65536, 14);
15166     helper_double_22_recursive(buf + 81920, 14);
15167     helper_double_22_recursive(buf + 98304, 14);
15168     helper_double_22_recursive(buf + 114688, 14);
15169     for (int j = 0; j < 131072; j += 131072) {
15170       for (int k = 0; k < 16384; k += 4) {
15171         __asm__ volatile (
15172           "vmovupd (%0), %%ymm0\n"
15173           "vmovupd (%1), %%ymm1\n"
15174           "vmovupd (%2), %%ymm2\n"
15175           "vmovupd (%3), %%ymm3\n"
15176           "vmovupd (%4), %%ymm4\n"
15177           "vmovupd (%5), %%ymm5\n"
15178           "vmovupd (%6), %%ymm6\n"
15179           "vmovupd (%7), %%ymm7\n"
15180           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15181           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15182           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15183           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15184           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15185           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15186           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15187           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15188           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15189           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15190           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15191           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15192           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15193           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15194           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15195           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15196           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15197           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15198           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15199           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15200           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15201           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15202           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15203           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15204           "vmovupd %%ymm8, (%0)\n"
15205           "vmovupd %%ymm9, (%1)\n"
15206           "vmovupd %%ymm10, (%2)\n"
15207           "vmovupd %%ymm11, (%3)\n"
15208           "vmovupd %%ymm12, (%4)\n"
15209           "vmovupd %%ymm13, (%5)\n"
15210           "vmovupd %%ymm14, (%6)\n"
15211           "vmovupd %%ymm15, (%7)\n"
15212           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15213         );
15214       }
15215     }
15216     return;
15217   }
15218   if (depth == 20) {
15219     helper_double_22_recursive(buf + 0, 17);
15220     helper_double_22_recursive(buf + 131072, 17);
15221     helper_double_22_recursive(buf + 262144, 17);
15222     helper_double_22_recursive(buf + 393216, 17);
15223     helper_double_22_recursive(buf + 524288, 17);
15224     helper_double_22_recursive(buf + 655360, 17);
15225     helper_double_22_recursive(buf + 786432, 17);
15226     helper_double_22_recursive(buf + 917504, 17);
15227     for (int j = 0; j < 1048576; j += 1048576) {
15228       for (int k = 0; k < 131072; k += 4) {
15229         __asm__ volatile (
15230           "vmovupd (%0), %%ymm0\n"
15231           "vmovupd (%1), %%ymm1\n"
15232           "vmovupd (%2), %%ymm2\n"
15233           "vmovupd (%3), %%ymm3\n"
15234           "vmovupd (%4), %%ymm4\n"
15235           "vmovupd (%5), %%ymm5\n"
15236           "vmovupd (%6), %%ymm6\n"
15237           "vmovupd (%7), %%ymm7\n"
15238           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15239           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15240           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15241           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15242           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15243           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15244           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15245           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15246           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15247           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15248           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15249           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15250           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15251           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15252           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15253           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15254           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15255           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15256           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15257           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15258           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15259           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15260           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15261           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15262           "vmovupd %%ymm8, (%0)\n"
15263           "vmovupd %%ymm9, (%1)\n"
15264           "vmovupd %%ymm10, (%2)\n"
15265           "vmovupd %%ymm11, (%3)\n"
15266           "vmovupd %%ymm12, (%4)\n"
15267           "vmovupd %%ymm13, (%5)\n"
15268           "vmovupd %%ymm14, (%6)\n"
15269           "vmovupd %%ymm15, (%7)\n"
15270           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15271         );
15272       }
15273     }
15274     return;
15275   }
15276   if (depth == 22) {
15277     helper_double_22_recursive(buf + 0, 20);
15278     helper_double_22_recursive(buf + 1048576, 20);
15279     helper_double_22_recursive(buf + 2097152, 20);
15280     helper_double_22_recursive(buf + 3145728, 20);
15281     for (int j = 0; j < 4194304; j += 4194304) {
15282       for (int k = 0; k < 1048576; k += 4) {
15283         __asm__ volatile (
15284           "vmovupd (%0), %%ymm0\n"
15285           "vmovupd (%1), %%ymm1\n"
15286           "vmovupd (%2), %%ymm2\n"
15287           "vmovupd (%3), %%ymm3\n"
15288           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15289           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15290           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15291           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15292           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15293           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15294           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15295           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15296           "vmovupd %%ymm0, (%0)\n"
15297           "vmovupd %%ymm1, (%1)\n"
15298           "vmovupd %%ymm2, (%2)\n"
15299           "vmovupd %%ymm3, (%3)\n"
15300           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15301         );
15302       }
15303     }
15304     return;
15305   }
15306 }
15307 void helper_double_22(double *buf);
helper_double_22(double * buf)15308 void helper_double_22(double *buf) {
15309   helper_double_22_recursive(buf, 22);
15310 }
15311 void helper_double_23_recursive(double *buf, int depth);
helper_double_23_recursive(double * buf,int depth)15312 void helper_double_23_recursive(double *buf, int depth) {
15313   if (depth == 11) {
15314     for (int j = 0; j < 2048; j += 32) {
15315       for (int k = 0; k < 4; k += 4) {
15316         __asm__ volatile (
15317           "vmovupd (%0), %%ymm0\n"
15318           "vmovupd (%1), %%ymm1\n"
15319           "vmovupd (%2), %%ymm2\n"
15320           "vmovupd (%3), %%ymm3\n"
15321           "vmovupd (%4), %%ymm4\n"
15322           "vmovupd (%5), %%ymm5\n"
15323           "vmovupd (%6), %%ymm6\n"
15324           "vmovupd (%7), %%ymm7\n"
15325           "vpermilpd $0, %%ymm0, %%ymm8\n"
15326           "vpermilpd $15, %%ymm0, %%ymm9\n"
15327           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15328           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15329           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
15330           "vpermilpd $0, %%ymm1, %%ymm8\n"
15331           "vpermilpd $15, %%ymm1, %%ymm9\n"
15332           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15333           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15334           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
15335           "vpermilpd $0, %%ymm2, %%ymm8\n"
15336           "vpermilpd $15, %%ymm2, %%ymm9\n"
15337           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15338           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15339           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
15340           "vpermilpd $0, %%ymm3, %%ymm8\n"
15341           "vpermilpd $15, %%ymm3, %%ymm9\n"
15342           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15343           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15344           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
15345           "vpermilpd $0, %%ymm4, %%ymm8\n"
15346           "vpermilpd $15, %%ymm4, %%ymm9\n"
15347           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15348           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15349           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
15350           "vpermilpd $0, %%ymm5, %%ymm8\n"
15351           "vpermilpd $15, %%ymm5, %%ymm9\n"
15352           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15353           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15354           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
15355           "vpermilpd $0, %%ymm6, %%ymm8\n"
15356           "vpermilpd $15, %%ymm6, %%ymm9\n"
15357           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15358           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15359           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
15360           "vpermilpd $0, %%ymm7, %%ymm8\n"
15361           "vpermilpd $15, %%ymm7, %%ymm9\n"
15362           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15363           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15364           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
15365           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
15366           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15367           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
15368           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
15369           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
15370           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
15371           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15372           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
15373           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
15374           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
15375           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
15376           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15377           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
15378           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
15379           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
15380           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
15381           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15382           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
15383           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
15384           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
15385           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
15386           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15387           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
15388           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
15389           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
15390           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
15391           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15392           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
15393           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
15394           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
15395           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
15396           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15397           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
15398           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
15399           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
15400           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
15401           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15402           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
15403           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
15404           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
15405           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15406           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15407           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15408           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15409           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15410           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15411           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15412           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15413           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15414           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15415           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15416           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15417           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15418           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15419           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15420           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15421           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15422           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15423           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15424           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15425           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15426           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15427           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15428           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15429           "vmovupd %%ymm8, (%0)\n"
15430           "vmovupd %%ymm9, (%1)\n"
15431           "vmovupd %%ymm10, (%2)\n"
15432           "vmovupd %%ymm11, (%3)\n"
15433           "vmovupd %%ymm12, (%4)\n"
15434           "vmovupd %%ymm13, (%5)\n"
15435           "vmovupd %%ymm14, (%6)\n"
15436           "vmovupd %%ymm15, (%7)\n"
15437           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15438         );
15439       }
15440     }
15441     for (int j = 0; j < 2048; j += 256) {
15442       for (int k = 0; k < 32; k += 4) {
15443         __asm__ volatile (
15444           "vmovupd (%0), %%ymm0\n"
15445           "vmovupd (%1), %%ymm1\n"
15446           "vmovupd (%2), %%ymm2\n"
15447           "vmovupd (%3), %%ymm3\n"
15448           "vmovupd (%4), %%ymm4\n"
15449           "vmovupd (%5), %%ymm5\n"
15450           "vmovupd (%6), %%ymm6\n"
15451           "vmovupd (%7), %%ymm7\n"
15452           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15453           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15454           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15455           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15456           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15457           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15458           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15459           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15460           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15461           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15462           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15463           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15464           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15465           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15466           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15467           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15468           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15469           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15470           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15471           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15472           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15473           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15474           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15475           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15476           "vmovupd %%ymm8, (%0)\n"
15477           "vmovupd %%ymm9, (%1)\n"
15478           "vmovupd %%ymm10, (%2)\n"
15479           "vmovupd %%ymm11, (%3)\n"
15480           "vmovupd %%ymm12, (%4)\n"
15481           "vmovupd %%ymm13, (%5)\n"
15482           "vmovupd %%ymm14, (%6)\n"
15483           "vmovupd %%ymm15, (%7)\n"
15484           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15485         );
15486       }
15487     }
15488     for (int j = 0; j < 2048; j += 2048) {
15489       for (int k = 0; k < 256; k += 4) {
15490         __asm__ volatile (
15491           "vmovupd (%0), %%ymm0\n"
15492           "vmovupd (%1), %%ymm1\n"
15493           "vmovupd (%2), %%ymm2\n"
15494           "vmovupd (%3), %%ymm3\n"
15495           "vmovupd (%4), %%ymm4\n"
15496           "vmovupd (%5), %%ymm5\n"
15497           "vmovupd (%6), %%ymm6\n"
15498           "vmovupd (%7), %%ymm7\n"
15499           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15500           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15501           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15502           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15503           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15504           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15505           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15506           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15507           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15508           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15509           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15510           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15511           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15512           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15513           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15514           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15515           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15516           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15517           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15518           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15519           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15520           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15521           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15522           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15523           "vmovupd %%ymm8, (%0)\n"
15524           "vmovupd %%ymm9, (%1)\n"
15525           "vmovupd %%ymm10, (%2)\n"
15526           "vmovupd %%ymm11, (%3)\n"
15527           "vmovupd %%ymm12, (%4)\n"
15528           "vmovupd %%ymm13, (%5)\n"
15529           "vmovupd %%ymm14, (%6)\n"
15530           "vmovupd %%ymm15, (%7)\n"
15531           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15532         );
15533       }
15534     }
15535     return;
15536   }
15537   if (depth == 14) {
15538     helper_double_23_recursive(buf + 0, 11);
15539     helper_double_23_recursive(buf + 2048, 11);
15540     helper_double_23_recursive(buf + 4096, 11);
15541     helper_double_23_recursive(buf + 6144, 11);
15542     helper_double_23_recursive(buf + 8192, 11);
15543     helper_double_23_recursive(buf + 10240, 11);
15544     helper_double_23_recursive(buf + 12288, 11);
15545     helper_double_23_recursive(buf + 14336, 11);
15546     for (int j = 0; j < 16384; j += 16384) {
15547       for (int k = 0; k < 2048; k += 4) {
15548         __asm__ volatile (
15549           "vmovupd (%0), %%ymm0\n"
15550           "vmovupd (%1), %%ymm1\n"
15551           "vmovupd (%2), %%ymm2\n"
15552           "vmovupd (%3), %%ymm3\n"
15553           "vmovupd (%4), %%ymm4\n"
15554           "vmovupd (%5), %%ymm5\n"
15555           "vmovupd (%6), %%ymm6\n"
15556           "vmovupd (%7), %%ymm7\n"
15557           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15558           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15559           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15560           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15561           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15562           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15563           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15564           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15565           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15566           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15567           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15568           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15569           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15570           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15571           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15572           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15573           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15574           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15575           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15576           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15577           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15578           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15579           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15580           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15581           "vmovupd %%ymm8, (%0)\n"
15582           "vmovupd %%ymm9, (%1)\n"
15583           "vmovupd %%ymm10, (%2)\n"
15584           "vmovupd %%ymm11, (%3)\n"
15585           "vmovupd %%ymm12, (%4)\n"
15586           "vmovupd %%ymm13, (%5)\n"
15587           "vmovupd %%ymm14, (%6)\n"
15588           "vmovupd %%ymm15, (%7)\n"
15589           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15590         );
15591       }
15592     }
15593     return;
15594   }
15595   if (depth == 17) {
15596     helper_double_23_recursive(buf + 0, 14);
15597     helper_double_23_recursive(buf + 16384, 14);
15598     helper_double_23_recursive(buf + 32768, 14);
15599     helper_double_23_recursive(buf + 49152, 14);
15600     helper_double_23_recursive(buf + 65536, 14);
15601     helper_double_23_recursive(buf + 81920, 14);
15602     helper_double_23_recursive(buf + 98304, 14);
15603     helper_double_23_recursive(buf + 114688, 14);
15604     for (int j = 0; j < 131072; j += 131072) {
15605       for (int k = 0; k < 16384; k += 4) {
15606         __asm__ volatile (
15607           "vmovupd (%0), %%ymm0\n"
15608           "vmovupd (%1), %%ymm1\n"
15609           "vmovupd (%2), %%ymm2\n"
15610           "vmovupd (%3), %%ymm3\n"
15611           "vmovupd (%4), %%ymm4\n"
15612           "vmovupd (%5), %%ymm5\n"
15613           "vmovupd (%6), %%ymm6\n"
15614           "vmovupd (%7), %%ymm7\n"
15615           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15616           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15617           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15618           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15619           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15620           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15621           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15622           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15623           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15624           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15625           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15626           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15627           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15628           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15629           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15630           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15631           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15632           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15633           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15634           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15635           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15636           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15637           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15638           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15639           "vmovupd %%ymm8, (%0)\n"
15640           "vmovupd %%ymm9, (%1)\n"
15641           "vmovupd %%ymm10, (%2)\n"
15642           "vmovupd %%ymm11, (%3)\n"
15643           "vmovupd %%ymm12, (%4)\n"
15644           "vmovupd %%ymm13, (%5)\n"
15645           "vmovupd %%ymm14, (%6)\n"
15646           "vmovupd %%ymm15, (%7)\n"
15647           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15648         );
15649       }
15650     }
15651     return;
15652   }
15653   if (depth == 20) {
15654     helper_double_23_recursive(buf + 0, 17);
15655     helper_double_23_recursive(buf + 131072, 17);
15656     helper_double_23_recursive(buf + 262144, 17);
15657     helper_double_23_recursive(buf + 393216, 17);
15658     helper_double_23_recursive(buf + 524288, 17);
15659     helper_double_23_recursive(buf + 655360, 17);
15660     helper_double_23_recursive(buf + 786432, 17);
15661     helper_double_23_recursive(buf + 917504, 17);
15662     for (int j = 0; j < 1048576; j += 1048576) {
15663       for (int k = 0; k < 131072; k += 4) {
15664         __asm__ volatile (
15665           "vmovupd (%0), %%ymm0\n"
15666           "vmovupd (%1), %%ymm1\n"
15667           "vmovupd (%2), %%ymm2\n"
15668           "vmovupd (%3), %%ymm3\n"
15669           "vmovupd (%4), %%ymm4\n"
15670           "vmovupd (%5), %%ymm5\n"
15671           "vmovupd (%6), %%ymm6\n"
15672           "vmovupd (%7), %%ymm7\n"
15673           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15674           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15675           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15676           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15677           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15678           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15679           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15680           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15681           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15682           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15683           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15684           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15685           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15686           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15687           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15688           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15689           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15690           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15691           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15692           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15693           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15694           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15695           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15696           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15697           "vmovupd %%ymm8, (%0)\n"
15698           "vmovupd %%ymm9, (%1)\n"
15699           "vmovupd %%ymm10, (%2)\n"
15700           "vmovupd %%ymm11, (%3)\n"
15701           "vmovupd %%ymm12, (%4)\n"
15702           "vmovupd %%ymm13, (%5)\n"
15703           "vmovupd %%ymm14, (%6)\n"
15704           "vmovupd %%ymm15, (%7)\n"
15705           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15706         );
15707       }
15708     }
15709     return;
15710   }
15711   if (depth == 23) {
15712     helper_double_23_recursive(buf + 0, 20);
15713     helper_double_23_recursive(buf + 1048576, 20);
15714     helper_double_23_recursive(buf + 2097152, 20);
15715     helper_double_23_recursive(buf + 3145728, 20);
15716     helper_double_23_recursive(buf + 4194304, 20);
15717     helper_double_23_recursive(buf + 5242880, 20);
15718     helper_double_23_recursive(buf + 6291456, 20);
15719     helper_double_23_recursive(buf + 7340032, 20);
15720     for (int j = 0; j < 8388608; j += 8388608) {
15721       for (int k = 0; k < 1048576; k += 4) {
15722         __asm__ volatile (
15723           "vmovupd (%0), %%ymm0\n"
15724           "vmovupd (%1), %%ymm1\n"
15725           "vmovupd (%2), %%ymm2\n"
15726           "vmovupd (%3), %%ymm3\n"
15727           "vmovupd (%4), %%ymm4\n"
15728           "vmovupd (%5), %%ymm5\n"
15729           "vmovupd (%6), %%ymm6\n"
15730           "vmovupd (%7), %%ymm7\n"
15731           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15732           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15733           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15734           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15735           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15736           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15737           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15738           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15739           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15740           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15741           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15742           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15743           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15744           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15745           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15746           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15747           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15748           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15749           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15750           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15751           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15752           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15753           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15754           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15755           "vmovupd %%ymm8, (%0)\n"
15756           "vmovupd %%ymm9, (%1)\n"
15757           "vmovupd %%ymm10, (%2)\n"
15758           "vmovupd %%ymm11, (%3)\n"
15759           "vmovupd %%ymm12, (%4)\n"
15760           "vmovupd %%ymm13, (%5)\n"
15761           "vmovupd %%ymm14, (%6)\n"
15762           "vmovupd %%ymm15, (%7)\n"
15763           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15764         );
15765       }
15766     }
15767     return;
15768   }
15769 }
15770 void helper_double_23(double *buf);
helper_double_23(double * buf)15771 void helper_double_23(double *buf) {
15772   helper_double_23_recursive(buf, 23);
15773 }
15774 void helper_double_24_recursive(double *buf, int depth);
helper_double_24_recursive(double * buf,int depth)15775 void helper_double_24_recursive(double *buf, int depth) {
15776   if (depth == 10) {
15777     for (int j = 0; j < 1024; j += 32) {
15778       for (int k = 0; k < 4; k += 4) {
15779         __asm__ volatile (
15780           "vmovupd (%0), %%ymm0\n"
15781           "vmovupd (%1), %%ymm1\n"
15782           "vmovupd (%2), %%ymm2\n"
15783           "vmovupd (%3), %%ymm3\n"
15784           "vmovupd (%4), %%ymm4\n"
15785           "vmovupd (%5), %%ymm5\n"
15786           "vmovupd (%6), %%ymm6\n"
15787           "vmovupd (%7), %%ymm7\n"
15788           "vpermilpd $0, %%ymm0, %%ymm8\n"
15789           "vpermilpd $15, %%ymm0, %%ymm9\n"
15790           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15791           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15792           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
15793           "vpermilpd $0, %%ymm1, %%ymm8\n"
15794           "vpermilpd $15, %%ymm1, %%ymm9\n"
15795           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15796           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15797           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
15798           "vpermilpd $0, %%ymm2, %%ymm8\n"
15799           "vpermilpd $15, %%ymm2, %%ymm9\n"
15800           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15801           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15802           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
15803           "vpermilpd $0, %%ymm3, %%ymm8\n"
15804           "vpermilpd $15, %%ymm3, %%ymm9\n"
15805           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15806           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15807           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
15808           "vpermilpd $0, %%ymm4, %%ymm8\n"
15809           "vpermilpd $15, %%ymm4, %%ymm9\n"
15810           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15811           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15812           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
15813           "vpermilpd $0, %%ymm5, %%ymm8\n"
15814           "vpermilpd $15, %%ymm5, %%ymm9\n"
15815           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15816           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15817           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
15818           "vpermilpd $0, %%ymm6, %%ymm8\n"
15819           "vpermilpd $15, %%ymm6, %%ymm9\n"
15820           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15821           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15822           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
15823           "vpermilpd $0, %%ymm7, %%ymm8\n"
15824           "vpermilpd $15, %%ymm7, %%ymm9\n"
15825           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15826           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15827           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
15828           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
15829           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15830           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
15831           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
15832           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
15833           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
15834           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15835           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
15836           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
15837           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
15838           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
15839           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15840           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
15841           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
15842           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
15843           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
15844           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15845           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
15846           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
15847           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
15848           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
15849           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15850           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
15851           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
15852           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
15853           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
15854           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15855           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
15856           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
15857           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
15858           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
15859           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15860           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
15861           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
15862           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
15863           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
15864           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15865           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
15866           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
15867           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
15868           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15869           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15870           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15871           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15872           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15873           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15874           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15875           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15876           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15877           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15878           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15879           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15880           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15881           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15882           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15883           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15884           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15885           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15886           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15887           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15888           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15889           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15890           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15891           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15892           "vmovupd %%ymm8, (%0)\n"
15893           "vmovupd %%ymm9, (%1)\n"
15894           "vmovupd %%ymm10, (%2)\n"
15895           "vmovupd %%ymm11, (%3)\n"
15896           "vmovupd %%ymm12, (%4)\n"
15897           "vmovupd %%ymm13, (%5)\n"
15898           "vmovupd %%ymm14, (%6)\n"
15899           "vmovupd %%ymm15, (%7)\n"
15900           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15901         );
15902       }
15903     }
15904     for (int j = 0; j < 1024; j += 256) {
15905       for (int k = 0; k < 32; k += 4) {
15906         __asm__ volatile (
15907           "vmovupd (%0), %%ymm0\n"
15908           "vmovupd (%1), %%ymm1\n"
15909           "vmovupd (%2), %%ymm2\n"
15910           "vmovupd (%3), %%ymm3\n"
15911           "vmovupd (%4), %%ymm4\n"
15912           "vmovupd (%5), %%ymm5\n"
15913           "vmovupd (%6), %%ymm6\n"
15914           "vmovupd (%7), %%ymm7\n"
15915           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15916           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15917           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15918           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15919           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15920           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15921           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15922           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15923           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15924           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15925           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15926           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15927           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15928           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15929           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15930           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15931           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15932           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15933           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15934           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15935           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15936           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15937           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15938           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15939           "vmovupd %%ymm8, (%0)\n"
15940           "vmovupd %%ymm9, (%1)\n"
15941           "vmovupd %%ymm10, (%2)\n"
15942           "vmovupd %%ymm11, (%3)\n"
15943           "vmovupd %%ymm12, (%4)\n"
15944           "vmovupd %%ymm13, (%5)\n"
15945           "vmovupd %%ymm14, (%6)\n"
15946           "vmovupd %%ymm15, (%7)\n"
15947           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15948         );
15949       }
15950     }
15951     for (int j = 0; j < 1024; j += 1024) {
15952       for (int k = 0; k < 256; k += 4) {
15953         __asm__ volatile (
15954           "vmovupd (%0), %%ymm0\n"
15955           "vmovupd (%1), %%ymm1\n"
15956           "vmovupd (%2), %%ymm2\n"
15957           "vmovupd (%3), %%ymm3\n"
15958           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15959           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15960           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15961           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15962           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15963           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15964           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15965           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15966           "vmovupd %%ymm0, (%0)\n"
15967           "vmovupd %%ymm1, (%1)\n"
15968           "vmovupd %%ymm2, (%2)\n"
15969           "vmovupd %%ymm3, (%3)\n"
15970           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15971         );
15972       }
15973     }
15974     return;
15975   }
15976   if (depth == 13) {
15977     helper_double_24_recursive(buf + 0, 10);
15978     helper_double_24_recursive(buf + 1024, 10);
15979     helper_double_24_recursive(buf + 2048, 10);
15980     helper_double_24_recursive(buf + 3072, 10);
15981     helper_double_24_recursive(buf + 4096, 10);
15982     helper_double_24_recursive(buf + 5120, 10);
15983     helper_double_24_recursive(buf + 6144, 10);
15984     helper_double_24_recursive(buf + 7168, 10);
15985     for (int j = 0; j < 8192; j += 8192) {
15986       for (int k = 0; k < 1024; k += 4) {
15987         __asm__ volatile (
15988           "vmovupd (%0), %%ymm0\n"
15989           "vmovupd (%1), %%ymm1\n"
15990           "vmovupd (%2), %%ymm2\n"
15991           "vmovupd (%3), %%ymm3\n"
15992           "vmovupd (%4), %%ymm4\n"
15993           "vmovupd (%5), %%ymm5\n"
15994           "vmovupd (%6), %%ymm6\n"
15995           "vmovupd (%7), %%ymm7\n"
15996           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15997           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15998           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15999           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16000           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16001           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16002           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16003           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16004           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16005           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16006           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16007           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16008           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16009           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16010           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16011           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16012           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16013           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16014           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16015           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16016           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16017           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16018           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16019           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16020           "vmovupd %%ymm8, (%0)\n"
16021           "vmovupd %%ymm9, (%1)\n"
16022           "vmovupd %%ymm10, (%2)\n"
16023           "vmovupd %%ymm11, (%3)\n"
16024           "vmovupd %%ymm12, (%4)\n"
16025           "vmovupd %%ymm13, (%5)\n"
16026           "vmovupd %%ymm14, (%6)\n"
16027           "vmovupd %%ymm15, (%7)\n"
16028           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16029         );
16030       }
16031     }
16032     return;
16033   }
16034   if (depth == 16) {
16035     helper_double_24_recursive(buf + 0, 13);
16036     helper_double_24_recursive(buf + 8192, 13);
16037     helper_double_24_recursive(buf + 16384, 13);
16038     helper_double_24_recursive(buf + 24576, 13);
16039     helper_double_24_recursive(buf + 32768, 13);
16040     helper_double_24_recursive(buf + 40960, 13);
16041     helper_double_24_recursive(buf + 49152, 13);
16042     helper_double_24_recursive(buf + 57344, 13);
16043     for (int j = 0; j < 65536; j += 65536) {
16044       for (int k = 0; k < 8192; k += 4) {
16045         __asm__ volatile (
16046           "vmovupd (%0), %%ymm0\n"
16047           "vmovupd (%1), %%ymm1\n"
16048           "vmovupd (%2), %%ymm2\n"
16049           "vmovupd (%3), %%ymm3\n"
16050           "vmovupd (%4), %%ymm4\n"
16051           "vmovupd (%5), %%ymm5\n"
16052           "vmovupd (%6), %%ymm6\n"
16053           "vmovupd (%7), %%ymm7\n"
16054           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16055           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16056           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16057           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16058           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16059           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16060           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16061           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16062           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16063           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16064           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16065           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16066           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16067           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16068           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16069           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16070           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16071           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16072           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16073           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16074           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16075           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16076           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16077           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16078           "vmovupd %%ymm8, (%0)\n"
16079           "vmovupd %%ymm9, (%1)\n"
16080           "vmovupd %%ymm10, (%2)\n"
16081           "vmovupd %%ymm11, (%3)\n"
16082           "vmovupd %%ymm12, (%4)\n"
16083           "vmovupd %%ymm13, (%5)\n"
16084           "vmovupd %%ymm14, (%6)\n"
16085           "vmovupd %%ymm15, (%7)\n"
16086           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16087         );
16088       }
16089     }
16090     return;
16091   }
16092   if (depth == 19) {
16093     helper_double_24_recursive(buf + 0, 16);
16094     helper_double_24_recursive(buf + 65536, 16);
16095     helper_double_24_recursive(buf + 131072, 16);
16096     helper_double_24_recursive(buf + 196608, 16);
16097     helper_double_24_recursive(buf + 262144, 16);
16098     helper_double_24_recursive(buf + 327680, 16);
16099     helper_double_24_recursive(buf + 393216, 16);
16100     helper_double_24_recursive(buf + 458752, 16);
16101     for (int j = 0; j < 524288; j += 524288) {
16102       for (int k = 0; k < 65536; k += 4) {
16103         __asm__ volatile (
16104           "vmovupd (%0), %%ymm0\n"
16105           "vmovupd (%1), %%ymm1\n"
16106           "vmovupd (%2), %%ymm2\n"
16107           "vmovupd (%3), %%ymm3\n"
16108           "vmovupd (%4), %%ymm4\n"
16109           "vmovupd (%5), %%ymm5\n"
16110           "vmovupd (%6), %%ymm6\n"
16111           "vmovupd (%7), %%ymm7\n"
16112           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16113           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16114           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16115           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16116           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16117           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16118           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16119           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16120           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16121           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16122           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16123           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16124           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16125           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16126           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16127           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16128           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16129           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16130           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16131           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16132           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16133           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16134           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16135           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16136           "vmovupd %%ymm8, (%0)\n"
16137           "vmovupd %%ymm9, (%1)\n"
16138           "vmovupd %%ymm10, (%2)\n"
16139           "vmovupd %%ymm11, (%3)\n"
16140           "vmovupd %%ymm12, (%4)\n"
16141           "vmovupd %%ymm13, (%5)\n"
16142           "vmovupd %%ymm14, (%6)\n"
16143           "vmovupd %%ymm15, (%7)\n"
16144           :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16145         );
16146       }
16147     }
16148     return;
16149   }
16150   if (depth == 22) {
16151     helper_double_24_recursive(buf + 0, 19);
16152     helper_double_24_recursive(buf + 524288, 19);
16153     helper_double_24_recursive(buf + 1048576, 19);
16154     helper_double_24_recursive(buf + 1572864, 19);
16155     helper_double_24_recursive(buf + 2097152, 19);
16156     helper_double_24_recursive(buf + 2621440, 19);
16157     helper_double_24_recursive(buf + 3145728, 19);
16158     helper_double_24_recursive(buf + 3670016, 19);
16159     for (int j = 0; j < 4194304; j += 4194304) {
16160       for (int k = 0; k < 524288; k += 4) {
16161         __asm__ volatile (
16162           "vmovupd (%0), %%ymm0\n"
16163           "vmovupd (%1), %%ymm1\n"
16164           "vmovupd (%2), %%ymm2\n"
16165           "vmovupd (%3), %%ymm3\n"
16166           "vmovupd (%4), %%ymm4\n"
16167           "vmovupd (%5), %%ymm5\n"
16168           "vmovupd (%6), %%ymm6\n"
16169           "vmovupd (%7), %%ymm7\n"
16170           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16171           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16172           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16173           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16174           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16175           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16176           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16177           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16178           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16179           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16180           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16181           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16182           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16183           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16184           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16185           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16186           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16187           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16188           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16189           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16190           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16191           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16192           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16193           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16194           "vmovupd %%ymm8, (%0)\n"
16195           "vmovupd %%ymm9, (%1)\n"
16196           "vmovupd %%ymm10, (%2)\n"
16197           "vmovupd %%ymm11, (%3)\n"
16198           "vmovupd %%ymm12, (%4)\n"
16199           "vmovupd %%ymm13, (%5)\n"
16200           "vmovupd %%ymm14, (%6)\n"
16201           "vmovupd %%ymm15, (%7)\n"
16202           :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16203         );
16204       }
16205     }
16206     return;
16207   }
16208   if (depth == 24) {
16209     helper_double_24_recursive(buf + 0, 22);
16210     helper_double_24_recursive(buf + 4194304, 22);
16211     helper_double_24_recursive(buf + 8388608, 22);
16212     helper_double_24_recursive(buf + 12582912, 22);
16213     for (int j = 0; j < 16777216; j += 16777216) {
16214       for (int k = 0; k < 4194304; k += 4) {
16215         __asm__ volatile (
16216           "vmovupd (%0), %%ymm0\n"
16217           "vmovupd (%1), %%ymm1\n"
16218           "vmovupd (%2), %%ymm2\n"
16219           "vmovupd (%3), %%ymm3\n"
16220           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16221           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16222           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16223           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16224           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16225           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16226           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16227           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16228           "vmovupd %%ymm0, (%0)\n"
16229           "vmovupd %%ymm1, (%1)\n"
16230           "vmovupd %%ymm2, (%2)\n"
16231           "vmovupd %%ymm3, (%3)\n"
16232           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16233         );
16234       }
16235     }
16236     return;
16237   }
16238 }
16239 void helper_double_24(double *buf);
helper_double_24(double * buf)16240 void helper_double_24(double *buf) {
16241   helper_double_24_recursive(buf, 24);
16242 }
16243 void helper_double_25_recursive(double *buf, int depth);
helper_double_25_recursive(double * buf,int depth)16244 void helper_double_25_recursive(double *buf, int depth) {
16245   if (depth == 8) {
16246     for (int j = 0; j < 256; j += 32) {
16247       for (int k = 0; k < 4; k += 4) {
16248         __asm__ volatile (
16249           "vmovupd (%0), %%ymm0\n"
16250           "vmovupd (%1), %%ymm1\n"
16251           "vmovupd (%2), %%ymm2\n"
16252           "vmovupd (%3), %%ymm3\n"
16253           "vmovupd (%4), %%ymm4\n"
16254           "vmovupd (%5), %%ymm5\n"
16255           "vmovupd (%6), %%ymm6\n"
16256           "vmovupd (%7), %%ymm7\n"
16257           "vpermilpd $0, %%ymm0, %%ymm8\n"
16258           "vpermilpd $15, %%ymm0, %%ymm9\n"
16259           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16260           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16261           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
16262           "vpermilpd $0, %%ymm1, %%ymm8\n"
16263           "vpermilpd $15, %%ymm1, %%ymm9\n"
16264           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16265           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16266           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
16267           "vpermilpd $0, %%ymm2, %%ymm8\n"
16268           "vpermilpd $15, %%ymm2, %%ymm9\n"
16269           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16270           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16271           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
16272           "vpermilpd $0, %%ymm3, %%ymm8\n"
16273           "vpermilpd $15, %%ymm3, %%ymm9\n"
16274           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16275           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16276           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
16277           "vpermilpd $0, %%ymm4, %%ymm8\n"
16278           "vpermilpd $15, %%ymm4, %%ymm9\n"
16279           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16280           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16281           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
16282           "vpermilpd $0, %%ymm5, %%ymm8\n"
16283           "vpermilpd $15, %%ymm5, %%ymm9\n"
16284           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16285           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16286           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
16287           "vpermilpd $0, %%ymm6, %%ymm8\n"
16288           "vpermilpd $15, %%ymm6, %%ymm9\n"
16289           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16290           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16291           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
16292           "vpermilpd $0, %%ymm7, %%ymm8\n"
16293           "vpermilpd $15, %%ymm7, %%ymm9\n"
16294           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16295           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16296           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
16297           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
16298           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16299           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
16300           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
16301           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
16302           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
16303           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16304           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
16305           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
16306           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
16307           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
16308           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16309           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
16310           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
16311           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
16312           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
16313           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16314           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
16315           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
16316           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
16317           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
16318           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16319           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
16320           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
16321           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
16322           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
16323           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16324           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
16325           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
16326           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
16327           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
16328           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16329           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
16330           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
16331           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
16332           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
16333           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16334           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
16335           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
16336           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
16337           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16338           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16339           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16340           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16341           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16342           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16343           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16344           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16345           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16346           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16347           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16348           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16349           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16350           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16351           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16352           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16353           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16354           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16355           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16356           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16357           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16358           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16359           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16360           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16361           "vmovupd %%ymm8, (%0)\n"
16362           "vmovupd %%ymm9, (%1)\n"
16363           "vmovupd %%ymm10, (%2)\n"
16364           "vmovupd %%ymm11, (%3)\n"
16365           "vmovupd %%ymm12, (%4)\n"
16366           "vmovupd %%ymm13, (%5)\n"
16367           "vmovupd %%ymm14, (%6)\n"
16368           "vmovupd %%ymm15, (%7)\n"
16369           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16370         );
16371       }
16372     }
16373     for (int j = 0; j < 256; j += 256) {
16374       for (int k = 0; k < 32; k += 4) {
16375         __asm__ volatile (
16376           "vmovupd (%0), %%ymm0\n"
16377           "vmovupd (%1), %%ymm1\n"
16378           "vmovupd (%2), %%ymm2\n"
16379           "vmovupd (%3), %%ymm3\n"
16380           "vmovupd (%4), %%ymm4\n"
16381           "vmovupd (%5), %%ymm5\n"
16382           "vmovupd (%6), %%ymm6\n"
16383           "vmovupd (%7), %%ymm7\n"
16384           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16385           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16386           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16387           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16388           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16389           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16390           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16391           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16392           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16393           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16394           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16395           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16396           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16397           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16398           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16399           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16400           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16401           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16402           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16403           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16404           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16405           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16406           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16407           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16408           "vmovupd %%ymm8, (%0)\n"
16409           "vmovupd %%ymm9, (%1)\n"
16410           "vmovupd %%ymm10, (%2)\n"
16411           "vmovupd %%ymm11, (%3)\n"
16412           "vmovupd %%ymm12, (%4)\n"
16413           "vmovupd %%ymm13, (%5)\n"
16414           "vmovupd %%ymm14, (%6)\n"
16415           "vmovupd %%ymm15, (%7)\n"
16416           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16417         );
16418       }
16419     }
16420     return;
16421   }
16422   if (depth == 11) {
16423     helper_double_25_recursive(buf + 0, 8);
16424     helper_double_25_recursive(buf + 256, 8);
16425     helper_double_25_recursive(buf + 512, 8);
16426     helper_double_25_recursive(buf + 768, 8);
16427     helper_double_25_recursive(buf + 1024, 8);
16428     helper_double_25_recursive(buf + 1280, 8);
16429     helper_double_25_recursive(buf + 1536, 8);
16430     helper_double_25_recursive(buf + 1792, 8);
16431     for (int j = 0; j < 2048; j += 2048) {
16432       for (int k = 0; k < 256; k += 4) {
16433         __asm__ volatile (
16434           "vmovupd (%0), %%ymm0\n"
16435           "vmovupd (%1), %%ymm1\n"
16436           "vmovupd (%2), %%ymm2\n"
16437           "vmovupd (%3), %%ymm3\n"
16438           "vmovupd (%4), %%ymm4\n"
16439           "vmovupd (%5), %%ymm5\n"
16440           "vmovupd (%6), %%ymm6\n"
16441           "vmovupd (%7), %%ymm7\n"
16442           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16443           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16444           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16445           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16446           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16447           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16448           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16449           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16450           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16451           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16452           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16453           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16454           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16455           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16456           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16457           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16458           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16459           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16460           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16461           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16462           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16463           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16464           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16465           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16466           "vmovupd %%ymm8, (%0)\n"
16467           "vmovupd %%ymm9, (%1)\n"
16468           "vmovupd %%ymm10, (%2)\n"
16469           "vmovupd %%ymm11, (%3)\n"
16470           "vmovupd %%ymm12, (%4)\n"
16471           "vmovupd %%ymm13, (%5)\n"
16472           "vmovupd %%ymm14, (%6)\n"
16473           "vmovupd %%ymm15, (%7)\n"
16474           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16475         );
16476       }
16477     }
16478     return;
16479   }
16480   if (depth == 14) {
16481     helper_double_25_recursive(buf + 0, 11);
16482     helper_double_25_recursive(buf + 2048, 11);
16483     helper_double_25_recursive(buf + 4096, 11);
16484     helper_double_25_recursive(buf + 6144, 11);
16485     helper_double_25_recursive(buf + 8192, 11);
16486     helper_double_25_recursive(buf + 10240, 11);
16487     helper_double_25_recursive(buf + 12288, 11);
16488     helper_double_25_recursive(buf + 14336, 11);
16489     for (int j = 0; j < 16384; j += 16384) {
16490       for (int k = 0; k < 2048; k += 4) {
16491         __asm__ volatile (
16492           "vmovupd (%0), %%ymm0\n"
16493           "vmovupd (%1), %%ymm1\n"
16494           "vmovupd (%2), %%ymm2\n"
16495           "vmovupd (%3), %%ymm3\n"
16496           "vmovupd (%4), %%ymm4\n"
16497           "vmovupd (%5), %%ymm5\n"
16498           "vmovupd (%6), %%ymm6\n"
16499           "vmovupd (%7), %%ymm7\n"
16500           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16501           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16502           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16503           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16504           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16505           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16506           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16507           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16508           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16509           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16510           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16511           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16512           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16513           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16514           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16515           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16516           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16517           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16518           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16519           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16520           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16521           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16522           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16523           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16524           "vmovupd %%ymm8, (%0)\n"
16525           "vmovupd %%ymm9, (%1)\n"
16526           "vmovupd %%ymm10, (%2)\n"
16527           "vmovupd %%ymm11, (%3)\n"
16528           "vmovupd %%ymm12, (%4)\n"
16529           "vmovupd %%ymm13, (%5)\n"
16530           "vmovupd %%ymm14, (%6)\n"
16531           "vmovupd %%ymm15, (%7)\n"
16532           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16533         );
16534       }
16535     }
16536     return;
16537   }
16538   if (depth == 17) {
16539     helper_double_25_recursive(buf + 0, 14);
16540     helper_double_25_recursive(buf + 16384, 14);
16541     helper_double_25_recursive(buf + 32768, 14);
16542     helper_double_25_recursive(buf + 49152, 14);
16543     helper_double_25_recursive(buf + 65536, 14);
16544     helper_double_25_recursive(buf + 81920, 14);
16545     helper_double_25_recursive(buf + 98304, 14);
16546     helper_double_25_recursive(buf + 114688, 14);
16547     for (int j = 0; j < 131072; j += 131072) {
16548       for (int k = 0; k < 16384; k += 4) {
16549         __asm__ volatile (
16550           "vmovupd (%0), %%ymm0\n"
16551           "vmovupd (%1), %%ymm1\n"
16552           "vmovupd (%2), %%ymm2\n"
16553           "vmovupd (%3), %%ymm3\n"
16554           "vmovupd (%4), %%ymm4\n"
16555           "vmovupd (%5), %%ymm5\n"
16556           "vmovupd (%6), %%ymm6\n"
16557           "vmovupd (%7), %%ymm7\n"
16558           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16559           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16560           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16561           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16562           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16563           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16564           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16565           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16566           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16567           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16568           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16569           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16570           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16571           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16572           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16573           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16574           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16575           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16576           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16577           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16578           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16579           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16580           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16581           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16582           "vmovupd %%ymm8, (%0)\n"
16583           "vmovupd %%ymm9, (%1)\n"
16584           "vmovupd %%ymm10, (%2)\n"
16585           "vmovupd %%ymm11, (%3)\n"
16586           "vmovupd %%ymm12, (%4)\n"
16587           "vmovupd %%ymm13, (%5)\n"
16588           "vmovupd %%ymm14, (%6)\n"
16589           "vmovupd %%ymm15, (%7)\n"
16590           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16591         );
16592       }
16593     }
16594     return;
16595   }
16596   if (depth == 20) {
16597     helper_double_25_recursive(buf + 0, 17);
16598     helper_double_25_recursive(buf + 131072, 17);
16599     helper_double_25_recursive(buf + 262144, 17);
16600     helper_double_25_recursive(buf + 393216, 17);
16601     helper_double_25_recursive(buf + 524288, 17);
16602     helper_double_25_recursive(buf + 655360, 17);
16603     helper_double_25_recursive(buf + 786432, 17);
16604     helper_double_25_recursive(buf + 917504, 17);
16605     for (int j = 0; j < 1048576; j += 1048576) {
16606       for (int k = 0; k < 131072; k += 4) {
16607         __asm__ volatile (
16608           "vmovupd (%0), %%ymm0\n"
16609           "vmovupd (%1), %%ymm1\n"
16610           "vmovupd (%2), %%ymm2\n"
16611           "vmovupd (%3), %%ymm3\n"
16612           "vmovupd (%4), %%ymm4\n"
16613           "vmovupd (%5), %%ymm5\n"
16614           "vmovupd (%6), %%ymm6\n"
16615           "vmovupd (%7), %%ymm7\n"
16616           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16617           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16618           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16619           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16620           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16621           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16622           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16623           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16624           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16625           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16626           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16627           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16628           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16629           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16630           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16631           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16632           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16633           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16634           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16635           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16636           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16637           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16638           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16639           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16640           "vmovupd %%ymm8, (%0)\n"
16641           "vmovupd %%ymm9, (%1)\n"
16642           "vmovupd %%ymm10, (%2)\n"
16643           "vmovupd %%ymm11, (%3)\n"
16644           "vmovupd %%ymm12, (%4)\n"
16645           "vmovupd %%ymm13, (%5)\n"
16646           "vmovupd %%ymm14, (%6)\n"
16647           "vmovupd %%ymm15, (%7)\n"
16648           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16649         );
16650       }
16651     }
16652     return;
16653   }
16654   if (depth == 23) {
16655     helper_double_25_recursive(buf + 0, 20);
16656     helper_double_25_recursive(buf + 1048576, 20);
16657     helper_double_25_recursive(buf + 2097152, 20);
16658     helper_double_25_recursive(buf + 3145728, 20);
16659     helper_double_25_recursive(buf + 4194304, 20);
16660     helper_double_25_recursive(buf + 5242880, 20);
16661     helper_double_25_recursive(buf + 6291456, 20);
16662     helper_double_25_recursive(buf + 7340032, 20);
16663     for (int j = 0; j < 8388608; j += 8388608) {
16664       for (int k = 0; k < 1048576; k += 4) {
16665         __asm__ volatile (
16666           "vmovupd (%0), %%ymm0\n"
16667           "vmovupd (%1), %%ymm1\n"
16668           "vmovupd (%2), %%ymm2\n"
16669           "vmovupd (%3), %%ymm3\n"
16670           "vmovupd (%4), %%ymm4\n"
16671           "vmovupd (%5), %%ymm5\n"
16672           "vmovupd (%6), %%ymm6\n"
16673           "vmovupd (%7), %%ymm7\n"
16674           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16675           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16676           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16677           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16678           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16679           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16680           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16681           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16682           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16683           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16684           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16685           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16686           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16687           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16688           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16689           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16690           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16691           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16692           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16693           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16694           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16695           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16696           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16697           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16698           "vmovupd %%ymm8, (%0)\n"
16699           "vmovupd %%ymm9, (%1)\n"
16700           "vmovupd %%ymm10, (%2)\n"
16701           "vmovupd %%ymm11, (%3)\n"
16702           "vmovupd %%ymm12, (%4)\n"
16703           "vmovupd %%ymm13, (%5)\n"
16704           "vmovupd %%ymm14, (%6)\n"
16705           "vmovupd %%ymm15, (%7)\n"
16706           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16707         );
16708       }
16709     }
16710     return;
16711   }
16712   if (depth == 25) {
16713     helper_double_25_recursive(buf + 0, 23);
16714     helper_double_25_recursive(buf + 8388608, 23);
16715     helper_double_25_recursive(buf + 16777216, 23);
16716     helper_double_25_recursive(buf + 25165824, 23);
16717     for (int j = 0; j < 33554432; j += 33554432) {
16718       for (int k = 0; k < 8388608; k += 4) {
16719         __asm__ volatile (
16720           "vmovupd (%0), %%ymm0\n"
16721           "vmovupd (%1), %%ymm1\n"
16722           "vmovupd (%2), %%ymm2\n"
16723           "vmovupd (%3), %%ymm3\n"
16724           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16725           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16726           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16727           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16728           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16729           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16730           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16731           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16732           "vmovupd %%ymm0, (%0)\n"
16733           "vmovupd %%ymm1, (%1)\n"
16734           "vmovupd %%ymm2, (%2)\n"
16735           "vmovupd %%ymm3, (%3)\n"
16736           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16737         );
16738       }
16739     }
16740     return;
16741   }
16742 }
16743 void helper_double_25(double *buf);
helper_double_25(double * buf)16744 void helper_double_25(double *buf) {
16745   helper_double_25_recursive(buf, 25);
16746 }
16747 void helper_double_26_recursive(double *buf, int depth);
helper_double_26_recursive(double * buf,int depth)16748 void helper_double_26_recursive(double *buf, int depth) {
16749   if (depth == 11) {
16750     for (int j = 0; j < 2048; j += 32) {
16751       for (int k = 0; k < 4; k += 4) {
16752         __asm__ volatile (
16753           "vmovupd (%0), %%ymm0\n"
16754           "vmovupd (%1), %%ymm1\n"
16755           "vmovupd (%2), %%ymm2\n"
16756           "vmovupd (%3), %%ymm3\n"
16757           "vmovupd (%4), %%ymm4\n"
16758           "vmovupd (%5), %%ymm5\n"
16759           "vmovupd (%6), %%ymm6\n"
16760           "vmovupd (%7), %%ymm7\n"
16761           "vpermilpd $0, %%ymm0, %%ymm8\n"
16762           "vpermilpd $15, %%ymm0, %%ymm9\n"
16763           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16764           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16765           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
16766           "vpermilpd $0, %%ymm1, %%ymm8\n"
16767           "vpermilpd $15, %%ymm1, %%ymm9\n"
16768           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16769           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16770           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
16771           "vpermilpd $0, %%ymm2, %%ymm8\n"
16772           "vpermilpd $15, %%ymm2, %%ymm9\n"
16773           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16774           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16775           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
16776           "vpermilpd $0, %%ymm3, %%ymm8\n"
16777           "vpermilpd $15, %%ymm3, %%ymm9\n"
16778           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16779           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16780           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
16781           "vpermilpd $0, %%ymm4, %%ymm8\n"
16782           "vpermilpd $15, %%ymm4, %%ymm9\n"
16783           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16784           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16785           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
16786           "vpermilpd $0, %%ymm5, %%ymm8\n"
16787           "vpermilpd $15, %%ymm5, %%ymm9\n"
16788           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16789           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16790           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
16791           "vpermilpd $0, %%ymm6, %%ymm8\n"
16792           "vpermilpd $15, %%ymm6, %%ymm9\n"
16793           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16794           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16795           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
16796           "vpermilpd $0, %%ymm7, %%ymm8\n"
16797           "vpermilpd $15, %%ymm7, %%ymm9\n"
16798           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16799           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16800           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
16801           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
16802           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16803           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
16804           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
16805           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
16806           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
16807           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16808           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
16809           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
16810           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
16811           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
16812           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16813           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
16814           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
16815           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
16816           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
16817           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16818           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
16819           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
16820           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
16821           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
16822           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16823           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
16824           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
16825           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
16826           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
16827           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16828           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
16829           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
16830           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
16831           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
16832           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16833           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
16834           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
16835           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
16836           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
16837           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16838           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
16839           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
16840           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
16841           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16842           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16843           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16844           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16845           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16846           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16847           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16848           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16849           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16850           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16851           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16852           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16853           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16854           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16855           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16856           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16857           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16858           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16859           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16860           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16861           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16862           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16863           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16864           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16865           "vmovupd %%ymm8, (%0)\n"
16866           "vmovupd %%ymm9, (%1)\n"
16867           "vmovupd %%ymm10, (%2)\n"
16868           "vmovupd %%ymm11, (%3)\n"
16869           "vmovupd %%ymm12, (%4)\n"
16870           "vmovupd %%ymm13, (%5)\n"
16871           "vmovupd %%ymm14, (%6)\n"
16872           "vmovupd %%ymm15, (%7)\n"
16873           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16874         );
16875       }
16876     }
16877     for (int j = 0; j < 2048; j += 256) {
16878       for (int k = 0; k < 32; k += 4) {
16879         __asm__ volatile (
16880           "vmovupd (%0), %%ymm0\n"
16881           "vmovupd (%1), %%ymm1\n"
16882           "vmovupd (%2), %%ymm2\n"
16883           "vmovupd (%3), %%ymm3\n"
16884           "vmovupd (%4), %%ymm4\n"
16885           "vmovupd (%5), %%ymm5\n"
16886           "vmovupd (%6), %%ymm6\n"
16887           "vmovupd (%7), %%ymm7\n"
16888           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16889           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16890           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16891           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16892           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16893           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16894           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16895           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16896           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16897           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16898           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16899           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16900           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16901           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16902           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16903           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16904           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16905           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16906           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16907           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16908           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16909           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16910           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16911           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16912           "vmovupd %%ymm8, (%0)\n"
16913           "vmovupd %%ymm9, (%1)\n"
16914           "vmovupd %%ymm10, (%2)\n"
16915           "vmovupd %%ymm11, (%3)\n"
16916           "vmovupd %%ymm12, (%4)\n"
16917           "vmovupd %%ymm13, (%5)\n"
16918           "vmovupd %%ymm14, (%6)\n"
16919           "vmovupd %%ymm15, (%7)\n"
16920           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16921         );
16922       }
16923     }
16924     for (int j = 0; j < 2048; j += 2048) {
16925       for (int k = 0; k < 256; k += 4) {
16926         __asm__ volatile (
16927           "vmovupd (%0), %%ymm0\n"
16928           "vmovupd (%1), %%ymm1\n"
16929           "vmovupd (%2), %%ymm2\n"
16930           "vmovupd (%3), %%ymm3\n"
16931           "vmovupd (%4), %%ymm4\n"
16932           "vmovupd (%5), %%ymm5\n"
16933           "vmovupd (%6), %%ymm6\n"
16934           "vmovupd (%7), %%ymm7\n"
16935           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16936           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16937           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16938           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16939           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16940           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16941           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16942           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16943           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16944           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16945           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16946           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16947           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16948           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16949           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16950           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16951           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16952           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16953           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16954           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16955           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16956           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16957           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16958           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16959           "vmovupd %%ymm8, (%0)\n"
16960           "vmovupd %%ymm9, (%1)\n"
16961           "vmovupd %%ymm10, (%2)\n"
16962           "vmovupd %%ymm11, (%3)\n"
16963           "vmovupd %%ymm12, (%4)\n"
16964           "vmovupd %%ymm13, (%5)\n"
16965           "vmovupd %%ymm14, (%6)\n"
16966           "vmovupd %%ymm15, (%7)\n"
16967           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16968         );
16969       }
16970     }
16971     return;
16972   }
16973   if (depth == 14) {
16974     helper_double_26_recursive(buf + 0, 11);
16975     helper_double_26_recursive(buf + 2048, 11);
16976     helper_double_26_recursive(buf + 4096, 11);
16977     helper_double_26_recursive(buf + 6144, 11);
16978     helper_double_26_recursive(buf + 8192, 11);
16979     helper_double_26_recursive(buf + 10240, 11);
16980     helper_double_26_recursive(buf + 12288, 11);
16981     helper_double_26_recursive(buf + 14336, 11);
16982     for (int j = 0; j < 16384; j += 16384) {
16983       for (int k = 0; k < 2048; k += 4) {
16984         __asm__ volatile (
16985           "vmovupd (%0), %%ymm0\n"
16986           "vmovupd (%1), %%ymm1\n"
16987           "vmovupd (%2), %%ymm2\n"
16988           "vmovupd (%3), %%ymm3\n"
16989           "vmovupd (%4), %%ymm4\n"
16990           "vmovupd (%5), %%ymm5\n"
16991           "vmovupd (%6), %%ymm6\n"
16992           "vmovupd (%7), %%ymm7\n"
16993           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16994           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16995           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16996           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16997           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16998           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16999           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17000           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17001           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17002           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17003           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17004           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17005           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17006           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17007           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17008           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17009           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17010           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17011           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17012           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17013           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17014           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17015           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17016           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17017           "vmovupd %%ymm8, (%0)\n"
17018           "vmovupd %%ymm9, (%1)\n"
17019           "vmovupd %%ymm10, (%2)\n"
17020           "vmovupd %%ymm11, (%3)\n"
17021           "vmovupd %%ymm12, (%4)\n"
17022           "vmovupd %%ymm13, (%5)\n"
17023           "vmovupd %%ymm14, (%6)\n"
17024           "vmovupd %%ymm15, (%7)\n"
17025           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17026         );
17027       }
17028     }
17029     return;
17030   }
17031   if (depth == 17) {
17032     helper_double_26_recursive(buf + 0, 14);
17033     helper_double_26_recursive(buf + 16384, 14);
17034     helper_double_26_recursive(buf + 32768, 14);
17035     helper_double_26_recursive(buf + 49152, 14);
17036     helper_double_26_recursive(buf + 65536, 14);
17037     helper_double_26_recursive(buf + 81920, 14);
17038     helper_double_26_recursive(buf + 98304, 14);
17039     helper_double_26_recursive(buf + 114688, 14);
17040     for (int j = 0; j < 131072; j += 131072) {
17041       for (int k = 0; k < 16384; k += 4) {
17042         __asm__ volatile (
17043           "vmovupd (%0), %%ymm0\n"
17044           "vmovupd (%1), %%ymm1\n"
17045           "vmovupd (%2), %%ymm2\n"
17046           "vmovupd (%3), %%ymm3\n"
17047           "vmovupd (%4), %%ymm4\n"
17048           "vmovupd (%5), %%ymm5\n"
17049           "vmovupd (%6), %%ymm6\n"
17050           "vmovupd (%7), %%ymm7\n"
17051           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17052           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17053           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17054           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17055           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17056           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17057           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17058           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17059           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17060           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17061           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17062           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17063           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17064           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17065           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17066           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17067           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17068           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17069           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17070           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17071           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17072           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17073           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17074           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17075           "vmovupd %%ymm8, (%0)\n"
17076           "vmovupd %%ymm9, (%1)\n"
17077           "vmovupd %%ymm10, (%2)\n"
17078           "vmovupd %%ymm11, (%3)\n"
17079           "vmovupd %%ymm12, (%4)\n"
17080           "vmovupd %%ymm13, (%5)\n"
17081           "vmovupd %%ymm14, (%6)\n"
17082           "vmovupd %%ymm15, (%7)\n"
17083           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17084         );
17085       }
17086     }
17087     return;
17088   }
17089   if (depth == 20) {
17090     helper_double_26_recursive(buf + 0, 17);
17091     helper_double_26_recursive(buf + 131072, 17);
17092     helper_double_26_recursive(buf + 262144, 17);
17093     helper_double_26_recursive(buf + 393216, 17);
17094     helper_double_26_recursive(buf + 524288, 17);
17095     helper_double_26_recursive(buf + 655360, 17);
17096     helper_double_26_recursive(buf + 786432, 17);
17097     helper_double_26_recursive(buf + 917504, 17);
17098     for (int j = 0; j < 1048576; j += 1048576) {
17099       for (int k = 0; k < 131072; k += 4) {
17100         __asm__ volatile (
17101           "vmovupd (%0), %%ymm0\n"
17102           "vmovupd (%1), %%ymm1\n"
17103           "vmovupd (%2), %%ymm2\n"
17104           "vmovupd (%3), %%ymm3\n"
17105           "vmovupd (%4), %%ymm4\n"
17106           "vmovupd (%5), %%ymm5\n"
17107           "vmovupd (%6), %%ymm6\n"
17108           "vmovupd (%7), %%ymm7\n"
17109           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17110           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17111           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17112           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17113           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17114           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17115           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17116           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17117           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17118           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17119           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17120           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17121           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17122           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17123           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17124           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17125           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17126           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17127           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17128           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17129           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17130           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17131           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17132           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17133           "vmovupd %%ymm8, (%0)\n"
17134           "vmovupd %%ymm9, (%1)\n"
17135           "vmovupd %%ymm10, (%2)\n"
17136           "vmovupd %%ymm11, (%3)\n"
17137           "vmovupd %%ymm12, (%4)\n"
17138           "vmovupd %%ymm13, (%5)\n"
17139           "vmovupd %%ymm14, (%6)\n"
17140           "vmovupd %%ymm15, (%7)\n"
17141           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17142         );
17143       }
17144     }
17145     return;
17146   }
17147   if (depth == 23) {
17148     helper_double_26_recursive(buf + 0, 20);
17149     helper_double_26_recursive(buf + 1048576, 20);
17150     helper_double_26_recursive(buf + 2097152, 20);
17151     helper_double_26_recursive(buf + 3145728, 20);
17152     helper_double_26_recursive(buf + 4194304, 20);
17153     helper_double_26_recursive(buf + 5242880, 20);
17154     helper_double_26_recursive(buf + 6291456, 20);
17155     helper_double_26_recursive(buf + 7340032, 20);
17156     for (int j = 0; j < 8388608; j += 8388608) {
17157       for (int k = 0; k < 1048576; k += 4) {
17158         __asm__ volatile (
17159           "vmovupd (%0), %%ymm0\n"
17160           "vmovupd (%1), %%ymm1\n"
17161           "vmovupd (%2), %%ymm2\n"
17162           "vmovupd (%3), %%ymm3\n"
17163           "vmovupd (%4), %%ymm4\n"
17164           "vmovupd (%5), %%ymm5\n"
17165           "vmovupd (%6), %%ymm6\n"
17166           "vmovupd (%7), %%ymm7\n"
17167           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17168           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17169           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17170           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17171           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17172           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17173           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17174           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17175           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17176           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17177           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17178           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17179           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17180           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17181           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17182           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17183           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17184           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17185           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17186           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17187           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17188           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17189           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17190           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17191           "vmovupd %%ymm8, (%0)\n"
17192           "vmovupd %%ymm9, (%1)\n"
17193           "vmovupd %%ymm10, (%2)\n"
17194           "vmovupd %%ymm11, (%3)\n"
17195           "vmovupd %%ymm12, (%4)\n"
17196           "vmovupd %%ymm13, (%5)\n"
17197           "vmovupd %%ymm14, (%6)\n"
17198           "vmovupd %%ymm15, (%7)\n"
17199           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17200         );
17201       }
17202     }
17203     return;
17204   }
17205   if (depth == 26) {
17206     helper_double_26_recursive(buf + 0, 23);
17207     helper_double_26_recursive(buf + 8388608, 23);
17208     helper_double_26_recursive(buf + 16777216, 23);
17209     helper_double_26_recursive(buf + 25165824, 23);
17210     helper_double_26_recursive(buf + 33554432, 23);
17211     helper_double_26_recursive(buf + 41943040, 23);
17212     helper_double_26_recursive(buf + 50331648, 23);
17213     helper_double_26_recursive(buf + 58720256, 23);
17214     for (int j = 0; j < 67108864; j += 67108864) {
17215       for (int k = 0; k < 8388608; k += 4) {
17216         __asm__ volatile (
17217           "vmovupd (%0), %%ymm0\n"
17218           "vmovupd (%1), %%ymm1\n"
17219           "vmovupd (%2), %%ymm2\n"
17220           "vmovupd (%3), %%ymm3\n"
17221           "vmovupd (%4), %%ymm4\n"
17222           "vmovupd (%5), %%ymm5\n"
17223           "vmovupd (%6), %%ymm6\n"
17224           "vmovupd (%7), %%ymm7\n"
17225           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17226           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17227           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17228           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17229           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17230           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17231           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17232           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17233           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17234           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17235           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17236           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17237           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17238           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17239           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17240           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17241           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17242           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17243           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17244           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17245           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17246           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17247           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17248           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17249           "vmovupd %%ymm8, (%0)\n"
17250           "vmovupd %%ymm9, (%1)\n"
17251           "vmovupd %%ymm10, (%2)\n"
17252           "vmovupd %%ymm11, (%3)\n"
17253           "vmovupd %%ymm12, (%4)\n"
17254           "vmovupd %%ymm13, (%5)\n"
17255           "vmovupd %%ymm14, (%6)\n"
17256           "vmovupd %%ymm15, (%7)\n"
17257           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17258         );
17259       }
17260     }
17261     return;
17262   }
17263 }
17264 void helper_double_26(double *buf);
helper_double_26(double * buf)17265 void helper_double_26(double *buf) {
17266   helper_double_26_recursive(buf, 26);
17267 }
17268 void helper_double_27_recursive(double *buf, int depth);
helper_double_27_recursive(double * buf,int depth)17269 void helper_double_27_recursive(double *buf, int depth) {
17270   if (depth == 9) {
17271     for (int j = 0; j < 512; j += 32) {
17272       for (int k = 0; k < 4; k += 4) {
17273         __asm__ volatile (
17274           "vmovupd (%0), %%ymm0\n"
17275           "vmovupd (%1), %%ymm1\n"
17276           "vmovupd (%2), %%ymm2\n"
17277           "vmovupd (%3), %%ymm3\n"
17278           "vmovupd (%4), %%ymm4\n"
17279           "vmovupd (%5), %%ymm5\n"
17280           "vmovupd (%6), %%ymm6\n"
17281           "vmovupd (%7), %%ymm7\n"
17282           "vpermilpd $0, %%ymm0, %%ymm8\n"
17283           "vpermilpd $15, %%ymm0, %%ymm9\n"
17284           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17285           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17286           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
17287           "vpermilpd $0, %%ymm1, %%ymm8\n"
17288           "vpermilpd $15, %%ymm1, %%ymm9\n"
17289           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17290           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17291           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
17292           "vpermilpd $0, %%ymm2, %%ymm8\n"
17293           "vpermilpd $15, %%ymm2, %%ymm9\n"
17294           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17295           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17296           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
17297           "vpermilpd $0, %%ymm3, %%ymm8\n"
17298           "vpermilpd $15, %%ymm3, %%ymm9\n"
17299           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17300           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17301           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
17302           "vpermilpd $0, %%ymm4, %%ymm8\n"
17303           "vpermilpd $15, %%ymm4, %%ymm9\n"
17304           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17305           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17306           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
17307           "vpermilpd $0, %%ymm5, %%ymm8\n"
17308           "vpermilpd $15, %%ymm5, %%ymm9\n"
17309           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17310           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17311           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
17312           "vpermilpd $0, %%ymm6, %%ymm8\n"
17313           "vpermilpd $15, %%ymm6, %%ymm9\n"
17314           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17315           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17316           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
17317           "vpermilpd $0, %%ymm7, %%ymm8\n"
17318           "vpermilpd $15, %%ymm7, %%ymm9\n"
17319           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17320           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17321           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
17322           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
17323           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17324           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
17325           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
17326           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
17327           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
17328           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17329           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
17330           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
17331           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
17332           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
17333           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17334           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
17335           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
17336           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
17337           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
17338           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17339           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
17340           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
17341           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
17342           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
17343           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17344           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
17345           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
17346           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
17347           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
17348           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17349           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
17350           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
17351           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
17352           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
17353           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17354           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
17355           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
17356           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
17357           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
17358           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17359           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
17360           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
17361           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
17362           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17363           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17364           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17365           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17366           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17367           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17368           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17369           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17370           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17371           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17372           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17373           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17374           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17375           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17376           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17377           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17378           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17379           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17380           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17381           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17382           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17383           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17384           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17385           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17386           "vmovupd %%ymm8, (%0)\n"
17387           "vmovupd %%ymm9, (%1)\n"
17388           "vmovupd %%ymm10, (%2)\n"
17389           "vmovupd %%ymm11, (%3)\n"
17390           "vmovupd %%ymm12, (%4)\n"
17391           "vmovupd %%ymm13, (%5)\n"
17392           "vmovupd %%ymm14, (%6)\n"
17393           "vmovupd %%ymm15, (%7)\n"
17394           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17395         );
17396       }
17397     }
17398     for (int j = 0; j < 512; j += 256) {
17399       for (int k = 0; k < 32; k += 4) {
17400         __asm__ volatile (
17401           "vmovupd (%0), %%ymm0\n"
17402           "vmovupd (%1), %%ymm1\n"
17403           "vmovupd (%2), %%ymm2\n"
17404           "vmovupd (%3), %%ymm3\n"
17405           "vmovupd (%4), %%ymm4\n"
17406           "vmovupd (%5), %%ymm5\n"
17407           "vmovupd (%6), %%ymm6\n"
17408           "vmovupd (%7), %%ymm7\n"
17409           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17410           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17411           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17412           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17413           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17414           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17415           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17416           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17417           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17418           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17419           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17420           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17421           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17422           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17423           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17424           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17425           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17426           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17427           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17428           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17429           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17430           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17431           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17432           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17433           "vmovupd %%ymm8, (%0)\n"
17434           "vmovupd %%ymm9, (%1)\n"
17435           "vmovupd %%ymm10, (%2)\n"
17436           "vmovupd %%ymm11, (%3)\n"
17437           "vmovupd %%ymm12, (%4)\n"
17438           "vmovupd %%ymm13, (%5)\n"
17439           "vmovupd %%ymm14, (%6)\n"
17440           "vmovupd %%ymm15, (%7)\n"
17441           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17442         );
17443       }
17444     }
17445     for (int j = 0; j < 512; j += 512) {
17446       for (int k = 0; k < 256; k += 4) {
17447         __asm__ volatile (
17448           "vmovupd (%0), %%ymm0\n"
17449           "vmovupd (%1), %%ymm1\n"
17450           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17451           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17452           "vmovupd %%ymm8, (%0)\n"
17453           "vmovupd %%ymm9, (%1)\n"
17454           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17455         );
17456       }
17457     }
17458     return;
17459   }
17460   if (depth == 12) {
17461     helper_double_27_recursive(buf + 0, 9);
17462     helper_double_27_recursive(buf + 512, 9);
17463     helper_double_27_recursive(buf + 1024, 9);
17464     helper_double_27_recursive(buf + 1536, 9);
17465     helper_double_27_recursive(buf + 2048, 9);
17466     helper_double_27_recursive(buf + 2560, 9);
17467     helper_double_27_recursive(buf + 3072, 9);
17468     helper_double_27_recursive(buf + 3584, 9);
17469     for (int j = 0; j < 4096; j += 4096) {
17470       for (int k = 0; k < 512; k += 4) {
17471         __asm__ volatile (
17472           "vmovupd (%0), %%ymm0\n"
17473           "vmovupd (%1), %%ymm1\n"
17474           "vmovupd (%2), %%ymm2\n"
17475           "vmovupd (%3), %%ymm3\n"
17476           "vmovupd (%4), %%ymm4\n"
17477           "vmovupd (%5), %%ymm5\n"
17478           "vmovupd (%6), %%ymm6\n"
17479           "vmovupd (%7), %%ymm7\n"
17480           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17481           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17482           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17483           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17484           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17485           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17486           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17487           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17488           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17489           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17490           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17491           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17492           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17493           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17494           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17495           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17496           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17497           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17498           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17499           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17500           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17501           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17502           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17503           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17504           "vmovupd %%ymm8, (%0)\n"
17505           "vmovupd %%ymm9, (%1)\n"
17506           "vmovupd %%ymm10, (%2)\n"
17507           "vmovupd %%ymm11, (%3)\n"
17508           "vmovupd %%ymm12, (%4)\n"
17509           "vmovupd %%ymm13, (%5)\n"
17510           "vmovupd %%ymm14, (%6)\n"
17511           "vmovupd %%ymm15, (%7)\n"
17512           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17513         );
17514       }
17515     }
17516     return;
17517   }
17518   if (depth == 15) {
17519     helper_double_27_recursive(buf + 0, 12);
17520     helper_double_27_recursive(buf + 4096, 12);
17521     helper_double_27_recursive(buf + 8192, 12);
17522     helper_double_27_recursive(buf + 12288, 12);
17523     helper_double_27_recursive(buf + 16384, 12);
17524     helper_double_27_recursive(buf + 20480, 12);
17525     helper_double_27_recursive(buf + 24576, 12);
17526     helper_double_27_recursive(buf + 28672, 12);
17527     for (int j = 0; j < 32768; j += 32768) {
17528       for (int k = 0; k < 4096; k += 4) {
17529         __asm__ volatile (
17530           "vmovupd (%0), %%ymm0\n"
17531           "vmovupd (%1), %%ymm1\n"
17532           "vmovupd (%2), %%ymm2\n"
17533           "vmovupd (%3), %%ymm3\n"
17534           "vmovupd (%4), %%ymm4\n"
17535           "vmovupd (%5), %%ymm5\n"
17536           "vmovupd (%6), %%ymm6\n"
17537           "vmovupd (%7), %%ymm7\n"
17538           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17539           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17540           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17541           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17542           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17543           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17544           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17545           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17546           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17547           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17548           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17549           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17550           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17551           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17552           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17553           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17554           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17555           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17556           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17557           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17558           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17559           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17560           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17561           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17562           "vmovupd %%ymm8, (%0)\n"
17563           "vmovupd %%ymm9, (%1)\n"
17564           "vmovupd %%ymm10, (%2)\n"
17565           "vmovupd %%ymm11, (%3)\n"
17566           "vmovupd %%ymm12, (%4)\n"
17567           "vmovupd %%ymm13, (%5)\n"
17568           "vmovupd %%ymm14, (%6)\n"
17569           "vmovupd %%ymm15, (%7)\n"
17570           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17571         );
17572       }
17573     }
17574     return;
17575   }
17576   if (depth == 18) {
17577     helper_double_27_recursive(buf + 0, 15);
17578     helper_double_27_recursive(buf + 32768, 15);
17579     helper_double_27_recursive(buf + 65536, 15);
17580     helper_double_27_recursive(buf + 98304, 15);
17581     helper_double_27_recursive(buf + 131072, 15);
17582     helper_double_27_recursive(buf + 163840, 15);
17583     helper_double_27_recursive(buf + 196608, 15);
17584     helper_double_27_recursive(buf + 229376, 15);
17585     for (int j = 0; j < 262144; j += 262144) {
17586       for (int k = 0; k < 32768; k += 4) {
17587         __asm__ volatile (
17588           "vmovupd (%0), %%ymm0\n"
17589           "vmovupd (%1), %%ymm1\n"
17590           "vmovupd (%2), %%ymm2\n"
17591           "vmovupd (%3), %%ymm3\n"
17592           "vmovupd (%4), %%ymm4\n"
17593           "vmovupd (%5), %%ymm5\n"
17594           "vmovupd (%6), %%ymm6\n"
17595           "vmovupd (%7), %%ymm7\n"
17596           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17597           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17598           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17599           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17600           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17601           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17602           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17603           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17604           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17605           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17606           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17607           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17608           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17609           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17610           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17611           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17612           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17613           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17614           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17615           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17616           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17617           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17618           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17619           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17620           "vmovupd %%ymm8, (%0)\n"
17621           "vmovupd %%ymm9, (%1)\n"
17622           "vmovupd %%ymm10, (%2)\n"
17623           "vmovupd %%ymm11, (%3)\n"
17624           "vmovupd %%ymm12, (%4)\n"
17625           "vmovupd %%ymm13, (%5)\n"
17626           "vmovupd %%ymm14, (%6)\n"
17627           "vmovupd %%ymm15, (%7)\n"
17628           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17629         );
17630       }
17631     }
17632     return;
17633   }
17634   if (depth == 21) {
17635     helper_double_27_recursive(buf + 0, 18);
17636     helper_double_27_recursive(buf + 262144, 18);
17637     helper_double_27_recursive(buf + 524288, 18);
17638     helper_double_27_recursive(buf + 786432, 18);
17639     helper_double_27_recursive(buf + 1048576, 18);
17640     helper_double_27_recursive(buf + 1310720, 18);
17641     helper_double_27_recursive(buf + 1572864, 18);
17642     helper_double_27_recursive(buf + 1835008, 18);
17643     for (int j = 0; j < 2097152; j += 2097152) {
17644       for (int k = 0; k < 262144; k += 4) {
17645         __asm__ volatile (
17646           "vmovupd (%0), %%ymm0\n"
17647           "vmovupd (%1), %%ymm1\n"
17648           "vmovupd (%2), %%ymm2\n"
17649           "vmovupd (%3), %%ymm3\n"
17650           "vmovupd (%4), %%ymm4\n"
17651           "vmovupd (%5), %%ymm5\n"
17652           "vmovupd (%6), %%ymm6\n"
17653           "vmovupd (%7), %%ymm7\n"
17654           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17655           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17656           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17657           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17658           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17659           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17660           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17661           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17662           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17663           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17664           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17665           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17666           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17667           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17668           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17669           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17670           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17671           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17672           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17673           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17674           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17675           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17676           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17677           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17678           "vmovupd %%ymm8, (%0)\n"
17679           "vmovupd %%ymm9, (%1)\n"
17680           "vmovupd %%ymm10, (%2)\n"
17681           "vmovupd %%ymm11, (%3)\n"
17682           "vmovupd %%ymm12, (%4)\n"
17683           "vmovupd %%ymm13, (%5)\n"
17684           "vmovupd %%ymm14, (%6)\n"
17685           "vmovupd %%ymm15, (%7)\n"
17686           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17687         );
17688       }
17689     }
17690     return;
17691   }
17692   if (depth == 24) {
17693     helper_double_27_recursive(buf + 0, 21);
17694     helper_double_27_recursive(buf + 2097152, 21);
17695     helper_double_27_recursive(buf + 4194304, 21);
17696     helper_double_27_recursive(buf + 6291456, 21);
17697     helper_double_27_recursive(buf + 8388608, 21);
17698     helper_double_27_recursive(buf + 10485760, 21);
17699     helper_double_27_recursive(buf + 12582912, 21);
17700     helper_double_27_recursive(buf + 14680064, 21);
17701     for (int j = 0; j < 16777216; j += 16777216) {
17702       for (int k = 0; k < 2097152; k += 4) {
17703         __asm__ volatile (
17704           "vmovupd (%0), %%ymm0\n"
17705           "vmovupd (%1), %%ymm1\n"
17706           "vmovupd (%2), %%ymm2\n"
17707           "vmovupd (%3), %%ymm3\n"
17708           "vmovupd (%4), %%ymm4\n"
17709           "vmovupd (%5), %%ymm5\n"
17710           "vmovupd (%6), %%ymm6\n"
17711           "vmovupd (%7), %%ymm7\n"
17712           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17713           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17714           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17715           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17716           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17717           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17718           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17719           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17720           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17721           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17722           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17723           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17724           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17725           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17726           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17727           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17728           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17729           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17730           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17731           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17732           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17733           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17734           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17735           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17736           "vmovupd %%ymm8, (%0)\n"
17737           "vmovupd %%ymm9, (%1)\n"
17738           "vmovupd %%ymm10, (%2)\n"
17739           "vmovupd %%ymm11, (%3)\n"
17740           "vmovupd %%ymm12, (%4)\n"
17741           "vmovupd %%ymm13, (%5)\n"
17742           "vmovupd %%ymm14, (%6)\n"
17743           "vmovupd %%ymm15, (%7)\n"
17744           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17745         );
17746       }
17747     }
17748     return;
17749   }
17750   if (depth == 27) {
17751     helper_double_27_recursive(buf + 0, 24);
17752     helper_double_27_recursive(buf + 16777216, 24);
17753     helper_double_27_recursive(buf + 33554432, 24);
17754     helper_double_27_recursive(buf + 50331648, 24);
17755     helper_double_27_recursive(buf + 67108864, 24);
17756     helper_double_27_recursive(buf + 83886080, 24);
17757     helper_double_27_recursive(buf + 100663296, 24);
17758     helper_double_27_recursive(buf + 117440512, 24);
17759     for (int j = 0; j < 134217728; j += 134217728) {
17760       for (int k = 0; k < 16777216; k += 4) {
17761         __asm__ volatile (
17762           "vmovupd (%0), %%ymm0\n"
17763           "vmovupd (%1), %%ymm1\n"
17764           "vmovupd (%2), %%ymm2\n"
17765           "vmovupd (%3), %%ymm3\n"
17766           "vmovupd (%4), %%ymm4\n"
17767           "vmovupd (%5), %%ymm5\n"
17768           "vmovupd (%6), %%ymm6\n"
17769           "vmovupd (%7), %%ymm7\n"
17770           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17771           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17772           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17773           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17774           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17775           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17776           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17777           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17778           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17779           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17780           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17781           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17782           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17783           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17784           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17785           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17786           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17787           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17788           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17789           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17790           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17791           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17792           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17793           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17794           "vmovupd %%ymm8, (%0)\n"
17795           "vmovupd %%ymm9, (%1)\n"
17796           "vmovupd %%ymm10, (%2)\n"
17797           "vmovupd %%ymm11, (%3)\n"
17798           "vmovupd %%ymm12, (%4)\n"
17799           "vmovupd %%ymm13, (%5)\n"
17800           "vmovupd %%ymm14, (%6)\n"
17801           "vmovupd %%ymm15, (%7)\n"
17802           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17803         );
17804       }
17805     }
17806     return;
17807   }
17808 }
17809 void helper_double_27(double *buf);
helper_double_27(double * buf)17810 void helper_double_27(double *buf) {
17811   helper_double_27_recursive(buf, 27);
17812 }
17813 void helper_double_28_recursive(double *buf, int depth);
helper_double_28_recursive(double * buf,int depth)17814 void helper_double_28_recursive(double *buf, int depth) {
17815   if (depth == 11) {
17816     for (int j = 0; j < 2048; j += 32) {
17817       for (int k = 0; k < 4; k += 4) {
17818         __asm__ volatile (
17819           "vmovupd (%0), %%ymm0\n"
17820           "vmovupd (%1), %%ymm1\n"
17821           "vmovupd (%2), %%ymm2\n"
17822           "vmovupd (%3), %%ymm3\n"
17823           "vmovupd (%4), %%ymm4\n"
17824           "vmovupd (%5), %%ymm5\n"
17825           "vmovupd (%6), %%ymm6\n"
17826           "vmovupd (%7), %%ymm7\n"
17827           "vpermilpd $0, %%ymm0, %%ymm8\n"
17828           "vpermilpd $15, %%ymm0, %%ymm9\n"
17829           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17830           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17831           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
17832           "vpermilpd $0, %%ymm1, %%ymm8\n"
17833           "vpermilpd $15, %%ymm1, %%ymm9\n"
17834           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17835           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17836           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
17837           "vpermilpd $0, %%ymm2, %%ymm8\n"
17838           "vpermilpd $15, %%ymm2, %%ymm9\n"
17839           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17840           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17841           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
17842           "vpermilpd $0, %%ymm3, %%ymm8\n"
17843           "vpermilpd $15, %%ymm3, %%ymm9\n"
17844           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17845           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17846           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
17847           "vpermilpd $0, %%ymm4, %%ymm8\n"
17848           "vpermilpd $15, %%ymm4, %%ymm9\n"
17849           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17850           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17851           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
17852           "vpermilpd $0, %%ymm5, %%ymm8\n"
17853           "vpermilpd $15, %%ymm5, %%ymm9\n"
17854           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17855           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17856           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
17857           "vpermilpd $0, %%ymm6, %%ymm8\n"
17858           "vpermilpd $15, %%ymm6, %%ymm9\n"
17859           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17860           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17861           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
17862           "vpermilpd $0, %%ymm7, %%ymm8\n"
17863           "vpermilpd $15, %%ymm7, %%ymm9\n"
17864           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17865           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17866           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
17867           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
17868           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17869           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
17870           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
17871           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
17872           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
17873           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17874           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
17875           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
17876           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
17877           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
17878           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17879           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
17880           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
17881           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
17882           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
17883           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17884           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
17885           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
17886           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
17887           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
17888           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17889           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
17890           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
17891           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
17892           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
17893           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17894           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
17895           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
17896           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
17897           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
17898           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17899           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
17900           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
17901           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
17902           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
17903           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17904           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
17905           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
17906           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
17907           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17908           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17909           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17910           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17911           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17912           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17913           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17914           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17915           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17916           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17917           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17918           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17919           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17920           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17921           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17922           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17923           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17924           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17925           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17926           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17927           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17928           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17929           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17930           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17931           "vmovupd %%ymm8, (%0)\n"
17932           "vmovupd %%ymm9, (%1)\n"
17933           "vmovupd %%ymm10, (%2)\n"
17934           "vmovupd %%ymm11, (%3)\n"
17935           "vmovupd %%ymm12, (%4)\n"
17936           "vmovupd %%ymm13, (%5)\n"
17937           "vmovupd %%ymm14, (%6)\n"
17938           "vmovupd %%ymm15, (%7)\n"
17939           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17940         );
17941       }
17942     }
17943     for (int j = 0; j < 2048; j += 256) {
17944       for (int k = 0; k < 32; k += 4) {
17945         __asm__ volatile (
17946           "vmovupd (%0), %%ymm0\n"
17947           "vmovupd (%1), %%ymm1\n"
17948           "vmovupd (%2), %%ymm2\n"
17949           "vmovupd (%3), %%ymm3\n"
17950           "vmovupd (%4), %%ymm4\n"
17951           "vmovupd (%5), %%ymm5\n"
17952           "vmovupd (%6), %%ymm6\n"
17953           "vmovupd (%7), %%ymm7\n"
17954           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17955           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17956           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17957           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17958           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17959           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17960           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17961           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17962           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17963           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17964           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17965           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17966           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17967           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17968           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17969           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17970           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17971           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17972           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17973           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17974           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17975           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17976           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17977           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17978           "vmovupd %%ymm8, (%0)\n"
17979           "vmovupd %%ymm9, (%1)\n"
17980           "vmovupd %%ymm10, (%2)\n"
17981           "vmovupd %%ymm11, (%3)\n"
17982           "vmovupd %%ymm12, (%4)\n"
17983           "vmovupd %%ymm13, (%5)\n"
17984           "vmovupd %%ymm14, (%6)\n"
17985           "vmovupd %%ymm15, (%7)\n"
17986           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17987         );
17988       }
17989     }
17990     for (int j = 0; j < 2048; j += 2048) {
17991       for (int k = 0; k < 256; k += 4) {
17992         __asm__ volatile (
17993           "vmovupd (%0), %%ymm0\n"
17994           "vmovupd (%1), %%ymm1\n"
17995           "vmovupd (%2), %%ymm2\n"
17996           "vmovupd (%3), %%ymm3\n"
17997           "vmovupd (%4), %%ymm4\n"
17998           "vmovupd (%5), %%ymm5\n"
17999           "vmovupd (%6), %%ymm6\n"
18000           "vmovupd (%7), %%ymm7\n"
18001           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18002           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18003           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18004           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18005           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18006           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18007           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18008           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18009           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18010           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18011           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18012           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18013           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18014           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18015           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18016           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18017           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18018           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18019           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18020           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18021           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18022           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18023           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18024           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18025           "vmovupd %%ymm8, (%0)\n"
18026           "vmovupd %%ymm9, (%1)\n"
18027           "vmovupd %%ymm10, (%2)\n"
18028           "vmovupd %%ymm11, (%3)\n"
18029           "vmovupd %%ymm12, (%4)\n"
18030           "vmovupd %%ymm13, (%5)\n"
18031           "vmovupd %%ymm14, (%6)\n"
18032           "vmovupd %%ymm15, (%7)\n"
18033           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18034         );
18035       }
18036     }
18037     return;
18038   }
18039   if (depth == 14) {
18040     helper_double_28_recursive(buf + 0, 11);
18041     helper_double_28_recursive(buf + 2048, 11);
18042     helper_double_28_recursive(buf + 4096, 11);
18043     helper_double_28_recursive(buf + 6144, 11);
18044     helper_double_28_recursive(buf + 8192, 11);
18045     helper_double_28_recursive(buf + 10240, 11);
18046     helper_double_28_recursive(buf + 12288, 11);
18047     helper_double_28_recursive(buf + 14336, 11);
18048     for (int j = 0; j < 16384; j += 16384) {
18049       for (int k = 0; k < 2048; k += 4) {
18050         __asm__ volatile (
18051           "vmovupd (%0), %%ymm0\n"
18052           "vmovupd (%1), %%ymm1\n"
18053           "vmovupd (%2), %%ymm2\n"
18054           "vmovupd (%3), %%ymm3\n"
18055           "vmovupd (%4), %%ymm4\n"
18056           "vmovupd (%5), %%ymm5\n"
18057           "vmovupd (%6), %%ymm6\n"
18058           "vmovupd (%7), %%ymm7\n"
18059           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18060           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18061           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18062           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18063           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18064           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18065           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18066           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18067           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18068           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18069           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18070           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18071           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18072           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18073           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18074           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18075           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18076           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18077           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18078           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18079           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18080           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18081           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18082           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18083           "vmovupd %%ymm8, (%0)\n"
18084           "vmovupd %%ymm9, (%1)\n"
18085           "vmovupd %%ymm10, (%2)\n"
18086           "vmovupd %%ymm11, (%3)\n"
18087           "vmovupd %%ymm12, (%4)\n"
18088           "vmovupd %%ymm13, (%5)\n"
18089           "vmovupd %%ymm14, (%6)\n"
18090           "vmovupd %%ymm15, (%7)\n"
18091           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18092         );
18093       }
18094     }
18095     return;
18096   }
18097   if (depth == 17) {
18098     helper_double_28_recursive(buf + 0, 14);
18099     helper_double_28_recursive(buf + 16384, 14);
18100     helper_double_28_recursive(buf + 32768, 14);
18101     helper_double_28_recursive(buf + 49152, 14);
18102     helper_double_28_recursive(buf + 65536, 14);
18103     helper_double_28_recursive(buf + 81920, 14);
18104     helper_double_28_recursive(buf + 98304, 14);
18105     helper_double_28_recursive(buf + 114688, 14);
18106     for (int j = 0; j < 131072; j += 131072) {
18107       for (int k = 0; k < 16384; k += 4) {
18108         __asm__ volatile (
18109           "vmovupd (%0), %%ymm0\n"
18110           "vmovupd (%1), %%ymm1\n"
18111           "vmovupd (%2), %%ymm2\n"
18112           "vmovupd (%3), %%ymm3\n"
18113           "vmovupd (%4), %%ymm4\n"
18114           "vmovupd (%5), %%ymm5\n"
18115           "vmovupd (%6), %%ymm6\n"
18116           "vmovupd (%7), %%ymm7\n"
18117           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18118           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18119           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18120           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18121           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18122           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18123           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18124           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18125           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18126           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18127           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18128           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18129           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18130           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18131           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18132           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18133           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18134           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18135           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18136           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18137           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18138           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18139           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18140           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18141           "vmovupd %%ymm8, (%0)\n"
18142           "vmovupd %%ymm9, (%1)\n"
18143           "vmovupd %%ymm10, (%2)\n"
18144           "vmovupd %%ymm11, (%3)\n"
18145           "vmovupd %%ymm12, (%4)\n"
18146           "vmovupd %%ymm13, (%5)\n"
18147           "vmovupd %%ymm14, (%6)\n"
18148           "vmovupd %%ymm15, (%7)\n"
18149           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18150         );
18151       }
18152     }
18153     return;
18154   }
18155   if (depth == 20) {
18156     helper_double_28_recursive(buf + 0, 17);
18157     helper_double_28_recursive(buf + 131072, 17);
18158     helper_double_28_recursive(buf + 262144, 17);
18159     helper_double_28_recursive(buf + 393216, 17);
18160     helper_double_28_recursive(buf + 524288, 17);
18161     helper_double_28_recursive(buf + 655360, 17);
18162     helper_double_28_recursive(buf + 786432, 17);
18163     helper_double_28_recursive(buf + 917504, 17);
18164     for (int j = 0; j < 1048576; j += 1048576) {
18165       for (int k = 0; k < 131072; k += 4) {
18166         __asm__ volatile (
18167           "vmovupd (%0), %%ymm0\n"
18168           "vmovupd (%1), %%ymm1\n"
18169           "vmovupd (%2), %%ymm2\n"
18170           "vmovupd (%3), %%ymm3\n"
18171           "vmovupd (%4), %%ymm4\n"
18172           "vmovupd (%5), %%ymm5\n"
18173           "vmovupd (%6), %%ymm6\n"
18174           "vmovupd (%7), %%ymm7\n"
18175           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18176           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18177           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18178           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18179           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18180           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18181           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18182           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18183           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18184           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18185           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18186           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18187           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18188           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18189           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18190           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18191           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18192           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18193           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18194           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18195           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18196           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18197           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18198           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18199           "vmovupd %%ymm8, (%0)\n"
18200           "vmovupd %%ymm9, (%1)\n"
18201           "vmovupd %%ymm10, (%2)\n"
18202           "vmovupd %%ymm11, (%3)\n"
18203           "vmovupd %%ymm12, (%4)\n"
18204           "vmovupd %%ymm13, (%5)\n"
18205           "vmovupd %%ymm14, (%6)\n"
18206           "vmovupd %%ymm15, (%7)\n"
18207           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18208         );
18209       }
18210     }
18211     return;
18212   }
18213   if (depth == 23) {
18214     helper_double_28_recursive(buf + 0, 20);
18215     helper_double_28_recursive(buf + 1048576, 20);
18216     helper_double_28_recursive(buf + 2097152, 20);
18217     helper_double_28_recursive(buf + 3145728, 20);
18218     helper_double_28_recursive(buf + 4194304, 20);
18219     helper_double_28_recursive(buf + 5242880, 20);
18220     helper_double_28_recursive(buf + 6291456, 20);
18221     helper_double_28_recursive(buf + 7340032, 20);
18222     for (int j = 0; j < 8388608; j += 8388608) {
18223       for (int k = 0; k < 1048576; k += 4) {
18224         __asm__ volatile (
18225           "vmovupd (%0), %%ymm0\n"
18226           "vmovupd (%1), %%ymm1\n"
18227           "vmovupd (%2), %%ymm2\n"
18228           "vmovupd (%3), %%ymm3\n"
18229           "vmovupd (%4), %%ymm4\n"
18230           "vmovupd (%5), %%ymm5\n"
18231           "vmovupd (%6), %%ymm6\n"
18232           "vmovupd (%7), %%ymm7\n"
18233           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18234           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18235           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18236           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18237           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18238           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18239           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18240           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18241           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18242           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18243           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18244           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18245           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18246           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18247           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18248           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18249           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18250           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18251           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18252           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18253           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18254           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18255           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18256           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18257           "vmovupd %%ymm8, (%0)\n"
18258           "vmovupd %%ymm9, (%1)\n"
18259           "vmovupd %%ymm10, (%2)\n"
18260           "vmovupd %%ymm11, (%3)\n"
18261           "vmovupd %%ymm12, (%4)\n"
18262           "vmovupd %%ymm13, (%5)\n"
18263           "vmovupd %%ymm14, (%6)\n"
18264           "vmovupd %%ymm15, (%7)\n"
18265           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18266         );
18267       }
18268     }
18269     return;
18270   }
18271   if (depth == 26) {
18272     helper_double_28_recursive(buf + 0, 23);
18273     helper_double_28_recursive(buf + 8388608, 23);
18274     helper_double_28_recursive(buf + 16777216, 23);
18275     helper_double_28_recursive(buf + 25165824, 23);
18276     helper_double_28_recursive(buf + 33554432, 23);
18277     helper_double_28_recursive(buf + 41943040, 23);
18278     helper_double_28_recursive(buf + 50331648, 23);
18279     helper_double_28_recursive(buf + 58720256, 23);
18280     for (int j = 0; j < 67108864; j += 67108864) {
18281       for (int k = 0; k < 8388608; k += 4) {
18282         __asm__ volatile (
18283           "vmovupd (%0), %%ymm0\n"
18284           "vmovupd (%1), %%ymm1\n"
18285           "vmovupd (%2), %%ymm2\n"
18286           "vmovupd (%3), %%ymm3\n"
18287           "vmovupd (%4), %%ymm4\n"
18288           "vmovupd (%5), %%ymm5\n"
18289           "vmovupd (%6), %%ymm6\n"
18290           "vmovupd (%7), %%ymm7\n"
18291           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18292           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18293           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18294           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18295           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18296           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18297           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18298           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18299           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18300           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18301           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18302           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18303           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18304           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18305           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18306           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18307           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18308           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18309           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18310           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18311           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18312           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18313           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18314           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18315           "vmovupd %%ymm8, (%0)\n"
18316           "vmovupd %%ymm9, (%1)\n"
18317           "vmovupd %%ymm10, (%2)\n"
18318           "vmovupd %%ymm11, (%3)\n"
18319           "vmovupd %%ymm12, (%4)\n"
18320           "vmovupd %%ymm13, (%5)\n"
18321           "vmovupd %%ymm14, (%6)\n"
18322           "vmovupd %%ymm15, (%7)\n"
18323           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18324         );
18325       }
18326     }
18327     return;
18328   }
18329   if (depth == 28) {
18330     helper_double_28_recursive(buf + 0, 26);
18331     helper_double_28_recursive(buf + 67108864, 26);
18332     helper_double_28_recursive(buf + 134217728, 26);
18333     helper_double_28_recursive(buf + 201326592, 26);
18334     for (int j = 0; j < 268435456; j += 268435456) {
18335       for (int k = 0; k < 67108864; k += 4) {
18336         __asm__ volatile (
18337           "vmovupd (%0), %%ymm0\n"
18338           "vmovupd (%1), %%ymm1\n"
18339           "vmovupd (%2), %%ymm2\n"
18340           "vmovupd (%3), %%ymm3\n"
18341           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18342           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18343           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18344           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18345           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18346           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18347           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18348           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18349           "vmovupd %%ymm0, (%0)\n"
18350           "vmovupd %%ymm1, (%1)\n"
18351           "vmovupd %%ymm2, (%2)\n"
18352           "vmovupd %%ymm3, (%3)\n"
18353           :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18354         );
18355       }
18356     }
18357     return;
18358   }
18359 }
18360 void helper_double_28(double *buf);
helper_double_28(double * buf)18361 void helper_double_28(double *buf) {
18362   helper_double_28_recursive(buf, 28);
18363 }
18364 void helper_double_29_recursive(double *buf, int depth);
helper_double_29_recursive(double * buf,int depth)18365 void helper_double_29_recursive(double *buf, int depth) {
18366   if (depth == 11) {
18367     for (int j = 0; j < 2048; j += 32) {
18368       for (int k = 0; k < 4; k += 4) {
18369         __asm__ volatile (
18370           "vmovupd (%0), %%ymm0\n"
18371           "vmovupd (%1), %%ymm1\n"
18372           "vmovupd (%2), %%ymm2\n"
18373           "vmovupd (%3), %%ymm3\n"
18374           "vmovupd (%4), %%ymm4\n"
18375           "vmovupd (%5), %%ymm5\n"
18376           "vmovupd (%6), %%ymm6\n"
18377           "vmovupd (%7), %%ymm7\n"
18378           "vpermilpd $0, %%ymm0, %%ymm8\n"
18379           "vpermilpd $15, %%ymm0, %%ymm9\n"
18380           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18381           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18382           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
18383           "vpermilpd $0, %%ymm1, %%ymm8\n"
18384           "vpermilpd $15, %%ymm1, %%ymm9\n"
18385           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18386           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18387           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
18388           "vpermilpd $0, %%ymm2, %%ymm8\n"
18389           "vpermilpd $15, %%ymm2, %%ymm9\n"
18390           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18391           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18392           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
18393           "vpermilpd $0, %%ymm3, %%ymm8\n"
18394           "vpermilpd $15, %%ymm3, %%ymm9\n"
18395           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18396           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18397           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
18398           "vpermilpd $0, %%ymm4, %%ymm8\n"
18399           "vpermilpd $15, %%ymm4, %%ymm9\n"
18400           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18401           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18402           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
18403           "vpermilpd $0, %%ymm5, %%ymm8\n"
18404           "vpermilpd $15, %%ymm5, %%ymm9\n"
18405           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18406           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18407           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
18408           "vpermilpd $0, %%ymm6, %%ymm8\n"
18409           "vpermilpd $15, %%ymm6, %%ymm9\n"
18410           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18411           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18412           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
18413           "vpermilpd $0, %%ymm7, %%ymm8\n"
18414           "vpermilpd $15, %%ymm7, %%ymm9\n"
18415           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18416           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18417           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
18418           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
18419           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18420           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
18421           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
18422           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
18423           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
18424           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18425           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
18426           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
18427           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
18428           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
18429           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18430           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
18431           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
18432           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
18433           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
18434           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18435           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
18436           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
18437           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
18438           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
18439           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18440           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
18441           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
18442           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
18443           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
18444           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18445           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
18446           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
18447           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
18448           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
18449           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18450           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
18451           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
18452           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
18453           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
18454           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18455           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
18456           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
18457           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
18458           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18459           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18460           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18461           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18462           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18463           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18464           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18465           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18466           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18467           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18468           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18469           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18470           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18471           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18472           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18473           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18474           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18475           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18476           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18477           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18478           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18479           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18480           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18481           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18482           "vmovupd %%ymm8, (%0)\n"
18483           "vmovupd %%ymm9, (%1)\n"
18484           "vmovupd %%ymm10, (%2)\n"
18485           "vmovupd %%ymm11, (%3)\n"
18486           "vmovupd %%ymm12, (%4)\n"
18487           "vmovupd %%ymm13, (%5)\n"
18488           "vmovupd %%ymm14, (%6)\n"
18489           "vmovupd %%ymm15, (%7)\n"
18490           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18491         );
18492       }
18493     }
18494     for (int j = 0; j < 2048; j += 256) {
18495       for (int k = 0; k < 32; k += 4) {
18496         __asm__ volatile (
18497           "vmovupd (%0), %%ymm0\n"
18498           "vmovupd (%1), %%ymm1\n"
18499           "vmovupd (%2), %%ymm2\n"
18500           "vmovupd (%3), %%ymm3\n"
18501           "vmovupd (%4), %%ymm4\n"
18502           "vmovupd (%5), %%ymm5\n"
18503           "vmovupd (%6), %%ymm6\n"
18504           "vmovupd (%7), %%ymm7\n"
18505           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18506           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18507           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18508           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18509           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18510           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18511           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18512           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18513           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18514           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18515           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18516           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18517           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18518           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18519           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18520           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18521           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18522           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18523           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18524           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18525           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18526           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18527           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18528           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18529           "vmovupd %%ymm8, (%0)\n"
18530           "vmovupd %%ymm9, (%1)\n"
18531           "vmovupd %%ymm10, (%2)\n"
18532           "vmovupd %%ymm11, (%3)\n"
18533           "vmovupd %%ymm12, (%4)\n"
18534           "vmovupd %%ymm13, (%5)\n"
18535           "vmovupd %%ymm14, (%6)\n"
18536           "vmovupd %%ymm15, (%7)\n"
18537           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18538         );
18539       }
18540     }
18541     for (int j = 0; j < 2048; j += 2048) {
18542       for (int k = 0; k < 256; k += 4) {
18543         __asm__ volatile (
18544           "vmovupd (%0), %%ymm0\n"
18545           "vmovupd (%1), %%ymm1\n"
18546           "vmovupd (%2), %%ymm2\n"
18547           "vmovupd (%3), %%ymm3\n"
18548           "vmovupd (%4), %%ymm4\n"
18549           "vmovupd (%5), %%ymm5\n"
18550           "vmovupd (%6), %%ymm6\n"
18551           "vmovupd (%7), %%ymm7\n"
18552           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18553           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18554           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18555           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18556           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18557           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18558           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18559           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18560           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18561           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18562           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18563           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18564           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18565           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18566           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18567           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18568           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18569           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18570           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18571           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18572           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18573           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18574           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18575           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18576           "vmovupd %%ymm8, (%0)\n"
18577           "vmovupd %%ymm9, (%1)\n"
18578           "vmovupd %%ymm10, (%2)\n"
18579           "vmovupd %%ymm11, (%3)\n"
18580           "vmovupd %%ymm12, (%4)\n"
18581           "vmovupd %%ymm13, (%5)\n"
18582           "vmovupd %%ymm14, (%6)\n"
18583           "vmovupd %%ymm15, (%7)\n"
18584           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18585         );
18586       }
18587     }
18588     return;
18589   }
18590   if (depth == 14) {
18591     helper_double_29_recursive(buf + 0, 11);
18592     helper_double_29_recursive(buf + 2048, 11);
18593     helper_double_29_recursive(buf + 4096, 11);
18594     helper_double_29_recursive(buf + 6144, 11);
18595     helper_double_29_recursive(buf + 8192, 11);
18596     helper_double_29_recursive(buf + 10240, 11);
18597     helper_double_29_recursive(buf + 12288, 11);
18598     helper_double_29_recursive(buf + 14336, 11);
18599     for (int j = 0; j < 16384; j += 16384) {
18600       for (int k = 0; k < 2048; k += 4) {
18601         __asm__ volatile (
18602           "vmovupd (%0), %%ymm0\n"
18603           "vmovupd (%1), %%ymm1\n"
18604           "vmovupd (%2), %%ymm2\n"
18605           "vmovupd (%3), %%ymm3\n"
18606           "vmovupd (%4), %%ymm4\n"
18607           "vmovupd (%5), %%ymm5\n"
18608           "vmovupd (%6), %%ymm6\n"
18609           "vmovupd (%7), %%ymm7\n"
18610           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18611           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18612           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18613           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18614           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18615           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18616           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18617           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18618           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18619           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18620           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18621           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18622           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18623           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18624           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18625           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18626           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18627           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18628           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18629           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18630           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18631           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18632           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18633           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18634           "vmovupd %%ymm8, (%0)\n"
18635           "vmovupd %%ymm9, (%1)\n"
18636           "vmovupd %%ymm10, (%2)\n"
18637           "vmovupd %%ymm11, (%3)\n"
18638           "vmovupd %%ymm12, (%4)\n"
18639           "vmovupd %%ymm13, (%5)\n"
18640           "vmovupd %%ymm14, (%6)\n"
18641           "vmovupd %%ymm15, (%7)\n"
18642           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18643         );
18644       }
18645     }
18646     return;
18647   }
18648   if (depth == 17) {
18649     helper_double_29_recursive(buf + 0, 14);
18650     helper_double_29_recursive(buf + 16384, 14);
18651     helper_double_29_recursive(buf + 32768, 14);
18652     helper_double_29_recursive(buf + 49152, 14);
18653     helper_double_29_recursive(buf + 65536, 14);
18654     helper_double_29_recursive(buf + 81920, 14);
18655     helper_double_29_recursive(buf + 98304, 14);
18656     helper_double_29_recursive(buf + 114688, 14);
18657     for (int j = 0; j < 131072; j += 131072) {
18658       for (int k = 0; k < 16384; k += 4) {
18659         __asm__ volatile (
18660           "vmovupd (%0), %%ymm0\n"
18661           "vmovupd (%1), %%ymm1\n"
18662           "vmovupd (%2), %%ymm2\n"
18663           "vmovupd (%3), %%ymm3\n"
18664           "vmovupd (%4), %%ymm4\n"
18665           "vmovupd (%5), %%ymm5\n"
18666           "vmovupd (%6), %%ymm6\n"
18667           "vmovupd (%7), %%ymm7\n"
18668           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18669           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18670           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18671           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18672           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18673           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18674           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18675           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18676           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18677           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18678           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18679           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18680           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18681           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18682           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18683           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18684           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18685           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18686           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18687           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18688           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18689           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18690           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18691           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18692           "vmovupd %%ymm8, (%0)\n"
18693           "vmovupd %%ymm9, (%1)\n"
18694           "vmovupd %%ymm10, (%2)\n"
18695           "vmovupd %%ymm11, (%3)\n"
18696           "vmovupd %%ymm12, (%4)\n"
18697           "vmovupd %%ymm13, (%5)\n"
18698           "vmovupd %%ymm14, (%6)\n"
18699           "vmovupd %%ymm15, (%7)\n"
18700           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18701         );
18702       }
18703     }
18704     return;
18705   }
18706   if (depth == 20) {
18707     helper_double_29_recursive(buf + 0, 17);
18708     helper_double_29_recursive(buf + 131072, 17);
18709     helper_double_29_recursive(buf + 262144, 17);
18710     helper_double_29_recursive(buf + 393216, 17);
18711     helper_double_29_recursive(buf + 524288, 17);
18712     helper_double_29_recursive(buf + 655360, 17);
18713     helper_double_29_recursive(buf + 786432, 17);
18714     helper_double_29_recursive(buf + 917504, 17);
18715     for (int j = 0; j < 1048576; j += 1048576) {
18716       for (int k = 0; k < 131072; k += 4) {
18717         __asm__ volatile (
18718           "vmovupd (%0), %%ymm0\n"
18719           "vmovupd (%1), %%ymm1\n"
18720           "vmovupd (%2), %%ymm2\n"
18721           "vmovupd (%3), %%ymm3\n"
18722           "vmovupd (%4), %%ymm4\n"
18723           "vmovupd (%5), %%ymm5\n"
18724           "vmovupd (%6), %%ymm6\n"
18725           "vmovupd (%7), %%ymm7\n"
18726           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18727           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18728           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18729           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18730           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18731           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18732           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18733           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18734           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18735           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18736           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18737           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18738           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18739           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18740           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18741           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18742           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18743           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18744           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18745           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18746           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18747           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18748           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18749           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18750           "vmovupd %%ymm8, (%0)\n"
18751           "vmovupd %%ymm9, (%1)\n"
18752           "vmovupd %%ymm10, (%2)\n"
18753           "vmovupd %%ymm11, (%3)\n"
18754           "vmovupd %%ymm12, (%4)\n"
18755           "vmovupd %%ymm13, (%5)\n"
18756           "vmovupd %%ymm14, (%6)\n"
18757           "vmovupd %%ymm15, (%7)\n"
18758           :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18759         );
18760       }
18761     }
18762     return;
18763   }
18764   if (depth == 23) {
18765     helper_double_29_recursive(buf + 0, 20);
18766     helper_double_29_recursive(buf + 1048576, 20);
18767     helper_double_29_recursive(buf + 2097152, 20);
18768     helper_double_29_recursive(buf + 3145728, 20);
18769     helper_double_29_recursive(buf + 4194304, 20);
18770     helper_double_29_recursive(buf + 5242880, 20);
18771     helper_double_29_recursive(buf + 6291456, 20);
18772     helper_double_29_recursive(buf + 7340032, 20);
18773     for (int j = 0; j < 8388608; j += 8388608) {
18774       for (int k = 0; k < 1048576; k += 4) {
18775         __asm__ volatile (
18776           "vmovupd (%0), %%ymm0\n"
18777           "vmovupd (%1), %%ymm1\n"
18778           "vmovupd (%2), %%ymm2\n"
18779           "vmovupd (%3), %%ymm3\n"
18780           "vmovupd (%4), %%ymm4\n"
18781           "vmovupd (%5), %%ymm5\n"
18782           "vmovupd (%6), %%ymm6\n"
18783           "vmovupd (%7), %%ymm7\n"
18784           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18785           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18786           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18787           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18788           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18789           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18790           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18791           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18792           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18793           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18794           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18795           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18796           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18797           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18798           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18799           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18800           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18801           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18802           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18803           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18804           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18805           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18806           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18807           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18808           "vmovupd %%ymm8, (%0)\n"
18809           "vmovupd %%ymm9, (%1)\n"
18810           "vmovupd %%ymm10, (%2)\n"
18811           "vmovupd %%ymm11, (%3)\n"
18812           "vmovupd %%ymm12, (%4)\n"
18813           "vmovupd %%ymm13, (%5)\n"
18814           "vmovupd %%ymm14, (%6)\n"
18815           "vmovupd %%ymm15, (%7)\n"
18816           :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18817         );
18818       }
18819     }
18820     return;
18821   }
18822   if (depth == 26) {
18823     helper_double_29_recursive(buf + 0, 23);
18824     helper_double_29_recursive(buf + 8388608, 23);
18825     helper_double_29_recursive(buf + 16777216, 23);
18826     helper_double_29_recursive(buf + 25165824, 23);
18827     helper_double_29_recursive(buf + 33554432, 23);
18828     helper_double_29_recursive(buf + 41943040, 23);
18829     helper_double_29_recursive(buf + 50331648, 23);
18830     helper_double_29_recursive(buf + 58720256, 23);
18831     for (int j = 0; j < 67108864; j += 67108864) {
18832       for (int k = 0; k < 8388608; k += 4) {
18833         __asm__ volatile (
18834           "vmovupd (%0), %%ymm0\n"
18835           "vmovupd (%1), %%ymm1\n"
18836           "vmovupd (%2), %%ymm2\n"
18837           "vmovupd (%3), %%ymm3\n"
18838           "vmovupd (%4), %%ymm4\n"
18839           "vmovupd (%5), %%ymm5\n"
18840           "vmovupd (%6), %%ymm6\n"
18841           "vmovupd (%7), %%ymm7\n"
18842           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18843           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18844           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18845           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18846           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18847           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18848           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18849           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18850           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18851           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18852           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18853           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18854           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18855           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18856           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18857           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18858           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18859           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18860           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18861           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18862           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18863           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18864           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18865           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18866           "vmovupd %%ymm8, (%0)\n"
18867           "vmovupd %%ymm9, (%1)\n"
18868           "vmovupd %%ymm10, (%2)\n"
18869           "vmovupd %%ymm11, (%3)\n"
18870           "vmovupd %%ymm12, (%4)\n"
18871           "vmovupd %%ymm13, (%5)\n"
18872           "vmovupd %%ymm14, (%6)\n"
18873           "vmovupd %%ymm15, (%7)\n"
18874           :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18875         );
18876       }
18877     }
18878     return;
18879   }
18880   if (depth == 29) {
18881     helper_double_29_recursive(buf + 0, 26);
18882     helper_double_29_recursive(buf + 67108864, 26);
18883     helper_double_29_recursive(buf + 134217728, 26);
18884     helper_double_29_recursive(buf + 201326592, 26);
18885     helper_double_29_recursive(buf + 268435456, 26);
18886     helper_double_29_recursive(buf + 335544320, 26);
18887     helper_double_29_recursive(buf + 402653184, 26);
18888     helper_double_29_recursive(buf + 469762048, 26);
18889     for (int j = 0; j < 536870912; j += 536870912) {
18890       for (int k = 0; k < 67108864; k += 4) {
18891         __asm__ volatile (
18892           "vmovupd (%0), %%ymm0\n"
18893           "vmovupd (%1), %%ymm1\n"
18894           "vmovupd (%2), %%ymm2\n"
18895           "vmovupd (%3), %%ymm3\n"
18896           "vmovupd (%4), %%ymm4\n"
18897           "vmovupd (%5), %%ymm5\n"
18898           "vmovupd (%6), %%ymm6\n"
18899           "vmovupd (%7), %%ymm7\n"
18900           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18901           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18902           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18903           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18904           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18905           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18906           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18907           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18908           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18909           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18910           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18911           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18912           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18913           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18914           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18915           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18916           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18917           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18918           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18919           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18920           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18921           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18922           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18923           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18924           "vmovupd %%ymm8, (%0)\n"
18925           "vmovupd %%ymm9, (%1)\n"
18926           "vmovupd %%ymm10, (%2)\n"
18927           "vmovupd %%ymm11, (%3)\n"
18928           "vmovupd %%ymm12, (%4)\n"
18929           "vmovupd %%ymm13, (%5)\n"
18930           "vmovupd %%ymm14, (%6)\n"
18931           "vmovupd %%ymm15, (%7)\n"
18932           :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592), "r"(buf + j + k + 268435456), "r"(buf + j + k + 335544320), "r"(buf + j + k + 402653184), "r"(buf + j + k + 469762048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18933         );
18934       }
18935     }
18936     return;
18937   }
18938 }
18939 void helper_double_29(double *buf);
helper_double_29(double * buf)18940 void helper_double_29(double *buf) {
18941   helper_double_29_recursive(buf, 29);
18942 }
18943 void helper_double_30_recursive(double *buf, int depth);
helper_double_30_recursive(double * buf,int depth)18944 void helper_double_30_recursive(double *buf, int depth) {
18945   if (depth == 9) {
18946     for (int j = 0; j < 512; j += 32) {
18947       for (int k = 0; k < 4; k += 4) {
18948         __asm__ volatile (
18949           "vmovupd (%0), %%ymm0\n"
18950           "vmovupd (%1), %%ymm1\n"
18951           "vmovupd (%2), %%ymm2\n"
18952           "vmovupd (%3), %%ymm3\n"
18953           "vmovupd (%4), %%ymm4\n"
18954           "vmovupd (%5), %%ymm5\n"
18955           "vmovupd (%6), %%ymm6\n"
18956           "vmovupd (%7), %%ymm7\n"
18957           "vpermilpd $0, %%ymm0, %%ymm8\n"
18958           "vpermilpd $15, %%ymm0, %%ymm9\n"
18959           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18960           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18961           "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
18962           "vpermilpd $0, %%ymm1, %%ymm8\n"
18963           "vpermilpd $15, %%ymm1, %%ymm9\n"
18964           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18965           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18966           "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
18967           "vpermilpd $0, %%ymm2, %%ymm8\n"
18968           "vpermilpd $15, %%ymm2, %%ymm9\n"
18969           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18970           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18971           "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
18972           "vpermilpd $0, %%ymm3, %%ymm8\n"
18973           "vpermilpd $15, %%ymm3, %%ymm9\n"
18974           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18975           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18976           "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
18977           "vpermilpd $0, %%ymm4, %%ymm8\n"
18978           "vpermilpd $15, %%ymm4, %%ymm9\n"
18979           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18980           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18981           "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
18982           "vpermilpd $0, %%ymm5, %%ymm8\n"
18983           "vpermilpd $15, %%ymm5, %%ymm9\n"
18984           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18985           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18986           "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
18987           "vpermilpd $0, %%ymm6, %%ymm8\n"
18988           "vpermilpd $15, %%ymm6, %%ymm9\n"
18989           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18990           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18991           "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
18992           "vpermilpd $0, %%ymm7, %%ymm8\n"
18993           "vpermilpd $15, %%ymm7, %%ymm9\n"
18994           "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18995           "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18996           "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
18997           "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
18998           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18999           "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
19000           "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
19001           "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
19002           "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
19003           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19004           "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
19005           "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
19006           "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
19007           "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
19008           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19009           "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
19010           "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
19011           "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
19012           "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
19013           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19014           "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
19015           "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
19016           "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
19017           "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
19018           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19019           "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
19020           "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
19021           "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
19022           "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
19023           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19024           "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
19025           "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
19026           "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
19027           "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
19028           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19029           "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
19030           "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
19031           "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
19032           "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
19033           "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19034           "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
19035           "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
19036           "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
19037           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19038           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19039           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19040           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19041           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19042           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19043           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19044           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19045           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19046           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19047           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19048           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19049           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19050           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19051           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19052           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19053           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19054           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19055           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19056           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19057           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19058           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19059           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19060           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19061           "vmovupd %%ymm8, (%0)\n"
19062           "vmovupd %%ymm9, (%1)\n"
19063           "vmovupd %%ymm10, (%2)\n"
19064           "vmovupd %%ymm11, (%3)\n"
19065           "vmovupd %%ymm12, (%4)\n"
19066           "vmovupd %%ymm13, (%5)\n"
19067           "vmovupd %%ymm14, (%6)\n"
19068           "vmovupd %%ymm15, (%7)\n"
19069           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19070         );
19071       }
19072     }
19073     for (int j = 0; j < 512; j += 256) {
19074       for (int k = 0; k < 32; k += 4) {
19075         __asm__ volatile (
19076           "vmovupd (%0), %%ymm0\n"
19077           "vmovupd (%1), %%ymm1\n"
19078           "vmovupd (%2), %%ymm2\n"
19079           "vmovupd (%3), %%ymm3\n"
19080           "vmovupd (%4), %%ymm4\n"
19081           "vmovupd (%5), %%ymm5\n"
19082           "vmovupd (%6), %%ymm6\n"
19083           "vmovupd (%7), %%ymm7\n"
19084           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19085           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19086           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19087           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19088           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19089           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19090           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19091           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19092           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19093           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19094           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19095           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19096           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19097           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19098           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19099           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19100           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19101           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19102           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19103           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19104           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19105           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19106           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19107           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19108           "vmovupd %%ymm8, (%0)\n"
19109           "vmovupd %%ymm9, (%1)\n"
19110           "vmovupd %%ymm10, (%2)\n"
19111           "vmovupd %%ymm11, (%3)\n"
19112           "vmovupd %%ymm12, (%4)\n"
19113           "vmovupd %%ymm13, (%5)\n"
19114           "vmovupd %%ymm14, (%6)\n"
19115           "vmovupd %%ymm15, (%7)\n"
19116           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19117         );
19118       }
19119     }
19120     for (int j = 0; j < 512; j += 512) {
19121       for (int k = 0; k < 256; k += 4) {
19122         __asm__ volatile (
19123           "vmovupd (%0), %%ymm0\n"
19124           "vmovupd (%1), %%ymm1\n"
19125           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19126           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19127           "vmovupd %%ymm8, (%0)\n"
19128           "vmovupd %%ymm9, (%1)\n"
19129           :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19130         );
19131       }
19132     }
19133     return;
19134   }
19135   if (depth == 12) {
19136     helper_double_30_recursive(buf + 0, 9);
19137     helper_double_30_recursive(buf + 512, 9);
19138     helper_double_30_recursive(buf + 1024, 9);
19139     helper_double_30_recursive(buf + 1536, 9);
19140     helper_double_30_recursive(buf + 2048, 9);
19141     helper_double_30_recursive(buf + 2560, 9);
19142     helper_double_30_recursive(buf + 3072, 9);
19143     helper_double_30_recursive(buf + 3584, 9);
19144     for (int j = 0; j < 4096; j += 4096) {
19145       for (int k = 0; k < 512; k += 4) {
19146         __asm__ volatile (
19147           "vmovupd (%0), %%ymm0\n"
19148           "vmovupd (%1), %%ymm1\n"
19149           "vmovupd (%2), %%ymm2\n"
19150           "vmovupd (%3), %%ymm3\n"
19151           "vmovupd (%4), %%ymm4\n"
19152           "vmovupd (%5), %%ymm5\n"
19153           "vmovupd (%6), %%ymm6\n"
19154           "vmovupd (%7), %%ymm7\n"
19155           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19156           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19157           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19158           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19159           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19160           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19161           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19162           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19163           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19164           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19165           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19166           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19167           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19168           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19169           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19170           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19171           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19172           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19173           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19174           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19175           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19176           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19177           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19178           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19179           "vmovupd %%ymm8, (%0)\n"
19180           "vmovupd %%ymm9, (%1)\n"
19181           "vmovupd %%ymm10, (%2)\n"
19182           "vmovupd %%ymm11, (%3)\n"
19183           "vmovupd %%ymm12, (%4)\n"
19184           "vmovupd %%ymm13, (%5)\n"
19185           "vmovupd %%ymm14, (%6)\n"
19186           "vmovupd %%ymm15, (%7)\n"
19187           :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19188         );
19189       }
19190     }
19191     return;
19192   }
19193   if (depth == 15) {
19194     helper_double_30_recursive(buf + 0, 12);
19195     helper_double_30_recursive(buf + 4096, 12);
19196     helper_double_30_recursive(buf + 8192, 12);
19197     helper_double_30_recursive(buf + 12288, 12);
19198     helper_double_30_recursive(buf + 16384, 12);
19199     helper_double_30_recursive(buf + 20480, 12);
19200     helper_double_30_recursive(buf + 24576, 12);
19201     helper_double_30_recursive(buf + 28672, 12);
19202     for (int j = 0; j < 32768; j += 32768) {
19203       for (int k = 0; k < 4096; k += 4) {
19204         __asm__ volatile (
19205           "vmovupd (%0), %%ymm0\n"
19206           "vmovupd (%1), %%ymm1\n"
19207           "vmovupd (%2), %%ymm2\n"
19208           "vmovupd (%3), %%ymm3\n"
19209           "vmovupd (%4), %%ymm4\n"
19210           "vmovupd (%5), %%ymm5\n"
19211           "vmovupd (%6), %%ymm6\n"
19212           "vmovupd (%7), %%ymm7\n"
19213           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19214           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19215           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19216           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19217           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19218           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19219           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19220           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19221           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19222           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19223           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19224           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19225           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19226           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19227           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19228           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19229           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19230           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19231           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19232           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19233           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19234           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19235           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19236           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19237           "vmovupd %%ymm8, (%0)\n"
19238           "vmovupd %%ymm9, (%1)\n"
19239           "vmovupd %%ymm10, (%2)\n"
19240           "vmovupd %%ymm11, (%3)\n"
19241           "vmovupd %%ymm12, (%4)\n"
19242           "vmovupd %%ymm13, (%5)\n"
19243           "vmovupd %%ymm14, (%6)\n"
19244           "vmovupd %%ymm15, (%7)\n"
19245           :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19246         );
19247       }
19248     }
19249     return;
19250   }
19251   if (depth == 18) {
19252     helper_double_30_recursive(buf + 0, 15);
19253     helper_double_30_recursive(buf + 32768, 15);
19254     helper_double_30_recursive(buf + 65536, 15);
19255     helper_double_30_recursive(buf + 98304, 15);
19256     helper_double_30_recursive(buf + 131072, 15);
19257     helper_double_30_recursive(buf + 163840, 15);
19258     helper_double_30_recursive(buf + 196608, 15);
19259     helper_double_30_recursive(buf + 229376, 15);
19260     for (int j = 0; j < 262144; j += 262144) {
19261       for (int k = 0; k < 32768; k += 4) {
19262         __asm__ volatile (
19263           "vmovupd (%0), %%ymm0\n"
19264           "vmovupd (%1), %%ymm1\n"
19265           "vmovupd (%2), %%ymm2\n"
19266           "vmovupd (%3), %%ymm3\n"
19267           "vmovupd (%4), %%ymm4\n"
19268           "vmovupd (%5), %%ymm5\n"
19269           "vmovupd (%6), %%ymm6\n"
19270           "vmovupd (%7), %%ymm7\n"
19271           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19272           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19273           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19274           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19275           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19276           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19277           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19278           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19279           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19280           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19281           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19282           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19283           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19284           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19285           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19286           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19287           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19288           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19289           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19290           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19291           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19292           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19293           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19294           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19295           "vmovupd %%ymm8, (%0)\n"
19296           "vmovupd %%ymm9, (%1)\n"
19297           "vmovupd %%ymm10, (%2)\n"
19298           "vmovupd %%ymm11, (%3)\n"
19299           "vmovupd %%ymm12, (%4)\n"
19300           "vmovupd %%ymm13, (%5)\n"
19301           "vmovupd %%ymm14, (%6)\n"
19302           "vmovupd %%ymm15, (%7)\n"
19303           :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19304         );
19305       }
19306     }
19307     return;
19308   }
19309   if (depth == 21) {
19310     helper_double_30_recursive(buf + 0, 18);
19311     helper_double_30_recursive(buf + 262144, 18);
19312     helper_double_30_recursive(buf + 524288, 18);
19313     helper_double_30_recursive(buf + 786432, 18);
19314     helper_double_30_recursive(buf + 1048576, 18);
19315     helper_double_30_recursive(buf + 1310720, 18);
19316     helper_double_30_recursive(buf + 1572864, 18);
19317     helper_double_30_recursive(buf + 1835008, 18);
19318     for (int j = 0; j < 2097152; j += 2097152) {
19319       for (int k = 0; k < 262144; k += 4) {
19320         __asm__ volatile (
19321           "vmovupd (%0), %%ymm0\n"
19322           "vmovupd (%1), %%ymm1\n"
19323           "vmovupd (%2), %%ymm2\n"
19324           "vmovupd (%3), %%ymm3\n"
19325           "vmovupd (%4), %%ymm4\n"
19326           "vmovupd (%5), %%ymm5\n"
19327           "vmovupd (%6), %%ymm6\n"
19328           "vmovupd (%7), %%ymm7\n"
19329           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19330           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19331           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19332           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19333           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19334           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19335           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19336           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19337           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19338           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19339           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19340           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19341           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19342           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19343           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19344           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19345           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19346           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19347           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19348           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19349           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19350           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19351           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19352           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19353           "vmovupd %%ymm8, (%0)\n"
19354           "vmovupd %%ymm9, (%1)\n"
19355           "vmovupd %%ymm10, (%2)\n"
19356           "vmovupd %%ymm11, (%3)\n"
19357           "vmovupd %%ymm12, (%4)\n"
19358           "vmovupd %%ymm13, (%5)\n"
19359           "vmovupd %%ymm14, (%6)\n"
19360           "vmovupd %%ymm15, (%7)\n"
19361           :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19362         );
19363       }
19364     }
19365     return;
19366   }
19367   if (depth == 24) {
19368     helper_double_30_recursive(buf + 0, 21);
19369     helper_double_30_recursive(buf + 2097152, 21);
19370     helper_double_30_recursive(buf + 4194304, 21);
19371     helper_double_30_recursive(buf + 6291456, 21);
19372     helper_double_30_recursive(buf + 8388608, 21);
19373     helper_double_30_recursive(buf + 10485760, 21);
19374     helper_double_30_recursive(buf + 12582912, 21);
19375     helper_double_30_recursive(buf + 14680064, 21);
19376     for (int j = 0; j < 16777216; j += 16777216) {
19377       for (int k = 0; k < 2097152; k += 4) {
19378         __asm__ volatile (
19379           "vmovupd (%0), %%ymm0\n"
19380           "vmovupd (%1), %%ymm1\n"
19381           "vmovupd (%2), %%ymm2\n"
19382           "vmovupd (%3), %%ymm3\n"
19383           "vmovupd (%4), %%ymm4\n"
19384           "vmovupd (%5), %%ymm5\n"
19385           "vmovupd (%6), %%ymm6\n"
19386           "vmovupd (%7), %%ymm7\n"
19387           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19388           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19389           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19390           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19391           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19392           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19393           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19394           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19395           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19396           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19397           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19398           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19399           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19400           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19401           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19402           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19403           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19404           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19405           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19406           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19407           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19408           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19409           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19410           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19411           "vmovupd %%ymm8, (%0)\n"
19412           "vmovupd %%ymm9, (%1)\n"
19413           "vmovupd %%ymm10, (%2)\n"
19414           "vmovupd %%ymm11, (%3)\n"
19415           "vmovupd %%ymm12, (%4)\n"
19416           "vmovupd %%ymm13, (%5)\n"
19417           "vmovupd %%ymm14, (%6)\n"
19418           "vmovupd %%ymm15, (%7)\n"
19419           :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19420         );
19421       }
19422     }
19423     return;
19424   }
19425   if (depth == 27) {
19426     helper_double_30_recursive(buf + 0, 24);
19427     helper_double_30_recursive(buf + 16777216, 24);
19428     helper_double_30_recursive(buf + 33554432, 24);
19429     helper_double_30_recursive(buf + 50331648, 24);
19430     helper_double_30_recursive(buf + 67108864, 24);
19431     helper_double_30_recursive(buf + 83886080, 24);
19432     helper_double_30_recursive(buf + 100663296, 24);
19433     helper_double_30_recursive(buf + 117440512, 24);
19434     for (int j = 0; j < 134217728; j += 134217728) {
19435       for (int k = 0; k < 16777216; k += 4) {
19436         __asm__ volatile (
19437           "vmovupd (%0), %%ymm0\n"
19438           "vmovupd (%1), %%ymm1\n"
19439           "vmovupd (%2), %%ymm2\n"
19440           "vmovupd (%3), %%ymm3\n"
19441           "vmovupd (%4), %%ymm4\n"
19442           "vmovupd (%5), %%ymm5\n"
19443           "vmovupd (%6), %%ymm6\n"
19444           "vmovupd (%7), %%ymm7\n"
19445           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19446           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19447           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19448           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19449           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19450           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19451           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19452           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19453           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19454           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19455           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19456           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19457           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19458           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19459           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19460           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19461           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19462           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19463           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19464           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19465           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19466           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19467           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19468           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19469           "vmovupd %%ymm8, (%0)\n"
19470           "vmovupd %%ymm9, (%1)\n"
19471           "vmovupd %%ymm10, (%2)\n"
19472           "vmovupd %%ymm11, (%3)\n"
19473           "vmovupd %%ymm12, (%4)\n"
19474           "vmovupd %%ymm13, (%5)\n"
19475           "vmovupd %%ymm14, (%6)\n"
19476           "vmovupd %%ymm15, (%7)\n"
19477           :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19478         );
19479       }
19480     }
19481     return;
19482   }
19483   if (depth == 30) {
19484     helper_double_30_recursive(buf + 0, 27);
19485     helper_double_30_recursive(buf + 134217728, 27);
19486     helper_double_30_recursive(buf + 268435456, 27);
19487     helper_double_30_recursive(buf + 402653184, 27);
19488     helper_double_30_recursive(buf + 536870912, 27);
19489     helper_double_30_recursive(buf + 671088640, 27);
19490     helper_double_30_recursive(buf + 805306368, 27);
19491     helper_double_30_recursive(buf + 939524096, 27);
19492     for (int j = 0; j < 1073741824; j += 1073741824) {
19493       for (int k = 0; k < 134217728; k += 4) {
19494         __asm__ volatile (
19495           "vmovupd (%0), %%ymm0\n"
19496           "vmovupd (%1), %%ymm1\n"
19497           "vmovupd (%2), %%ymm2\n"
19498           "vmovupd (%3), %%ymm3\n"
19499           "vmovupd (%4), %%ymm4\n"
19500           "vmovupd (%5), %%ymm5\n"
19501           "vmovupd (%6), %%ymm6\n"
19502           "vmovupd (%7), %%ymm7\n"
19503           "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19504           "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19505           "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19506           "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19507           "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19508           "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19509           "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19510           "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19511           "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19512           "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19513           "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19514           "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19515           "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19516           "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19517           "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19518           "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19519           "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19520           "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19521           "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19522           "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19523           "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19524           "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19525           "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19526           "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19527           "vmovupd %%ymm8, (%0)\n"
19528           "vmovupd %%ymm9, (%1)\n"
19529           "vmovupd %%ymm10, (%2)\n"
19530           "vmovupd %%ymm11, (%3)\n"
19531           "vmovupd %%ymm12, (%4)\n"
19532           "vmovupd %%ymm13, (%5)\n"
19533           "vmovupd %%ymm14, (%6)\n"
19534           "vmovupd %%ymm15, (%7)\n"
19535           :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19536         );
19537       }
19538     }
19539     return;
19540   }
19541 }
19542 void helper_double_30(double *buf);
helper_double_30(double * buf)19543 void helper_double_30(double *buf) {
19544   helper_double_30_recursive(buf, 30);
19545 }
fht_double(double * buf,int log_n)19546 int fht_double(double *buf, int log_n) {
19547   if (log_n == 0) {
19548     return 0;
19549   }
19550   if (log_n == 1) {
19551     helper_double_1(buf);
19552     return 0;
19553   }
19554   if (log_n == 2) {
19555     helper_double_2(buf);
19556     return 0;
19557   }
19558   if (log_n == 3) {
19559     helper_double_3(buf);
19560     return 0;
19561   }
19562   if (log_n == 4) {
19563     helper_double_4(buf);
19564     return 0;
19565   }
19566   if (log_n == 5) {
19567     helper_double_5(buf);
19568     return 0;
19569   }
19570   if (log_n == 6) {
19571     helper_double_6(buf);
19572     return 0;
19573   }
19574   if (log_n == 7) {
19575     helper_double_7(buf);
19576     return 0;
19577   }
19578   if (log_n == 8) {
19579     helper_double_8(buf);
19580     return 0;
19581   }
19582   if (log_n == 9) {
19583     helper_double_9(buf);
19584     return 0;
19585   }
19586   if (log_n == 10) {
19587     helper_double_10(buf);
19588     return 0;
19589   }
19590   if (log_n == 11) {
19591     helper_double_11(buf);
19592     return 0;
19593   }
19594   if (log_n == 12) {
19595     helper_double_12(buf);
19596     return 0;
19597   }
19598   if (log_n == 13) {
19599     helper_double_13(buf);
19600     return 0;
19601   }
19602   if (log_n == 14) {
19603     helper_double_14(buf);
19604     return 0;
19605   }
19606   if (log_n == 15) {
19607     helper_double_15(buf);
19608     return 0;
19609   }
19610   if (log_n == 16) {
19611     helper_double_16(buf);
19612     return 0;
19613   }
19614   if (log_n == 17) {
19615     helper_double_17(buf);
19616     return 0;
19617   }
19618   if (log_n == 18) {
19619     helper_double_18(buf);
19620     return 0;
19621   }
19622   if (log_n == 19) {
19623     helper_double_19(buf);
19624     return 0;
19625   }
19626   if (log_n == 20) {
19627     helper_double_20(buf);
19628     return 0;
19629   }
19630   if (log_n == 21) {
19631     helper_double_21(buf);
19632     return 0;
19633   }
19634   if (log_n == 22) {
19635     helper_double_22(buf);
19636     return 0;
19637   }
19638   if (log_n == 23) {
19639     helper_double_23(buf);
19640     return 0;
19641   }
19642   if (log_n == 24) {
19643     helper_double_24(buf);
19644     return 0;
19645   }
19646   if (log_n == 25) {
19647     helper_double_25(buf);
19648     return 0;
19649   }
19650   if (log_n == 26) {
19651     helper_double_26(buf);
19652     return 0;
19653   }
19654   if (log_n == 27) {
19655     helper_double_27(buf);
19656     return 0;
19657   }
19658   if (log_n == 28) {
19659     helper_double_28(buf);
19660     return 0;
19661   }
19662   if (log_n == 29) {
19663     helper_double_29(buf);
19664     return 0;
19665   }
19666   if (log_n == 30) {
19667     helper_double_30(buf);
19668     return 0;
19669   }
19670   return 1;
19671 }
19672