1 #include "fht.h"
2 static inline void helper_float_1(float *buf);
helper_float_1(float * buf)3 static inline void helper_float_1(float *buf) {
4 for (int j = 0; j < 2; j += 2) {
5 for (int k = 0; k < 1; ++k) {
6 float u = buf[j + k];
7 float v = buf[j + k + 1];
8 buf[j + k] = u + v;
9 buf[j + k + 1] = u - v;
10 }
11 }
12 }
13 static inline void helper_float_2(float *buf);
helper_float_2(float * buf)14 static inline void helper_float_2(float *buf) {
15 for (int j = 0; j < 4; j += 2) {
16 for (int k = 0; k < 1; ++k) {
17 float u = buf[j + k];
18 float v = buf[j + k + 1];
19 buf[j + k] = u + v;
20 buf[j + k + 1] = u - v;
21 }
22 }
23 for (int j = 0; j < 4; j += 4) {
24 for (int k = 0; k < 2; ++k) {
25 float u = buf[j + k];
26 float v = buf[j + k + 2];
27 buf[j + k] = u + v;
28 buf[j + k + 2] = u - v;
29 }
30 }
31 }
32 static inline void helper_float_3(float *buf);
helper_float_3(float * buf)33 static inline void helper_float_3(float *buf) {
34 for (int j = 0; j < 8; j += 8) {
35 __asm__ volatile (
36 "vmovups (%0), %%ymm0\n"
37 "vpermilps $160, %%ymm0, %%ymm8\n"
38 "vpermilps $245, %%ymm0, %%ymm9\n"
39 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
40 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
41 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
42 "vpermilps $68, %%ymm0, %%ymm8\n"
43 "vpermilps $238, %%ymm0, %%ymm9\n"
44 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
45 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
46 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
47 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
48 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
49 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
50 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
51 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
52 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
53 "vmovups %%ymm0, (%0)\n"
54 :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
55 );
56 }
57 }
58 static inline void helper_float_4(float *buf);
helper_float_4(float * buf)59 static inline void helper_float_4(float *buf) {
60 for (int j = 0; j < 16; j += 16) {
61 for (int k = 0; k < 8; k += 8) {
62 __asm__ volatile (
63 "vmovups (%0), %%ymm0\n"
64 "vmovups (%1), %%ymm1\n"
65 "vpermilps $160, %%ymm0, %%ymm8\n"
66 "vpermilps $245, %%ymm0, %%ymm9\n"
67 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
68 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
69 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
70 "vpermilps $160, %%ymm1, %%ymm8\n"
71 "vpermilps $245, %%ymm1, %%ymm9\n"
72 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
73 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
74 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
75 "vpermilps $68, %%ymm0, %%ymm8\n"
76 "vpermilps $238, %%ymm0, %%ymm9\n"
77 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
78 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
79 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
80 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
81 "vpermilps $68, %%ymm1, %%ymm8\n"
82 "vpermilps $238, %%ymm1, %%ymm9\n"
83 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
84 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
85 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
86 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
87 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
88 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
89 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
90 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
91 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
92 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
93 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
94 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
95 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
96 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
97 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
98 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
99 "vmovups %%ymm8, (%0)\n"
100 "vmovups %%ymm9, (%1)\n"
101 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
102 );
103 }
104 }
105 }
106 static inline void helper_float_5(float *buf);
helper_float_5(float * buf)107 static inline void helper_float_5(float *buf) {
108 for (int j = 0; j < 32; j += 32) {
109 for (int k = 0; k < 8; k += 8) {
110 __asm__ volatile (
111 "vmovups (%0), %%ymm0\n"
112 "vmovups (%1), %%ymm1\n"
113 "vmovups (%2), %%ymm2\n"
114 "vmovups (%3), %%ymm3\n"
115 "vpermilps $160, %%ymm0, %%ymm8\n"
116 "vpermilps $245, %%ymm0, %%ymm9\n"
117 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
118 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
119 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
120 "vpermilps $160, %%ymm1, %%ymm8\n"
121 "vpermilps $245, %%ymm1, %%ymm9\n"
122 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
123 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
124 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
125 "vpermilps $160, %%ymm2, %%ymm8\n"
126 "vpermilps $245, %%ymm2, %%ymm9\n"
127 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
128 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
129 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
130 "vpermilps $160, %%ymm3, %%ymm8\n"
131 "vpermilps $245, %%ymm3, %%ymm9\n"
132 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
133 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
134 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
135 "vpermilps $68, %%ymm0, %%ymm8\n"
136 "vpermilps $238, %%ymm0, %%ymm9\n"
137 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
138 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
139 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
140 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
141 "vpermilps $68, %%ymm1, %%ymm8\n"
142 "vpermilps $238, %%ymm1, %%ymm9\n"
143 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
144 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
145 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
146 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
147 "vpermilps $68, %%ymm2, %%ymm8\n"
148 "vpermilps $238, %%ymm2, %%ymm9\n"
149 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
150 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
151 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
152 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
153 "vpermilps $68, %%ymm3, %%ymm8\n"
154 "vpermilps $238, %%ymm3, %%ymm9\n"
155 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
156 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
157 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
158 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
159 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
160 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
161 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
162 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
163 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
164 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
165 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
166 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
167 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
168 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
169 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
170 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
171 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
172 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
173 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
174 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
175 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
176 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
177 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
178 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
179 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
180 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
181 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
182 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
183 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
184 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
185 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
186 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
187 "vmovups %%ymm0, (%0)\n"
188 "vmovups %%ymm1, (%1)\n"
189 "vmovups %%ymm2, (%2)\n"
190 "vmovups %%ymm3, (%3)\n"
191 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
192 );
193 }
194 }
195 }
196 static inline void helper_float_6(float *buf);
helper_float_6(float * buf)197 static inline void helper_float_6(float *buf) {
198 for (int j = 0; j < 64; j += 64) {
199 for (int k = 0; k < 8; k += 8) {
200 __asm__ volatile (
201 "vmovups (%0), %%ymm0\n"
202 "vmovups (%1), %%ymm1\n"
203 "vmovups (%2), %%ymm2\n"
204 "vmovups (%3), %%ymm3\n"
205 "vmovups (%4), %%ymm4\n"
206 "vmovups (%5), %%ymm5\n"
207 "vmovups (%6), %%ymm6\n"
208 "vmovups (%7), %%ymm7\n"
209 "vpermilps $160, %%ymm0, %%ymm8\n"
210 "vpermilps $245, %%ymm0, %%ymm9\n"
211 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
212 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
213 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
214 "vpermilps $160, %%ymm1, %%ymm8\n"
215 "vpermilps $245, %%ymm1, %%ymm9\n"
216 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
217 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
218 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
219 "vpermilps $160, %%ymm2, %%ymm8\n"
220 "vpermilps $245, %%ymm2, %%ymm9\n"
221 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
222 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
223 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
224 "vpermilps $160, %%ymm3, %%ymm8\n"
225 "vpermilps $245, %%ymm3, %%ymm9\n"
226 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
227 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
228 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
229 "vpermilps $160, %%ymm4, %%ymm8\n"
230 "vpermilps $245, %%ymm4, %%ymm9\n"
231 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
232 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
233 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
234 "vpermilps $160, %%ymm5, %%ymm8\n"
235 "vpermilps $245, %%ymm5, %%ymm9\n"
236 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
237 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
238 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
239 "vpermilps $160, %%ymm6, %%ymm8\n"
240 "vpermilps $245, %%ymm6, %%ymm9\n"
241 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
242 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
243 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
244 "vpermilps $160, %%ymm7, %%ymm8\n"
245 "vpermilps $245, %%ymm7, %%ymm9\n"
246 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
247 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
248 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
249 "vpermilps $68, %%ymm0, %%ymm8\n"
250 "vpermilps $238, %%ymm0, %%ymm9\n"
251 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
252 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
253 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
254 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
255 "vpermilps $68, %%ymm1, %%ymm8\n"
256 "vpermilps $238, %%ymm1, %%ymm9\n"
257 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
258 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
259 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
260 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
261 "vpermilps $68, %%ymm2, %%ymm8\n"
262 "vpermilps $238, %%ymm2, %%ymm9\n"
263 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
264 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
265 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
266 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
267 "vpermilps $68, %%ymm3, %%ymm8\n"
268 "vpermilps $238, %%ymm3, %%ymm9\n"
269 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
270 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
271 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
272 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
273 "vpermilps $68, %%ymm4, %%ymm8\n"
274 "vpermilps $238, %%ymm4, %%ymm9\n"
275 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
276 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
277 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
278 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
279 "vpermilps $68, %%ymm5, %%ymm8\n"
280 "vpermilps $238, %%ymm5, %%ymm9\n"
281 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
282 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
283 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
284 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
285 "vpermilps $68, %%ymm6, %%ymm8\n"
286 "vpermilps $238, %%ymm6, %%ymm9\n"
287 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
288 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
289 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
290 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
291 "vpermilps $68, %%ymm7, %%ymm8\n"
292 "vpermilps $238, %%ymm7, %%ymm9\n"
293 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
294 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
295 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
296 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
297 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
298 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
299 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
300 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
301 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
302 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
303 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
304 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
305 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
306 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
307 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
308 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
309 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
310 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
311 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
312 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
313 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
314 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
315 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
316 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
317 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
318 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
319 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
320 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
321 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
322 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
323 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
324 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
325 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
326 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
327 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
328 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
329 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
330 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
331 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
332 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
333 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
334 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
335 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
336 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
337 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
338 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
339 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
340 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
341 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
342 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
343 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
344 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
345 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
346 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
347 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
348 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
349 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
350 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
351 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
352 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
353 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
354 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
355 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
356 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
357 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
358 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
359 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
360 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
361 "vmovups %%ymm8, (%0)\n"
362 "vmovups %%ymm9, (%1)\n"
363 "vmovups %%ymm10, (%2)\n"
364 "vmovups %%ymm11, (%3)\n"
365 "vmovups %%ymm12, (%4)\n"
366 "vmovups %%ymm13, (%5)\n"
367 "vmovups %%ymm14, (%6)\n"
368 "vmovups %%ymm15, (%7)\n"
369 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
370 );
371 }
372 }
373 }
374 void helper_float_7_recursive(float *buf, int depth);
helper_float_7_recursive(float * buf,int depth)375 void helper_float_7_recursive(float *buf, int depth) {
376 if (depth == 7) {
377 for (int j = 0; j < 128; j += 64) {
378 for (int k = 0; k < 8; k += 8) {
379 __asm__ volatile (
380 "vmovups (%0), %%ymm0\n"
381 "vmovups (%1), %%ymm1\n"
382 "vmovups (%2), %%ymm2\n"
383 "vmovups (%3), %%ymm3\n"
384 "vmovups (%4), %%ymm4\n"
385 "vmovups (%5), %%ymm5\n"
386 "vmovups (%6), %%ymm6\n"
387 "vmovups (%7), %%ymm7\n"
388 "vpermilps $160, %%ymm0, %%ymm8\n"
389 "vpermilps $245, %%ymm0, %%ymm9\n"
390 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
391 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
392 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
393 "vpermilps $160, %%ymm1, %%ymm8\n"
394 "vpermilps $245, %%ymm1, %%ymm9\n"
395 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
396 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
397 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
398 "vpermilps $160, %%ymm2, %%ymm8\n"
399 "vpermilps $245, %%ymm2, %%ymm9\n"
400 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
401 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
402 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
403 "vpermilps $160, %%ymm3, %%ymm8\n"
404 "vpermilps $245, %%ymm3, %%ymm9\n"
405 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
406 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
407 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
408 "vpermilps $160, %%ymm4, %%ymm8\n"
409 "vpermilps $245, %%ymm4, %%ymm9\n"
410 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
411 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
412 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
413 "vpermilps $160, %%ymm5, %%ymm8\n"
414 "vpermilps $245, %%ymm5, %%ymm9\n"
415 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
416 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
417 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
418 "vpermilps $160, %%ymm6, %%ymm8\n"
419 "vpermilps $245, %%ymm6, %%ymm9\n"
420 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
421 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
422 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
423 "vpermilps $160, %%ymm7, %%ymm8\n"
424 "vpermilps $245, %%ymm7, %%ymm9\n"
425 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
426 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
427 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
428 "vpermilps $68, %%ymm0, %%ymm8\n"
429 "vpermilps $238, %%ymm0, %%ymm9\n"
430 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
431 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
432 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
433 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
434 "vpermilps $68, %%ymm1, %%ymm8\n"
435 "vpermilps $238, %%ymm1, %%ymm9\n"
436 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
437 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
438 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
439 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
440 "vpermilps $68, %%ymm2, %%ymm8\n"
441 "vpermilps $238, %%ymm2, %%ymm9\n"
442 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
443 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
444 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
445 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
446 "vpermilps $68, %%ymm3, %%ymm8\n"
447 "vpermilps $238, %%ymm3, %%ymm9\n"
448 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
449 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
450 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
451 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
452 "vpermilps $68, %%ymm4, %%ymm8\n"
453 "vpermilps $238, %%ymm4, %%ymm9\n"
454 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
455 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
456 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
457 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
458 "vpermilps $68, %%ymm5, %%ymm8\n"
459 "vpermilps $238, %%ymm5, %%ymm9\n"
460 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
461 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
462 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
463 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
464 "vpermilps $68, %%ymm6, %%ymm8\n"
465 "vpermilps $238, %%ymm6, %%ymm9\n"
466 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
467 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
468 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
469 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
470 "vpermilps $68, %%ymm7, %%ymm8\n"
471 "vpermilps $238, %%ymm7, %%ymm9\n"
472 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
473 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
474 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
475 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
476 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
477 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
478 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
479 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
480 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
481 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
482 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
483 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
484 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
485 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
486 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
487 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
488 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
489 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
490 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
491 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
492 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
493 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
494 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
495 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
496 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
497 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
498 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
499 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
500 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
501 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
502 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
503 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
504 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
505 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
506 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
507 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
508 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
509 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
510 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
511 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
512 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
513 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
514 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
515 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
516 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
517 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
518 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
519 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
520 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
521 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
522 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
523 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
524 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
525 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
526 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
527 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
528 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
529 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
530 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
531 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
532 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
533 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
534 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
535 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
536 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
537 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
538 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
539 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
540 "vmovups %%ymm8, (%0)\n"
541 "vmovups %%ymm9, (%1)\n"
542 "vmovups %%ymm10, (%2)\n"
543 "vmovups %%ymm11, (%3)\n"
544 "vmovups %%ymm12, (%4)\n"
545 "vmovups %%ymm13, (%5)\n"
546 "vmovups %%ymm14, (%6)\n"
547 "vmovups %%ymm15, (%7)\n"
548 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
549 );
550 }
551 }
552 for (int j = 0; j < 128; j += 128) {
553 for (int k = 0; k < 64; k += 8) {
554 __asm__ volatile (
555 "vmovups (%0), %%ymm0\n"
556 "vmovups (%1), %%ymm1\n"
557 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
558 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
559 "vmovups %%ymm8, (%0)\n"
560 "vmovups %%ymm9, (%1)\n"
561 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
562 );
563 }
564 }
565 return;
566 }
567 }
568 void helper_float_7(float *buf);
helper_float_7(float * buf)569 void helper_float_7(float *buf) {
570 helper_float_7_recursive(buf, 7);
571 }
572 void helper_float_8_recursive(float *buf, int depth);
helper_float_8_recursive(float * buf,int depth)573 void helper_float_8_recursive(float *buf, int depth) {
574 if (depth == 6) {
575 for (int j = 0; j < 64; j += 64) {
576 for (int k = 0; k < 8; k += 8) {
577 __asm__ volatile (
578 "vmovups (%0), %%ymm0\n"
579 "vmovups (%1), %%ymm1\n"
580 "vmovups (%2), %%ymm2\n"
581 "vmovups (%3), %%ymm3\n"
582 "vmovups (%4), %%ymm4\n"
583 "vmovups (%5), %%ymm5\n"
584 "vmovups (%6), %%ymm6\n"
585 "vmovups (%7), %%ymm7\n"
586 "vpermilps $160, %%ymm0, %%ymm8\n"
587 "vpermilps $245, %%ymm0, %%ymm9\n"
588 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
589 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
590 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
591 "vpermilps $160, %%ymm1, %%ymm8\n"
592 "vpermilps $245, %%ymm1, %%ymm9\n"
593 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
594 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
595 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
596 "vpermilps $160, %%ymm2, %%ymm8\n"
597 "vpermilps $245, %%ymm2, %%ymm9\n"
598 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
599 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
600 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
601 "vpermilps $160, %%ymm3, %%ymm8\n"
602 "vpermilps $245, %%ymm3, %%ymm9\n"
603 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
604 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
605 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
606 "vpermilps $160, %%ymm4, %%ymm8\n"
607 "vpermilps $245, %%ymm4, %%ymm9\n"
608 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
609 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
610 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
611 "vpermilps $160, %%ymm5, %%ymm8\n"
612 "vpermilps $245, %%ymm5, %%ymm9\n"
613 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
614 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
615 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
616 "vpermilps $160, %%ymm6, %%ymm8\n"
617 "vpermilps $245, %%ymm6, %%ymm9\n"
618 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
619 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
620 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
621 "vpermilps $160, %%ymm7, %%ymm8\n"
622 "vpermilps $245, %%ymm7, %%ymm9\n"
623 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
624 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
625 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
626 "vpermilps $68, %%ymm0, %%ymm8\n"
627 "vpermilps $238, %%ymm0, %%ymm9\n"
628 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
629 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
630 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
631 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
632 "vpermilps $68, %%ymm1, %%ymm8\n"
633 "vpermilps $238, %%ymm1, %%ymm9\n"
634 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
635 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
636 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
637 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
638 "vpermilps $68, %%ymm2, %%ymm8\n"
639 "vpermilps $238, %%ymm2, %%ymm9\n"
640 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
641 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
642 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
643 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
644 "vpermilps $68, %%ymm3, %%ymm8\n"
645 "vpermilps $238, %%ymm3, %%ymm9\n"
646 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
647 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
648 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
649 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
650 "vpermilps $68, %%ymm4, %%ymm8\n"
651 "vpermilps $238, %%ymm4, %%ymm9\n"
652 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
653 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
654 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
655 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
656 "vpermilps $68, %%ymm5, %%ymm8\n"
657 "vpermilps $238, %%ymm5, %%ymm9\n"
658 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
659 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
660 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
661 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
662 "vpermilps $68, %%ymm6, %%ymm8\n"
663 "vpermilps $238, %%ymm6, %%ymm9\n"
664 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
665 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
666 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
667 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
668 "vpermilps $68, %%ymm7, %%ymm8\n"
669 "vpermilps $238, %%ymm7, %%ymm9\n"
670 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
671 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
672 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
673 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
674 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
675 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
676 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
677 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
678 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
679 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
680 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
681 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
682 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
683 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
684 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
685 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
686 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
687 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
688 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
689 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
690 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
691 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
692 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
693 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
694 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
695 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
696 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
697 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
698 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
699 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
700 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
701 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
702 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
703 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
704 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
705 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
706 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
707 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
708 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
709 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
710 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
711 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
712 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
713 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
714 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
715 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
716 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
717 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
718 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
719 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
720 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
721 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
722 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
723 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
724 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
725 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
726 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
727 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
728 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
729 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
730 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
731 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
732 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
733 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
734 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
735 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
736 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
737 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
738 "vmovups %%ymm8, (%0)\n"
739 "vmovups %%ymm9, (%1)\n"
740 "vmovups %%ymm10, (%2)\n"
741 "vmovups %%ymm11, (%3)\n"
742 "vmovups %%ymm12, (%4)\n"
743 "vmovups %%ymm13, (%5)\n"
744 "vmovups %%ymm14, (%6)\n"
745 "vmovups %%ymm15, (%7)\n"
746 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
747 );
748 }
749 }
750 return;
751 }
752 if (depth == 8) {
753 helper_float_8_recursive(buf + 0, 6);
754 helper_float_8_recursive(buf + 64, 6);
755 helper_float_8_recursive(buf + 128, 6);
756 helper_float_8_recursive(buf + 192, 6);
757 for (int j = 0; j < 256; j += 256) {
758 for (int k = 0; k < 64; k += 8) {
759 __asm__ volatile (
760 "vmovups (%0), %%ymm0\n"
761 "vmovups (%1), %%ymm1\n"
762 "vmovups (%2), %%ymm2\n"
763 "vmovups (%3), %%ymm3\n"
764 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
765 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
766 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
767 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
768 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
769 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
770 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
771 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
772 "vmovups %%ymm0, (%0)\n"
773 "vmovups %%ymm1, (%1)\n"
774 "vmovups %%ymm2, (%2)\n"
775 "vmovups %%ymm3, (%3)\n"
776 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
777 );
778 }
779 }
780 return;
781 }
782 }
783 void helper_float_8(float *buf);
helper_float_8(float * buf)784 void helper_float_8(float *buf) {
785 helper_float_8_recursive(buf, 8);
786 }
787 static inline void helper_float_9(float *buf);
helper_float_9(float * buf)788 static inline void helper_float_9(float *buf) {
789 for (int j = 0; j < 512; j += 64) {
790 for (int k = 0; k < 8; k += 8) {
791 __asm__ volatile (
792 "vmovups (%0), %%ymm0\n"
793 "vmovups (%1), %%ymm1\n"
794 "vmovups (%2), %%ymm2\n"
795 "vmovups (%3), %%ymm3\n"
796 "vmovups (%4), %%ymm4\n"
797 "vmovups (%5), %%ymm5\n"
798 "vmovups (%6), %%ymm6\n"
799 "vmovups (%7), %%ymm7\n"
800 "vpermilps $160, %%ymm0, %%ymm8\n"
801 "vpermilps $245, %%ymm0, %%ymm9\n"
802 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
803 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
804 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
805 "vpermilps $160, %%ymm1, %%ymm8\n"
806 "vpermilps $245, %%ymm1, %%ymm9\n"
807 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
808 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
809 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
810 "vpermilps $160, %%ymm2, %%ymm8\n"
811 "vpermilps $245, %%ymm2, %%ymm9\n"
812 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
813 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
814 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
815 "vpermilps $160, %%ymm3, %%ymm8\n"
816 "vpermilps $245, %%ymm3, %%ymm9\n"
817 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
818 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
819 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
820 "vpermilps $160, %%ymm4, %%ymm8\n"
821 "vpermilps $245, %%ymm4, %%ymm9\n"
822 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
823 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
824 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
825 "vpermilps $160, %%ymm5, %%ymm8\n"
826 "vpermilps $245, %%ymm5, %%ymm9\n"
827 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
828 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
829 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
830 "vpermilps $160, %%ymm6, %%ymm8\n"
831 "vpermilps $245, %%ymm6, %%ymm9\n"
832 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
833 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
834 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
835 "vpermilps $160, %%ymm7, %%ymm8\n"
836 "vpermilps $245, %%ymm7, %%ymm9\n"
837 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
838 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
839 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
840 "vpermilps $68, %%ymm0, %%ymm8\n"
841 "vpermilps $238, %%ymm0, %%ymm9\n"
842 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
843 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
844 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
845 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
846 "vpermilps $68, %%ymm1, %%ymm8\n"
847 "vpermilps $238, %%ymm1, %%ymm9\n"
848 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
849 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
850 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
851 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
852 "vpermilps $68, %%ymm2, %%ymm8\n"
853 "vpermilps $238, %%ymm2, %%ymm9\n"
854 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
855 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
856 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
857 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
858 "vpermilps $68, %%ymm3, %%ymm8\n"
859 "vpermilps $238, %%ymm3, %%ymm9\n"
860 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
861 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
862 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
863 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
864 "vpermilps $68, %%ymm4, %%ymm8\n"
865 "vpermilps $238, %%ymm4, %%ymm9\n"
866 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
867 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
868 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
869 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
870 "vpermilps $68, %%ymm5, %%ymm8\n"
871 "vpermilps $238, %%ymm5, %%ymm9\n"
872 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
873 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
874 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
875 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
876 "vpermilps $68, %%ymm6, %%ymm8\n"
877 "vpermilps $238, %%ymm6, %%ymm9\n"
878 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
879 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
880 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
881 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
882 "vpermilps $68, %%ymm7, %%ymm8\n"
883 "vpermilps $238, %%ymm7, %%ymm9\n"
884 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
885 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
886 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
887 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
888 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
889 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
890 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
891 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
892 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
893 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
894 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
895 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
896 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
897 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
898 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
899 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
900 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
901 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
902 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
903 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
904 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
905 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
906 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
907 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
908 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
909 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
910 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
911 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
912 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
913 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
914 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
915 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
916 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
917 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
918 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
919 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
920 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
921 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
922 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
923 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
924 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
925 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
926 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
927 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
928 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
929 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
930 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
931 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
932 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
933 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
934 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
935 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
936 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
937 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
938 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
939 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
940 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
941 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
942 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
943 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
944 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
945 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
946 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
947 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
948 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
949 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
950 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
951 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
952 "vmovups %%ymm8, (%0)\n"
953 "vmovups %%ymm9, (%1)\n"
954 "vmovups %%ymm10, (%2)\n"
955 "vmovups %%ymm11, (%3)\n"
956 "vmovups %%ymm12, (%4)\n"
957 "vmovups %%ymm13, (%5)\n"
958 "vmovups %%ymm14, (%6)\n"
959 "vmovups %%ymm15, (%7)\n"
960 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
961 );
962 }
963 }
964 for (int j = 0; j < 512; j += 512) {
965 for (int k = 0; k < 64; k += 8) {
966 __asm__ volatile (
967 "vmovups (%0), %%ymm0\n"
968 "vmovups (%1), %%ymm1\n"
969 "vmovups (%2), %%ymm2\n"
970 "vmovups (%3), %%ymm3\n"
971 "vmovups (%4), %%ymm4\n"
972 "vmovups (%5), %%ymm5\n"
973 "vmovups (%6), %%ymm6\n"
974 "vmovups (%7), %%ymm7\n"
975 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
976 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
977 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
978 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
979 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
980 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
981 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
982 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
983 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
984 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
985 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
986 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
987 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
988 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
989 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
990 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
991 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
992 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
993 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
994 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
995 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
996 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
997 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
998 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
999 "vmovups %%ymm8, (%0)\n"
1000 "vmovups %%ymm9, (%1)\n"
1001 "vmovups %%ymm10, (%2)\n"
1002 "vmovups %%ymm11, (%3)\n"
1003 "vmovups %%ymm12, (%4)\n"
1004 "vmovups %%ymm13, (%5)\n"
1005 "vmovups %%ymm14, (%6)\n"
1006 "vmovups %%ymm15, (%7)\n"
1007 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1008 );
1009 }
1010 }
1011 }
1012 void helper_float_10_recursive(float *buf, int depth);
helper_float_10_recursive(float * buf,int depth)1013 void helper_float_10_recursive(float *buf, int depth) {
1014 if (depth == 10) {
1015 for (int j = 0; j < 1024; j += 64) {
1016 for (int k = 0; k < 8; k += 8) {
1017 __asm__ volatile (
1018 "vmovups (%0), %%ymm0\n"
1019 "vmovups (%1), %%ymm1\n"
1020 "vmovups (%2), %%ymm2\n"
1021 "vmovups (%3), %%ymm3\n"
1022 "vmovups (%4), %%ymm4\n"
1023 "vmovups (%5), %%ymm5\n"
1024 "vmovups (%6), %%ymm6\n"
1025 "vmovups (%7), %%ymm7\n"
1026 "vpermilps $160, %%ymm0, %%ymm8\n"
1027 "vpermilps $245, %%ymm0, %%ymm9\n"
1028 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1029 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1030 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1031 "vpermilps $160, %%ymm1, %%ymm8\n"
1032 "vpermilps $245, %%ymm1, %%ymm9\n"
1033 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1034 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1035 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1036 "vpermilps $160, %%ymm2, %%ymm8\n"
1037 "vpermilps $245, %%ymm2, %%ymm9\n"
1038 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1039 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1040 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1041 "vpermilps $160, %%ymm3, %%ymm8\n"
1042 "vpermilps $245, %%ymm3, %%ymm9\n"
1043 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1044 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1045 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1046 "vpermilps $160, %%ymm4, %%ymm8\n"
1047 "vpermilps $245, %%ymm4, %%ymm9\n"
1048 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1049 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1050 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1051 "vpermilps $160, %%ymm5, %%ymm8\n"
1052 "vpermilps $245, %%ymm5, %%ymm9\n"
1053 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1054 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1055 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1056 "vpermilps $160, %%ymm6, %%ymm8\n"
1057 "vpermilps $245, %%ymm6, %%ymm9\n"
1058 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1059 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1060 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1061 "vpermilps $160, %%ymm7, %%ymm8\n"
1062 "vpermilps $245, %%ymm7, %%ymm9\n"
1063 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1064 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1065 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1066 "vpermilps $68, %%ymm0, %%ymm8\n"
1067 "vpermilps $238, %%ymm0, %%ymm9\n"
1068 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1069 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1070 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1071 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1072 "vpermilps $68, %%ymm1, %%ymm8\n"
1073 "vpermilps $238, %%ymm1, %%ymm9\n"
1074 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1075 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1076 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1077 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1078 "vpermilps $68, %%ymm2, %%ymm8\n"
1079 "vpermilps $238, %%ymm2, %%ymm9\n"
1080 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1081 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1082 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1083 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1084 "vpermilps $68, %%ymm3, %%ymm8\n"
1085 "vpermilps $238, %%ymm3, %%ymm9\n"
1086 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1087 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1088 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1089 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1090 "vpermilps $68, %%ymm4, %%ymm8\n"
1091 "vpermilps $238, %%ymm4, %%ymm9\n"
1092 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1093 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1094 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1095 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1096 "vpermilps $68, %%ymm5, %%ymm8\n"
1097 "vpermilps $238, %%ymm5, %%ymm9\n"
1098 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1099 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1100 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1101 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1102 "vpermilps $68, %%ymm6, %%ymm8\n"
1103 "vpermilps $238, %%ymm6, %%ymm9\n"
1104 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1105 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1106 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1107 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1108 "vpermilps $68, %%ymm7, %%ymm8\n"
1109 "vpermilps $238, %%ymm7, %%ymm9\n"
1110 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1111 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1112 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1113 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1114 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1115 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1116 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1117 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1118 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1119 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1120 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1121 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1122 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1123 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1124 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1125 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1126 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1127 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1128 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1129 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1130 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1131 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1132 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1133 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1134 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1135 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1136 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1137 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1138 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1139 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1140 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1141 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1142 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1143 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1144 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1145 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1146 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1147 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1148 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1149 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1150 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1151 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1152 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1153 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1154 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1155 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1156 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1157 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1158 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1159 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1160 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1161 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1162 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1163 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1164 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1165 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1166 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1167 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1168 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1169 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1170 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1171 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1172 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1173 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1174 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1175 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1176 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1177 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1178 "vmovups %%ymm8, (%0)\n"
1179 "vmovups %%ymm9, (%1)\n"
1180 "vmovups %%ymm10, (%2)\n"
1181 "vmovups %%ymm11, (%3)\n"
1182 "vmovups %%ymm12, (%4)\n"
1183 "vmovups %%ymm13, (%5)\n"
1184 "vmovups %%ymm14, (%6)\n"
1185 "vmovups %%ymm15, (%7)\n"
1186 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1187 );
1188 }
1189 }
1190 for (int j = 0; j < 1024; j += 512) {
1191 for (int k = 0; k < 64; k += 8) {
1192 __asm__ volatile (
1193 "vmovups (%0), %%ymm0\n"
1194 "vmovups (%1), %%ymm1\n"
1195 "vmovups (%2), %%ymm2\n"
1196 "vmovups (%3), %%ymm3\n"
1197 "vmovups (%4), %%ymm4\n"
1198 "vmovups (%5), %%ymm5\n"
1199 "vmovups (%6), %%ymm6\n"
1200 "vmovups (%7), %%ymm7\n"
1201 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1202 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1203 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1204 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1205 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1206 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1207 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1208 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1209 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1210 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1211 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1212 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1213 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1214 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1215 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1216 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1217 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1218 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1219 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1220 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1221 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1222 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1223 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1224 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1225 "vmovups %%ymm8, (%0)\n"
1226 "vmovups %%ymm9, (%1)\n"
1227 "vmovups %%ymm10, (%2)\n"
1228 "vmovups %%ymm11, (%3)\n"
1229 "vmovups %%ymm12, (%4)\n"
1230 "vmovups %%ymm13, (%5)\n"
1231 "vmovups %%ymm14, (%6)\n"
1232 "vmovups %%ymm15, (%7)\n"
1233 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1234 );
1235 }
1236 }
1237 for (int j = 0; j < 1024; j += 1024) {
1238 for (int k = 0; k < 512; k += 8) {
1239 __asm__ volatile (
1240 "vmovups (%0), %%ymm0\n"
1241 "vmovups (%1), %%ymm1\n"
1242 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1243 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1244 "vmovups %%ymm8, (%0)\n"
1245 "vmovups %%ymm9, (%1)\n"
1246 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1247 );
1248 }
1249 }
1250 return;
1251 }
1252 }
1253 void helper_float_10(float *buf);
helper_float_10(float * buf)1254 void helper_float_10(float *buf) {
1255 helper_float_10_recursive(buf, 10);
1256 }
1257 void helper_float_11_recursive(float *buf, int depth);
helper_float_11_recursive(float * buf,int depth)1258 void helper_float_11_recursive(float *buf, int depth) {
1259 if (depth == 11) {
1260 for (int j = 0; j < 2048; j += 64) {
1261 for (int k = 0; k < 8; k += 8) {
1262 __asm__ volatile (
1263 "vmovups (%0), %%ymm0\n"
1264 "vmovups (%1), %%ymm1\n"
1265 "vmovups (%2), %%ymm2\n"
1266 "vmovups (%3), %%ymm3\n"
1267 "vmovups (%4), %%ymm4\n"
1268 "vmovups (%5), %%ymm5\n"
1269 "vmovups (%6), %%ymm6\n"
1270 "vmovups (%7), %%ymm7\n"
1271 "vpermilps $160, %%ymm0, %%ymm8\n"
1272 "vpermilps $245, %%ymm0, %%ymm9\n"
1273 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1274 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1275 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1276 "vpermilps $160, %%ymm1, %%ymm8\n"
1277 "vpermilps $245, %%ymm1, %%ymm9\n"
1278 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1279 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1280 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1281 "vpermilps $160, %%ymm2, %%ymm8\n"
1282 "vpermilps $245, %%ymm2, %%ymm9\n"
1283 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1284 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1285 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1286 "vpermilps $160, %%ymm3, %%ymm8\n"
1287 "vpermilps $245, %%ymm3, %%ymm9\n"
1288 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1289 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1290 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1291 "vpermilps $160, %%ymm4, %%ymm8\n"
1292 "vpermilps $245, %%ymm4, %%ymm9\n"
1293 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1294 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1295 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1296 "vpermilps $160, %%ymm5, %%ymm8\n"
1297 "vpermilps $245, %%ymm5, %%ymm9\n"
1298 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1299 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1300 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1301 "vpermilps $160, %%ymm6, %%ymm8\n"
1302 "vpermilps $245, %%ymm6, %%ymm9\n"
1303 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1304 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1305 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1306 "vpermilps $160, %%ymm7, %%ymm8\n"
1307 "vpermilps $245, %%ymm7, %%ymm9\n"
1308 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1309 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1310 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1311 "vpermilps $68, %%ymm0, %%ymm8\n"
1312 "vpermilps $238, %%ymm0, %%ymm9\n"
1313 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1314 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1315 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1316 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1317 "vpermilps $68, %%ymm1, %%ymm8\n"
1318 "vpermilps $238, %%ymm1, %%ymm9\n"
1319 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1320 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1321 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1322 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1323 "vpermilps $68, %%ymm2, %%ymm8\n"
1324 "vpermilps $238, %%ymm2, %%ymm9\n"
1325 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1326 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1327 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1328 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1329 "vpermilps $68, %%ymm3, %%ymm8\n"
1330 "vpermilps $238, %%ymm3, %%ymm9\n"
1331 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1332 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1333 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1334 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1335 "vpermilps $68, %%ymm4, %%ymm8\n"
1336 "vpermilps $238, %%ymm4, %%ymm9\n"
1337 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1338 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1339 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1340 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1341 "vpermilps $68, %%ymm5, %%ymm8\n"
1342 "vpermilps $238, %%ymm5, %%ymm9\n"
1343 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1344 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1345 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1346 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1347 "vpermilps $68, %%ymm6, %%ymm8\n"
1348 "vpermilps $238, %%ymm6, %%ymm9\n"
1349 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1350 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1351 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1352 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1353 "vpermilps $68, %%ymm7, %%ymm8\n"
1354 "vpermilps $238, %%ymm7, %%ymm9\n"
1355 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1356 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1357 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1358 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1359 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1360 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1361 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1362 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1363 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1364 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1365 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1366 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1367 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1368 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1369 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1370 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1371 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1372 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1373 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1374 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1375 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1376 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1377 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1378 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1379 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1380 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1381 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1382 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1383 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1384 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1385 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1386 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1387 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1388 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1389 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1390 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1391 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1392 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1393 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1394 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1395 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1396 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1397 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1398 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1399 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1400 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1401 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1402 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1403 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1404 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1405 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1406 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1407 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1408 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1409 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1410 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1411 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1412 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1413 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1414 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1415 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1416 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1417 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1418 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1419 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1420 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1421 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1422 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1423 "vmovups %%ymm8, (%0)\n"
1424 "vmovups %%ymm9, (%1)\n"
1425 "vmovups %%ymm10, (%2)\n"
1426 "vmovups %%ymm11, (%3)\n"
1427 "vmovups %%ymm12, (%4)\n"
1428 "vmovups %%ymm13, (%5)\n"
1429 "vmovups %%ymm14, (%6)\n"
1430 "vmovups %%ymm15, (%7)\n"
1431 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1432 );
1433 }
1434 }
1435 for (int j = 0; j < 2048; j += 512) {
1436 for (int k = 0; k < 64; k += 8) {
1437 __asm__ volatile (
1438 "vmovups (%0), %%ymm0\n"
1439 "vmovups (%1), %%ymm1\n"
1440 "vmovups (%2), %%ymm2\n"
1441 "vmovups (%3), %%ymm3\n"
1442 "vmovups (%4), %%ymm4\n"
1443 "vmovups (%5), %%ymm5\n"
1444 "vmovups (%6), %%ymm6\n"
1445 "vmovups (%7), %%ymm7\n"
1446 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1447 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1448 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1449 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1450 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1451 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1452 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1453 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1454 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1455 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1456 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1457 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1458 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1459 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1460 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1461 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1462 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1463 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1464 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1465 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1466 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1467 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1468 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1469 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1470 "vmovups %%ymm8, (%0)\n"
1471 "vmovups %%ymm9, (%1)\n"
1472 "vmovups %%ymm10, (%2)\n"
1473 "vmovups %%ymm11, (%3)\n"
1474 "vmovups %%ymm12, (%4)\n"
1475 "vmovups %%ymm13, (%5)\n"
1476 "vmovups %%ymm14, (%6)\n"
1477 "vmovups %%ymm15, (%7)\n"
1478 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1479 );
1480 }
1481 }
1482 for (int j = 0; j < 2048; j += 2048) {
1483 for (int k = 0; k < 512; k += 8) {
1484 __asm__ volatile (
1485 "vmovups (%0), %%ymm0\n"
1486 "vmovups (%1), %%ymm1\n"
1487 "vmovups (%2), %%ymm2\n"
1488 "vmovups (%3), %%ymm3\n"
1489 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1490 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1491 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1492 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1493 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1494 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1495 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1496 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1497 "vmovups %%ymm0, (%0)\n"
1498 "vmovups %%ymm1, (%1)\n"
1499 "vmovups %%ymm2, (%2)\n"
1500 "vmovups %%ymm3, (%3)\n"
1501 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1502 );
1503 }
1504 }
1505 return;
1506 }
1507 }
1508 void helper_float_11(float *buf);
helper_float_11(float * buf)1509 void helper_float_11(float *buf) {
1510 helper_float_11_recursive(buf, 11);
1511 }
1512 static inline void helper_float_12(float *buf);
helper_float_12(float * buf)1513 static inline void helper_float_12(float *buf) {
1514 for (int j = 0; j < 4096; j += 64) {
1515 for (int k = 0; k < 8; k += 8) {
1516 __asm__ volatile (
1517 "vmovups (%0), %%ymm0\n"
1518 "vmovups (%1), %%ymm1\n"
1519 "vmovups (%2), %%ymm2\n"
1520 "vmovups (%3), %%ymm3\n"
1521 "vmovups (%4), %%ymm4\n"
1522 "vmovups (%5), %%ymm5\n"
1523 "vmovups (%6), %%ymm6\n"
1524 "vmovups (%7), %%ymm7\n"
1525 "vpermilps $160, %%ymm0, %%ymm8\n"
1526 "vpermilps $245, %%ymm0, %%ymm9\n"
1527 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1528 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1529 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1530 "vpermilps $160, %%ymm1, %%ymm8\n"
1531 "vpermilps $245, %%ymm1, %%ymm9\n"
1532 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1533 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1534 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1535 "vpermilps $160, %%ymm2, %%ymm8\n"
1536 "vpermilps $245, %%ymm2, %%ymm9\n"
1537 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1538 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1539 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1540 "vpermilps $160, %%ymm3, %%ymm8\n"
1541 "vpermilps $245, %%ymm3, %%ymm9\n"
1542 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1543 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1544 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1545 "vpermilps $160, %%ymm4, %%ymm8\n"
1546 "vpermilps $245, %%ymm4, %%ymm9\n"
1547 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1548 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1549 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1550 "vpermilps $160, %%ymm5, %%ymm8\n"
1551 "vpermilps $245, %%ymm5, %%ymm9\n"
1552 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1553 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1554 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1555 "vpermilps $160, %%ymm6, %%ymm8\n"
1556 "vpermilps $245, %%ymm6, %%ymm9\n"
1557 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1558 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1559 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1560 "vpermilps $160, %%ymm7, %%ymm8\n"
1561 "vpermilps $245, %%ymm7, %%ymm9\n"
1562 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1563 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1564 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1565 "vpermilps $68, %%ymm0, %%ymm8\n"
1566 "vpermilps $238, %%ymm0, %%ymm9\n"
1567 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1568 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1569 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1570 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1571 "vpermilps $68, %%ymm1, %%ymm8\n"
1572 "vpermilps $238, %%ymm1, %%ymm9\n"
1573 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1574 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1575 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1576 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1577 "vpermilps $68, %%ymm2, %%ymm8\n"
1578 "vpermilps $238, %%ymm2, %%ymm9\n"
1579 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1580 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1581 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1582 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1583 "vpermilps $68, %%ymm3, %%ymm8\n"
1584 "vpermilps $238, %%ymm3, %%ymm9\n"
1585 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1586 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1587 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1588 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1589 "vpermilps $68, %%ymm4, %%ymm8\n"
1590 "vpermilps $238, %%ymm4, %%ymm9\n"
1591 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1592 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1593 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1594 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1595 "vpermilps $68, %%ymm5, %%ymm8\n"
1596 "vpermilps $238, %%ymm5, %%ymm9\n"
1597 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1598 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1599 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1600 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1601 "vpermilps $68, %%ymm6, %%ymm8\n"
1602 "vpermilps $238, %%ymm6, %%ymm9\n"
1603 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1604 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1605 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1606 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1607 "vpermilps $68, %%ymm7, %%ymm8\n"
1608 "vpermilps $238, %%ymm7, %%ymm9\n"
1609 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1610 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1611 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1612 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1613 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1614 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1615 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1616 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1617 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1618 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1619 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1620 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1621 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1622 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1623 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1624 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1625 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1626 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1627 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1628 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1629 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1630 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1631 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1632 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1633 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1634 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1635 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1636 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1637 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1638 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1639 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1640 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1641 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1642 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1643 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1644 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1645 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1646 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1647 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1648 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1649 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1650 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1651 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1652 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1653 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1654 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1655 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1656 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1657 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1658 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1659 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1660 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1661 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1662 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1663 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1664 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1665 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1666 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1667 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1668 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1669 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1670 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1671 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1672 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1673 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1674 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1675 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1676 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1677 "vmovups %%ymm8, (%0)\n"
1678 "vmovups %%ymm9, (%1)\n"
1679 "vmovups %%ymm10, (%2)\n"
1680 "vmovups %%ymm11, (%3)\n"
1681 "vmovups %%ymm12, (%4)\n"
1682 "vmovups %%ymm13, (%5)\n"
1683 "vmovups %%ymm14, (%6)\n"
1684 "vmovups %%ymm15, (%7)\n"
1685 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1686 );
1687 }
1688 }
1689 for (int j = 0; j < 4096; j += 512) {
1690 for (int k = 0; k < 64; k += 8) {
1691 __asm__ volatile (
1692 "vmovups (%0), %%ymm0\n"
1693 "vmovups (%1), %%ymm1\n"
1694 "vmovups (%2), %%ymm2\n"
1695 "vmovups (%3), %%ymm3\n"
1696 "vmovups (%4), %%ymm4\n"
1697 "vmovups (%5), %%ymm5\n"
1698 "vmovups (%6), %%ymm6\n"
1699 "vmovups (%7), %%ymm7\n"
1700 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1701 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1702 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1703 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1704 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1705 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1706 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1707 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1708 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1709 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1710 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1711 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1712 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1713 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1714 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1715 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1716 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1717 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1718 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1719 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1720 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1721 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1722 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1723 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1724 "vmovups %%ymm8, (%0)\n"
1725 "vmovups %%ymm9, (%1)\n"
1726 "vmovups %%ymm10, (%2)\n"
1727 "vmovups %%ymm11, (%3)\n"
1728 "vmovups %%ymm12, (%4)\n"
1729 "vmovups %%ymm13, (%5)\n"
1730 "vmovups %%ymm14, (%6)\n"
1731 "vmovups %%ymm15, (%7)\n"
1732 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1733 );
1734 }
1735 }
1736 for (int j = 0; j < 4096; j += 4096) {
1737 for (int k = 0; k < 512; k += 8) {
1738 __asm__ volatile (
1739 "vmovups (%0), %%ymm0\n"
1740 "vmovups (%1), %%ymm1\n"
1741 "vmovups (%2), %%ymm2\n"
1742 "vmovups (%3), %%ymm3\n"
1743 "vmovups (%4), %%ymm4\n"
1744 "vmovups (%5), %%ymm5\n"
1745 "vmovups (%6), %%ymm6\n"
1746 "vmovups (%7), %%ymm7\n"
1747 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1748 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1749 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1750 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1751 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1752 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1753 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1754 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1755 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1756 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1757 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1758 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1759 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1760 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1761 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1762 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1763 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1764 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1765 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1766 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1767 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1768 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1769 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1770 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1771 "vmovups %%ymm8, (%0)\n"
1772 "vmovups %%ymm9, (%1)\n"
1773 "vmovups %%ymm10, (%2)\n"
1774 "vmovups %%ymm11, (%3)\n"
1775 "vmovups %%ymm12, (%4)\n"
1776 "vmovups %%ymm13, (%5)\n"
1777 "vmovups %%ymm14, (%6)\n"
1778 "vmovups %%ymm15, (%7)\n"
1779 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1780 );
1781 }
1782 }
1783 }
1784 void helper_float_13_recursive(float *buf, int depth);
helper_float_13_recursive(float * buf,int depth)1785 void helper_float_13_recursive(float *buf, int depth) {
1786 if (depth == 11) {
1787 for (int j = 0; j < 2048; j += 64) {
1788 for (int k = 0; k < 8; k += 8) {
1789 __asm__ volatile (
1790 "vmovups (%0), %%ymm0\n"
1791 "vmovups (%1), %%ymm1\n"
1792 "vmovups (%2), %%ymm2\n"
1793 "vmovups (%3), %%ymm3\n"
1794 "vmovups (%4), %%ymm4\n"
1795 "vmovups (%5), %%ymm5\n"
1796 "vmovups (%6), %%ymm6\n"
1797 "vmovups (%7), %%ymm7\n"
1798 "vpermilps $160, %%ymm0, %%ymm8\n"
1799 "vpermilps $245, %%ymm0, %%ymm9\n"
1800 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1801 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1802 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
1803 "vpermilps $160, %%ymm1, %%ymm8\n"
1804 "vpermilps $245, %%ymm1, %%ymm9\n"
1805 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1806 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1807 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
1808 "vpermilps $160, %%ymm2, %%ymm8\n"
1809 "vpermilps $245, %%ymm2, %%ymm9\n"
1810 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1811 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1812 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
1813 "vpermilps $160, %%ymm3, %%ymm8\n"
1814 "vpermilps $245, %%ymm3, %%ymm9\n"
1815 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1816 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1817 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
1818 "vpermilps $160, %%ymm4, %%ymm8\n"
1819 "vpermilps $245, %%ymm4, %%ymm9\n"
1820 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1821 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1822 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
1823 "vpermilps $160, %%ymm5, %%ymm8\n"
1824 "vpermilps $245, %%ymm5, %%ymm9\n"
1825 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1826 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1827 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
1828 "vpermilps $160, %%ymm6, %%ymm8\n"
1829 "vpermilps $245, %%ymm6, %%ymm9\n"
1830 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1831 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1832 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
1833 "vpermilps $160, %%ymm7, %%ymm8\n"
1834 "vpermilps $245, %%ymm7, %%ymm9\n"
1835 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1836 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1837 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
1838 "vpermilps $68, %%ymm0, %%ymm8\n"
1839 "vpermilps $238, %%ymm0, %%ymm9\n"
1840 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1841 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1842 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1843 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
1844 "vpermilps $68, %%ymm1, %%ymm8\n"
1845 "vpermilps $238, %%ymm1, %%ymm9\n"
1846 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1847 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1848 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1849 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
1850 "vpermilps $68, %%ymm2, %%ymm8\n"
1851 "vpermilps $238, %%ymm2, %%ymm9\n"
1852 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1853 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1854 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1855 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
1856 "vpermilps $68, %%ymm3, %%ymm8\n"
1857 "vpermilps $238, %%ymm3, %%ymm9\n"
1858 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1859 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1860 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1861 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
1862 "vpermilps $68, %%ymm4, %%ymm8\n"
1863 "vpermilps $238, %%ymm4, %%ymm9\n"
1864 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1865 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1866 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1867 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
1868 "vpermilps $68, %%ymm5, %%ymm8\n"
1869 "vpermilps $238, %%ymm5, %%ymm9\n"
1870 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1871 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1872 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1873 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
1874 "vpermilps $68, %%ymm6, %%ymm8\n"
1875 "vpermilps $238, %%ymm6, %%ymm9\n"
1876 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1877 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1878 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1879 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
1880 "vpermilps $68, %%ymm7, %%ymm8\n"
1881 "vpermilps $238, %%ymm7, %%ymm9\n"
1882 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
1883 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
1884 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
1885 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
1886 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1887 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
1888 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
1889 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
1890 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
1891 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1892 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
1893 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
1894 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
1895 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
1896 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1897 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
1898 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
1899 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
1900 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
1901 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1902 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
1903 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
1904 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
1905 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
1906 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1907 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
1908 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
1909 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
1910 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
1911 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1912 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
1913 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
1914 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
1915 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
1916 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1917 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
1918 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
1919 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
1920 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
1921 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
1922 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
1923 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
1924 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
1925 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
1926 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1927 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1928 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1929 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1930 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1931 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1932 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1933 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1934 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1935 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1936 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1937 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1938 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1939 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1940 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1941 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1942 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1943 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1944 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1945 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1946 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1947 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1948 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1949 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1950 "vmovups %%ymm8, (%0)\n"
1951 "vmovups %%ymm9, (%1)\n"
1952 "vmovups %%ymm10, (%2)\n"
1953 "vmovups %%ymm11, (%3)\n"
1954 "vmovups %%ymm12, (%4)\n"
1955 "vmovups %%ymm13, (%5)\n"
1956 "vmovups %%ymm14, (%6)\n"
1957 "vmovups %%ymm15, (%7)\n"
1958 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
1959 );
1960 }
1961 }
1962 for (int j = 0; j < 2048; j += 512) {
1963 for (int k = 0; k < 64; k += 8) {
1964 __asm__ volatile (
1965 "vmovups (%0), %%ymm0\n"
1966 "vmovups (%1), %%ymm1\n"
1967 "vmovups (%2), %%ymm2\n"
1968 "vmovups (%3), %%ymm3\n"
1969 "vmovups (%4), %%ymm4\n"
1970 "vmovups (%5), %%ymm5\n"
1971 "vmovups (%6), %%ymm6\n"
1972 "vmovups (%7), %%ymm7\n"
1973 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
1974 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
1975 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
1976 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
1977 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
1978 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
1979 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
1980 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
1981 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
1982 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
1983 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
1984 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
1985 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
1986 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
1987 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
1988 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
1989 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
1990 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
1991 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
1992 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
1993 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
1994 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
1995 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
1996 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
1997 "vmovups %%ymm8, (%0)\n"
1998 "vmovups %%ymm9, (%1)\n"
1999 "vmovups %%ymm10, (%2)\n"
2000 "vmovups %%ymm11, (%3)\n"
2001 "vmovups %%ymm12, (%4)\n"
2002 "vmovups %%ymm13, (%5)\n"
2003 "vmovups %%ymm14, (%6)\n"
2004 "vmovups %%ymm15, (%7)\n"
2005 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2006 );
2007 }
2008 }
2009 for (int j = 0; j < 2048; j += 2048) {
2010 for (int k = 0; k < 512; k += 8) {
2011 __asm__ volatile (
2012 "vmovups (%0), %%ymm0\n"
2013 "vmovups (%1), %%ymm1\n"
2014 "vmovups (%2), %%ymm2\n"
2015 "vmovups (%3), %%ymm3\n"
2016 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2017 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2018 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2019 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2020 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2021 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2022 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2023 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2024 "vmovups %%ymm0, (%0)\n"
2025 "vmovups %%ymm1, (%1)\n"
2026 "vmovups %%ymm2, (%2)\n"
2027 "vmovups %%ymm3, (%3)\n"
2028 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2029 );
2030 }
2031 }
2032 return;
2033 }
2034 if (depth == 13) {
2035 helper_float_13_recursive(buf + 0, 11);
2036 helper_float_13_recursive(buf + 2048, 11);
2037 helper_float_13_recursive(buf + 4096, 11);
2038 helper_float_13_recursive(buf + 6144, 11);
2039 for (int j = 0; j < 8192; j += 8192) {
2040 for (int k = 0; k < 2048; k += 8) {
2041 __asm__ volatile (
2042 "vmovups (%0), %%ymm0\n"
2043 "vmovups (%1), %%ymm1\n"
2044 "vmovups (%2), %%ymm2\n"
2045 "vmovups (%3), %%ymm3\n"
2046 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2047 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2048 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2049 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2050 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2051 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2052 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2053 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2054 "vmovups %%ymm0, (%0)\n"
2055 "vmovups %%ymm1, (%1)\n"
2056 "vmovups %%ymm2, (%2)\n"
2057 "vmovups %%ymm3, (%3)\n"
2058 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2059 );
2060 }
2061 }
2062 return;
2063 }
2064 }
2065 void helper_float_13(float *buf);
helper_float_13(float * buf)2066 void helper_float_13(float *buf) {
2067 helper_float_13_recursive(buf, 13);
2068 }
2069 void helper_float_14_recursive(float *buf, int depth);
helper_float_14_recursive(float * buf,int depth)2070 void helper_float_14_recursive(float *buf, int depth) {
2071 if (depth == 12) {
2072 for (int j = 0; j < 4096; j += 64) {
2073 for (int k = 0; k < 8; k += 8) {
2074 __asm__ volatile (
2075 "vmovups (%0), %%ymm0\n"
2076 "vmovups (%1), %%ymm1\n"
2077 "vmovups (%2), %%ymm2\n"
2078 "vmovups (%3), %%ymm3\n"
2079 "vmovups (%4), %%ymm4\n"
2080 "vmovups (%5), %%ymm5\n"
2081 "vmovups (%6), %%ymm6\n"
2082 "vmovups (%7), %%ymm7\n"
2083 "vpermilps $160, %%ymm0, %%ymm8\n"
2084 "vpermilps $245, %%ymm0, %%ymm9\n"
2085 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2086 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2087 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
2088 "vpermilps $160, %%ymm1, %%ymm8\n"
2089 "vpermilps $245, %%ymm1, %%ymm9\n"
2090 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2091 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2092 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
2093 "vpermilps $160, %%ymm2, %%ymm8\n"
2094 "vpermilps $245, %%ymm2, %%ymm9\n"
2095 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2096 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2097 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
2098 "vpermilps $160, %%ymm3, %%ymm8\n"
2099 "vpermilps $245, %%ymm3, %%ymm9\n"
2100 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2101 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2102 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
2103 "vpermilps $160, %%ymm4, %%ymm8\n"
2104 "vpermilps $245, %%ymm4, %%ymm9\n"
2105 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2106 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2107 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
2108 "vpermilps $160, %%ymm5, %%ymm8\n"
2109 "vpermilps $245, %%ymm5, %%ymm9\n"
2110 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2111 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2112 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
2113 "vpermilps $160, %%ymm6, %%ymm8\n"
2114 "vpermilps $245, %%ymm6, %%ymm9\n"
2115 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2116 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2117 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
2118 "vpermilps $160, %%ymm7, %%ymm8\n"
2119 "vpermilps $245, %%ymm7, %%ymm9\n"
2120 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2121 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2122 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
2123 "vpermilps $68, %%ymm0, %%ymm8\n"
2124 "vpermilps $238, %%ymm0, %%ymm9\n"
2125 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2126 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2127 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2128 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
2129 "vpermilps $68, %%ymm1, %%ymm8\n"
2130 "vpermilps $238, %%ymm1, %%ymm9\n"
2131 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2132 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2133 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2134 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
2135 "vpermilps $68, %%ymm2, %%ymm8\n"
2136 "vpermilps $238, %%ymm2, %%ymm9\n"
2137 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2138 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2139 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2140 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
2141 "vpermilps $68, %%ymm3, %%ymm8\n"
2142 "vpermilps $238, %%ymm3, %%ymm9\n"
2143 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2144 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2145 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2146 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
2147 "vpermilps $68, %%ymm4, %%ymm8\n"
2148 "vpermilps $238, %%ymm4, %%ymm9\n"
2149 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2150 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2151 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2152 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
2153 "vpermilps $68, %%ymm5, %%ymm8\n"
2154 "vpermilps $238, %%ymm5, %%ymm9\n"
2155 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2156 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2157 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2158 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
2159 "vpermilps $68, %%ymm6, %%ymm8\n"
2160 "vpermilps $238, %%ymm6, %%ymm9\n"
2161 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2162 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2163 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2164 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
2165 "vpermilps $68, %%ymm7, %%ymm8\n"
2166 "vpermilps $238, %%ymm7, %%ymm9\n"
2167 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2168 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2169 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2170 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
2171 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2172 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
2173 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
2174 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
2175 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
2176 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2177 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
2178 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
2179 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
2180 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
2181 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2182 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
2183 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
2184 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
2185 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
2186 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2187 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
2188 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
2189 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
2190 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
2191 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2192 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
2193 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
2194 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
2195 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
2196 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2197 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
2198 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
2199 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
2200 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
2201 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2202 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
2203 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
2204 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
2205 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
2206 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2207 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
2208 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
2209 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
2210 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
2211 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2212 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2213 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2214 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2215 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2216 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2217 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2218 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2219 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2220 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2221 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2222 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2223 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2224 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2225 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2226 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2227 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2228 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2229 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2230 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2231 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2232 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2233 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2234 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2235 "vmovups %%ymm8, (%0)\n"
2236 "vmovups %%ymm9, (%1)\n"
2237 "vmovups %%ymm10, (%2)\n"
2238 "vmovups %%ymm11, (%3)\n"
2239 "vmovups %%ymm12, (%4)\n"
2240 "vmovups %%ymm13, (%5)\n"
2241 "vmovups %%ymm14, (%6)\n"
2242 "vmovups %%ymm15, (%7)\n"
2243 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2244 );
2245 }
2246 }
2247 for (int j = 0; j < 4096; j += 512) {
2248 for (int k = 0; k < 64; k += 8) {
2249 __asm__ volatile (
2250 "vmovups (%0), %%ymm0\n"
2251 "vmovups (%1), %%ymm1\n"
2252 "vmovups (%2), %%ymm2\n"
2253 "vmovups (%3), %%ymm3\n"
2254 "vmovups (%4), %%ymm4\n"
2255 "vmovups (%5), %%ymm5\n"
2256 "vmovups (%6), %%ymm6\n"
2257 "vmovups (%7), %%ymm7\n"
2258 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2259 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2260 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2261 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2262 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2263 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2264 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2265 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2266 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2267 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2268 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2269 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2270 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2271 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2272 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2273 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2274 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2275 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2276 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2277 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2278 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2279 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2280 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2281 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2282 "vmovups %%ymm8, (%0)\n"
2283 "vmovups %%ymm9, (%1)\n"
2284 "vmovups %%ymm10, (%2)\n"
2285 "vmovups %%ymm11, (%3)\n"
2286 "vmovups %%ymm12, (%4)\n"
2287 "vmovups %%ymm13, (%5)\n"
2288 "vmovups %%ymm14, (%6)\n"
2289 "vmovups %%ymm15, (%7)\n"
2290 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2291 );
2292 }
2293 }
2294 for (int j = 0; j < 4096; j += 4096) {
2295 for (int k = 0; k < 512; k += 8) {
2296 __asm__ volatile (
2297 "vmovups (%0), %%ymm0\n"
2298 "vmovups (%1), %%ymm1\n"
2299 "vmovups (%2), %%ymm2\n"
2300 "vmovups (%3), %%ymm3\n"
2301 "vmovups (%4), %%ymm4\n"
2302 "vmovups (%5), %%ymm5\n"
2303 "vmovups (%6), %%ymm6\n"
2304 "vmovups (%7), %%ymm7\n"
2305 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2306 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2307 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2308 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2309 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2310 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2311 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2312 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2313 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2314 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2315 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2316 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2317 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2318 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2319 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2320 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2321 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2322 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2323 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2324 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2325 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2326 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2327 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2328 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2329 "vmovups %%ymm8, (%0)\n"
2330 "vmovups %%ymm9, (%1)\n"
2331 "vmovups %%ymm10, (%2)\n"
2332 "vmovups %%ymm11, (%3)\n"
2333 "vmovups %%ymm12, (%4)\n"
2334 "vmovups %%ymm13, (%5)\n"
2335 "vmovups %%ymm14, (%6)\n"
2336 "vmovups %%ymm15, (%7)\n"
2337 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2338 );
2339 }
2340 }
2341 return;
2342 }
2343 if (depth == 14) {
2344 helper_float_14_recursive(buf + 0, 12);
2345 helper_float_14_recursive(buf + 4096, 12);
2346 helper_float_14_recursive(buf + 8192, 12);
2347 helper_float_14_recursive(buf + 12288, 12);
2348 for (int j = 0; j < 16384; j += 16384) {
2349 for (int k = 0; k < 4096; k += 8) {
2350 __asm__ volatile (
2351 "vmovups (%0), %%ymm0\n"
2352 "vmovups (%1), %%ymm1\n"
2353 "vmovups (%2), %%ymm2\n"
2354 "vmovups (%3), %%ymm3\n"
2355 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2356 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2357 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2358 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2359 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2360 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2361 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2362 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2363 "vmovups %%ymm0, (%0)\n"
2364 "vmovups %%ymm1, (%1)\n"
2365 "vmovups %%ymm2, (%2)\n"
2366 "vmovups %%ymm3, (%3)\n"
2367 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2368 );
2369 }
2370 }
2371 return;
2372 }
2373 }
2374 void helper_float_14(float *buf);
helper_float_14(float * buf)2375 void helper_float_14(float *buf) {
2376 helper_float_14_recursive(buf, 14);
2377 }
2378 void helper_float_15_recursive(float *buf, int depth);
helper_float_15_recursive(float * buf,int depth)2379 void helper_float_15_recursive(float *buf, int depth) {
2380 if (depth == 13) {
2381 for (int j = 0; j < 8192; j += 64) {
2382 for (int k = 0; k < 8; k += 8) {
2383 __asm__ volatile (
2384 "vmovups (%0), %%ymm0\n"
2385 "vmovups (%1), %%ymm1\n"
2386 "vmovups (%2), %%ymm2\n"
2387 "vmovups (%3), %%ymm3\n"
2388 "vmovups (%4), %%ymm4\n"
2389 "vmovups (%5), %%ymm5\n"
2390 "vmovups (%6), %%ymm6\n"
2391 "vmovups (%7), %%ymm7\n"
2392 "vpermilps $160, %%ymm0, %%ymm8\n"
2393 "vpermilps $245, %%ymm0, %%ymm9\n"
2394 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2395 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2396 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
2397 "vpermilps $160, %%ymm1, %%ymm8\n"
2398 "vpermilps $245, %%ymm1, %%ymm9\n"
2399 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2400 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2401 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
2402 "vpermilps $160, %%ymm2, %%ymm8\n"
2403 "vpermilps $245, %%ymm2, %%ymm9\n"
2404 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2405 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2406 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
2407 "vpermilps $160, %%ymm3, %%ymm8\n"
2408 "vpermilps $245, %%ymm3, %%ymm9\n"
2409 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2410 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2411 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
2412 "vpermilps $160, %%ymm4, %%ymm8\n"
2413 "vpermilps $245, %%ymm4, %%ymm9\n"
2414 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2415 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2416 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
2417 "vpermilps $160, %%ymm5, %%ymm8\n"
2418 "vpermilps $245, %%ymm5, %%ymm9\n"
2419 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2420 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2421 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
2422 "vpermilps $160, %%ymm6, %%ymm8\n"
2423 "vpermilps $245, %%ymm6, %%ymm9\n"
2424 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2425 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2426 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
2427 "vpermilps $160, %%ymm7, %%ymm8\n"
2428 "vpermilps $245, %%ymm7, %%ymm9\n"
2429 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2430 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2431 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
2432 "vpermilps $68, %%ymm0, %%ymm8\n"
2433 "vpermilps $238, %%ymm0, %%ymm9\n"
2434 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2435 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2436 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2437 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
2438 "vpermilps $68, %%ymm1, %%ymm8\n"
2439 "vpermilps $238, %%ymm1, %%ymm9\n"
2440 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2441 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2442 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2443 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
2444 "vpermilps $68, %%ymm2, %%ymm8\n"
2445 "vpermilps $238, %%ymm2, %%ymm9\n"
2446 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2447 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2448 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2449 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
2450 "vpermilps $68, %%ymm3, %%ymm8\n"
2451 "vpermilps $238, %%ymm3, %%ymm9\n"
2452 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2453 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2454 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2455 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
2456 "vpermilps $68, %%ymm4, %%ymm8\n"
2457 "vpermilps $238, %%ymm4, %%ymm9\n"
2458 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2459 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2460 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2461 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
2462 "vpermilps $68, %%ymm5, %%ymm8\n"
2463 "vpermilps $238, %%ymm5, %%ymm9\n"
2464 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2465 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2466 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2467 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
2468 "vpermilps $68, %%ymm6, %%ymm8\n"
2469 "vpermilps $238, %%ymm6, %%ymm9\n"
2470 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2471 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2472 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2473 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
2474 "vpermilps $68, %%ymm7, %%ymm8\n"
2475 "vpermilps $238, %%ymm7, %%ymm9\n"
2476 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2477 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2478 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2479 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
2480 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2481 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
2482 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
2483 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
2484 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
2485 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2486 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
2487 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
2488 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
2489 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
2490 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2491 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
2492 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
2493 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
2494 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
2495 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2496 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
2497 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
2498 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
2499 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
2500 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2501 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
2502 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
2503 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
2504 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
2505 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2506 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
2507 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
2508 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
2509 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
2510 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2511 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
2512 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
2513 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
2514 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
2515 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2516 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
2517 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
2518 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
2519 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
2520 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2521 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2522 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2523 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2524 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2525 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2526 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2527 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2528 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2529 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2530 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2531 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2532 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2533 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2534 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2535 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2536 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2537 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2538 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2539 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2540 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2541 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2542 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2543 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2544 "vmovups %%ymm8, (%0)\n"
2545 "vmovups %%ymm9, (%1)\n"
2546 "vmovups %%ymm10, (%2)\n"
2547 "vmovups %%ymm11, (%3)\n"
2548 "vmovups %%ymm12, (%4)\n"
2549 "vmovups %%ymm13, (%5)\n"
2550 "vmovups %%ymm14, (%6)\n"
2551 "vmovups %%ymm15, (%7)\n"
2552 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2553 );
2554 }
2555 }
2556 for (int j = 0; j < 8192; j += 512) {
2557 for (int k = 0; k < 64; k += 8) {
2558 __asm__ volatile (
2559 "vmovups (%0), %%ymm0\n"
2560 "vmovups (%1), %%ymm1\n"
2561 "vmovups (%2), %%ymm2\n"
2562 "vmovups (%3), %%ymm3\n"
2563 "vmovups (%4), %%ymm4\n"
2564 "vmovups (%5), %%ymm5\n"
2565 "vmovups (%6), %%ymm6\n"
2566 "vmovups (%7), %%ymm7\n"
2567 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2568 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2569 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2570 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2571 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2572 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2573 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2574 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2575 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2576 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2577 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2578 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2579 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2580 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2581 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2582 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2583 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2584 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2585 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2586 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2587 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2588 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2589 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2590 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2591 "vmovups %%ymm8, (%0)\n"
2592 "vmovups %%ymm9, (%1)\n"
2593 "vmovups %%ymm10, (%2)\n"
2594 "vmovups %%ymm11, (%3)\n"
2595 "vmovups %%ymm12, (%4)\n"
2596 "vmovups %%ymm13, (%5)\n"
2597 "vmovups %%ymm14, (%6)\n"
2598 "vmovups %%ymm15, (%7)\n"
2599 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2600 );
2601 }
2602 }
2603 for (int j = 0; j < 8192; j += 4096) {
2604 for (int k = 0; k < 512; k += 8) {
2605 __asm__ volatile (
2606 "vmovups (%0), %%ymm0\n"
2607 "vmovups (%1), %%ymm1\n"
2608 "vmovups (%2), %%ymm2\n"
2609 "vmovups (%3), %%ymm3\n"
2610 "vmovups (%4), %%ymm4\n"
2611 "vmovups (%5), %%ymm5\n"
2612 "vmovups (%6), %%ymm6\n"
2613 "vmovups (%7), %%ymm7\n"
2614 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2615 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2616 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2617 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2618 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2619 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2620 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2621 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2622 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2623 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2624 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2625 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2626 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2627 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2628 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2629 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2630 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2631 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2632 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2633 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2634 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2635 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2636 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2637 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2638 "vmovups %%ymm8, (%0)\n"
2639 "vmovups %%ymm9, (%1)\n"
2640 "vmovups %%ymm10, (%2)\n"
2641 "vmovups %%ymm11, (%3)\n"
2642 "vmovups %%ymm12, (%4)\n"
2643 "vmovups %%ymm13, (%5)\n"
2644 "vmovups %%ymm14, (%6)\n"
2645 "vmovups %%ymm15, (%7)\n"
2646 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2647 );
2648 }
2649 }
2650 for (int j = 0; j < 8192; j += 8192) {
2651 for (int k = 0; k < 4096; k += 8) {
2652 __asm__ volatile (
2653 "vmovups (%0), %%ymm0\n"
2654 "vmovups (%1), %%ymm1\n"
2655 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2656 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2657 "vmovups %%ymm8, (%0)\n"
2658 "vmovups %%ymm9, (%1)\n"
2659 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2660 );
2661 }
2662 }
2663 return;
2664 }
2665 if (depth == 15) {
2666 helper_float_15_recursive(buf + 0, 13);
2667 helper_float_15_recursive(buf + 8192, 13);
2668 helper_float_15_recursive(buf + 16384, 13);
2669 helper_float_15_recursive(buf + 24576, 13);
2670 for (int j = 0; j < 32768; j += 32768) {
2671 for (int k = 0; k < 8192; k += 8) {
2672 __asm__ volatile (
2673 "vmovups (%0), %%ymm0\n"
2674 "vmovups (%1), %%ymm1\n"
2675 "vmovups (%2), %%ymm2\n"
2676 "vmovups (%3), %%ymm3\n"
2677 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2678 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2679 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2680 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2681 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2682 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2683 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2684 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2685 "vmovups %%ymm0, (%0)\n"
2686 "vmovups %%ymm1, (%1)\n"
2687 "vmovups %%ymm2, (%2)\n"
2688 "vmovups %%ymm3, (%3)\n"
2689 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2690 );
2691 }
2692 }
2693 return;
2694 }
2695 }
2696 void helper_float_15(float *buf);
helper_float_15(float * buf)2697 void helper_float_15(float *buf) {
2698 helper_float_15_recursive(buf, 15);
2699 }
2700 void helper_float_16_recursive(float *buf, int depth);
helper_float_16_recursive(float * buf,int depth)2701 void helper_float_16_recursive(float *buf, int depth) {
2702 if (depth == 13) {
2703 for (int j = 0; j < 8192; j += 64) {
2704 for (int k = 0; k < 8; k += 8) {
2705 __asm__ volatile (
2706 "vmovups (%0), %%ymm0\n"
2707 "vmovups (%1), %%ymm1\n"
2708 "vmovups (%2), %%ymm2\n"
2709 "vmovups (%3), %%ymm3\n"
2710 "vmovups (%4), %%ymm4\n"
2711 "vmovups (%5), %%ymm5\n"
2712 "vmovups (%6), %%ymm6\n"
2713 "vmovups (%7), %%ymm7\n"
2714 "vpermilps $160, %%ymm0, %%ymm8\n"
2715 "vpermilps $245, %%ymm0, %%ymm9\n"
2716 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2717 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2718 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
2719 "vpermilps $160, %%ymm1, %%ymm8\n"
2720 "vpermilps $245, %%ymm1, %%ymm9\n"
2721 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2722 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2723 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
2724 "vpermilps $160, %%ymm2, %%ymm8\n"
2725 "vpermilps $245, %%ymm2, %%ymm9\n"
2726 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2727 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2728 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
2729 "vpermilps $160, %%ymm3, %%ymm8\n"
2730 "vpermilps $245, %%ymm3, %%ymm9\n"
2731 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2732 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2733 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
2734 "vpermilps $160, %%ymm4, %%ymm8\n"
2735 "vpermilps $245, %%ymm4, %%ymm9\n"
2736 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2737 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2738 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
2739 "vpermilps $160, %%ymm5, %%ymm8\n"
2740 "vpermilps $245, %%ymm5, %%ymm9\n"
2741 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2742 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2743 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
2744 "vpermilps $160, %%ymm6, %%ymm8\n"
2745 "vpermilps $245, %%ymm6, %%ymm9\n"
2746 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2747 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2748 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
2749 "vpermilps $160, %%ymm7, %%ymm8\n"
2750 "vpermilps $245, %%ymm7, %%ymm9\n"
2751 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2752 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2753 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
2754 "vpermilps $68, %%ymm0, %%ymm8\n"
2755 "vpermilps $238, %%ymm0, %%ymm9\n"
2756 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2757 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2758 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2759 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
2760 "vpermilps $68, %%ymm1, %%ymm8\n"
2761 "vpermilps $238, %%ymm1, %%ymm9\n"
2762 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2763 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2764 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2765 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
2766 "vpermilps $68, %%ymm2, %%ymm8\n"
2767 "vpermilps $238, %%ymm2, %%ymm9\n"
2768 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2769 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2770 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2771 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
2772 "vpermilps $68, %%ymm3, %%ymm8\n"
2773 "vpermilps $238, %%ymm3, %%ymm9\n"
2774 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2775 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2776 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2777 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
2778 "vpermilps $68, %%ymm4, %%ymm8\n"
2779 "vpermilps $238, %%ymm4, %%ymm9\n"
2780 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2781 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2782 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2783 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
2784 "vpermilps $68, %%ymm5, %%ymm8\n"
2785 "vpermilps $238, %%ymm5, %%ymm9\n"
2786 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2787 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2788 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2789 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
2790 "vpermilps $68, %%ymm6, %%ymm8\n"
2791 "vpermilps $238, %%ymm6, %%ymm9\n"
2792 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2793 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2794 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2795 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
2796 "vpermilps $68, %%ymm7, %%ymm8\n"
2797 "vpermilps $238, %%ymm7, %%ymm9\n"
2798 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
2799 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
2800 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
2801 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
2802 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2803 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
2804 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
2805 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
2806 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
2807 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2808 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
2809 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
2810 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
2811 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
2812 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2813 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
2814 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
2815 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
2816 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
2817 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2818 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
2819 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
2820 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
2821 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
2822 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2823 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
2824 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
2825 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
2826 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
2827 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2828 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
2829 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
2830 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
2831 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
2832 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2833 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
2834 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
2835 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
2836 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
2837 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
2838 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
2839 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
2840 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
2841 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
2842 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2843 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2844 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2845 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2846 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2847 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2848 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2849 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2850 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2851 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2852 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2853 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2854 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2855 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2856 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2857 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2858 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2859 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2860 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2861 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2862 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2863 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2864 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2865 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2866 "vmovups %%ymm8, (%0)\n"
2867 "vmovups %%ymm9, (%1)\n"
2868 "vmovups %%ymm10, (%2)\n"
2869 "vmovups %%ymm11, (%3)\n"
2870 "vmovups %%ymm12, (%4)\n"
2871 "vmovups %%ymm13, (%5)\n"
2872 "vmovups %%ymm14, (%6)\n"
2873 "vmovups %%ymm15, (%7)\n"
2874 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2875 );
2876 }
2877 }
2878 for (int j = 0; j < 8192; j += 512) {
2879 for (int k = 0; k < 64; k += 8) {
2880 __asm__ volatile (
2881 "vmovups (%0), %%ymm0\n"
2882 "vmovups (%1), %%ymm1\n"
2883 "vmovups (%2), %%ymm2\n"
2884 "vmovups (%3), %%ymm3\n"
2885 "vmovups (%4), %%ymm4\n"
2886 "vmovups (%5), %%ymm5\n"
2887 "vmovups (%6), %%ymm6\n"
2888 "vmovups (%7), %%ymm7\n"
2889 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2890 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2891 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2892 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2893 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2894 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2895 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2896 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2897 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2898 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2899 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2900 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2901 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2902 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2903 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2904 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2905 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2906 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2907 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2908 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2909 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2910 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2911 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2912 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2913 "vmovups %%ymm8, (%0)\n"
2914 "vmovups %%ymm9, (%1)\n"
2915 "vmovups %%ymm10, (%2)\n"
2916 "vmovups %%ymm11, (%3)\n"
2917 "vmovups %%ymm12, (%4)\n"
2918 "vmovups %%ymm13, (%5)\n"
2919 "vmovups %%ymm14, (%6)\n"
2920 "vmovups %%ymm15, (%7)\n"
2921 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2922 );
2923 }
2924 }
2925 for (int j = 0; j < 8192; j += 4096) {
2926 for (int k = 0; k < 512; k += 8) {
2927 __asm__ volatile (
2928 "vmovups (%0), %%ymm0\n"
2929 "vmovups (%1), %%ymm1\n"
2930 "vmovups (%2), %%ymm2\n"
2931 "vmovups (%3), %%ymm3\n"
2932 "vmovups (%4), %%ymm4\n"
2933 "vmovups (%5), %%ymm5\n"
2934 "vmovups (%6), %%ymm6\n"
2935 "vmovups (%7), %%ymm7\n"
2936 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2937 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2938 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
2939 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
2940 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
2941 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
2942 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
2943 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
2944 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
2945 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
2946 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
2947 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
2948 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
2949 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
2950 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
2951 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
2952 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
2953 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
2954 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
2955 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
2956 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
2957 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
2958 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
2959 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
2960 "vmovups %%ymm8, (%0)\n"
2961 "vmovups %%ymm9, (%1)\n"
2962 "vmovups %%ymm10, (%2)\n"
2963 "vmovups %%ymm11, (%3)\n"
2964 "vmovups %%ymm12, (%4)\n"
2965 "vmovups %%ymm13, (%5)\n"
2966 "vmovups %%ymm14, (%6)\n"
2967 "vmovups %%ymm15, (%7)\n"
2968 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2969 );
2970 }
2971 }
2972 for (int j = 0; j < 8192; j += 8192) {
2973 for (int k = 0; k < 4096; k += 8) {
2974 __asm__ volatile (
2975 "vmovups (%0), %%ymm0\n"
2976 "vmovups (%1), %%ymm1\n"
2977 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
2978 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
2979 "vmovups %%ymm8, (%0)\n"
2980 "vmovups %%ymm9, (%1)\n"
2981 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
2982 );
2983 }
2984 }
2985 return;
2986 }
2987 if (depth == 16) {
2988 helper_float_16_recursive(buf + 0, 13);
2989 helper_float_16_recursive(buf + 8192, 13);
2990 helper_float_16_recursive(buf + 16384, 13);
2991 helper_float_16_recursive(buf + 24576, 13);
2992 helper_float_16_recursive(buf + 32768, 13);
2993 helper_float_16_recursive(buf + 40960, 13);
2994 helper_float_16_recursive(buf + 49152, 13);
2995 helper_float_16_recursive(buf + 57344, 13);
2996 for (int j = 0; j < 65536; j += 65536) {
2997 for (int k = 0; k < 8192; k += 8) {
2998 __asm__ volatile (
2999 "vmovups (%0), %%ymm0\n"
3000 "vmovups (%1), %%ymm1\n"
3001 "vmovups (%2), %%ymm2\n"
3002 "vmovups (%3), %%ymm3\n"
3003 "vmovups (%4), %%ymm4\n"
3004 "vmovups (%5), %%ymm5\n"
3005 "vmovups (%6), %%ymm6\n"
3006 "vmovups (%7), %%ymm7\n"
3007 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3008 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3009 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3010 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3011 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3012 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3013 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3014 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3015 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3016 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3017 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3018 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3019 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3020 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3021 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3022 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3023 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3024 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3025 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3026 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3027 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3028 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3029 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3030 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3031 "vmovups %%ymm8, (%0)\n"
3032 "vmovups %%ymm9, (%1)\n"
3033 "vmovups %%ymm10, (%2)\n"
3034 "vmovups %%ymm11, (%3)\n"
3035 "vmovups %%ymm12, (%4)\n"
3036 "vmovups %%ymm13, (%5)\n"
3037 "vmovups %%ymm14, (%6)\n"
3038 "vmovups %%ymm15, (%7)\n"
3039 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3040 );
3041 }
3042 }
3043 return;
3044 }
3045 }
3046 void helper_float_16(float *buf);
helper_float_16(float * buf)3047 void helper_float_16(float *buf) {
3048 helper_float_16_recursive(buf, 16);
3049 }
3050 void helper_float_17_recursive(float *buf, int depth);
helper_float_17_recursive(float * buf,int depth)3051 void helper_float_17_recursive(float *buf, int depth) {
3052 if (depth == 12) {
3053 for (int j = 0; j < 4096; j += 64) {
3054 for (int k = 0; k < 8; k += 8) {
3055 __asm__ volatile (
3056 "vmovups (%0), %%ymm0\n"
3057 "vmovups (%1), %%ymm1\n"
3058 "vmovups (%2), %%ymm2\n"
3059 "vmovups (%3), %%ymm3\n"
3060 "vmovups (%4), %%ymm4\n"
3061 "vmovups (%5), %%ymm5\n"
3062 "vmovups (%6), %%ymm6\n"
3063 "vmovups (%7), %%ymm7\n"
3064 "vpermilps $160, %%ymm0, %%ymm8\n"
3065 "vpermilps $245, %%ymm0, %%ymm9\n"
3066 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3067 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3068 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
3069 "vpermilps $160, %%ymm1, %%ymm8\n"
3070 "vpermilps $245, %%ymm1, %%ymm9\n"
3071 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3072 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3073 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
3074 "vpermilps $160, %%ymm2, %%ymm8\n"
3075 "vpermilps $245, %%ymm2, %%ymm9\n"
3076 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3077 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3078 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
3079 "vpermilps $160, %%ymm3, %%ymm8\n"
3080 "vpermilps $245, %%ymm3, %%ymm9\n"
3081 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3082 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3083 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
3084 "vpermilps $160, %%ymm4, %%ymm8\n"
3085 "vpermilps $245, %%ymm4, %%ymm9\n"
3086 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3087 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3088 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
3089 "vpermilps $160, %%ymm5, %%ymm8\n"
3090 "vpermilps $245, %%ymm5, %%ymm9\n"
3091 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3092 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3093 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
3094 "vpermilps $160, %%ymm6, %%ymm8\n"
3095 "vpermilps $245, %%ymm6, %%ymm9\n"
3096 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3097 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3098 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
3099 "vpermilps $160, %%ymm7, %%ymm8\n"
3100 "vpermilps $245, %%ymm7, %%ymm9\n"
3101 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3102 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3103 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
3104 "vpermilps $68, %%ymm0, %%ymm8\n"
3105 "vpermilps $238, %%ymm0, %%ymm9\n"
3106 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3107 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3108 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3109 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
3110 "vpermilps $68, %%ymm1, %%ymm8\n"
3111 "vpermilps $238, %%ymm1, %%ymm9\n"
3112 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3113 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3114 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3115 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
3116 "vpermilps $68, %%ymm2, %%ymm8\n"
3117 "vpermilps $238, %%ymm2, %%ymm9\n"
3118 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3119 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3120 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3121 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
3122 "vpermilps $68, %%ymm3, %%ymm8\n"
3123 "vpermilps $238, %%ymm3, %%ymm9\n"
3124 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3125 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3126 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3127 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
3128 "vpermilps $68, %%ymm4, %%ymm8\n"
3129 "vpermilps $238, %%ymm4, %%ymm9\n"
3130 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3131 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3132 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3133 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
3134 "vpermilps $68, %%ymm5, %%ymm8\n"
3135 "vpermilps $238, %%ymm5, %%ymm9\n"
3136 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3137 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3138 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3139 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
3140 "vpermilps $68, %%ymm6, %%ymm8\n"
3141 "vpermilps $238, %%ymm6, %%ymm9\n"
3142 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3143 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3144 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3145 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
3146 "vpermilps $68, %%ymm7, %%ymm8\n"
3147 "vpermilps $238, %%ymm7, %%ymm9\n"
3148 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3149 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3150 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3151 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
3152 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3153 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
3154 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
3155 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
3156 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
3157 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3158 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
3159 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
3160 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
3161 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
3162 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3163 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
3164 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
3165 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
3166 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
3167 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3168 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
3169 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
3170 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
3171 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
3172 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3173 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
3174 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
3175 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
3176 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
3177 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3178 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
3179 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
3180 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
3181 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
3182 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3183 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
3184 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
3185 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
3186 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
3187 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3188 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
3189 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
3190 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
3191 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
3192 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3193 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3194 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3195 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3196 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3197 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3198 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3199 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3200 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3201 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3202 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3203 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3204 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3205 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3206 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3207 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3208 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3209 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3210 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3211 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3212 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3213 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3214 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3215 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3216 "vmovups %%ymm8, (%0)\n"
3217 "vmovups %%ymm9, (%1)\n"
3218 "vmovups %%ymm10, (%2)\n"
3219 "vmovups %%ymm11, (%3)\n"
3220 "vmovups %%ymm12, (%4)\n"
3221 "vmovups %%ymm13, (%5)\n"
3222 "vmovups %%ymm14, (%6)\n"
3223 "vmovups %%ymm15, (%7)\n"
3224 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3225 );
3226 }
3227 }
3228 for (int j = 0; j < 4096; j += 512) {
3229 for (int k = 0; k < 64; k += 8) {
3230 __asm__ volatile (
3231 "vmovups (%0), %%ymm0\n"
3232 "vmovups (%1), %%ymm1\n"
3233 "vmovups (%2), %%ymm2\n"
3234 "vmovups (%3), %%ymm3\n"
3235 "vmovups (%4), %%ymm4\n"
3236 "vmovups (%5), %%ymm5\n"
3237 "vmovups (%6), %%ymm6\n"
3238 "vmovups (%7), %%ymm7\n"
3239 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3240 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3241 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3242 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3243 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3244 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3245 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3246 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3247 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3248 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3249 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3250 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3251 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3252 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3253 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3254 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3255 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3256 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3257 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3258 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3259 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3260 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3261 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3262 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3263 "vmovups %%ymm8, (%0)\n"
3264 "vmovups %%ymm9, (%1)\n"
3265 "vmovups %%ymm10, (%2)\n"
3266 "vmovups %%ymm11, (%3)\n"
3267 "vmovups %%ymm12, (%4)\n"
3268 "vmovups %%ymm13, (%5)\n"
3269 "vmovups %%ymm14, (%6)\n"
3270 "vmovups %%ymm15, (%7)\n"
3271 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3272 );
3273 }
3274 }
3275 for (int j = 0; j < 4096; j += 4096) {
3276 for (int k = 0; k < 512; k += 8) {
3277 __asm__ volatile (
3278 "vmovups (%0), %%ymm0\n"
3279 "vmovups (%1), %%ymm1\n"
3280 "vmovups (%2), %%ymm2\n"
3281 "vmovups (%3), %%ymm3\n"
3282 "vmovups (%4), %%ymm4\n"
3283 "vmovups (%5), %%ymm5\n"
3284 "vmovups (%6), %%ymm6\n"
3285 "vmovups (%7), %%ymm7\n"
3286 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3287 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3288 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3289 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3290 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3291 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3292 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3293 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3294 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3295 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3296 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3297 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3298 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3299 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3300 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3301 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3302 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3303 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3304 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3305 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3306 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3307 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3308 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3309 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3310 "vmovups %%ymm8, (%0)\n"
3311 "vmovups %%ymm9, (%1)\n"
3312 "vmovups %%ymm10, (%2)\n"
3313 "vmovups %%ymm11, (%3)\n"
3314 "vmovups %%ymm12, (%4)\n"
3315 "vmovups %%ymm13, (%5)\n"
3316 "vmovups %%ymm14, (%6)\n"
3317 "vmovups %%ymm15, (%7)\n"
3318 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3319 );
3320 }
3321 }
3322 return;
3323 }
3324 if (depth == 15) {
3325 helper_float_17_recursive(buf + 0, 12);
3326 helper_float_17_recursive(buf + 4096, 12);
3327 helper_float_17_recursive(buf + 8192, 12);
3328 helper_float_17_recursive(buf + 12288, 12);
3329 helper_float_17_recursive(buf + 16384, 12);
3330 helper_float_17_recursive(buf + 20480, 12);
3331 helper_float_17_recursive(buf + 24576, 12);
3332 helper_float_17_recursive(buf + 28672, 12);
3333 for (int j = 0; j < 32768; j += 32768) {
3334 for (int k = 0; k < 4096; k += 8) {
3335 __asm__ volatile (
3336 "vmovups (%0), %%ymm0\n"
3337 "vmovups (%1), %%ymm1\n"
3338 "vmovups (%2), %%ymm2\n"
3339 "vmovups (%3), %%ymm3\n"
3340 "vmovups (%4), %%ymm4\n"
3341 "vmovups (%5), %%ymm5\n"
3342 "vmovups (%6), %%ymm6\n"
3343 "vmovups (%7), %%ymm7\n"
3344 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3345 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3346 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3347 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3348 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3349 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3350 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3351 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3352 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3353 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3354 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3355 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3356 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3357 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3358 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3359 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3360 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3361 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3362 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3363 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3364 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3365 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3366 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3367 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3368 "vmovups %%ymm8, (%0)\n"
3369 "vmovups %%ymm9, (%1)\n"
3370 "vmovups %%ymm10, (%2)\n"
3371 "vmovups %%ymm11, (%3)\n"
3372 "vmovups %%ymm12, (%4)\n"
3373 "vmovups %%ymm13, (%5)\n"
3374 "vmovups %%ymm14, (%6)\n"
3375 "vmovups %%ymm15, (%7)\n"
3376 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3377 );
3378 }
3379 }
3380 return;
3381 }
3382 if (depth == 17) {
3383 helper_float_17_recursive(buf + 0, 15);
3384 helper_float_17_recursive(buf + 32768, 15);
3385 helper_float_17_recursive(buf + 65536, 15);
3386 helper_float_17_recursive(buf + 98304, 15);
3387 for (int j = 0; j < 131072; j += 131072) {
3388 for (int k = 0; k < 32768; k += 8) {
3389 __asm__ volatile (
3390 "vmovups (%0), %%ymm0\n"
3391 "vmovups (%1), %%ymm1\n"
3392 "vmovups (%2), %%ymm2\n"
3393 "vmovups (%3), %%ymm3\n"
3394 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3395 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3396 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3397 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3398 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3399 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3400 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3401 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3402 "vmovups %%ymm0, (%0)\n"
3403 "vmovups %%ymm1, (%1)\n"
3404 "vmovups %%ymm2, (%2)\n"
3405 "vmovups %%ymm3, (%3)\n"
3406 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3407 );
3408 }
3409 }
3410 return;
3411 }
3412 }
3413 void helper_float_17(float *buf);
helper_float_17(float * buf)3414 void helper_float_17(float *buf) {
3415 helper_float_17_recursive(buf, 17);
3416 }
3417 void helper_float_18_recursive(float *buf, int depth);
helper_float_18_recursive(float * buf,int depth)3418 void helper_float_18_recursive(float *buf, int depth) {
3419 if (depth == 12) {
3420 for (int j = 0; j < 4096; j += 64) {
3421 for (int k = 0; k < 8; k += 8) {
3422 __asm__ volatile (
3423 "vmovups (%0), %%ymm0\n"
3424 "vmovups (%1), %%ymm1\n"
3425 "vmovups (%2), %%ymm2\n"
3426 "vmovups (%3), %%ymm3\n"
3427 "vmovups (%4), %%ymm4\n"
3428 "vmovups (%5), %%ymm5\n"
3429 "vmovups (%6), %%ymm6\n"
3430 "vmovups (%7), %%ymm7\n"
3431 "vpermilps $160, %%ymm0, %%ymm8\n"
3432 "vpermilps $245, %%ymm0, %%ymm9\n"
3433 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3434 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3435 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
3436 "vpermilps $160, %%ymm1, %%ymm8\n"
3437 "vpermilps $245, %%ymm1, %%ymm9\n"
3438 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3439 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3440 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
3441 "vpermilps $160, %%ymm2, %%ymm8\n"
3442 "vpermilps $245, %%ymm2, %%ymm9\n"
3443 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3444 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3445 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
3446 "vpermilps $160, %%ymm3, %%ymm8\n"
3447 "vpermilps $245, %%ymm3, %%ymm9\n"
3448 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3449 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3450 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
3451 "vpermilps $160, %%ymm4, %%ymm8\n"
3452 "vpermilps $245, %%ymm4, %%ymm9\n"
3453 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3454 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3455 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
3456 "vpermilps $160, %%ymm5, %%ymm8\n"
3457 "vpermilps $245, %%ymm5, %%ymm9\n"
3458 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3459 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3460 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
3461 "vpermilps $160, %%ymm6, %%ymm8\n"
3462 "vpermilps $245, %%ymm6, %%ymm9\n"
3463 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3464 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3465 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
3466 "vpermilps $160, %%ymm7, %%ymm8\n"
3467 "vpermilps $245, %%ymm7, %%ymm9\n"
3468 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3469 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3470 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
3471 "vpermilps $68, %%ymm0, %%ymm8\n"
3472 "vpermilps $238, %%ymm0, %%ymm9\n"
3473 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3474 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3475 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3476 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
3477 "vpermilps $68, %%ymm1, %%ymm8\n"
3478 "vpermilps $238, %%ymm1, %%ymm9\n"
3479 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3480 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3481 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3482 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
3483 "vpermilps $68, %%ymm2, %%ymm8\n"
3484 "vpermilps $238, %%ymm2, %%ymm9\n"
3485 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3486 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3487 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3488 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
3489 "vpermilps $68, %%ymm3, %%ymm8\n"
3490 "vpermilps $238, %%ymm3, %%ymm9\n"
3491 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3492 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3493 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3494 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
3495 "vpermilps $68, %%ymm4, %%ymm8\n"
3496 "vpermilps $238, %%ymm4, %%ymm9\n"
3497 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3498 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3499 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3500 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
3501 "vpermilps $68, %%ymm5, %%ymm8\n"
3502 "vpermilps $238, %%ymm5, %%ymm9\n"
3503 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3504 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3505 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3506 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
3507 "vpermilps $68, %%ymm6, %%ymm8\n"
3508 "vpermilps $238, %%ymm6, %%ymm9\n"
3509 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3510 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3511 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3512 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
3513 "vpermilps $68, %%ymm7, %%ymm8\n"
3514 "vpermilps $238, %%ymm7, %%ymm9\n"
3515 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3516 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3517 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3518 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
3519 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3520 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
3521 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
3522 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
3523 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
3524 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3525 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
3526 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
3527 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
3528 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
3529 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3530 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
3531 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
3532 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
3533 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
3534 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3535 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
3536 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
3537 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
3538 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
3539 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3540 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
3541 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
3542 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
3543 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
3544 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3545 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
3546 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
3547 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
3548 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
3549 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3550 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
3551 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
3552 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
3553 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
3554 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3555 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
3556 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
3557 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
3558 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
3559 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3560 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3561 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3562 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3563 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3564 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3565 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3566 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3567 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3568 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3569 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3570 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3571 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3572 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3573 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3574 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3575 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3576 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3577 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3578 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3579 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3580 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3581 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3582 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3583 "vmovups %%ymm8, (%0)\n"
3584 "vmovups %%ymm9, (%1)\n"
3585 "vmovups %%ymm10, (%2)\n"
3586 "vmovups %%ymm11, (%3)\n"
3587 "vmovups %%ymm12, (%4)\n"
3588 "vmovups %%ymm13, (%5)\n"
3589 "vmovups %%ymm14, (%6)\n"
3590 "vmovups %%ymm15, (%7)\n"
3591 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3592 );
3593 }
3594 }
3595 for (int j = 0; j < 4096; j += 512) {
3596 for (int k = 0; k < 64; k += 8) {
3597 __asm__ volatile (
3598 "vmovups (%0), %%ymm0\n"
3599 "vmovups (%1), %%ymm1\n"
3600 "vmovups (%2), %%ymm2\n"
3601 "vmovups (%3), %%ymm3\n"
3602 "vmovups (%4), %%ymm4\n"
3603 "vmovups (%5), %%ymm5\n"
3604 "vmovups (%6), %%ymm6\n"
3605 "vmovups (%7), %%ymm7\n"
3606 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3607 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3608 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3609 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3610 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3611 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3612 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3613 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3614 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3615 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3616 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3617 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3618 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3619 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3620 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3621 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3622 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3623 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3624 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3625 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3626 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3627 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3628 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3629 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3630 "vmovups %%ymm8, (%0)\n"
3631 "vmovups %%ymm9, (%1)\n"
3632 "vmovups %%ymm10, (%2)\n"
3633 "vmovups %%ymm11, (%3)\n"
3634 "vmovups %%ymm12, (%4)\n"
3635 "vmovups %%ymm13, (%5)\n"
3636 "vmovups %%ymm14, (%6)\n"
3637 "vmovups %%ymm15, (%7)\n"
3638 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3639 );
3640 }
3641 }
3642 for (int j = 0; j < 4096; j += 4096) {
3643 for (int k = 0; k < 512; k += 8) {
3644 __asm__ volatile (
3645 "vmovups (%0), %%ymm0\n"
3646 "vmovups (%1), %%ymm1\n"
3647 "vmovups (%2), %%ymm2\n"
3648 "vmovups (%3), %%ymm3\n"
3649 "vmovups (%4), %%ymm4\n"
3650 "vmovups (%5), %%ymm5\n"
3651 "vmovups (%6), %%ymm6\n"
3652 "vmovups (%7), %%ymm7\n"
3653 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3654 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3655 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3656 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3657 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3658 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3659 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3660 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3661 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3662 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3663 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3664 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3665 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3666 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3667 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3668 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3669 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3670 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3671 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3672 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3673 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3674 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3675 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3676 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3677 "vmovups %%ymm8, (%0)\n"
3678 "vmovups %%ymm9, (%1)\n"
3679 "vmovups %%ymm10, (%2)\n"
3680 "vmovups %%ymm11, (%3)\n"
3681 "vmovups %%ymm12, (%4)\n"
3682 "vmovups %%ymm13, (%5)\n"
3683 "vmovups %%ymm14, (%6)\n"
3684 "vmovups %%ymm15, (%7)\n"
3685 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3686 );
3687 }
3688 }
3689 return;
3690 }
3691 if (depth == 15) {
3692 helper_float_18_recursive(buf + 0, 12);
3693 helper_float_18_recursive(buf + 4096, 12);
3694 helper_float_18_recursive(buf + 8192, 12);
3695 helper_float_18_recursive(buf + 12288, 12);
3696 helper_float_18_recursive(buf + 16384, 12);
3697 helper_float_18_recursive(buf + 20480, 12);
3698 helper_float_18_recursive(buf + 24576, 12);
3699 helper_float_18_recursive(buf + 28672, 12);
3700 for (int j = 0; j < 32768; j += 32768) {
3701 for (int k = 0; k < 4096; k += 8) {
3702 __asm__ volatile (
3703 "vmovups (%0), %%ymm0\n"
3704 "vmovups (%1), %%ymm1\n"
3705 "vmovups (%2), %%ymm2\n"
3706 "vmovups (%3), %%ymm3\n"
3707 "vmovups (%4), %%ymm4\n"
3708 "vmovups (%5), %%ymm5\n"
3709 "vmovups (%6), %%ymm6\n"
3710 "vmovups (%7), %%ymm7\n"
3711 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3712 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3713 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3714 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3715 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3716 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3717 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3718 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3719 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3720 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3721 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3722 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3723 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3724 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3725 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3726 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3727 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3728 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3729 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3730 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3731 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3732 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3733 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3734 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3735 "vmovups %%ymm8, (%0)\n"
3736 "vmovups %%ymm9, (%1)\n"
3737 "vmovups %%ymm10, (%2)\n"
3738 "vmovups %%ymm11, (%3)\n"
3739 "vmovups %%ymm12, (%4)\n"
3740 "vmovups %%ymm13, (%5)\n"
3741 "vmovups %%ymm14, (%6)\n"
3742 "vmovups %%ymm15, (%7)\n"
3743 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3744 );
3745 }
3746 }
3747 return;
3748 }
3749 if (depth == 18) {
3750 helper_float_18_recursive(buf + 0, 15);
3751 helper_float_18_recursive(buf + 32768, 15);
3752 helper_float_18_recursive(buf + 65536, 15);
3753 helper_float_18_recursive(buf + 98304, 15);
3754 helper_float_18_recursive(buf + 131072, 15);
3755 helper_float_18_recursive(buf + 163840, 15);
3756 helper_float_18_recursive(buf + 196608, 15);
3757 helper_float_18_recursive(buf + 229376, 15);
3758 for (int j = 0; j < 262144; j += 262144) {
3759 for (int k = 0; k < 32768; k += 8) {
3760 __asm__ volatile (
3761 "vmovups (%0), %%ymm0\n"
3762 "vmovups (%1), %%ymm1\n"
3763 "vmovups (%2), %%ymm2\n"
3764 "vmovups (%3), %%ymm3\n"
3765 "vmovups (%4), %%ymm4\n"
3766 "vmovups (%5), %%ymm5\n"
3767 "vmovups (%6), %%ymm6\n"
3768 "vmovups (%7), %%ymm7\n"
3769 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3770 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3771 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3772 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3773 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3774 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3775 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3776 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3777 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3778 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3779 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3780 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3781 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3782 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3783 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3784 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3785 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3786 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3787 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3788 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3789 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3790 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3791 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3792 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3793 "vmovups %%ymm8, (%0)\n"
3794 "vmovups %%ymm9, (%1)\n"
3795 "vmovups %%ymm10, (%2)\n"
3796 "vmovups %%ymm11, (%3)\n"
3797 "vmovups %%ymm12, (%4)\n"
3798 "vmovups %%ymm13, (%5)\n"
3799 "vmovups %%ymm14, (%6)\n"
3800 "vmovups %%ymm15, (%7)\n"
3801 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3802 );
3803 }
3804 }
3805 return;
3806 }
3807 }
3808 void helper_float_18(float *buf);
helper_float_18(float * buf)3809 void helper_float_18(float *buf) {
3810 helper_float_18_recursive(buf, 18);
3811 }
3812 void helper_float_19_recursive(float *buf, int depth);
helper_float_19_recursive(float * buf,int depth)3813 void helper_float_19_recursive(float *buf, int depth) {
3814 if (depth == 13) {
3815 for (int j = 0; j < 8192; j += 64) {
3816 for (int k = 0; k < 8; k += 8) {
3817 __asm__ volatile (
3818 "vmovups (%0), %%ymm0\n"
3819 "vmovups (%1), %%ymm1\n"
3820 "vmovups (%2), %%ymm2\n"
3821 "vmovups (%3), %%ymm3\n"
3822 "vmovups (%4), %%ymm4\n"
3823 "vmovups (%5), %%ymm5\n"
3824 "vmovups (%6), %%ymm6\n"
3825 "vmovups (%7), %%ymm7\n"
3826 "vpermilps $160, %%ymm0, %%ymm8\n"
3827 "vpermilps $245, %%ymm0, %%ymm9\n"
3828 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3829 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3830 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
3831 "vpermilps $160, %%ymm1, %%ymm8\n"
3832 "vpermilps $245, %%ymm1, %%ymm9\n"
3833 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3834 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3835 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
3836 "vpermilps $160, %%ymm2, %%ymm8\n"
3837 "vpermilps $245, %%ymm2, %%ymm9\n"
3838 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3839 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3840 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
3841 "vpermilps $160, %%ymm3, %%ymm8\n"
3842 "vpermilps $245, %%ymm3, %%ymm9\n"
3843 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3844 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3845 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
3846 "vpermilps $160, %%ymm4, %%ymm8\n"
3847 "vpermilps $245, %%ymm4, %%ymm9\n"
3848 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3849 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3850 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
3851 "vpermilps $160, %%ymm5, %%ymm8\n"
3852 "vpermilps $245, %%ymm5, %%ymm9\n"
3853 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3854 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3855 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
3856 "vpermilps $160, %%ymm6, %%ymm8\n"
3857 "vpermilps $245, %%ymm6, %%ymm9\n"
3858 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3859 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3860 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
3861 "vpermilps $160, %%ymm7, %%ymm8\n"
3862 "vpermilps $245, %%ymm7, %%ymm9\n"
3863 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3864 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3865 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
3866 "vpermilps $68, %%ymm0, %%ymm8\n"
3867 "vpermilps $238, %%ymm0, %%ymm9\n"
3868 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3869 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3870 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3871 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
3872 "vpermilps $68, %%ymm1, %%ymm8\n"
3873 "vpermilps $238, %%ymm1, %%ymm9\n"
3874 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3875 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3876 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3877 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
3878 "vpermilps $68, %%ymm2, %%ymm8\n"
3879 "vpermilps $238, %%ymm2, %%ymm9\n"
3880 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3881 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3882 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3883 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
3884 "vpermilps $68, %%ymm3, %%ymm8\n"
3885 "vpermilps $238, %%ymm3, %%ymm9\n"
3886 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3887 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3888 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3889 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
3890 "vpermilps $68, %%ymm4, %%ymm8\n"
3891 "vpermilps $238, %%ymm4, %%ymm9\n"
3892 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3893 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3894 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3895 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
3896 "vpermilps $68, %%ymm5, %%ymm8\n"
3897 "vpermilps $238, %%ymm5, %%ymm9\n"
3898 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3899 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3900 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3901 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
3902 "vpermilps $68, %%ymm6, %%ymm8\n"
3903 "vpermilps $238, %%ymm6, %%ymm9\n"
3904 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3905 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3906 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3907 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
3908 "vpermilps $68, %%ymm7, %%ymm8\n"
3909 "vpermilps $238, %%ymm7, %%ymm9\n"
3910 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
3911 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
3912 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
3913 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
3914 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3915 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
3916 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
3917 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
3918 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
3919 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3920 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
3921 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
3922 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
3923 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
3924 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3925 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
3926 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
3927 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
3928 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
3929 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3930 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
3931 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
3932 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
3933 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
3934 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3935 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
3936 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
3937 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
3938 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
3939 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3940 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
3941 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
3942 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
3943 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
3944 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3945 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
3946 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
3947 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
3948 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
3949 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
3950 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
3951 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
3952 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
3953 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
3954 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
3955 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
3956 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
3957 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
3958 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
3959 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
3960 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
3961 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
3962 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
3963 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
3964 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
3965 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
3966 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
3967 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
3968 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
3969 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
3970 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
3971 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
3972 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
3973 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
3974 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
3975 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
3976 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
3977 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
3978 "vmovups %%ymm8, (%0)\n"
3979 "vmovups %%ymm9, (%1)\n"
3980 "vmovups %%ymm10, (%2)\n"
3981 "vmovups %%ymm11, (%3)\n"
3982 "vmovups %%ymm12, (%4)\n"
3983 "vmovups %%ymm13, (%5)\n"
3984 "vmovups %%ymm14, (%6)\n"
3985 "vmovups %%ymm15, (%7)\n"
3986 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
3987 );
3988 }
3989 }
3990 for (int j = 0; j < 8192; j += 512) {
3991 for (int k = 0; k < 64; k += 8) {
3992 __asm__ volatile (
3993 "vmovups (%0), %%ymm0\n"
3994 "vmovups (%1), %%ymm1\n"
3995 "vmovups (%2), %%ymm2\n"
3996 "vmovups (%3), %%ymm3\n"
3997 "vmovups (%4), %%ymm4\n"
3998 "vmovups (%5), %%ymm5\n"
3999 "vmovups (%6), %%ymm6\n"
4000 "vmovups (%7), %%ymm7\n"
4001 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4002 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4003 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4004 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4005 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4006 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4007 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4008 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4009 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4010 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4011 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4012 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4013 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4014 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4015 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4016 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4017 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4018 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4019 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4020 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4021 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4022 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4023 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4024 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4025 "vmovups %%ymm8, (%0)\n"
4026 "vmovups %%ymm9, (%1)\n"
4027 "vmovups %%ymm10, (%2)\n"
4028 "vmovups %%ymm11, (%3)\n"
4029 "vmovups %%ymm12, (%4)\n"
4030 "vmovups %%ymm13, (%5)\n"
4031 "vmovups %%ymm14, (%6)\n"
4032 "vmovups %%ymm15, (%7)\n"
4033 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4034 );
4035 }
4036 }
4037 for (int j = 0; j < 8192; j += 4096) {
4038 for (int k = 0; k < 512; k += 8) {
4039 __asm__ volatile (
4040 "vmovups (%0), %%ymm0\n"
4041 "vmovups (%1), %%ymm1\n"
4042 "vmovups (%2), %%ymm2\n"
4043 "vmovups (%3), %%ymm3\n"
4044 "vmovups (%4), %%ymm4\n"
4045 "vmovups (%5), %%ymm5\n"
4046 "vmovups (%6), %%ymm6\n"
4047 "vmovups (%7), %%ymm7\n"
4048 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4049 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4050 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4051 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4052 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4053 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4054 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4055 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4056 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4057 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4058 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4059 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4060 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4061 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4062 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4063 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4064 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4065 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4066 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4067 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4068 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4069 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4070 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4071 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4072 "vmovups %%ymm8, (%0)\n"
4073 "vmovups %%ymm9, (%1)\n"
4074 "vmovups %%ymm10, (%2)\n"
4075 "vmovups %%ymm11, (%3)\n"
4076 "vmovups %%ymm12, (%4)\n"
4077 "vmovups %%ymm13, (%5)\n"
4078 "vmovups %%ymm14, (%6)\n"
4079 "vmovups %%ymm15, (%7)\n"
4080 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4081 );
4082 }
4083 }
4084 for (int j = 0; j < 8192; j += 8192) {
4085 for (int k = 0; k < 4096; k += 8) {
4086 __asm__ volatile (
4087 "vmovups (%0), %%ymm0\n"
4088 "vmovups (%1), %%ymm1\n"
4089 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4090 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4091 "vmovups %%ymm8, (%0)\n"
4092 "vmovups %%ymm9, (%1)\n"
4093 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4094 );
4095 }
4096 }
4097 return;
4098 }
4099 if (depth == 16) {
4100 helper_float_19_recursive(buf + 0, 13);
4101 helper_float_19_recursive(buf + 8192, 13);
4102 helper_float_19_recursive(buf + 16384, 13);
4103 helper_float_19_recursive(buf + 24576, 13);
4104 helper_float_19_recursive(buf + 32768, 13);
4105 helper_float_19_recursive(buf + 40960, 13);
4106 helper_float_19_recursive(buf + 49152, 13);
4107 helper_float_19_recursive(buf + 57344, 13);
4108 for (int j = 0; j < 65536; j += 65536) {
4109 for (int k = 0; k < 8192; k += 8) {
4110 __asm__ volatile (
4111 "vmovups (%0), %%ymm0\n"
4112 "vmovups (%1), %%ymm1\n"
4113 "vmovups (%2), %%ymm2\n"
4114 "vmovups (%3), %%ymm3\n"
4115 "vmovups (%4), %%ymm4\n"
4116 "vmovups (%5), %%ymm5\n"
4117 "vmovups (%6), %%ymm6\n"
4118 "vmovups (%7), %%ymm7\n"
4119 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4120 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4121 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4122 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4123 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4124 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4125 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4126 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4127 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4128 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4129 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4130 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4131 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4132 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4133 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4134 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4135 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4136 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4137 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4138 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4139 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4140 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4141 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4142 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4143 "vmovups %%ymm8, (%0)\n"
4144 "vmovups %%ymm9, (%1)\n"
4145 "vmovups %%ymm10, (%2)\n"
4146 "vmovups %%ymm11, (%3)\n"
4147 "vmovups %%ymm12, (%4)\n"
4148 "vmovups %%ymm13, (%5)\n"
4149 "vmovups %%ymm14, (%6)\n"
4150 "vmovups %%ymm15, (%7)\n"
4151 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4152 );
4153 }
4154 }
4155 return;
4156 }
4157 if (depth == 19) {
4158 helper_float_19_recursive(buf + 0, 16);
4159 helper_float_19_recursive(buf + 65536, 16);
4160 helper_float_19_recursive(buf + 131072, 16);
4161 helper_float_19_recursive(buf + 196608, 16);
4162 helper_float_19_recursive(buf + 262144, 16);
4163 helper_float_19_recursive(buf + 327680, 16);
4164 helper_float_19_recursive(buf + 393216, 16);
4165 helper_float_19_recursive(buf + 458752, 16);
4166 for (int j = 0; j < 524288; j += 524288) {
4167 for (int k = 0; k < 65536; k += 8) {
4168 __asm__ volatile (
4169 "vmovups (%0), %%ymm0\n"
4170 "vmovups (%1), %%ymm1\n"
4171 "vmovups (%2), %%ymm2\n"
4172 "vmovups (%3), %%ymm3\n"
4173 "vmovups (%4), %%ymm4\n"
4174 "vmovups (%5), %%ymm5\n"
4175 "vmovups (%6), %%ymm6\n"
4176 "vmovups (%7), %%ymm7\n"
4177 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4178 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4179 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4180 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4181 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4182 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4183 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4184 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4185 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4186 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4187 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4188 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4189 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4190 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4191 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4192 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4193 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4194 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4195 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4196 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4197 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4198 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4199 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4200 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4201 "vmovups %%ymm8, (%0)\n"
4202 "vmovups %%ymm9, (%1)\n"
4203 "vmovups %%ymm10, (%2)\n"
4204 "vmovups %%ymm11, (%3)\n"
4205 "vmovups %%ymm12, (%4)\n"
4206 "vmovups %%ymm13, (%5)\n"
4207 "vmovups %%ymm14, (%6)\n"
4208 "vmovups %%ymm15, (%7)\n"
4209 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4210 );
4211 }
4212 }
4213 return;
4214 }
4215 }
4216 void helper_float_19(float *buf);
helper_float_19(float * buf)4217 void helper_float_19(float *buf) {
4218 helper_float_19_recursive(buf, 19);
4219 }
4220 void helper_float_20_recursive(float *buf, int depth);
helper_float_20_recursive(float * buf,int depth)4221 void helper_float_20_recursive(float *buf, int depth) {
4222 if (depth == 12) {
4223 for (int j = 0; j < 4096; j += 64) {
4224 for (int k = 0; k < 8; k += 8) {
4225 __asm__ volatile (
4226 "vmovups (%0), %%ymm0\n"
4227 "vmovups (%1), %%ymm1\n"
4228 "vmovups (%2), %%ymm2\n"
4229 "vmovups (%3), %%ymm3\n"
4230 "vmovups (%4), %%ymm4\n"
4231 "vmovups (%5), %%ymm5\n"
4232 "vmovups (%6), %%ymm6\n"
4233 "vmovups (%7), %%ymm7\n"
4234 "vpermilps $160, %%ymm0, %%ymm8\n"
4235 "vpermilps $245, %%ymm0, %%ymm9\n"
4236 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4237 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4238 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
4239 "vpermilps $160, %%ymm1, %%ymm8\n"
4240 "vpermilps $245, %%ymm1, %%ymm9\n"
4241 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4242 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4243 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
4244 "vpermilps $160, %%ymm2, %%ymm8\n"
4245 "vpermilps $245, %%ymm2, %%ymm9\n"
4246 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4247 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4248 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
4249 "vpermilps $160, %%ymm3, %%ymm8\n"
4250 "vpermilps $245, %%ymm3, %%ymm9\n"
4251 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4252 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4253 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
4254 "vpermilps $160, %%ymm4, %%ymm8\n"
4255 "vpermilps $245, %%ymm4, %%ymm9\n"
4256 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4257 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4258 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
4259 "vpermilps $160, %%ymm5, %%ymm8\n"
4260 "vpermilps $245, %%ymm5, %%ymm9\n"
4261 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4262 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4263 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
4264 "vpermilps $160, %%ymm6, %%ymm8\n"
4265 "vpermilps $245, %%ymm6, %%ymm9\n"
4266 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4267 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4268 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
4269 "vpermilps $160, %%ymm7, %%ymm8\n"
4270 "vpermilps $245, %%ymm7, %%ymm9\n"
4271 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4272 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4273 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
4274 "vpermilps $68, %%ymm0, %%ymm8\n"
4275 "vpermilps $238, %%ymm0, %%ymm9\n"
4276 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4277 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4278 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4279 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
4280 "vpermilps $68, %%ymm1, %%ymm8\n"
4281 "vpermilps $238, %%ymm1, %%ymm9\n"
4282 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4283 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4284 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4285 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
4286 "vpermilps $68, %%ymm2, %%ymm8\n"
4287 "vpermilps $238, %%ymm2, %%ymm9\n"
4288 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4289 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4290 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4291 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
4292 "vpermilps $68, %%ymm3, %%ymm8\n"
4293 "vpermilps $238, %%ymm3, %%ymm9\n"
4294 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4295 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4296 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4297 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
4298 "vpermilps $68, %%ymm4, %%ymm8\n"
4299 "vpermilps $238, %%ymm4, %%ymm9\n"
4300 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4301 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4302 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4303 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
4304 "vpermilps $68, %%ymm5, %%ymm8\n"
4305 "vpermilps $238, %%ymm5, %%ymm9\n"
4306 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4307 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4308 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4309 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
4310 "vpermilps $68, %%ymm6, %%ymm8\n"
4311 "vpermilps $238, %%ymm6, %%ymm9\n"
4312 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4313 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4314 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4315 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
4316 "vpermilps $68, %%ymm7, %%ymm8\n"
4317 "vpermilps $238, %%ymm7, %%ymm9\n"
4318 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4319 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4320 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4321 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
4322 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4323 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
4324 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
4325 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
4326 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
4327 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4328 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
4329 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
4330 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
4331 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
4332 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4333 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
4334 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
4335 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
4336 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
4337 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4338 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
4339 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
4340 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
4341 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
4342 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4343 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
4344 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
4345 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
4346 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
4347 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4348 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
4349 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
4350 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
4351 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
4352 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4353 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
4354 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
4355 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
4356 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
4357 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4358 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
4359 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
4360 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
4361 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
4362 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4363 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4364 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4365 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4366 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4367 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4368 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4369 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4370 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4371 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4372 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4373 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4374 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4375 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4376 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4377 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4378 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4379 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4380 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4381 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4382 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4383 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4384 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4385 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4386 "vmovups %%ymm8, (%0)\n"
4387 "vmovups %%ymm9, (%1)\n"
4388 "vmovups %%ymm10, (%2)\n"
4389 "vmovups %%ymm11, (%3)\n"
4390 "vmovups %%ymm12, (%4)\n"
4391 "vmovups %%ymm13, (%5)\n"
4392 "vmovups %%ymm14, (%6)\n"
4393 "vmovups %%ymm15, (%7)\n"
4394 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4395 );
4396 }
4397 }
4398 for (int j = 0; j < 4096; j += 512) {
4399 for (int k = 0; k < 64; k += 8) {
4400 __asm__ volatile (
4401 "vmovups (%0), %%ymm0\n"
4402 "vmovups (%1), %%ymm1\n"
4403 "vmovups (%2), %%ymm2\n"
4404 "vmovups (%3), %%ymm3\n"
4405 "vmovups (%4), %%ymm4\n"
4406 "vmovups (%5), %%ymm5\n"
4407 "vmovups (%6), %%ymm6\n"
4408 "vmovups (%7), %%ymm7\n"
4409 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4410 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4411 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4412 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4413 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4414 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4415 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4416 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4417 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4418 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4419 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4420 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4421 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4422 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4423 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4424 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4425 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4426 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4427 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4428 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4429 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4430 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4431 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4432 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4433 "vmovups %%ymm8, (%0)\n"
4434 "vmovups %%ymm9, (%1)\n"
4435 "vmovups %%ymm10, (%2)\n"
4436 "vmovups %%ymm11, (%3)\n"
4437 "vmovups %%ymm12, (%4)\n"
4438 "vmovups %%ymm13, (%5)\n"
4439 "vmovups %%ymm14, (%6)\n"
4440 "vmovups %%ymm15, (%7)\n"
4441 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4442 );
4443 }
4444 }
4445 for (int j = 0; j < 4096; j += 4096) {
4446 for (int k = 0; k < 512; k += 8) {
4447 __asm__ volatile (
4448 "vmovups (%0), %%ymm0\n"
4449 "vmovups (%1), %%ymm1\n"
4450 "vmovups (%2), %%ymm2\n"
4451 "vmovups (%3), %%ymm3\n"
4452 "vmovups (%4), %%ymm4\n"
4453 "vmovups (%5), %%ymm5\n"
4454 "vmovups (%6), %%ymm6\n"
4455 "vmovups (%7), %%ymm7\n"
4456 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4457 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4458 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4459 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4460 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4461 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4462 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4463 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4464 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4465 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4466 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4467 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4468 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4469 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4470 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4471 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4472 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4473 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4474 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4475 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4476 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4477 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4478 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4479 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4480 "vmovups %%ymm8, (%0)\n"
4481 "vmovups %%ymm9, (%1)\n"
4482 "vmovups %%ymm10, (%2)\n"
4483 "vmovups %%ymm11, (%3)\n"
4484 "vmovups %%ymm12, (%4)\n"
4485 "vmovups %%ymm13, (%5)\n"
4486 "vmovups %%ymm14, (%6)\n"
4487 "vmovups %%ymm15, (%7)\n"
4488 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4489 );
4490 }
4491 }
4492 return;
4493 }
4494 if (depth == 15) {
4495 helper_float_20_recursive(buf + 0, 12);
4496 helper_float_20_recursive(buf + 4096, 12);
4497 helper_float_20_recursive(buf + 8192, 12);
4498 helper_float_20_recursive(buf + 12288, 12);
4499 helper_float_20_recursive(buf + 16384, 12);
4500 helper_float_20_recursive(buf + 20480, 12);
4501 helper_float_20_recursive(buf + 24576, 12);
4502 helper_float_20_recursive(buf + 28672, 12);
4503 for (int j = 0; j < 32768; j += 32768) {
4504 for (int k = 0; k < 4096; k += 8) {
4505 __asm__ volatile (
4506 "vmovups (%0), %%ymm0\n"
4507 "vmovups (%1), %%ymm1\n"
4508 "vmovups (%2), %%ymm2\n"
4509 "vmovups (%3), %%ymm3\n"
4510 "vmovups (%4), %%ymm4\n"
4511 "vmovups (%5), %%ymm5\n"
4512 "vmovups (%6), %%ymm6\n"
4513 "vmovups (%7), %%ymm7\n"
4514 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4515 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4516 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4517 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4518 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4519 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4520 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4521 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4522 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4523 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4524 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4525 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4526 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4527 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4528 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4529 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4530 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4531 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4532 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4533 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4534 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4535 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4536 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4537 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4538 "vmovups %%ymm8, (%0)\n"
4539 "vmovups %%ymm9, (%1)\n"
4540 "vmovups %%ymm10, (%2)\n"
4541 "vmovups %%ymm11, (%3)\n"
4542 "vmovups %%ymm12, (%4)\n"
4543 "vmovups %%ymm13, (%5)\n"
4544 "vmovups %%ymm14, (%6)\n"
4545 "vmovups %%ymm15, (%7)\n"
4546 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4547 );
4548 }
4549 }
4550 return;
4551 }
4552 if (depth == 18) {
4553 helper_float_20_recursive(buf + 0, 15);
4554 helper_float_20_recursive(buf + 32768, 15);
4555 helper_float_20_recursive(buf + 65536, 15);
4556 helper_float_20_recursive(buf + 98304, 15);
4557 helper_float_20_recursive(buf + 131072, 15);
4558 helper_float_20_recursive(buf + 163840, 15);
4559 helper_float_20_recursive(buf + 196608, 15);
4560 helper_float_20_recursive(buf + 229376, 15);
4561 for (int j = 0; j < 262144; j += 262144) {
4562 for (int k = 0; k < 32768; k += 8) {
4563 __asm__ volatile (
4564 "vmovups (%0), %%ymm0\n"
4565 "vmovups (%1), %%ymm1\n"
4566 "vmovups (%2), %%ymm2\n"
4567 "vmovups (%3), %%ymm3\n"
4568 "vmovups (%4), %%ymm4\n"
4569 "vmovups (%5), %%ymm5\n"
4570 "vmovups (%6), %%ymm6\n"
4571 "vmovups (%7), %%ymm7\n"
4572 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4573 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4574 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4575 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4576 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4577 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4578 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4579 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4580 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4581 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4582 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4583 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4584 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4585 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4586 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4587 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4588 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4589 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4590 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4591 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4592 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4593 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4594 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4595 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4596 "vmovups %%ymm8, (%0)\n"
4597 "vmovups %%ymm9, (%1)\n"
4598 "vmovups %%ymm10, (%2)\n"
4599 "vmovups %%ymm11, (%3)\n"
4600 "vmovups %%ymm12, (%4)\n"
4601 "vmovups %%ymm13, (%5)\n"
4602 "vmovups %%ymm14, (%6)\n"
4603 "vmovups %%ymm15, (%7)\n"
4604 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4605 );
4606 }
4607 }
4608 return;
4609 }
4610 if (depth == 20) {
4611 helper_float_20_recursive(buf + 0, 18);
4612 helper_float_20_recursive(buf + 262144, 18);
4613 helper_float_20_recursive(buf + 524288, 18);
4614 helper_float_20_recursive(buf + 786432, 18);
4615 for (int j = 0; j < 1048576; j += 1048576) {
4616 for (int k = 0; k < 262144; k += 8) {
4617 __asm__ volatile (
4618 "vmovups (%0), %%ymm0\n"
4619 "vmovups (%1), %%ymm1\n"
4620 "vmovups (%2), %%ymm2\n"
4621 "vmovups (%3), %%ymm3\n"
4622 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4623 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4624 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4625 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4626 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4627 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4628 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4629 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4630 "vmovups %%ymm0, (%0)\n"
4631 "vmovups %%ymm1, (%1)\n"
4632 "vmovups %%ymm2, (%2)\n"
4633 "vmovups %%ymm3, (%3)\n"
4634 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4635 );
4636 }
4637 }
4638 return;
4639 }
4640 }
4641 void helper_float_20(float *buf);
helper_float_20(float * buf)4642 void helper_float_20(float *buf) {
4643 helper_float_20_recursive(buf, 20);
4644 }
4645 void helper_float_21_recursive(float *buf, int depth);
helper_float_21_recursive(float * buf,int depth)4646 void helper_float_21_recursive(float *buf, int depth) {
4647 if (depth == 9) {
4648 for (int j = 0; j < 512; j += 64) {
4649 for (int k = 0; k < 8; k += 8) {
4650 __asm__ volatile (
4651 "vmovups (%0), %%ymm0\n"
4652 "vmovups (%1), %%ymm1\n"
4653 "vmovups (%2), %%ymm2\n"
4654 "vmovups (%3), %%ymm3\n"
4655 "vmovups (%4), %%ymm4\n"
4656 "vmovups (%5), %%ymm5\n"
4657 "vmovups (%6), %%ymm6\n"
4658 "vmovups (%7), %%ymm7\n"
4659 "vpermilps $160, %%ymm0, %%ymm8\n"
4660 "vpermilps $245, %%ymm0, %%ymm9\n"
4661 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4662 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4663 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
4664 "vpermilps $160, %%ymm1, %%ymm8\n"
4665 "vpermilps $245, %%ymm1, %%ymm9\n"
4666 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4667 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4668 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
4669 "vpermilps $160, %%ymm2, %%ymm8\n"
4670 "vpermilps $245, %%ymm2, %%ymm9\n"
4671 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4672 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4673 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
4674 "vpermilps $160, %%ymm3, %%ymm8\n"
4675 "vpermilps $245, %%ymm3, %%ymm9\n"
4676 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4677 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4678 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
4679 "vpermilps $160, %%ymm4, %%ymm8\n"
4680 "vpermilps $245, %%ymm4, %%ymm9\n"
4681 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4682 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4683 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
4684 "vpermilps $160, %%ymm5, %%ymm8\n"
4685 "vpermilps $245, %%ymm5, %%ymm9\n"
4686 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4687 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4688 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
4689 "vpermilps $160, %%ymm6, %%ymm8\n"
4690 "vpermilps $245, %%ymm6, %%ymm9\n"
4691 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4692 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4693 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
4694 "vpermilps $160, %%ymm7, %%ymm8\n"
4695 "vpermilps $245, %%ymm7, %%ymm9\n"
4696 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4697 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4698 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
4699 "vpermilps $68, %%ymm0, %%ymm8\n"
4700 "vpermilps $238, %%ymm0, %%ymm9\n"
4701 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4702 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4703 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4704 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
4705 "vpermilps $68, %%ymm1, %%ymm8\n"
4706 "vpermilps $238, %%ymm1, %%ymm9\n"
4707 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4708 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4709 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4710 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
4711 "vpermilps $68, %%ymm2, %%ymm8\n"
4712 "vpermilps $238, %%ymm2, %%ymm9\n"
4713 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4714 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4715 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4716 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
4717 "vpermilps $68, %%ymm3, %%ymm8\n"
4718 "vpermilps $238, %%ymm3, %%ymm9\n"
4719 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4720 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4721 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4722 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
4723 "vpermilps $68, %%ymm4, %%ymm8\n"
4724 "vpermilps $238, %%ymm4, %%ymm9\n"
4725 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4726 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4727 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4728 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
4729 "vpermilps $68, %%ymm5, %%ymm8\n"
4730 "vpermilps $238, %%ymm5, %%ymm9\n"
4731 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4732 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4733 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4734 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
4735 "vpermilps $68, %%ymm6, %%ymm8\n"
4736 "vpermilps $238, %%ymm6, %%ymm9\n"
4737 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4738 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4739 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4740 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
4741 "vpermilps $68, %%ymm7, %%ymm8\n"
4742 "vpermilps $238, %%ymm7, %%ymm9\n"
4743 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
4744 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
4745 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
4746 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
4747 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4748 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
4749 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
4750 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
4751 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
4752 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4753 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
4754 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
4755 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
4756 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
4757 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4758 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
4759 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
4760 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
4761 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
4762 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4763 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
4764 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
4765 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
4766 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
4767 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4768 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
4769 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
4770 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
4771 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
4772 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4773 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
4774 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
4775 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
4776 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
4777 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4778 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
4779 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
4780 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
4781 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
4782 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
4783 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
4784 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
4785 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
4786 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
4787 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4788 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4789 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4790 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4791 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4792 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4793 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4794 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4795 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4796 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4797 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4798 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4799 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4800 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4801 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4802 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4803 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4804 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4805 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4806 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4807 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4808 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4809 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4810 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4811 "vmovups %%ymm8, (%0)\n"
4812 "vmovups %%ymm9, (%1)\n"
4813 "vmovups %%ymm10, (%2)\n"
4814 "vmovups %%ymm11, (%3)\n"
4815 "vmovups %%ymm12, (%4)\n"
4816 "vmovups %%ymm13, (%5)\n"
4817 "vmovups %%ymm14, (%6)\n"
4818 "vmovups %%ymm15, (%7)\n"
4819 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4820 );
4821 }
4822 }
4823 for (int j = 0; j < 512; j += 512) {
4824 for (int k = 0; k < 64; k += 8) {
4825 __asm__ volatile (
4826 "vmovups (%0), %%ymm0\n"
4827 "vmovups (%1), %%ymm1\n"
4828 "vmovups (%2), %%ymm2\n"
4829 "vmovups (%3), %%ymm3\n"
4830 "vmovups (%4), %%ymm4\n"
4831 "vmovups (%5), %%ymm5\n"
4832 "vmovups (%6), %%ymm6\n"
4833 "vmovups (%7), %%ymm7\n"
4834 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4835 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4836 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4837 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4838 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4839 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4840 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4841 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4842 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4843 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4844 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4845 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4846 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4847 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4848 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4849 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4850 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4851 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4852 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4853 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4854 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4855 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4856 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4857 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4858 "vmovups %%ymm8, (%0)\n"
4859 "vmovups %%ymm9, (%1)\n"
4860 "vmovups %%ymm10, (%2)\n"
4861 "vmovups %%ymm11, (%3)\n"
4862 "vmovups %%ymm12, (%4)\n"
4863 "vmovups %%ymm13, (%5)\n"
4864 "vmovups %%ymm14, (%6)\n"
4865 "vmovups %%ymm15, (%7)\n"
4866 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4867 );
4868 }
4869 }
4870 return;
4871 }
4872 if (depth == 12) {
4873 helper_float_21_recursive(buf + 0, 9);
4874 helper_float_21_recursive(buf + 512, 9);
4875 helper_float_21_recursive(buf + 1024, 9);
4876 helper_float_21_recursive(buf + 1536, 9);
4877 helper_float_21_recursive(buf + 2048, 9);
4878 helper_float_21_recursive(buf + 2560, 9);
4879 helper_float_21_recursive(buf + 3072, 9);
4880 helper_float_21_recursive(buf + 3584, 9);
4881 for (int j = 0; j < 4096; j += 4096) {
4882 for (int k = 0; k < 512; k += 8) {
4883 __asm__ volatile (
4884 "vmovups (%0), %%ymm0\n"
4885 "vmovups (%1), %%ymm1\n"
4886 "vmovups (%2), %%ymm2\n"
4887 "vmovups (%3), %%ymm3\n"
4888 "vmovups (%4), %%ymm4\n"
4889 "vmovups (%5), %%ymm5\n"
4890 "vmovups (%6), %%ymm6\n"
4891 "vmovups (%7), %%ymm7\n"
4892 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4893 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4894 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4895 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4896 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4897 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4898 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4899 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4900 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4901 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4902 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4903 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4904 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4905 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4906 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4907 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4908 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4909 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4910 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4911 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4912 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4913 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4914 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4915 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4916 "vmovups %%ymm8, (%0)\n"
4917 "vmovups %%ymm9, (%1)\n"
4918 "vmovups %%ymm10, (%2)\n"
4919 "vmovups %%ymm11, (%3)\n"
4920 "vmovups %%ymm12, (%4)\n"
4921 "vmovups %%ymm13, (%5)\n"
4922 "vmovups %%ymm14, (%6)\n"
4923 "vmovups %%ymm15, (%7)\n"
4924 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4925 );
4926 }
4927 }
4928 return;
4929 }
4930 if (depth == 15) {
4931 helper_float_21_recursive(buf + 0, 12);
4932 helper_float_21_recursive(buf + 4096, 12);
4933 helper_float_21_recursive(buf + 8192, 12);
4934 helper_float_21_recursive(buf + 12288, 12);
4935 helper_float_21_recursive(buf + 16384, 12);
4936 helper_float_21_recursive(buf + 20480, 12);
4937 helper_float_21_recursive(buf + 24576, 12);
4938 helper_float_21_recursive(buf + 28672, 12);
4939 for (int j = 0; j < 32768; j += 32768) {
4940 for (int k = 0; k < 4096; k += 8) {
4941 __asm__ volatile (
4942 "vmovups (%0), %%ymm0\n"
4943 "vmovups (%1), %%ymm1\n"
4944 "vmovups (%2), %%ymm2\n"
4945 "vmovups (%3), %%ymm3\n"
4946 "vmovups (%4), %%ymm4\n"
4947 "vmovups (%5), %%ymm5\n"
4948 "vmovups (%6), %%ymm6\n"
4949 "vmovups (%7), %%ymm7\n"
4950 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
4951 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
4952 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
4953 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
4954 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
4955 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
4956 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
4957 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
4958 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
4959 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
4960 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
4961 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
4962 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
4963 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
4964 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
4965 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
4966 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
4967 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
4968 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
4969 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
4970 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
4971 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
4972 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
4973 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
4974 "vmovups %%ymm8, (%0)\n"
4975 "vmovups %%ymm9, (%1)\n"
4976 "vmovups %%ymm10, (%2)\n"
4977 "vmovups %%ymm11, (%3)\n"
4978 "vmovups %%ymm12, (%4)\n"
4979 "vmovups %%ymm13, (%5)\n"
4980 "vmovups %%ymm14, (%6)\n"
4981 "vmovups %%ymm15, (%7)\n"
4982 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
4983 );
4984 }
4985 }
4986 return;
4987 }
4988 if (depth == 18) {
4989 helper_float_21_recursive(buf + 0, 15);
4990 helper_float_21_recursive(buf + 32768, 15);
4991 helper_float_21_recursive(buf + 65536, 15);
4992 helper_float_21_recursive(buf + 98304, 15);
4993 helper_float_21_recursive(buf + 131072, 15);
4994 helper_float_21_recursive(buf + 163840, 15);
4995 helper_float_21_recursive(buf + 196608, 15);
4996 helper_float_21_recursive(buf + 229376, 15);
4997 for (int j = 0; j < 262144; j += 262144) {
4998 for (int k = 0; k < 32768; k += 8) {
4999 __asm__ volatile (
5000 "vmovups (%0), %%ymm0\n"
5001 "vmovups (%1), %%ymm1\n"
5002 "vmovups (%2), %%ymm2\n"
5003 "vmovups (%3), %%ymm3\n"
5004 "vmovups (%4), %%ymm4\n"
5005 "vmovups (%5), %%ymm5\n"
5006 "vmovups (%6), %%ymm6\n"
5007 "vmovups (%7), %%ymm7\n"
5008 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5009 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5010 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5011 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5012 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5013 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5014 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5015 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5016 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5017 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5018 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5019 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5020 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5021 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5022 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5023 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5024 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5025 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5026 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5027 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5028 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5029 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5030 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5031 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5032 "vmovups %%ymm8, (%0)\n"
5033 "vmovups %%ymm9, (%1)\n"
5034 "vmovups %%ymm10, (%2)\n"
5035 "vmovups %%ymm11, (%3)\n"
5036 "vmovups %%ymm12, (%4)\n"
5037 "vmovups %%ymm13, (%5)\n"
5038 "vmovups %%ymm14, (%6)\n"
5039 "vmovups %%ymm15, (%7)\n"
5040 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5041 );
5042 }
5043 }
5044 return;
5045 }
5046 if (depth == 21) {
5047 helper_float_21_recursive(buf + 0, 18);
5048 helper_float_21_recursive(buf + 262144, 18);
5049 helper_float_21_recursive(buf + 524288, 18);
5050 helper_float_21_recursive(buf + 786432, 18);
5051 helper_float_21_recursive(buf + 1048576, 18);
5052 helper_float_21_recursive(buf + 1310720, 18);
5053 helper_float_21_recursive(buf + 1572864, 18);
5054 helper_float_21_recursive(buf + 1835008, 18);
5055 for (int j = 0; j < 2097152; j += 2097152) {
5056 for (int k = 0; k < 262144; k += 8) {
5057 __asm__ volatile (
5058 "vmovups (%0), %%ymm0\n"
5059 "vmovups (%1), %%ymm1\n"
5060 "vmovups (%2), %%ymm2\n"
5061 "vmovups (%3), %%ymm3\n"
5062 "vmovups (%4), %%ymm4\n"
5063 "vmovups (%5), %%ymm5\n"
5064 "vmovups (%6), %%ymm6\n"
5065 "vmovups (%7), %%ymm7\n"
5066 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5067 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5068 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5069 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5070 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5071 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5072 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5073 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5074 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5075 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5076 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5077 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5078 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5079 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5080 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5081 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5082 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5083 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5084 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5085 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5086 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5087 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5088 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5089 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5090 "vmovups %%ymm8, (%0)\n"
5091 "vmovups %%ymm9, (%1)\n"
5092 "vmovups %%ymm10, (%2)\n"
5093 "vmovups %%ymm11, (%3)\n"
5094 "vmovups %%ymm12, (%4)\n"
5095 "vmovups %%ymm13, (%5)\n"
5096 "vmovups %%ymm14, (%6)\n"
5097 "vmovups %%ymm15, (%7)\n"
5098 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5099 );
5100 }
5101 }
5102 return;
5103 }
5104 }
5105 void helper_float_21(float *buf);
helper_float_21(float * buf)5106 void helper_float_21(float *buf) {
5107 helper_float_21_recursive(buf, 21);
5108 }
5109 void helper_float_22_recursive(float *buf, int depth);
helper_float_22_recursive(float * buf,int depth)5110 void helper_float_22_recursive(float *buf, int depth) {
5111 if (depth == 11) {
5112 for (int j = 0; j < 2048; j += 64) {
5113 for (int k = 0; k < 8; k += 8) {
5114 __asm__ volatile (
5115 "vmovups (%0), %%ymm0\n"
5116 "vmovups (%1), %%ymm1\n"
5117 "vmovups (%2), %%ymm2\n"
5118 "vmovups (%3), %%ymm3\n"
5119 "vmovups (%4), %%ymm4\n"
5120 "vmovups (%5), %%ymm5\n"
5121 "vmovups (%6), %%ymm6\n"
5122 "vmovups (%7), %%ymm7\n"
5123 "vpermilps $160, %%ymm0, %%ymm8\n"
5124 "vpermilps $245, %%ymm0, %%ymm9\n"
5125 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5126 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5127 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
5128 "vpermilps $160, %%ymm1, %%ymm8\n"
5129 "vpermilps $245, %%ymm1, %%ymm9\n"
5130 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5131 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5132 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
5133 "vpermilps $160, %%ymm2, %%ymm8\n"
5134 "vpermilps $245, %%ymm2, %%ymm9\n"
5135 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5136 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5137 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
5138 "vpermilps $160, %%ymm3, %%ymm8\n"
5139 "vpermilps $245, %%ymm3, %%ymm9\n"
5140 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5141 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5142 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
5143 "vpermilps $160, %%ymm4, %%ymm8\n"
5144 "vpermilps $245, %%ymm4, %%ymm9\n"
5145 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5146 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5147 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
5148 "vpermilps $160, %%ymm5, %%ymm8\n"
5149 "vpermilps $245, %%ymm5, %%ymm9\n"
5150 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5151 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5152 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
5153 "vpermilps $160, %%ymm6, %%ymm8\n"
5154 "vpermilps $245, %%ymm6, %%ymm9\n"
5155 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5156 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5157 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
5158 "vpermilps $160, %%ymm7, %%ymm8\n"
5159 "vpermilps $245, %%ymm7, %%ymm9\n"
5160 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5161 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5162 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
5163 "vpermilps $68, %%ymm0, %%ymm8\n"
5164 "vpermilps $238, %%ymm0, %%ymm9\n"
5165 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5166 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5167 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5168 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
5169 "vpermilps $68, %%ymm1, %%ymm8\n"
5170 "vpermilps $238, %%ymm1, %%ymm9\n"
5171 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5172 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5173 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5174 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
5175 "vpermilps $68, %%ymm2, %%ymm8\n"
5176 "vpermilps $238, %%ymm2, %%ymm9\n"
5177 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5178 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5179 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5180 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
5181 "vpermilps $68, %%ymm3, %%ymm8\n"
5182 "vpermilps $238, %%ymm3, %%ymm9\n"
5183 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5184 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5185 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5186 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
5187 "vpermilps $68, %%ymm4, %%ymm8\n"
5188 "vpermilps $238, %%ymm4, %%ymm9\n"
5189 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5190 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5191 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5192 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
5193 "vpermilps $68, %%ymm5, %%ymm8\n"
5194 "vpermilps $238, %%ymm5, %%ymm9\n"
5195 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5196 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5197 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5198 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
5199 "vpermilps $68, %%ymm6, %%ymm8\n"
5200 "vpermilps $238, %%ymm6, %%ymm9\n"
5201 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5202 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5203 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5204 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
5205 "vpermilps $68, %%ymm7, %%ymm8\n"
5206 "vpermilps $238, %%ymm7, %%ymm9\n"
5207 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5208 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5209 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5210 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
5211 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5212 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
5213 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
5214 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
5215 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
5216 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5217 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
5218 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
5219 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
5220 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
5221 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5222 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
5223 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
5224 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
5225 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
5226 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5227 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
5228 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
5229 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
5230 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
5231 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5232 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
5233 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
5234 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
5235 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
5236 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5237 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
5238 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
5239 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
5240 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
5241 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5242 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
5243 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
5244 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
5245 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
5246 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5247 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
5248 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
5249 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
5250 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
5251 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5252 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5253 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5254 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5255 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5256 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5257 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5258 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5259 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5260 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5261 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5262 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5263 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5264 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5265 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5266 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5267 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5268 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5269 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5270 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5271 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5272 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5273 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5274 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5275 "vmovups %%ymm8, (%0)\n"
5276 "vmovups %%ymm9, (%1)\n"
5277 "vmovups %%ymm10, (%2)\n"
5278 "vmovups %%ymm11, (%3)\n"
5279 "vmovups %%ymm12, (%4)\n"
5280 "vmovups %%ymm13, (%5)\n"
5281 "vmovups %%ymm14, (%6)\n"
5282 "vmovups %%ymm15, (%7)\n"
5283 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5284 );
5285 }
5286 }
5287 for (int j = 0; j < 2048; j += 512) {
5288 for (int k = 0; k < 64; k += 8) {
5289 __asm__ volatile (
5290 "vmovups (%0), %%ymm0\n"
5291 "vmovups (%1), %%ymm1\n"
5292 "vmovups (%2), %%ymm2\n"
5293 "vmovups (%3), %%ymm3\n"
5294 "vmovups (%4), %%ymm4\n"
5295 "vmovups (%5), %%ymm5\n"
5296 "vmovups (%6), %%ymm6\n"
5297 "vmovups (%7), %%ymm7\n"
5298 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5299 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5300 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5301 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5302 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5303 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5304 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5305 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5306 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5307 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5308 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5309 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5310 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5311 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5312 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5313 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5314 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5315 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5316 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5317 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5318 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5319 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5320 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5321 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5322 "vmovups %%ymm8, (%0)\n"
5323 "vmovups %%ymm9, (%1)\n"
5324 "vmovups %%ymm10, (%2)\n"
5325 "vmovups %%ymm11, (%3)\n"
5326 "vmovups %%ymm12, (%4)\n"
5327 "vmovups %%ymm13, (%5)\n"
5328 "vmovups %%ymm14, (%6)\n"
5329 "vmovups %%ymm15, (%7)\n"
5330 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5331 );
5332 }
5333 }
5334 for (int j = 0; j < 2048; j += 2048) {
5335 for (int k = 0; k < 512; k += 8) {
5336 __asm__ volatile (
5337 "vmovups (%0), %%ymm0\n"
5338 "vmovups (%1), %%ymm1\n"
5339 "vmovups (%2), %%ymm2\n"
5340 "vmovups (%3), %%ymm3\n"
5341 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5342 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5343 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5344 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5345 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5346 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5347 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5348 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5349 "vmovups %%ymm0, (%0)\n"
5350 "vmovups %%ymm1, (%1)\n"
5351 "vmovups %%ymm2, (%2)\n"
5352 "vmovups %%ymm3, (%3)\n"
5353 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5354 );
5355 }
5356 }
5357 return;
5358 }
5359 if (depth == 14) {
5360 helper_float_22_recursive(buf + 0, 11);
5361 helper_float_22_recursive(buf + 2048, 11);
5362 helper_float_22_recursive(buf + 4096, 11);
5363 helper_float_22_recursive(buf + 6144, 11);
5364 helper_float_22_recursive(buf + 8192, 11);
5365 helper_float_22_recursive(buf + 10240, 11);
5366 helper_float_22_recursive(buf + 12288, 11);
5367 helper_float_22_recursive(buf + 14336, 11);
5368 for (int j = 0; j < 16384; j += 16384) {
5369 for (int k = 0; k < 2048; k += 8) {
5370 __asm__ volatile (
5371 "vmovups (%0), %%ymm0\n"
5372 "vmovups (%1), %%ymm1\n"
5373 "vmovups (%2), %%ymm2\n"
5374 "vmovups (%3), %%ymm3\n"
5375 "vmovups (%4), %%ymm4\n"
5376 "vmovups (%5), %%ymm5\n"
5377 "vmovups (%6), %%ymm6\n"
5378 "vmovups (%7), %%ymm7\n"
5379 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5380 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5381 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5382 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5383 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5384 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5385 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5386 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5387 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5388 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5389 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5390 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5391 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5392 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5393 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5394 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5395 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5396 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5397 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5398 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5399 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5400 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5401 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5402 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5403 "vmovups %%ymm8, (%0)\n"
5404 "vmovups %%ymm9, (%1)\n"
5405 "vmovups %%ymm10, (%2)\n"
5406 "vmovups %%ymm11, (%3)\n"
5407 "vmovups %%ymm12, (%4)\n"
5408 "vmovups %%ymm13, (%5)\n"
5409 "vmovups %%ymm14, (%6)\n"
5410 "vmovups %%ymm15, (%7)\n"
5411 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5412 );
5413 }
5414 }
5415 return;
5416 }
5417 if (depth == 17) {
5418 helper_float_22_recursive(buf + 0, 14);
5419 helper_float_22_recursive(buf + 16384, 14);
5420 helper_float_22_recursive(buf + 32768, 14);
5421 helper_float_22_recursive(buf + 49152, 14);
5422 helper_float_22_recursive(buf + 65536, 14);
5423 helper_float_22_recursive(buf + 81920, 14);
5424 helper_float_22_recursive(buf + 98304, 14);
5425 helper_float_22_recursive(buf + 114688, 14);
5426 for (int j = 0; j < 131072; j += 131072) {
5427 for (int k = 0; k < 16384; k += 8) {
5428 __asm__ volatile (
5429 "vmovups (%0), %%ymm0\n"
5430 "vmovups (%1), %%ymm1\n"
5431 "vmovups (%2), %%ymm2\n"
5432 "vmovups (%3), %%ymm3\n"
5433 "vmovups (%4), %%ymm4\n"
5434 "vmovups (%5), %%ymm5\n"
5435 "vmovups (%6), %%ymm6\n"
5436 "vmovups (%7), %%ymm7\n"
5437 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5438 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5439 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5440 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5441 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5442 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5443 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5444 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5445 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5446 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5447 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5448 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5449 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5450 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5451 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5452 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5453 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5454 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5455 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5456 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5457 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5458 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5459 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5460 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5461 "vmovups %%ymm8, (%0)\n"
5462 "vmovups %%ymm9, (%1)\n"
5463 "vmovups %%ymm10, (%2)\n"
5464 "vmovups %%ymm11, (%3)\n"
5465 "vmovups %%ymm12, (%4)\n"
5466 "vmovups %%ymm13, (%5)\n"
5467 "vmovups %%ymm14, (%6)\n"
5468 "vmovups %%ymm15, (%7)\n"
5469 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5470 );
5471 }
5472 }
5473 return;
5474 }
5475 if (depth == 20) {
5476 helper_float_22_recursive(buf + 0, 17);
5477 helper_float_22_recursive(buf + 131072, 17);
5478 helper_float_22_recursive(buf + 262144, 17);
5479 helper_float_22_recursive(buf + 393216, 17);
5480 helper_float_22_recursive(buf + 524288, 17);
5481 helper_float_22_recursive(buf + 655360, 17);
5482 helper_float_22_recursive(buf + 786432, 17);
5483 helper_float_22_recursive(buf + 917504, 17);
5484 for (int j = 0; j < 1048576; j += 1048576) {
5485 for (int k = 0; k < 131072; k += 8) {
5486 __asm__ volatile (
5487 "vmovups (%0), %%ymm0\n"
5488 "vmovups (%1), %%ymm1\n"
5489 "vmovups (%2), %%ymm2\n"
5490 "vmovups (%3), %%ymm3\n"
5491 "vmovups (%4), %%ymm4\n"
5492 "vmovups (%5), %%ymm5\n"
5493 "vmovups (%6), %%ymm6\n"
5494 "vmovups (%7), %%ymm7\n"
5495 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5496 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5497 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5498 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5499 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5500 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5501 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5502 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5503 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5504 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5505 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5506 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5507 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5508 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5509 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5510 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5511 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5512 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5513 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5514 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5515 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5516 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5517 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5518 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5519 "vmovups %%ymm8, (%0)\n"
5520 "vmovups %%ymm9, (%1)\n"
5521 "vmovups %%ymm10, (%2)\n"
5522 "vmovups %%ymm11, (%3)\n"
5523 "vmovups %%ymm12, (%4)\n"
5524 "vmovups %%ymm13, (%5)\n"
5525 "vmovups %%ymm14, (%6)\n"
5526 "vmovups %%ymm15, (%7)\n"
5527 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5528 );
5529 }
5530 }
5531 return;
5532 }
5533 if (depth == 22) {
5534 helper_float_22_recursive(buf + 0, 20);
5535 helper_float_22_recursive(buf + 1048576, 20);
5536 helper_float_22_recursive(buf + 2097152, 20);
5537 helper_float_22_recursive(buf + 3145728, 20);
5538 for (int j = 0; j < 4194304; j += 4194304) {
5539 for (int k = 0; k < 1048576; k += 8) {
5540 __asm__ volatile (
5541 "vmovups (%0), %%ymm0\n"
5542 "vmovups (%1), %%ymm1\n"
5543 "vmovups (%2), %%ymm2\n"
5544 "vmovups (%3), %%ymm3\n"
5545 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5546 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5547 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5548 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5549 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5550 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5551 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5552 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5553 "vmovups %%ymm0, (%0)\n"
5554 "vmovups %%ymm1, (%1)\n"
5555 "vmovups %%ymm2, (%2)\n"
5556 "vmovups %%ymm3, (%3)\n"
5557 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5558 );
5559 }
5560 }
5561 return;
5562 }
5563 }
5564 void helper_float_22(float *buf);
helper_float_22(float * buf)5565 void helper_float_22(float *buf) {
5566 helper_float_22_recursive(buf, 22);
5567 }
5568 void helper_float_23_recursive(float *buf, int depth);
helper_float_23_recursive(float * buf,int depth)5569 void helper_float_23_recursive(float *buf, int depth) {
5570 if (depth == 9) {
5571 for (int j = 0; j < 512; j += 64) {
5572 for (int k = 0; k < 8; k += 8) {
5573 __asm__ volatile (
5574 "vmovups (%0), %%ymm0\n"
5575 "vmovups (%1), %%ymm1\n"
5576 "vmovups (%2), %%ymm2\n"
5577 "vmovups (%3), %%ymm3\n"
5578 "vmovups (%4), %%ymm4\n"
5579 "vmovups (%5), %%ymm5\n"
5580 "vmovups (%6), %%ymm6\n"
5581 "vmovups (%7), %%ymm7\n"
5582 "vpermilps $160, %%ymm0, %%ymm8\n"
5583 "vpermilps $245, %%ymm0, %%ymm9\n"
5584 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5585 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5586 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
5587 "vpermilps $160, %%ymm1, %%ymm8\n"
5588 "vpermilps $245, %%ymm1, %%ymm9\n"
5589 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5590 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5591 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
5592 "vpermilps $160, %%ymm2, %%ymm8\n"
5593 "vpermilps $245, %%ymm2, %%ymm9\n"
5594 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5595 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5596 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
5597 "vpermilps $160, %%ymm3, %%ymm8\n"
5598 "vpermilps $245, %%ymm3, %%ymm9\n"
5599 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5600 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5601 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
5602 "vpermilps $160, %%ymm4, %%ymm8\n"
5603 "vpermilps $245, %%ymm4, %%ymm9\n"
5604 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5605 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5606 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
5607 "vpermilps $160, %%ymm5, %%ymm8\n"
5608 "vpermilps $245, %%ymm5, %%ymm9\n"
5609 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5610 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5611 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
5612 "vpermilps $160, %%ymm6, %%ymm8\n"
5613 "vpermilps $245, %%ymm6, %%ymm9\n"
5614 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5615 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5616 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
5617 "vpermilps $160, %%ymm7, %%ymm8\n"
5618 "vpermilps $245, %%ymm7, %%ymm9\n"
5619 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5620 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5621 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
5622 "vpermilps $68, %%ymm0, %%ymm8\n"
5623 "vpermilps $238, %%ymm0, %%ymm9\n"
5624 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5625 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5626 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5627 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
5628 "vpermilps $68, %%ymm1, %%ymm8\n"
5629 "vpermilps $238, %%ymm1, %%ymm9\n"
5630 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5631 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5632 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5633 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
5634 "vpermilps $68, %%ymm2, %%ymm8\n"
5635 "vpermilps $238, %%ymm2, %%ymm9\n"
5636 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5637 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5638 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5639 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
5640 "vpermilps $68, %%ymm3, %%ymm8\n"
5641 "vpermilps $238, %%ymm3, %%ymm9\n"
5642 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5643 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5644 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5645 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
5646 "vpermilps $68, %%ymm4, %%ymm8\n"
5647 "vpermilps $238, %%ymm4, %%ymm9\n"
5648 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5649 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5650 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5651 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
5652 "vpermilps $68, %%ymm5, %%ymm8\n"
5653 "vpermilps $238, %%ymm5, %%ymm9\n"
5654 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5655 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5656 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5657 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
5658 "vpermilps $68, %%ymm6, %%ymm8\n"
5659 "vpermilps $238, %%ymm6, %%ymm9\n"
5660 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5661 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5662 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5663 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
5664 "vpermilps $68, %%ymm7, %%ymm8\n"
5665 "vpermilps $238, %%ymm7, %%ymm9\n"
5666 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
5667 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
5668 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
5669 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
5670 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5671 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
5672 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
5673 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
5674 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
5675 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5676 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
5677 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
5678 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
5679 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
5680 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5681 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
5682 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
5683 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
5684 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
5685 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5686 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
5687 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
5688 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
5689 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
5690 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5691 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
5692 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
5693 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
5694 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
5695 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5696 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
5697 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
5698 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
5699 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
5700 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5701 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
5702 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
5703 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
5704 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
5705 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
5706 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
5707 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
5708 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
5709 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
5710 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5711 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5712 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5713 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5714 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5715 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5716 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5717 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5718 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5719 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5720 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5721 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5722 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5723 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5724 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5725 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5726 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5727 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5728 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5729 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5730 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5731 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5732 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5733 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5734 "vmovups %%ymm8, (%0)\n"
5735 "vmovups %%ymm9, (%1)\n"
5736 "vmovups %%ymm10, (%2)\n"
5737 "vmovups %%ymm11, (%3)\n"
5738 "vmovups %%ymm12, (%4)\n"
5739 "vmovups %%ymm13, (%5)\n"
5740 "vmovups %%ymm14, (%6)\n"
5741 "vmovups %%ymm15, (%7)\n"
5742 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5743 );
5744 }
5745 }
5746 for (int j = 0; j < 512; j += 512) {
5747 for (int k = 0; k < 64; k += 8) {
5748 __asm__ volatile (
5749 "vmovups (%0), %%ymm0\n"
5750 "vmovups (%1), %%ymm1\n"
5751 "vmovups (%2), %%ymm2\n"
5752 "vmovups (%3), %%ymm3\n"
5753 "vmovups (%4), %%ymm4\n"
5754 "vmovups (%5), %%ymm5\n"
5755 "vmovups (%6), %%ymm6\n"
5756 "vmovups (%7), %%ymm7\n"
5757 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5758 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5759 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5760 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5761 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5762 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5763 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5764 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5765 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5766 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5767 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5768 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5769 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5770 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5771 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5772 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5773 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5774 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5775 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5776 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5777 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5778 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5779 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5780 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5781 "vmovups %%ymm8, (%0)\n"
5782 "vmovups %%ymm9, (%1)\n"
5783 "vmovups %%ymm10, (%2)\n"
5784 "vmovups %%ymm11, (%3)\n"
5785 "vmovups %%ymm12, (%4)\n"
5786 "vmovups %%ymm13, (%5)\n"
5787 "vmovups %%ymm14, (%6)\n"
5788 "vmovups %%ymm15, (%7)\n"
5789 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5790 );
5791 }
5792 }
5793 return;
5794 }
5795 if (depth == 12) {
5796 helper_float_23_recursive(buf + 0, 9);
5797 helper_float_23_recursive(buf + 512, 9);
5798 helper_float_23_recursive(buf + 1024, 9);
5799 helper_float_23_recursive(buf + 1536, 9);
5800 helper_float_23_recursive(buf + 2048, 9);
5801 helper_float_23_recursive(buf + 2560, 9);
5802 helper_float_23_recursive(buf + 3072, 9);
5803 helper_float_23_recursive(buf + 3584, 9);
5804 for (int j = 0; j < 4096; j += 4096) {
5805 for (int k = 0; k < 512; k += 8) {
5806 __asm__ volatile (
5807 "vmovups (%0), %%ymm0\n"
5808 "vmovups (%1), %%ymm1\n"
5809 "vmovups (%2), %%ymm2\n"
5810 "vmovups (%3), %%ymm3\n"
5811 "vmovups (%4), %%ymm4\n"
5812 "vmovups (%5), %%ymm5\n"
5813 "vmovups (%6), %%ymm6\n"
5814 "vmovups (%7), %%ymm7\n"
5815 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5816 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5817 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5818 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5819 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5820 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5821 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5822 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5823 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5824 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5825 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5826 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5827 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5828 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5829 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5830 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5831 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5832 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5833 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5834 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5835 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5836 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5837 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5838 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5839 "vmovups %%ymm8, (%0)\n"
5840 "vmovups %%ymm9, (%1)\n"
5841 "vmovups %%ymm10, (%2)\n"
5842 "vmovups %%ymm11, (%3)\n"
5843 "vmovups %%ymm12, (%4)\n"
5844 "vmovups %%ymm13, (%5)\n"
5845 "vmovups %%ymm14, (%6)\n"
5846 "vmovups %%ymm15, (%7)\n"
5847 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5848 );
5849 }
5850 }
5851 return;
5852 }
5853 if (depth == 15) {
5854 helper_float_23_recursive(buf + 0, 12);
5855 helper_float_23_recursive(buf + 4096, 12);
5856 helper_float_23_recursive(buf + 8192, 12);
5857 helper_float_23_recursive(buf + 12288, 12);
5858 helper_float_23_recursive(buf + 16384, 12);
5859 helper_float_23_recursive(buf + 20480, 12);
5860 helper_float_23_recursive(buf + 24576, 12);
5861 helper_float_23_recursive(buf + 28672, 12);
5862 for (int j = 0; j < 32768; j += 32768) {
5863 for (int k = 0; k < 4096; k += 8) {
5864 __asm__ volatile (
5865 "vmovups (%0), %%ymm0\n"
5866 "vmovups (%1), %%ymm1\n"
5867 "vmovups (%2), %%ymm2\n"
5868 "vmovups (%3), %%ymm3\n"
5869 "vmovups (%4), %%ymm4\n"
5870 "vmovups (%5), %%ymm5\n"
5871 "vmovups (%6), %%ymm6\n"
5872 "vmovups (%7), %%ymm7\n"
5873 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5874 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5875 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5876 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5877 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5878 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5879 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5880 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5881 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5882 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5883 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5884 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5885 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5886 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5887 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5888 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5889 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5890 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5891 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5892 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5893 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5894 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5895 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5896 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5897 "vmovups %%ymm8, (%0)\n"
5898 "vmovups %%ymm9, (%1)\n"
5899 "vmovups %%ymm10, (%2)\n"
5900 "vmovups %%ymm11, (%3)\n"
5901 "vmovups %%ymm12, (%4)\n"
5902 "vmovups %%ymm13, (%5)\n"
5903 "vmovups %%ymm14, (%6)\n"
5904 "vmovups %%ymm15, (%7)\n"
5905 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5906 );
5907 }
5908 }
5909 return;
5910 }
5911 if (depth == 18) {
5912 helper_float_23_recursive(buf + 0, 15);
5913 helper_float_23_recursive(buf + 32768, 15);
5914 helper_float_23_recursive(buf + 65536, 15);
5915 helper_float_23_recursive(buf + 98304, 15);
5916 helper_float_23_recursive(buf + 131072, 15);
5917 helper_float_23_recursive(buf + 163840, 15);
5918 helper_float_23_recursive(buf + 196608, 15);
5919 helper_float_23_recursive(buf + 229376, 15);
5920 for (int j = 0; j < 262144; j += 262144) {
5921 for (int k = 0; k < 32768; k += 8) {
5922 __asm__ volatile (
5923 "vmovups (%0), %%ymm0\n"
5924 "vmovups (%1), %%ymm1\n"
5925 "vmovups (%2), %%ymm2\n"
5926 "vmovups (%3), %%ymm3\n"
5927 "vmovups (%4), %%ymm4\n"
5928 "vmovups (%5), %%ymm5\n"
5929 "vmovups (%6), %%ymm6\n"
5930 "vmovups (%7), %%ymm7\n"
5931 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5932 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5933 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5934 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5935 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5936 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5937 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5938 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5939 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5940 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5941 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
5942 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
5943 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
5944 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
5945 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
5946 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
5947 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
5948 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
5949 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
5950 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
5951 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
5952 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
5953 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
5954 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
5955 "vmovups %%ymm8, (%0)\n"
5956 "vmovups %%ymm9, (%1)\n"
5957 "vmovups %%ymm10, (%2)\n"
5958 "vmovups %%ymm11, (%3)\n"
5959 "vmovups %%ymm12, (%4)\n"
5960 "vmovups %%ymm13, (%5)\n"
5961 "vmovups %%ymm14, (%6)\n"
5962 "vmovups %%ymm15, (%7)\n"
5963 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
5964 );
5965 }
5966 }
5967 return;
5968 }
5969 if (depth == 21) {
5970 helper_float_23_recursive(buf + 0, 18);
5971 helper_float_23_recursive(buf + 262144, 18);
5972 helper_float_23_recursive(buf + 524288, 18);
5973 helper_float_23_recursive(buf + 786432, 18);
5974 helper_float_23_recursive(buf + 1048576, 18);
5975 helper_float_23_recursive(buf + 1310720, 18);
5976 helper_float_23_recursive(buf + 1572864, 18);
5977 helper_float_23_recursive(buf + 1835008, 18);
5978 for (int j = 0; j < 2097152; j += 2097152) {
5979 for (int k = 0; k < 262144; k += 8) {
5980 __asm__ volatile (
5981 "vmovups (%0), %%ymm0\n"
5982 "vmovups (%1), %%ymm1\n"
5983 "vmovups (%2), %%ymm2\n"
5984 "vmovups (%3), %%ymm3\n"
5985 "vmovups (%4), %%ymm4\n"
5986 "vmovups (%5), %%ymm5\n"
5987 "vmovups (%6), %%ymm6\n"
5988 "vmovups (%7), %%ymm7\n"
5989 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
5990 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
5991 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
5992 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
5993 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
5994 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
5995 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
5996 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
5997 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
5998 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
5999 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6000 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6001 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6002 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6003 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6004 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6005 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6006 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6007 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6008 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6009 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6010 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6011 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6012 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6013 "vmovups %%ymm8, (%0)\n"
6014 "vmovups %%ymm9, (%1)\n"
6015 "vmovups %%ymm10, (%2)\n"
6016 "vmovups %%ymm11, (%3)\n"
6017 "vmovups %%ymm12, (%4)\n"
6018 "vmovups %%ymm13, (%5)\n"
6019 "vmovups %%ymm14, (%6)\n"
6020 "vmovups %%ymm15, (%7)\n"
6021 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6022 );
6023 }
6024 }
6025 return;
6026 }
6027 if (depth == 23) {
6028 helper_float_23_recursive(buf + 0, 21);
6029 helper_float_23_recursive(buf + 2097152, 21);
6030 helper_float_23_recursive(buf + 4194304, 21);
6031 helper_float_23_recursive(buf + 6291456, 21);
6032 for (int j = 0; j < 8388608; j += 8388608) {
6033 for (int k = 0; k < 2097152; k += 8) {
6034 __asm__ volatile (
6035 "vmovups (%0), %%ymm0\n"
6036 "vmovups (%1), %%ymm1\n"
6037 "vmovups (%2), %%ymm2\n"
6038 "vmovups (%3), %%ymm3\n"
6039 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6040 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6041 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6042 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6043 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6044 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6045 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6046 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6047 "vmovups %%ymm0, (%0)\n"
6048 "vmovups %%ymm1, (%1)\n"
6049 "vmovups %%ymm2, (%2)\n"
6050 "vmovups %%ymm3, (%3)\n"
6051 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6052 );
6053 }
6054 }
6055 return;
6056 }
6057 }
6058 void helper_float_23(float *buf);
helper_float_23(float * buf)6059 void helper_float_23(float *buf) {
6060 helper_float_23_recursive(buf, 23);
6061 }
6062 void helper_float_24_recursive(float *buf, int depth);
helper_float_24_recursive(float * buf,int depth)6063 void helper_float_24_recursive(float *buf, int depth) {
6064 if (depth == 12) {
6065 for (int j = 0; j < 4096; j += 64) {
6066 for (int k = 0; k < 8; k += 8) {
6067 __asm__ volatile (
6068 "vmovups (%0), %%ymm0\n"
6069 "vmovups (%1), %%ymm1\n"
6070 "vmovups (%2), %%ymm2\n"
6071 "vmovups (%3), %%ymm3\n"
6072 "vmovups (%4), %%ymm4\n"
6073 "vmovups (%5), %%ymm5\n"
6074 "vmovups (%6), %%ymm6\n"
6075 "vmovups (%7), %%ymm7\n"
6076 "vpermilps $160, %%ymm0, %%ymm8\n"
6077 "vpermilps $245, %%ymm0, %%ymm9\n"
6078 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6079 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6080 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
6081 "vpermilps $160, %%ymm1, %%ymm8\n"
6082 "vpermilps $245, %%ymm1, %%ymm9\n"
6083 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6084 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6085 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
6086 "vpermilps $160, %%ymm2, %%ymm8\n"
6087 "vpermilps $245, %%ymm2, %%ymm9\n"
6088 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6089 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6090 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
6091 "vpermilps $160, %%ymm3, %%ymm8\n"
6092 "vpermilps $245, %%ymm3, %%ymm9\n"
6093 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6094 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6095 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
6096 "vpermilps $160, %%ymm4, %%ymm8\n"
6097 "vpermilps $245, %%ymm4, %%ymm9\n"
6098 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6099 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6100 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
6101 "vpermilps $160, %%ymm5, %%ymm8\n"
6102 "vpermilps $245, %%ymm5, %%ymm9\n"
6103 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6104 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6105 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
6106 "vpermilps $160, %%ymm6, %%ymm8\n"
6107 "vpermilps $245, %%ymm6, %%ymm9\n"
6108 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6109 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6110 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
6111 "vpermilps $160, %%ymm7, %%ymm8\n"
6112 "vpermilps $245, %%ymm7, %%ymm9\n"
6113 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6114 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6115 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
6116 "vpermilps $68, %%ymm0, %%ymm8\n"
6117 "vpermilps $238, %%ymm0, %%ymm9\n"
6118 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6119 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6120 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6121 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
6122 "vpermilps $68, %%ymm1, %%ymm8\n"
6123 "vpermilps $238, %%ymm1, %%ymm9\n"
6124 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6125 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6126 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6127 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
6128 "vpermilps $68, %%ymm2, %%ymm8\n"
6129 "vpermilps $238, %%ymm2, %%ymm9\n"
6130 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6131 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6132 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6133 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
6134 "vpermilps $68, %%ymm3, %%ymm8\n"
6135 "vpermilps $238, %%ymm3, %%ymm9\n"
6136 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6137 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6138 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6139 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
6140 "vpermilps $68, %%ymm4, %%ymm8\n"
6141 "vpermilps $238, %%ymm4, %%ymm9\n"
6142 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6143 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6144 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6145 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
6146 "vpermilps $68, %%ymm5, %%ymm8\n"
6147 "vpermilps $238, %%ymm5, %%ymm9\n"
6148 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6149 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6150 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6151 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
6152 "vpermilps $68, %%ymm6, %%ymm8\n"
6153 "vpermilps $238, %%ymm6, %%ymm9\n"
6154 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6155 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6156 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6157 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
6158 "vpermilps $68, %%ymm7, %%ymm8\n"
6159 "vpermilps $238, %%ymm7, %%ymm9\n"
6160 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6161 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6162 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6163 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
6164 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6165 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
6166 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
6167 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
6168 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
6169 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6170 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
6171 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
6172 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
6173 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
6174 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6175 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
6176 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
6177 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
6178 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
6179 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6180 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
6181 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
6182 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
6183 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
6184 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6185 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
6186 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
6187 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
6188 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
6189 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6190 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
6191 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
6192 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
6193 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
6194 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6195 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
6196 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
6197 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
6198 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
6199 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6200 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
6201 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
6202 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
6203 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
6204 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6205 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6206 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6207 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6208 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6209 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6210 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6211 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6212 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6213 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6214 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6215 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6216 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6217 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6218 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6219 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6220 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6221 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6222 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6223 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6224 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6225 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6226 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6227 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6228 "vmovups %%ymm8, (%0)\n"
6229 "vmovups %%ymm9, (%1)\n"
6230 "vmovups %%ymm10, (%2)\n"
6231 "vmovups %%ymm11, (%3)\n"
6232 "vmovups %%ymm12, (%4)\n"
6233 "vmovups %%ymm13, (%5)\n"
6234 "vmovups %%ymm14, (%6)\n"
6235 "vmovups %%ymm15, (%7)\n"
6236 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6237 );
6238 }
6239 }
6240 for (int j = 0; j < 4096; j += 512) {
6241 for (int k = 0; k < 64; k += 8) {
6242 __asm__ volatile (
6243 "vmovups (%0), %%ymm0\n"
6244 "vmovups (%1), %%ymm1\n"
6245 "vmovups (%2), %%ymm2\n"
6246 "vmovups (%3), %%ymm3\n"
6247 "vmovups (%4), %%ymm4\n"
6248 "vmovups (%5), %%ymm5\n"
6249 "vmovups (%6), %%ymm6\n"
6250 "vmovups (%7), %%ymm7\n"
6251 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6252 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6253 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6254 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6255 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6256 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6257 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6258 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6259 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6260 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6261 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6262 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6263 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6264 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6265 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6266 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6267 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6268 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6269 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6270 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6271 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6272 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6273 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6274 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6275 "vmovups %%ymm8, (%0)\n"
6276 "vmovups %%ymm9, (%1)\n"
6277 "vmovups %%ymm10, (%2)\n"
6278 "vmovups %%ymm11, (%3)\n"
6279 "vmovups %%ymm12, (%4)\n"
6280 "vmovups %%ymm13, (%5)\n"
6281 "vmovups %%ymm14, (%6)\n"
6282 "vmovups %%ymm15, (%7)\n"
6283 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6284 );
6285 }
6286 }
6287 for (int j = 0; j < 4096; j += 4096) {
6288 for (int k = 0; k < 512; k += 8) {
6289 __asm__ volatile (
6290 "vmovups (%0), %%ymm0\n"
6291 "vmovups (%1), %%ymm1\n"
6292 "vmovups (%2), %%ymm2\n"
6293 "vmovups (%3), %%ymm3\n"
6294 "vmovups (%4), %%ymm4\n"
6295 "vmovups (%5), %%ymm5\n"
6296 "vmovups (%6), %%ymm6\n"
6297 "vmovups (%7), %%ymm7\n"
6298 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6299 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6300 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6301 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6302 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6303 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6304 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6305 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6306 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6307 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6308 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6309 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6310 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6311 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6312 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6313 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6314 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6315 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6316 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6317 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6318 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6319 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6320 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6321 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6322 "vmovups %%ymm8, (%0)\n"
6323 "vmovups %%ymm9, (%1)\n"
6324 "vmovups %%ymm10, (%2)\n"
6325 "vmovups %%ymm11, (%3)\n"
6326 "vmovups %%ymm12, (%4)\n"
6327 "vmovups %%ymm13, (%5)\n"
6328 "vmovups %%ymm14, (%6)\n"
6329 "vmovups %%ymm15, (%7)\n"
6330 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6331 );
6332 }
6333 }
6334 return;
6335 }
6336 if (depth == 15) {
6337 helper_float_24_recursive(buf + 0, 12);
6338 helper_float_24_recursive(buf + 4096, 12);
6339 helper_float_24_recursive(buf + 8192, 12);
6340 helper_float_24_recursive(buf + 12288, 12);
6341 helper_float_24_recursive(buf + 16384, 12);
6342 helper_float_24_recursive(buf + 20480, 12);
6343 helper_float_24_recursive(buf + 24576, 12);
6344 helper_float_24_recursive(buf + 28672, 12);
6345 for (int j = 0; j < 32768; j += 32768) {
6346 for (int k = 0; k < 4096; k += 8) {
6347 __asm__ volatile (
6348 "vmovups (%0), %%ymm0\n"
6349 "vmovups (%1), %%ymm1\n"
6350 "vmovups (%2), %%ymm2\n"
6351 "vmovups (%3), %%ymm3\n"
6352 "vmovups (%4), %%ymm4\n"
6353 "vmovups (%5), %%ymm5\n"
6354 "vmovups (%6), %%ymm6\n"
6355 "vmovups (%7), %%ymm7\n"
6356 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6357 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6358 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6359 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6360 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6361 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6362 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6363 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6364 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6365 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6366 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6367 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6368 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6369 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6370 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6371 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6372 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6373 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6374 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6375 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6376 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6377 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6378 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6379 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6380 "vmovups %%ymm8, (%0)\n"
6381 "vmovups %%ymm9, (%1)\n"
6382 "vmovups %%ymm10, (%2)\n"
6383 "vmovups %%ymm11, (%3)\n"
6384 "vmovups %%ymm12, (%4)\n"
6385 "vmovups %%ymm13, (%5)\n"
6386 "vmovups %%ymm14, (%6)\n"
6387 "vmovups %%ymm15, (%7)\n"
6388 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6389 );
6390 }
6391 }
6392 return;
6393 }
6394 if (depth == 18) {
6395 helper_float_24_recursive(buf + 0, 15);
6396 helper_float_24_recursive(buf + 32768, 15);
6397 helper_float_24_recursive(buf + 65536, 15);
6398 helper_float_24_recursive(buf + 98304, 15);
6399 helper_float_24_recursive(buf + 131072, 15);
6400 helper_float_24_recursive(buf + 163840, 15);
6401 helper_float_24_recursive(buf + 196608, 15);
6402 helper_float_24_recursive(buf + 229376, 15);
6403 for (int j = 0; j < 262144; j += 262144) {
6404 for (int k = 0; k < 32768; k += 8) {
6405 __asm__ volatile (
6406 "vmovups (%0), %%ymm0\n"
6407 "vmovups (%1), %%ymm1\n"
6408 "vmovups (%2), %%ymm2\n"
6409 "vmovups (%3), %%ymm3\n"
6410 "vmovups (%4), %%ymm4\n"
6411 "vmovups (%5), %%ymm5\n"
6412 "vmovups (%6), %%ymm6\n"
6413 "vmovups (%7), %%ymm7\n"
6414 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6415 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6416 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6417 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6418 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6419 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6420 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6421 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6422 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6423 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6424 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6425 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6426 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6427 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6428 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6429 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6430 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6431 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6432 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6433 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6434 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6435 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6436 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6437 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6438 "vmovups %%ymm8, (%0)\n"
6439 "vmovups %%ymm9, (%1)\n"
6440 "vmovups %%ymm10, (%2)\n"
6441 "vmovups %%ymm11, (%3)\n"
6442 "vmovups %%ymm12, (%4)\n"
6443 "vmovups %%ymm13, (%5)\n"
6444 "vmovups %%ymm14, (%6)\n"
6445 "vmovups %%ymm15, (%7)\n"
6446 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6447 );
6448 }
6449 }
6450 return;
6451 }
6452 if (depth == 21) {
6453 helper_float_24_recursive(buf + 0, 18);
6454 helper_float_24_recursive(buf + 262144, 18);
6455 helper_float_24_recursive(buf + 524288, 18);
6456 helper_float_24_recursive(buf + 786432, 18);
6457 helper_float_24_recursive(buf + 1048576, 18);
6458 helper_float_24_recursive(buf + 1310720, 18);
6459 helper_float_24_recursive(buf + 1572864, 18);
6460 helper_float_24_recursive(buf + 1835008, 18);
6461 for (int j = 0; j < 2097152; j += 2097152) {
6462 for (int k = 0; k < 262144; k += 8) {
6463 __asm__ volatile (
6464 "vmovups (%0), %%ymm0\n"
6465 "vmovups (%1), %%ymm1\n"
6466 "vmovups (%2), %%ymm2\n"
6467 "vmovups (%3), %%ymm3\n"
6468 "vmovups (%4), %%ymm4\n"
6469 "vmovups (%5), %%ymm5\n"
6470 "vmovups (%6), %%ymm6\n"
6471 "vmovups (%7), %%ymm7\n"
6472 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6473 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6474 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6475 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6476 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6477 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6478 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6479 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6480 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6481 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6482 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6483 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6484 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6485 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6486 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6487 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6488 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6489 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6490 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6491 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6492 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6493 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6494 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6495 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6496 "vmovups %%ymm8, (%0)\n"
6497 "vmovups %%ymm9, (%1)\n"
6498 "vmovups %%ymm10, (%2)\n"
6499 "vmovups %%ymm11, (%3)\n"
6500 "vmovups %%ymm12, (%4)\n"
6501 "vmovups %%ymm13, (%5)\n"
6502 "vmovups %%ymm14, (%6)\n"
6503 "vmovups %%ymm15, (%7)\n"
6504 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6505 );
6506 }
6507 }
6508 return;
6509 }
6510 if (depth == 24) {
6511 helper_float_24_recursive(buf + 0, 21);
6512 helper_float_24_recursive(buf + 2097152, 21);
6513 helper_float_24_recursive(buf + 4194304, 21);
6514 helper_float_24_recursive(buf + 6291456, 21);
6515 helper_float_24_recursive(buf + 8388608, 21);
6516 helper_float_24_recursive(buf + 10485760, 21);
6517 helper_float_24_recursive(buf + 12582912, 21);
6518 helper_float_24_recursive(buf + 14680064, 21);
6519 for (int j = 0; j < 16777216; j += 16777216) {
6520 for (int k = 0; k < 2097152; k += 8) {
6521 __asm__ volatile (
6522 "vmovups (%0), %%ymm0\n"
6523 "vmovups (%1), %%ymm1\n"
6524 "vmovups (%2), %%ymm2\n"
6525 "vmovups (%3), %%ymm3\n"
6526 "vmovups (%4), %%ymm4\n"
6527 "vmovups (%5), %%ymm5\n"
6528 "vmovups (%6), %%ymm6\n"
6529 "vmovups (%7), %%ymm7\n"
6530 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6531 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6532 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6533 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6534 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6535 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6536 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6537 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6538 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6539 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6540 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6541 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6542 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6543 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6544 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6545 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6546 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6547 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6548 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6549 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6550 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6551 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6552 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6553 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6554 "vmovups %%ymm8, (%0)\n"
6555 "vmovups %%ymm9, (%1)\n"
6556 "vmovups %%ymm10, (%2)\n"
6557 "vmovups %%ymm11, (%3)\n"
6558 "vmovups %%ymm12, (%4)\n"
6559 "vmovups %%ymm13, (%5)\n"
6560 "vmovups %%ymm14, (%6)\n"
6561 "vmovups %%ymm15, (%7)\n"
6562 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6563 );
6564 }
6565 }
6566 return;
6567 }
6568 }
6569 void helper_float_24(float *buf);
helper_float_24(float * buf)6570 void helper_float_24(float *buf) {
6571 helper_float_24_recursive(buf, 24);
6572 }
6573 void helper_float_25_recursive(float *buf, int depth);
helper_float_25_recursive(float * buf,int depth)6574 void helper_float_25_recursive(float *buf, int depth) {
6575 if (depth == 7) {
6576 for (int j = 0; j < 128; j += 64) {
6577 for (int k = 0; k < 8; k += 8) {
6578 __asm__ volatile (
6579 "vmovups (%0), %%ymm0\n"
6580 "vmovups (%1), %%ymm1\n"
6581 "vmovups (%2), %%ymm2\n"
6582 "vmovups (%3), %%ymm3\n"
6583 "vmovups (%4), %%ymm4\n"
6584 "vmovups (%5), %%ymm5\n"
6585 "vmovups (%6), %%ymm6\n"
6586 "vmovups (%7), %%ymm7\n"
6587 "vpermilps $160, %%ymm0, %%ymm8\n"
6588 "vpermilps $245, %%ymm0, %%ymm9\n"
6589 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6590 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6591 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
6592 "vpermilps $160, %%ymm1, %%ymm8\n"
6593 "vpermilps $245, %%ymm1, %%ymm9\n"
6594 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6595 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6596 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
6597 "vpermilps $160, %%ymm2, %%ymm8\n"
6598 "vpermilps $245, %%ymm2, %%ymm9\n"
6599 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6600 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6601 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
6602 "vpermilps $160, %%ymm3, %%ymm8\n"
6603 "vpermilps $245, %%ymm3, %%ymm9\n"
6604 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6605 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6606 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
6607 "vpermilps $160, %%ymm4, %%ymm8\n"
6608 "vpermilps $245, %%ymm4, %%ymm9\n"
6609 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6610 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6611 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
6612 "vpermilps $160, %%ymm5, %%ymm8\n"
6613 "vpermilps $245, %%ymm5, %%ymm9\n"
6614 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6615 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6616 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
6617 "vpermilps $160, %%ymm6, %%ymm8\n"
6618 "vpermilps $245, %%ymm6, %%ymm9\n"
6619 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6620 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6621 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
6622 "vpermilps $160, %%ymm7, %%ymm8\n"
6623 "vpermilps $245, %%ymm7, %%ymm9\n"
6624 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6625 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6626 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
6627 "vpermilps $68, %%ymm0, %%ymm8\n"
6628 "vpermilps $238, %%ymm0, %%ymm9\n"
6629 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6630 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6631 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6632 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
6633 "vpermilps $68, %%ymm1, %%ymm8\n"
6634 "vpermilps $238, %%ymm1, %%ymm9\n"
6635 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6636 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6637 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6638 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
6639 "vpermilps $68, %%ymm2, %%ymm8\n"
6640 "vpermilps $238, %%ymm2, %%ymm9\n"
6641 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6642 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6643 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6644 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
6645 "vpermilps $68, %%ymm3, %%ymm8\n"
6646 "vpermilps $238, %%ymm3, %%ymm9\n"
6647 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6648 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6649 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6650 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
6651 "vpermilps $68, %%ymm4, %%ymm8\n"
6652 "vpermilps $238, %%ymm4, %%ymm9\n"
6653 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6654 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6655 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6656 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
6657 "vpermilps $68, %%ymm5, %%ymm8\n"
6658 "vpermilps $238, %%ymm5, %%ymm9\n"
6659 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6660 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6661 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6662 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
6663 "vpermilps $68, %%ymm6, %%ymm8\n"
6664 "vpermilps $238, %%ymm6, %%ymm9\n"
6665 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6666 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6667 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6668 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
6669 "vpermilps $68, %%ymm7, %%ymm8\n"
6670 "vpermilps $238, %%ymm7, %%ymm9\n"
6671 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
6672 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
6673 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
6674 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
6675 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6676 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
6677 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
6678 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
6679 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
6680 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6681 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
6682 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
6683 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
6684 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
6685 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6686 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
6687 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
6688 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
6689 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
6690 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6691 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
6692 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
6693 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
6694 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
6695 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6696 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
6697 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
6698 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
6699 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
6700 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6701 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
6702 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
6703 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
6704 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
6705 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6706 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
6707 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
6708 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
6709 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
6710 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
6711 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
6712 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
6713 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
6714 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
6715 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6716 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6717 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6718 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6719 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6720 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6721 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6722 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6723 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6724 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6725 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6726 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6727 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6728 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6729 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6730 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6731 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6732 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6733 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6734 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6735 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6736 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6737 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6738 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6739 "vmovups %%ymm8, (%0)\n"
6740 "vmovups %%ymm9, (%1)\n"
6741 "vmovups %%ymm10, (%2)\n"
6742 "vmovups %%ymm11, (%3)\n"
6743 "vmovups %%ymm12, (%4)\n"
6744 "vmovups %%ymm13, (%5)\n"
6745 "vmovups %%ymm14, (%6)\n"
6746 "vmovups %%ymm15, (%7)\n"
6747 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6748 );
6749 }
6750 }
6751 for (int j = 0; j < 128; j += 128) {
6752 for (int k = 0; k < 64; k += 8) {
6753 __asm__ volatile (
6754 "vmovups (%0), %%ymm0\n"
6755 "vmovups (%1), %%ymm1\n"
6756 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6757 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6758 "vmovups %%ymm8, (%0)\n"
6759 "vmovups %%ymm9, (%1)\n"
6760 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6761 );
6762 }
6763 }
6764 return;
6765 }
6766 if (depth == 10) {
6767 helper_float_25_recursive(buf + 0, 7);
6768 helper_float_25_recursive(buf + 128, 7);
6769 helper_float_25_recursive(buf + 256, 7);
6770 helper_float_25_recursive(buf + 384, 7);
6771 helper_float_25_recursive(buf + 512, 7);
6772 helper_float_25_recursive(buf + 640, 7);
6773 helper_float_25_recursive(buf + 768, 7);
6774 helper_float_25_recursive(buf + 896, 7);
6775 for (int j = 0; j < 1024; j += 1024) {
6776 for (int k = 0; k < 128; k += 8) {
6777 __asm__ volatile (
6778 "vmovups (%0), %%ymm0\n"
6779 "vmovups (%1), %%ymm1\n"
6780 "vmovups (%2), %%ymm2\n"
6781 "vmovups (%3), %%ymm3\n"
6782 "vmovups (%4), %%ymm4\n"
6783 "vmovups (%5), %%ymm5\n"
6784 "vmovups (%6), %%ymm6\n"
6785 "vmovups (%7), %%ymm7\n"
6786 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6787 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6788 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6789 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6790 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6791 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6792 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6793 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6794 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6795 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6796 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6797 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6798 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6799 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6800 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6801 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6802 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6803 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6804 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6805 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6806 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6807 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6808 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6809 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6810 "vmovups %%ymm8, (%0)\n"
6811 "vmovups %%ymm9, (%1)\n"
6812 "vmovups %%ymm10, (%2)\n"
6813 "vmovups %%ymm11, (%3)\n"
6814 "vmovups %%ymm12, (%4)\n"
6815 "vmovups %%ymm13, (%5)\n"
6816 "vmovups %%ymm14, (%6)\n"
6817 "vmovups %%ymm15, (%7)\n"
6818 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6819 );
6820 }
6821 }
6822 return;
6823 }
6824 if (depth == 13) {
6825 helper_float_25_recursive(buf + 0, 10);
6826 helper_float_25_recursive(buf + 1024, 10);
6827 helper_float_25_recursive(buf + 2048, 10);
6828 helper_float_25_recursive(buf + 3072, 10);
6829 helper_float_25_recursive(buf + 4096, 10);
6830 helper_float_25_recursive(buf + 5120, 10);
6831 helper_float_25_recursive(buf + 6144, 10);
6832 helper_float_25_recursive(buf + 7168, 10);
6833 for (int j = 0; j < 8192; j += 8192) {
6834 for (int k = 0; k < 1024; k += 8) {
6835 __asm__ volatile (
6836 "vmovups (%0), %%ymm0\n"
6837 "vmovups (%1), %%ymm1\n"
6838 "vmovups (%2), %%ymm2\n"
6839 "vmovups (%3), %%ymm3\n"
6840 "vmovups (%4), %%ymm4\n"
6841 "vmovups (%5), %%ymm5\n"
6842 "vmovups (%6), %%ymm6\n"
6843 "vmovups (%7), %%ymm7\n"
6844 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6845 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6846 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6847 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6848 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6849 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6850 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6851 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6852 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6853 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6854 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6855 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6856 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6857 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6858 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6859 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6860 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6861 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6862 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6863 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6864 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6865 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6866 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6867 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6868 "vmovups %%ymm8, (%0)\n"
6869 "vmovups %%ymm9, (%1)\n"
6870 "vmovups %%ymm10, (%2)\n"
6871 "vmovups %%ymm11, (%3)\n"
6872 "vmovups %%ymm12, (%4)\n"
6873 "vmovups %%ymm13, (%5)\n"
6874 "vmovups %%ymm14, (%6)\n"
6875 "vmovups %%ymm15, (%7)\n"
6876 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6877 );
6878 }
6879 }
6880 return;
6881 }
6882 if (depth == 16) {
6883 helper_float_25_recursive(buf + 0, 13);
6884 helper_float_25_recursive(buf + 8192, 13);
6885 helper_float_25_recursive(buf + 16384, 13);
6886 helper_float_25_recursive(buf + 24576, 13);
6887 helper_float_25_recursive(buf + 32768, 13);
6888 helper_float_25_recursive(buf + 40960, 13);
6889 helper_float_25_recursive(buf + 49152, 13);
6890 helper_float_25_recursive(buf + 57344, 13);
6891 for (int j = 0; j < 65536; j += 65536) {
6892 for (int k = 0; k < 8192; k += 8) {
6893 __asm__ volatile (
6894 "vmovups (%0), %%ymm0\n"
6895 "vmovups (%1), %%ymm1\n"
6896 "vmovups (%2), %%ymm2\n"
6897 "vmovups (%3), %%ymm3\n"
6898 "vmovups (%4), %%ymm4\n"
6899 "vmovups (%5), %%ymm5\n"
6900 "vmovups (%6), %%ymm6\n"
6901 "vmovups (%7), %%ymm7\n"
6902 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6903 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6904 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6905 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6906 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6907 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6908 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6909 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6910 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6911 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6912 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6913 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6914 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6915 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6916 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6917 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6918 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6919 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6920 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6921 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6922 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6923 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6924 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6925 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6926 "vmovups %%ymm8, (%0)\n"
6927 "vmovups %%ymm9, (%1)\n"
6928 "vmovups %%ymm10, (%2)\n"
6929 "vmovups %%ymm11, (%3)\n"
6930 "vmovups %%ymm12, (%4)\n"
6931 "vmovups %%ymm13, (%5)\n"
6932 "vmovups %%ymm14, (%6)\n"
6933 "vmovups %%ymm15, (%7)\n"
6934 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6935 );
6936 }
6937 }
6938 return;
6939 }
6940 if (depth == 19) {
6941 helper_float_25_recursive(buf + 0, 16);
6942 helper_float_25_recursive(buf + 65536, 16);
6943 helper_float_25_recursive(buf + 131072, 16);
6944 helper_float_25_recursive(buf + 196608, 16);
6945 helper_float_25_recursive(buf + 262144, 16);
6946 helper_float_25_recursive(buf + 327680, 16);
6947 helper_float_25_recursive(buf + 393216, 16);
6948 helper_float_25_recursive(buf + 458752, 16);
6949 for (int j = 0; j < 524288; j += 524288) {
6950 for (int k = 0; k < 65536; k += 8) {
6951 __asm__ volatile (
6952 "vmovups (%0), %%ymm0\n"
6953 "vmovups (%1), %%ymm1\n"
6954 "vmovups (%2), %%ymm2\n"
6955 "vmovups (%3), %%ymm3\n"
6956 "vmovups (%4), %%ymm4\n"
6957 "vmovups (%5), %%ymm5\n"
6958 "vmovups (%6), %%ymm6\n"
6959 "vmovups (%7), %%ymm7\n"
6960 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
6961 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
6962 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
6963 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
6964 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
6965 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
6966 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
6967 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
6968 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
6969 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
6970 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
6971 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
6972 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
6973 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
6974 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
6975 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
6976 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
6977 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
6978 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
6979 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
6980 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
6981 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
6982 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
6983 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
6984 "vmovups %%ymm8, (%0)\n"
6985 "vmovups %%ymm9, (%1)\n"
6986 "vmovups %%ymm10, (%2)\n"
6987 "vmovups %%ymm11, (%3)\n"
6988 "vmovups %%ymm12, (%4)\n"
6989 "vmovups %%ymm13, (%5)\n"
6990 "vmovups %%ymm14, (%6)\n"
6991 "vmovups %%ymm15, (%7)\n"
6992 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
6993 );
6994 }
6995 }
6996 return;
6997 }
6998 if (depth == 22) {
6999 helper_float_25_recursive(buf + 0, 19);
7000 helper_float_25_recursive(buf + 524288, 19);
7001 helper_float_25_recursive(buf + 1048576, 19);
7002 helper_float_25_recursive(buf + 1572864, 19);
7003 helper_float_25_recursive(buf + 2097152, 19);
7004 helper_float_25_recursive(buf + 2621440, 19);
7005 helper_float_25_recursive(buf + 3145728, 19);
7006 helper_float_25_recursive(buf + 3670016, 19);
7007 for (int j = 0; j < 4194304; j += 4194304) {
7008 for (int k = 0; k < 524288; k += 8) {
7009 __asm__ volatile (
7010 "vmovups (%0), %%ymm0\n"
7011 "vmovups (%1), %%ymm1\n"
7012 "vmovups (%2), %%ymm2\n"
7013 "vmovups (%3), %%ymm3\n"
7014 "vmovups (%4), %%ymm4\n"
7015 "vmovups (%5), %%ymm5\n"
7016 "vmovups (%6), %%ymm6\n"
7017 "vmovups (%7), %%ymm7\n"
7018 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7019 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7020 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7021 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7022 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7023 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7024 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7025 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7026 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7027 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7028 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7029 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7030 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7031 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7032 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7033 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7034 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7035 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7036 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7037 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7038 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7039 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7040 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7041 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7042 "vmovups %%ymm8, (%0)\n"
7043 "vmovups %%ymm9, (%1)\n"
7044 "vmovups %%ymm10, (%2)\n"
7045 "vmovups %%ymm11, (%3)\n"
7046 "vmovups %%ymm12, (%4)\n"
7047 "vmovups %%ymm13, (%5)\n"
7048 "vmovups %%ymm14, (%6)\n"
7049 "vmovups %%ymm15, (%7)\n"
7050 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7051 );
7052 }
7053 }
7054 return;
7055 }
7056 if (depth == 25) {
7057 helper_float_25_recursive(buf + 0, 22);
7058 helper_float_25_recursive(buf + 4194304, 22);
7059 helper_float_25_recursive(buf + 8388608, 22);
7060 helper_float_25_recursive(buf + 12582912, 22);
7061 helper_float_25_recursive(buf + 16777216, 22);
7062 helper_float_25_recursive(buf + 20971520, 22);
7063 helper_float_25_recursive(buf + 25165824, 22);
7064 helper_float_25_recursive(buf + 29360128, 22);
7065 for (int j = 0; j < 33554432; j += 33554432) {
7066 for (int k = 0; k < 4194304; k += 8) {
7067 __asm__ volatile (
7068 "vmovups (%0), %%ymm0\n"
7069 "vmovups (%1), %%ymm1\n"
7070 "vmovups (%2), %%ymm2\n"
7071 "vmovups (%3), %%ymm3\n"
7072 "vmovups (%4), %%ymm4\n"
7073 "vmovups (%5), %%ymm5\n"
7074 "vmovups (%6), %%ymm6\n"
7075 "vmovups (%7), %%ymm7\n"
7076 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7077 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7078 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7079 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7080 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7081 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7082 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7083 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7084 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7085 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7086 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7087 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7088 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7089 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7090 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7091 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7092 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7093 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7094 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7095 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7096 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7097 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7098 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7099 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7100 "vmovups %%ymm8, (%0)\n"
7101 "vmovups %%ymm9, (%1)\n"
7102 "vmovups %%ymm10, (%2)\n"
7103 "vmovups %%ymm11, (%3)\n"
7104 "vmovups %%ymm12, (%4)\n"
7105 "vmovups %%ymm13, (%5)\n"
7106 "vmovups %%ymm14, (%6)\n"
7107 "vmovups %%ymm15, (%7)\n"
7108 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7109 );
7110 }
7111 }
7112 return;
7113 }
7114 }
7115 void helper_float_25(float *buf);
helper_float_25(float * buf)7116 void helper_float_25(float *buf) {
7117 helper_float_25_recursive(buf, 25);
7118 }
7119 void helper_float_26_recursive(float *buf, int depth);
helper_float_26_recursive(float * buf,int depth)7120 void helper_float_26_recursive(float *buf, int depth) {
7121 if (depth == 12) {
7122 for (int j = 0; j < 4096; j += 64) {
7123 for (int k = 0; k < 8; k += 8) {
7124 __asm__ volatile (
7125 "vmovups (%0), %%ymm0\n"
7126 "vmovups (%1), %%ymm1\n"
7127 "vmovups (%2), %%ymm2\n"
7128 "vmovups (%3), %%ymm3\n"
7129 "vmovups (%4), %%ymm4\n"
7130 "vmovups (%5), %%ymm5\n"
7131 "vmovups (%6), %%ymm6\n"
7132 "vmovups (%7), %%ymm7\n"
7133 "vpermilps $160, %%ymm0, %%ymm8\n"
7134 "vpermilps $245, %%ymm0, %%ymm9\n"
7135 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7136 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7137 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
7138 "vpermilps $160, %%ymm1, %%ymm8\n"
7139 "vpermilps $245, %%ymm1, %%ymm9\n"
7140 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7141 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7142 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
7143 "vpermilps $160, %%ymm2, %%ymm8\n"
7144 "vpermilps $245, %%ymm2, %%ymm9\n"
7145 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7146 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7147 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
7148 "vpermilps $160, %%ymm3, %%ymm8\n"
7149 "vpermilps $245, %%ymm3, %%ymm9\n"
7150 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7151 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7152 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
7153 "vpermilps $160, %%ymm4, %%ymm8\n"
7154 "vpermilps $245, %%ymm4, %%ymm9\n"
7155 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7156 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7157 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
7158 "vpermilps $160, %%ymm5, %%ymm8\n"
7159 "vpermilps $245, %%ymm5, %%ymm9\n"
7160 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7161 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7162 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
7163 "vpermilps $160, %%ymm6, %%ymm8\n"
7164 "vpermilps $245, %%ymm6, %%ymm9\n"
7165 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7166 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7167 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
7168 "vpermilps $160, %%ymm7, %%ymm8\n"
7169 "vpermilps $245, %%ymm7, %%ymm9\n"
7170 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7171 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7172 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
7173 "vpermilps $68, %%ymm0, %%ymm8\n"
7174 "vpermilps $238, %%ymm0, %%ymm9\n"
7175 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7176 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7177 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7178 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
7179 "vpermilps $68, %%ymm1, %%ymm8\n"
7180 "vpermilps $238, %%ymm1, %%ymm9\n"
7181 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7182 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7183 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7184 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
7185 "vpermilps $68, %%ymm2, %%ymm8\n"
7186 "vpermilps $238, %%ymm2, %%ymm9\n"
7187 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7188 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7189 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7190 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
7191 "vpermilps $68, %%ymm3, %%ymm8\n"
7192 "vpermilps $238, %%ymm3, %%ymm9\n"
7193 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7194 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7195 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7196 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
7197 "vpermilps $68, %%ymm4, %%ymm8\n"
7198 "vpermilps $238, %%ymm4, %%ymm9\n"
7199 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7200 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7201 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7202 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
7203 "vpermilps $68, %%ymm5, %%ymm8\n"
7204 "vpermilps $238, %%ymm5, %%ymm9\n"
7205 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7206 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7207 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7208 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
7209 "vpermilps $68, %%ymm6, %%ymm8\n"
7210 "vpermilps $238, %%ymm6, %%ymm9\n"
7211 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7212 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7213 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7214 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
7215 "vpermilps $68, %%ymm7, %%ymm8\n"
7216 "vpermilps $238, %%ymm7, %%ymm9\n"
7217 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7218 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7219 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7220 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
7221 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7222 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
7223 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
7224 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
7225 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
7226 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7227 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
7228 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
7229 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
7230 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
7231 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7232 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
7233 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
7234 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
7235 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
7236 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7237 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
7238 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
7239 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
7240 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
7241 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7242 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
7243 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
7244 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
7245 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
7246 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7247 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
7248 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
7249 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
7250 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
7251 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7252 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
7253 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
7254 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
7255 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
7256 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7257 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
7258 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
7259 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
7260 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
7261 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7262 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7263 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7264 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7265 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7266 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7267 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7268 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7269 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7270 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7271 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7272 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7273 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7274 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7275 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7276 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7277 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7278 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7279 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7280 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7281 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7282 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7283 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7284 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7285 "vmovups %%ymm8, (%0)\n"
7286 "vmovups %%ymm9, (%1)\n"
7287 "vmovups %%ymm10, (%2)\n"
7288 "vmovups %%ymm11, (%3)\n"
7289 "vmovups %%ymm12, (%4)\n"
7290 "vmovups %%ymm13, (%5)\n"
7291 "vmovups %%ymm14, (%6)\n"
7292 "vmovups %%ymm15, (%7)\n"
7293 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7294 );
7295 }
7296 }
7297 for (int j = 0; j < 4096; j += 512) {
7298 for (int k = 0; k < 64; k += 8) {
7299 __asm__ volatile (
7300 "vmovups (%0), %%ymm0\n"
7301 "vmovups (%1), %%ymm1\n"
7302 "vmovups (%2), %%ymm2\n"
7303 "vmovups (%3), %%ymm3\n"
7304 "vmovups (%4), %%ymm4\n"
7305 "vmovups (%5), %%ymm5\n"
7306 "vmovups (%6), %%ymm6\n"
7307 "vmovups (%7), %%ymm7\n"
7308 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7309 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7310 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7311 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7312 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7313 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7314 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7315 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7316 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7317 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7318 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7319 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7320 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7321 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7322 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7323 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7324 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7325 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7326 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7327 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7328 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7329 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7330 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7331 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7332 "vmovups %%ymm8, (%0)\n"
7333 "vmovups %%ymm9, (%1)\n"
7334 "vmovups %%ymm10, (%2)\n"
7335 "vmovups %%ymm11, (%3)\n"
7336 "vmovups %%ymm12, (%4)\n"
7337 "vmovups %%ymm13, (%5)\n"
7338 "vmovups %%ymm14, (%6)\n"
7339 "vmovups %%ymm15, (%7)\n"
7340 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7341 );
7342 }
7343 }
7344 for (int j = 0; j < 4096; j += 4096) {
7345 for (int k = 0; k < 512; k += 8) {
7346 __asm__ volatile (
7347 "vmovups (%0), %%ymm0\n"
7348 "vmovups (%1), %%ymm1\n"
7349 "vmovups (%2), %%ymm2\n"
7350 "vmovups (%3), %%ymm3\n"
7351 "vmovups (%4), %%ymm4\n"
7352 "vmovups (%5), %%ymm5\n"
7353 "vmovups (%6), %%ymm6\n"
7354 "vmovups (%7), %%ymm7\n"
7355 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7356 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7357 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7358 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7359 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7360 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7361 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7362 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7363 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7364 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7365 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7366 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7367 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7368 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7369 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7370 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7371 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7372 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7373 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7374 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7375 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7376 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7377 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7378 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7379 "vmovups %%ymm8, (%0)\n"
7380 "vmovups %%ymm9, (%1)\n"
7381 "vmovups %%ymm10, (%2)\n"
7382 "vmovups %%ymm11, (%3)\n"
7383 "vmovups %%ymm12, (%4)\n"
7384 "vmovups %%ymm13, (%5)\n"
7385 "vmovups %%ymm14, (%6)\n"
7386 "vmovups %%ymm15, (%7)\n"
7387 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7388 );
7389 }
7390 }
7391 return;
7392 }
7393 if (depth == 15) {
7394 helper_float_26_recursive(buf + 0, 12);
7395 helper_float_26_recursive(buf + 4096, 12);
7396 helper_float_26_recursive(buf + 8192, 12);
7397 helper_float_26_recursive(buf + 12288, 12);
7398 helper_float_26_recursive(buf + 16384, 12);
7399 helper_float_26_recursive(buf + 20480, 12);
7400 helper_float_26_recursive(buf + 24576, 12);
7401 helper_float_26_recursive(buf + 28672, 12);
7402 for (int j = 0; j < 32768; j += 32768) {
7403 for (int k = 0; k < 4096; k += 8) {
7404 __asm__ volatile (
7405 "vmovups (%0), %%ymm0\n"
7406 "vmovups (%1), %%ymm1\n"
7407 "vmovups (%2), %%ymm2\n"
7408 "vmovups (%3), %%ymm3\n"
7409 "vmovups (%4), %%ymm4\n"
7410 "vmovups (%5), %%ymm5\n"
7411 "vmovups (%6), %%ymm6\n"
7412 "vmovups (%7), %%ymm7\n"
7413 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7414 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7415 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7416 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7417 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7418 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7419 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7420 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7421 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7422 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7423 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7424 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7425 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7426 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7427 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7428 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7429 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7430 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7431 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7432 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7433 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7434 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7435 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7436 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7437 "vmovups %%ymm8, (%0)\n"
7438 "vmovups %%ymm9, (%1)\n"
7439 "vmovups %%ymm10, (%2)\n"
7440 "vmovups %%ymm11, (%3)\n"
7441 "vmovups %%ymm12, (%4)\n"
7442 "vmovups %%ymm13, (%5)\n"
7443 "vmovups %%ymm14, (%6)\n"
7444 "vmovups %%ymm15, (%7)\n"
7445 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7446 );
7447 }
7448 }
7449 return;
7450 }
7451 if (depth == 18) {
7452 helper_float_26_recursive(buf + 0, 15);
7453 helper_float_26_recursive(buf + 32768, 15);
7454 helper_float_26_recursive(buf + 65536, 15);
7455 helper_float_26_recursive(buf + 98304, 15);
7456 helper_float_26_recursive(buf + 131072, 15);
7457 helper_float_26_recursive(buf + 163840, 15);
7458 helper_float_26_recursive(buf + 196608, 15);
7459 helper_float_26_recursive(buf + 229376, 15);
7460 for (int j = 0; j < 262144; j += 262144) {
7461 for (int k = 0; k < 32768; k += 8) {
7462 __asm__ volatile (
7463 "vmovups (%0), %%ymm0\n"
7464 "vmovups (%1), %%ymm1\n"
7465 "vmovups (%2), %%ymm2\n"
7466 "vmovups (%3), %%ymm3\n"
7467 "vmovups (%4), %%ymm4\n"
7468 "vmovups (%5), %%ymm5\n"
7469 "vmovups (%6), %%ymm6\n"
7470 "vmovups (%7), %%ymm7\n"
7471 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7472 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7473 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7474 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7475 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7476 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7477 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7478 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7479 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7480 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7481 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7482 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7483 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7484 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7485 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7486 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7487 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7488 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7489 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7490 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7491 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7492 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7493 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7494 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7495 "vmovups %%ymm8, (%0)\n"
7496 "vmovups %%ymm9, (%1)\n"
7497 "vmovups %%ymm10, (%2)\n"
7498 "vmovups %%ymm11, (%3)\n"
7499 "vmovups %%ymm12, (%4)\n"
7500 "vmovups %%ymm13, (%5)\n"
7501 "vmovups %%ymm14, (%6)\n"
7502 "vmovups %%ymm15, (%7)\n"
7503 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7504 );
7505 }
7506 }
7507 return;
7508 }
7509 if (depth == 21) {
7510 helper_float_26_recursive(buf + 0, 18);
7511 helper_float_26_recursive(buf + 262144, 18);
7512 helper_float_26_recursive(buf + 524288, 18);
7513 helper_float_26_recursive(buf + 786432, 18);
7514 helper_float_26_recursive(buf + 1048576, 18);
7515 helper_float_26_recursive(buf + 1310720, 18);
7516 helper_float_26_recursive(buf + 1572864, 18);
7517 helper_float_26_recursive(buf + 1835008, 18);
7518 for (int j = 0; j < 2097152; j += 2097152) {
7519 for (int k = 0; k < 262144; k += 8) {
7520 __asm__ volatile (
7521 "vmovups (%0), %%ymm0\n"
7522 "vmovups (%1), %%ymm1\n"
7523 "vmovups (%2), %%ymm2\n"
7524 "vmovups (%3), %%ymm3\n"
7525 "vmovups (%4), %%ymm4\n"
7526 "vmovups (%5), %%ymm5\n"
7527 "vmovups (%6), %%ymm6\n"
7528 "vmovups (%7), %%ymm7\n"
7529 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7530 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7531 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7532 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7533 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7534 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7535 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7536 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7537 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7538 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7539 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7540 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7541 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7542 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7543 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7544 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7545 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7546 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7547 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7548 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7549 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7550 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7551 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7552 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7553 "vmovups %%ymm8, (%0)\n"
7554 "vmovups %%ymm9, (%1)\n"
7555 "vmovups %%ymm10, (%2)\n"
7556 "vmovups %%ymm11, (%3)\n"
7557 "vmovups %%ymm12, (%4)\n"
7558 "vmovups %%ymm13, (%5)\n"
7559 "vmovups %%ymm14, (%6)\n"
7560 "vmovups %%ymm15, (%7)\n"
7561 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7562 );
7563 }
7564 }
7565 return;
7566 }
7567 if (depth == 24) {
7568 helper_float_26_recursive(buf + 0, 21);
7569 helper_float_26_recursive(buf + 2097152, 21);
7570 helper_float_26_recursive(buf + 4194304, 21);
7571 helper_float_26_recursive(buf + 6291456, 21);
7572 helper_float_26_recursive(buf + 8388608, 21);
7573 helper_float_26_recursive(buf + 10485760, 21);
7574 helper_float_26_recursive(buf + 12582912, 21);
7575 helper_float_26_recursive(buf + 14680064, 21);
7576 for (int j = 0; j < 16777216; j += 16777216) {
7577 for (int k = 0; k < 2097152; k += 8) {
7578 __asm__ volatile (
7579 "vmovups (%0), %%ymm0\n"
7580 "vmovups (%1), %%ymm1\n"
7581 "vmovups (%2), %%ymm2\n"
7582 "vmovups (%3), %%ymm3\n"
7583 "vmovups (%4), %%ymm4\n"
7584 "vmovups (%5), %%ymm5\n"
7585 "vmovups (%6), %%ymm6\n"
7586 "vmovups (%7), %%ymm7\n"
7587 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7588 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7589 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7590 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7591 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7592 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7593 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7594 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7595 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7596 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7597 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7598 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7599 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7600 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7601 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7602 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7603 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7604 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7605 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7606 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7607 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7608 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7609 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7610 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7611 "vmovups %%ymm8, (%0)\n"
7612 "vmovups %%ymm9, (%1)\n"
7613 "vmovups %%ymm10, (%2)\n"
7614 "vmovups %%ymm11, (%3)\n"
7615 "vmovups %%ymm12, (%4)\n"
7616 "vmovups %%ymm13, (%5)\n"
7617 "vmovups %%ymm14, (%6)\n"
7618 "vmovups %%ymm15, (%7)\n"
7619 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7620 );
7621 }
7622 }
7623 return;
7624 }
7625 if (depth == 26) {
7626 helper_float_26_recursive(buf + 0, 24);
7627 helper_float_26_recursive(buf + 16777216, 24);
7628 helper_float_26_recursive(buf + 33554432, 24);
7629 helper_float_26_recursive(buf + 50331648, 24);
7630 for (int j = 0; j < 67108864; j += 67108864) {
7631 for (int k = 0; k < 16777216; k += 8) {
7632 __asm__ volatile (
7633 "vmovups (%0), %%ymm0\n"
7634 "vmovups (%1), %%ymm1\n"
7635 "vmovups (%2), %%ymm2\n"
7636 "vmovups (%3), %%ymm3\n"
7637 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7638 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7639 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7640 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7641 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7642 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7643 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7644 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7645 "vmovups %%ymm0, (%0)\n"
7646 "vmovups %%ymm1, (%1)\n"
7647 "vmovups %%ymm2, (%2)\n"
7648 "vmovups %%ymm3, (%3)\n"
7649 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7650 );
7651 }
7652 }
7653 return;
7654 }
7655 }
7656 void helper_float_26(float *buf);
helper_float_26(float * buf)7657 void helper_float_26(float *buf) {
7658 helper_float_26_recursive(buf, 26);
7659 }
7660 void helper_float_27_recursive(float *buf, int depth);
helper_float_27_recursive(float * buf,int depth)7661 void helper_float_27_recursive(float *buf, int depth) {
7662 if (depth == 12) {
7663 for (int j = 0; j < 4096; j += 64) {
7664 for (int k = 0; k < 8; k += 8) {
7665 __asm__ volatile (
7666 "vmovups (%0), %%ymm0\n"
7667 "vmovups (%1), %%ymm1\n"
7668 "vmovups (%2), %%ymm2\n"
7669 "vmovups (%3), %%ymm3\n"
7670 "vmovups (%4), %%ymm4\n"
7671 "vmovups (%5), %%ymm5\n"
7672 "vmovups (%6), %%ymm6\n"
7673 "vmovups (%7), %%ymm7\n"
7674 "vpermilps $160, %%ymm0, %%ymm8\n"
7675 "vpermilps $245, %%ymm0, %%ymm9\n"
7676 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7677 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7678 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
7679 "vpermilps $160, %%ymm1, %%ymm8\n"
7680 "vpermilps $245, %%ymm1, %%ymm9\n"
7681 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7682 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7683 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
7684 "vpermilps $160, %%ymm2, %%ymm8\n"
7685 "vpermilps $245, %%ymm2, %%ymm9\n"
7686 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7687 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7688 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
7689 "vpermilps $160, %%ymm3, %%ymm8\n"
7690 "vpermilps $245, %%ymm3, %%ymm9\n"
7691 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7692 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7693 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
7694 "vpermilps $160, %%ymm4, %%ymm8\n"
7695 "vpermilps $245, %%ymm4, %%ymm9\n"
7696 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7697 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7698 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
7699 "vpermilps $160, %%ymm5, %%ymm8\n"
7700 "vpermilps $245, %%ymm5, %%ymm9\n"
7701 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7702 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7703 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
7704 "vpermilps $160, %%ymm6, %%ymm8\n"
7705 "vpermilps $245, %%ymm6, %%ymm9\n"
7706 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7707 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7708 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
7709 "vpermilps $160, %%ymm7, %%ymm8\n"
7710 "vpermilps $245, %%ymm7, %%ymm9\n"
7711 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7712 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7713 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
7714 "vpermilps $68, %%ymm0, %%ymm8\n"
7715 "vpermilps $238, %%ymm0, %%ymm9\n"
7716 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7717 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7718 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7719 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
7720 "vpermilps $68, %%ymm1, %%ymm8\n"
7721 "vpermilps $238, %%ymm1, %%ymm9\n"
7722 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7723 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7724 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7725 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
7726 "vpermilps $68, %%ymm2, %%ymm8\n"
7727 "vpermilps $238, %%ymm2, %%ymm9\n"
7728 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7729 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7730 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7731 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
7732 "vpermilps $68, %%ymm3, %%ymm8\n"
7733 "vpermilps $238, %%ymm3, %%ymm9\n"
7734 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7735 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7736 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7737 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
7738 "vpermilps $68, %%ymm4, %%ymm8\n"
7739 "vpermilps $238, %%ymm4, %%ymm9\n"
7740 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7741 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7742 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7743 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
7744 "vpermilps $68, %%ymm5, %%ymm8\n"
7745 "vpermilps $238, %%ymm5, %%ymm9\n"
7746 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7747 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7748 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7749 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
7750 "vpermilps $68, %%ymm6, %%ymm8\n"
7751 "vpermilps $238, %%ymm6, %%ymm9\n"
7752 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7753 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7754 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7755 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
7756 "vpermilps $68, %%ymm7, %%ymm8\n"
7757 "vpermilps $238, %%ymm7, %%ymm9\n"
7758 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
7759 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
7760 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
7761 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
7762 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7763 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
7764 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
7765 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
7766 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
7767 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7768 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
7769 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
7770 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
7771 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
7772 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7773 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
7774 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
7775 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
7776 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
7777 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7778 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
7779 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
7780 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
7781 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
7782 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7783 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
7784 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
7785 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
7786 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
7787 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7788 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
7789 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
7790 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
7791 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
7792 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7793 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
7794 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
7795 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
7796 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
7797 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
7798 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
7799 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
7800 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
7801 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
7802 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7803 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7804 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7805 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7806 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7807 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7808 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7809 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7810 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7811 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7812 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7813 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7814 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7815 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7816 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7817 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7818 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7819 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7820 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7821 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7822 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7823 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7824 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7825 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7826 "vmovups %%ymm8, (%0)\n"
7827 "vmovups %%ymm9, (%1)\n"
7828 "vmovups %%ymm10, (%2)\n"
7829 "vmovups %%ymm11, (%3)\n"
7830 "vmovups %%ymm12, (%4)\n"
7831 "vmovups %%ymm13, (%5)\n"
7832 "vmovups %%ymm14, (%6)\n"
7833 "vmovups %%ymm15, (%7)\n"
7834 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7835 );
7836 }
7837 }
7838 for (int j = 0; j < 4096; j += 512) {
7839 for (int k = 0; k < 64; k += 8) {
7840 __asm__ volatile (
7841 "vmovups (%0), %%ymm0\n"
7842 "vmovups (%1), %%ymm1\n"
7843 "vmovups (%2), %%ymm2\n"
7844 "vmovups (%3), %%ymm3\n"
7845 "vmovups (%4), %%ymm4\n"
7846 "vmovups (%5), %%ymm5\n"
7847 "vmovups (%6), %%ymm6\n"
7848 "vmovups (%7), %%ymm7\n"
7849 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7850 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7851 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7852 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7853 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7854 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7855 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7856 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7857 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7858 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7859 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7860 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7861 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7862 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7863 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7864 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7865 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7866 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7867 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7868 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7869 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7870 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7871 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7872 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7873 "vmovups %%ymm8, (%0)\n"
7874 "vmovups %%ymm9, (%1)\n"
7875 "vmovups %%ymm10, (%2)\n"
7876 "vmovups %%ymm11, (%3)\n"
7877 "vmovups %%ymm12, (%4)\n"
7878 "vmovups %%ymm13, (%5)\n"
7879 "vmovups %%ymm14, (%6)\n"
7880 "vmovups %%ymm15, (%7)\n"
7881 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7882 );
7883 }
7884 }
7885 for (int j = 0; j < 4096; j += 4096) {
7886 for (int k = 0; k < 512; k += 8) {
7887 __asm__ volatile (
7888 "vmovups (%0), %%ymm0\n"
7889 "vmovups (%1), %%ymm1\n"
7890 "vmovups (%2), %%ymm2\n"
7891 "vmovups (%3), %%ymm3\n"
7892 "vmovups (%4), %%ymm4\n"
7893 "vmovups (%5), %%ymm5\n"
7894 "vmovups (%6), %%ymm6\n"
7895 "vmovups (%7), %%ymm7\n"
7896 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7897 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7898 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7899 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7900 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7901 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7902 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7903 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7904 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7905 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7906 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7907 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7908 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7909 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7910 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7911 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7912 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7913 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7914 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7915 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7916 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7917 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7918 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7919 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7920 "vmovups %%ymm8, (%0)\n"
7921 "vmovups %%ymm9, (%1)\n"
7922 "vmovups %%ymm10, (%2)\n"
7923 "vmovups %%ymm11, (%3)\n"
7924 "vmovups %%ymm12, (%4)\n"
7925 "vmovups %%ymm13, (%5)\n"
7926 "vmovups %%ymm14, (%6)\n"
7927 "vmovups %%ymm15, (%7)\n"
7928 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7929 );
7930 }
7931 }
7932 return;
7933 }
7934 if (depth == 15) {
7935 helper_float_27_recursive(buf + 0, 12);
7936 helper_float_27_recursive(buf + 4096, 12);
7937 helper_float_27_recursive(buf + 8192, 12);
7938 helper_float_27_recursive(buf + 12288, 12);
7939 helper_float_27_recursive(buf + 16384, 12);
7940 helper_float_27_recursive(buf + 20480, 12);
7941 helper_float_27_recursive(buf + 24576, 12);
7942 helper_float_27_recursive(buf + 28672, 12);
7943 for (int j = 0; j < 32768; j += 32768) {
7944 for (int k = 0; k < 4096; k += 8) {
7945 __asm__ volatile (
7946 "vmovups (%0), %%ymm0\n"
7947 "vmovups (%1), %%ymm1\n"
7948 "vmovups (%2), %%ymm2\n"
7949 "vmovups (%3), %%ymm3\n"
7950 "vmovups (%4), %%ymm4\n"
7951 "vmovups (%5), %%ymm5\n"
7952 "vmovups (%6), %%ymm6\n"
7953 "vmovups (%7), %%ymm7\n"
7954 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
7955 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
7956 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
7957 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
7958 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
7959 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
7960 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
7961 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
7962 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
7963 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
7964 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
7965 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
7966 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
7967 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
7968 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
7969 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
7970 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
7971 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
7972 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
7973 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
7974 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
7975 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
7976 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
7977 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
7978 "vmovups %%ymm8, (%0)\n"
7979 "vmovups %%ymm9, (%1)\n"
7980 "vmovups %%ymm10, (%2)\n"
7981 "vmovups %%ymm11, (%3)\n"
7982 "vmovups %%ymm12, (%4)\n"
7983 "vmovups %%ymm13, (%5)\n"
7984 "vmovups %%ymm14, (%6)\n"
7985 "vmovups %%ymm15, (%7)\n"
7986 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
7987 );
7988 }
7989 }
7990 return;
7991 }
7992 if (depth == 18) {
7993 helper_float_27_recursive(buf + 0, 15);
7994 helper_float_27_recursive(buf + 32768, 15);
7995 helper_float_27_recursive(buf + 65536, 15);
7996 helper_float_27_recursive(buf + 98304, 15);
7997 helper_float_27_recursive(buf + 131072, 15);
7998 helper_float_27_recursive(buf + 163840, 15);
7999 helper_float_27_recursive(buf + 196608, 15);
8000 helper_float_27_recursive(buf + 229376, 15);
8001 for (int j = 0; j < 262144; j += 262144) {
8002 for (int k = 0; k < 32768; k += 8) {
8003 __asm__ volatile (
8004 "vmovups (%0), %%ymm0\n"
8005 "vmovups (%1), %%ymm1\n"
8006 "vmovups (%2), %%ymm2\n"
8007 "vmovups (%3), %%ymm3\n"
8008 "vmovups (%4), %%ymm4\n"
8009 "vmovups (%5), %%ymm5\n"
8010 "vmovups (%6), %%ymm6\n"
8011 "vmovups (%7), %%ymm7\n"
8012 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8013 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8014 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8015 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8016 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8017 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8018 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8019 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8020 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8021 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8022 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8023 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8024 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8025 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8026 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8027 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8028 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8029 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8030 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8031 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8032 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8033 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8034 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8035 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8036 "vmovups %%ymm8, (%0)\n"
8037 "vmovups %%ymm9, (%1)\n"
8038 "vmovups %%ymm10, (%2)\n"
8039 "vmovups %%ymm11, (%3)\n"
8040 "vmovups %%ymm12, (%4)\n"
8041 "vmovups %%ymm13, (%5)\n"
8042 "vmovups %%ymm14, (%6)\n"
8043 "vmovups %%ymm15, (%7)\n"
8044 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8045 );
8046 }
8047 }
8048 return;
8049 }
8050 if (depth == 21) {
8051 helper_float_27_recursive(buf + 0, 18);
8052 helper_float_27_recursive(buf + 262144, 18);
8053 helper_float_27_recursive(buf + 524288, 18);
8054 helper_float_27_recursive(buf + 786432, 18);
8055 helper_float_27_recursive(buf + 1048576, 18);
8056 helper_float_27_recursive(buf + 1310720, 18);
8057 helper_float_27_recursive(buf + 1572864, 18);
8058 helper_float_27_recursive(buf + 1835008, 18);
8059 for (int j = 0; j < 2097152; j += 2097152) {
8060 for (int k = 0; k < 262144; k += 8) {
8061 __asm__ volatile (
8062 "vmovups (%0), %%ymm0\n"
8063 "vmovups (%1), %%ymm1\n"
8064 "vmovups (%2), %%ymm2\n"
8065 "vmovups (%3), %%ymm3\n"
8066 "vmovups (%4), %%ymm4\n"
8067 "vmovups (%5), %%ymm5\n"
8068 "vmovups (%6), %%ymm6\n"
8069 "vmovups (%7), %%ymm7\n"
8070 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8071 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8072 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8073 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8074 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8075 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8076 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8077 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8078 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8079 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8080 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8081 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8082 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8083 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8084 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8085 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8086 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8087 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8088 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8089 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8090 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8091 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8092 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8093 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8094 "vmovups %%ymm8, (%0)\n"
8095 "vmovups %%ymm9, (%1)\n"
8096 "vmovups %%ymm10, (%2)\n"
8097 "vmovups %%ymm11, (%3)\n"
8098 "vmovups %%ymm12, (%4)\n"
8099 "vmovups %%ymm13, (%5)\n"
8100 "vmovups %%ymm14, (%6)\n"
8101 "vmovups %%ymm15, (%7)\n"
8102 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8103 );
8104 }
8105 }
8106 return;
8107 }
8108 if (depth == 24) {
8109 helper_float_27_recursive(buf + 0, 21);
8110 helper_float_27_recursive(buf + 2097152, 21);
8111 helper_float_27_recursive(buf + 4194304, 21);
8112 helper_float_27_recursive(buf + 6291456, 21);
8113 helper_float_27_recursive(buf + 8388608, 21);
8114 helper_float_27_recursive(buf + 10485760, 21);
8115 helper_float_27_recursive(buf + 12582912, 21);
8116 helper_float_27_recursive(buf + 14680064, 21);
8117 for (int j = 0; j < 16777216; j += 16777216) {
8118 for (int k = 0; k < 2097152; k += 8) {
8119 __asm__ volatile (
8120 "vmovups (%0), %%ymm0\n"
8121 "vmovups (%1), %%ymm1\n"
8122 "vmovups (%2), %%ymm2\n"
8123 "vmovups (%3), %%ymm3\n"
8124 "vmovups (%4), %%ymm4\n"
8125 "vmovups (%5), %%ymm5\n"
8126 "vmovups (%6), %%ymm6\n"
8127 "vmovups (%7), %%ymm7\n"
8128 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8129 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8130 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8131 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8132 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8133 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8134 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8135 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8136 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8137 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8138 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8139 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8140 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8141 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8142 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8143 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8144 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8145 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8146 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8147 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8148 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8149 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8150 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8151 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8152 "vmovups %%ymm8, (%0)\n"
8153 "vmovups %%ymm9, (%1)\n"
8154 "vmovups %%ymm10, (%2)\n"
8155 "vmovups %%ymm11, (%3)\n"
8156 "vmovups %%ymm12, (%4)\n"
8157 "vmovups %%ymm13, (%5)\n"
8158 "vmovups %%ymm14, (%6)\n"
8159 "vmovups %%ymm15, (%7)\n"
8160 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8161 );
8162 }
8163 }
8164 return;
8165 }
8166 if (depth == 27) {
8167 helper_float_27_recursive(buf + 0, 24);
8168 helper_float_27_recursive(buf + 16777216, 24);
8169 helper_float_27_recursive(buf + 33554432, 24);
8170 helper_float_27_recursive(buf + 50331648, 24);
8171 helper_float_27_recursive(buf + 67108864, 24);
8172 helper_float_27_recursive(buf + 83886080, 24);
8173 helper_float_27_recursive(buf + 100663296, 24);
8174 helper_float_27_recursive(buf + 117440512, 24);
8175 for (int j = 0; j < 134217728; j += 134217728) {
8176 for (int k = 0; k < 16777216; k += 8) {
8177 __asm__ volatile (
8178 "vmovups (%0), %%ymm0\n"
8179 "vmovups (%1), %%ymm1\n"
8180 "vmovups (%2), %%ymm2\n"
8181 "vmovups (%3), %%ymm3\n"
8182 "vmovups (%4), %%ymm4\n"
8183 "vmovups (%5), %%ymm5\n"
8184 "vmovups (%6), %%ymm6\n"
8185 "vmovups (%7), %%ymm7\n"
8186 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8187 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8188 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8189 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8190 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8191 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8192 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8193 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8194 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8195 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8196 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8197 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8198 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8199 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8200 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8201 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8202 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8203 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8204 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8205 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8206 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8207 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8208 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8209 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8210 "vmovups %%ymm8, (%0)\n"
8211 "vmovups %%ymm9, (%1)\n"
8212 "vmovups %%ymm10, (%2)\n"
8213 "vmovups %%ymm11, (%3)\n"
8214 "vmovups %%ymm12, (%4)\n"
8215 "vmovups %%ymm13, (%5)\n"
8216 "vmovups %%ymm14, (%6)\n"
8217 "vmovups %%ymm15, (%7)\n"
8218 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8219 );
8220 }
8221 }
8222 return;
8223 }
8224 }
8225 void helper_float_27(float *buf);
helper_float_27(float * buf)8226 void helper_float_27(float *buf) {
8227 helper_float_27_recursive(buf, 27);
8228 }
8229 void helper_float_28_recursive(float *buf, int depth);
helper_float_28_recursive(float * buf,int depth)8230 void helper_float_28_recursive(float *buf, int depth) {
8231 if (depth == 7) {
8232 for (int j = 0; j < 128; j += 64) {
8233 for (int k = 0; k < 8; k += 8) {
8234 __asm__ volatile (
8235 "vmovups (%0), %%ymm0\n"
8236 "vmovups (%1), %%ymm1\n"
8237 "vmovups (%2), %%ymm2\n"
8238 "vmovups (%3), %%ymm3\n"
8239 "vmovups (%4), %%ymm4\n"
8240 "vmovups (%5), %%ymm5\n"
8241 "vmovups (%6), %%ymm6\n"
8242 "vmovups (%7), %%ymm7\n"
8243 "vpermilps $160, %%ymm0, %%ymm8\n"
8244 "vpermilps $245, %%ymm0, %%ymm9\n"
8245 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8246 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8247 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
8248 "vpermilps $160, %%ymm1, %%ymm8\n"
8249 "vpermilps $245, %%ymm1, %%ymm9\n"
8250 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8251 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8252 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
8253 "vpermilps $160, %%ymm2, %%ymm8\n"
8254 "vpermilps $245, %%ymm2, %%ymm9\n"
8255 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8256 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8257 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
8258 "vpermilps $160, %%ymm3, %%ymm8\n"
8259 "vpermilps $245, %%ymm3, %%ymm9\n"
8260 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8261 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8262 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
8263 "vpermilps $160, %%ymm4, %%ymm8\n"
8264 "vpermilps $245, %%ymm4, %%ymm9\n"
8265 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8266 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8267 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
8268 "vpermilps $160, %%ymm5, %%ymm8\n"
8269 "vpermilps $245, %%ymm5, %%ymm9\n"
8270 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8271 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8272 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
8273 "vpermilps $160, %%ymm6, %%ymm8\n"
8274 "vpermilps $245, %%ymm6, %%ymm9\n"
8275 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8276 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8277 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
8278 "vpermilps $160, %%ymm7, %%ymm8\n"
8279 "vpermilps $245, %%ymm7, %%ymm9\n"
8280 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8281 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8282 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
8283 "vpermilps $68, %%ymm0, %%ymm8\n"
8284 "vpermilps $238, %%ymm0, %%ymm9\n"
8285 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8286 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8287 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8288 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
8289 "vpermilps $68, %%ymm1, %%ymm8\n"
8290 "vpermilps $238, %%ymm1, %%ymm9\n"
8291 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8292 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8293 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8294 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
8295 "vpermilps $68, %%ymm2, %%ymm8\n"
8296 "vpermilps $238, %%ymm2, %%ymm9\n"
8297 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8298 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8299 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8300 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
8301 "vpermilps $68, %%ymm3, %%ymm8\n"
8302 "vpermilps $238, %%ymm3, %%ymm9\n"
8303 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8304 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8305 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8306 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
8307 "vpermilps $68, %%ymm4, %%ymm8\n"
8308 "vpermilps $238, %%ymm4, %%ymm9\n"
8309 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8310 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8311 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8312 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
8313 "vpermilps $68, %%ymm5, %%ymm8\n"
8314 "vpermilps $238, %%ymm5, %%ymm9\n"
8315 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8316 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8317 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8318 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
8319 "vpermilps $68, %%ymm6, %%ymm8\n"
8320 "vpermilps $238, %%ymm6, %%ymm9\n"
8321 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8322 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8323 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8324 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
8325 "vpermilps $68, %%ymm7, %%ymm8\n"
8326 "vpermilps $238, %%ymm7, %%ymm9\n"
8327 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8328 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8329 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8330 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
8331 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8332 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
8333 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
8334 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
8335 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
8336 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8337 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
8338 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
8339 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
8340 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
8341 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8342 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
8343 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
8344 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
8345 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
8346 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8347 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
8348 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
8349 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
8350 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
8351 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8352 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
8353 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
8354 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
8355 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
8356 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8357 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
8358 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
8359 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
8360 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
8361 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8362 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
8363 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
8364 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
8365 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
8366 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8367 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
8368 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
8369 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
8370 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
8371 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8372 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8373 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8374 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8375 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8376 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8377 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8378 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8379 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8380 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8381 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8382 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8383 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8384 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8385 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8386 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8387 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8388 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8389 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8390 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8391 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8392 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8393 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8394 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8395 "vmovups %%ymm8, (%0)\n"
8396 "vmovups %%ymm9, (%1)\n"
8397 "vmovups %%ymm10, (%2)\n"
8398 "vmovups %%ymm11, (%3)\n"
8399 "vmovups %%ymm12, (%4)\n"
8400 "vmovups %%ymm13, (%5)\n"
8401 "vmovups %%ymm14, (%6)\n"
8402 "vmovups %%ymm15, (%7)\n"
8403 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8404 );
8405 }
8406 }
8407 for (int j = 0; j < 128; j += 128) {
8408 for (int k = 0; k < 64; k += 8) {
8409 __asm__ volatile (
8410 "vmovups (%0), %%ymm0\n"
8411 "vmovups (%1), %%ymm1\n"
8412 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8413 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8414 "vmovups %%ymm8, (%0)\n"
8415 "vmovups %%ymm9, (%1)\n"
8416 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8417 );
8418 }
8419 }
8420 return;
8421 }
8422 if (depth == 10) {
8423 helper_float_28_recursive(buf + 0, 7);
8424 helper_float_28_recursive(buf + 128, 7);
8425 helper_float_28_recursive(buf + 256, 7);
8426 helper_float_28_recursive(buf + 384, 7);
8427 helper_float_28_recursive(buf + 512, 7);
8428 helper_float_28_recursive(buf + 640, 7);
8429 helper_float_28_recursive(buf + 768, 7);
8430 helper_float_28_recursive(buf + 896, 7);
8431 for (int j = 0; j < 1024; j += 1024) {
8432 for (int k = 0; k < 128; k += 8) {
8433 __asm__ volatile (
8434 "vmovups (%0), %%ymm0\n"
8435 "vmovups (%1), %%ymm1\n"
8436 "vmovups (%2), %%ymm2\n"
8437 "vmovups (%3), %%ymm3\n"
8438 "vmovups (%4), %%ymm4\n"
8439 "vmovups (%5), %%ymm5\n"
8440 "vmovups (%6), %%ymm6\n"
8441 "vmovups (%7), %%ymm7\n"
8442 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8443 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8444 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8445 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8446 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8447 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8448 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8449 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8450 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8451 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8452 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8453 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8454 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8455 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8456 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8457 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8458 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8459 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8460 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8461 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8462 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8463 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8464 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8465 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8466 "vmovups %%ymm8, (%0)\n"
8467 "vmovups %%ymm9, (%1)\n"
8468 "vmovups %%ymm10, (%2)\n"
8469 "vmovups %%ymm11, (%3)\n"
8470 "vmovups %%ymm12, (%4)\n"
8471 "vmovups %%ymm13, (%5)\n"
8472 "vmovups %%ymm14, (%6)\n"
8473 "vmovups %%ymm15, (%7)\n"
8474 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8475 );
8476 }
8477 }
8478 return;
8479 }
8480 if (depth == 13) {
8481 helper_float_28_recursive(buf + 0, 10);
8482 helper_float_28_recursive(buf + 1024, 10);
8483 helper_float_28_recursive(buf + 2048, 10);
8484 helper_float_28_recursive(buf + 3072, 10);
8485 helper_float_28_recursive(buf + 4096, 10);
8486 helper_float_28_recursive(buf + 5120, 10);
8487 helper_float_28_recursive(buf + 6144, 10);
8488 helper_float_28_recursive(buf + 7168, 10);
8489 for (int j = 0; j < 8192; j += 8192) {
8490 for (int k = 0; k < 1024; k += 8) {
8491 __asm__ volatile (
8492 "vmovups (%0), %%ymm0\n"
8493 "vmovups (%1), %%ymm1\n"
8494 "vmovups (%2), %%ymm2\n"
8495 "vmovups (%3), %%ymm3\n"
8496 "vmovups (%4), %%ymm4\n"
8497 "vmovups (%5), %%ymm5\n"
8498 "vmovups (%6), %%ymm6\n"
8499 "vmovups (%7), %%ymm7\n"
8500 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8501 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8502 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8503 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8504 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8505 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8506 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8507 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8508 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8509 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8510 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8511 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8512 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8513 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8514 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8515 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8516 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8517 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8518 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8519 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8520 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8521 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8522 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8523 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8524 "vmovups %%ymm8, (%0)\n"
8525 "vmovups %%ymm9, (%1)\n"
8526 "vmovups %%ymm10, (%2)\n"
8527 "vmovups %%ymm11, (%3)\n"
8528 "vmovups %%ymm12, (%4)\n"
8529 "vmovups %%ymm13, (%5)\n"
8530 "vmovups %%ymm14, (%6)\n"
8531 "vmovups %%ymm15, (%7)\n"
8532 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8533 );
8534 }
8535 }
8536 return;
8537 }
8538 if (depth == 16) {
8539 helper_float_28_recursive(buf + 0, 13);
8540 helper_float_28_recursive(buf + 8192, 13);
8541 helper_float_28_recursive(buf + 16384, 13);
8542 helper_float_28_recursive(buf + 24576, 13);
8543 helper_float_28_recursive(buf + 32768, 13);
8544 helper_float_28_recursive(buf + 40960, 13);
8545 helper_float_28_recursive(buf + 49152, 13);
8546 helper_float_28_recursive(buf + 57344, 13);
8547 for (int j = 0; j < 65536; j += 65536) {
8548 for (int k = 0; k < 8192; k += 8) {
8549 __asm__ volatile (
8550 "vmovups (%0), %%ymm0\n"
8551 "vmovups (%1), %%ymm1\n"
8552 "vmovups (%2), %%ymm2\n"
8553 "vmovups (%3), %%ymm3\n"
8554 "vmovups (%4), %%ymm4\n"
8555 "vmovups (%5), %%ymm5\n"
8556 "vmovups (%6), %%ymm6\n"
8557 "vmovups (%7), %%ymm7\n"
8558 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8559 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8560 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8561 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8562 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8563 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8564 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8565 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8566 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8567 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8568 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8569 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8570 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8571 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8572 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8573 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8574 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8575 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8576 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8577 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8578 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8579 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8580 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8581 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8582 "vmovups %%ymm8, (%0)\n"
8583 "vmovups %%ymm9, (%1)\n"
8584 "vmovups %%ymm10, (%2)\n"
8585 "vmovups %%ymm11, (%3)\n"
8586 "vmovups %%ymm12, (%4)\n"
8587 "vmovups %%ymm13, (%5)\n"
8588 "vmovups %%ymm14, (%6)\n"
8589 "vmovups %%ymm15, (%7)\n"
8590 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8591 );
8592 }
8593 }
8594 return;
8595 }
8596 if (depth == 19) {
8597 helper_float_28_recursive(buf + 0, 16);
8598 helper_float_28_recursive(buf + 65536, 16);
8599 helper_float_28_recursive(buf + 131072, 16);
8600 helper_float_28_recursive(buf + 196608, 16);
8601 helper_float_28_recursive(buf + 262144, 16);
8602 helper_float_28_recursive(buf + 327680, 16);
8603 helper_float_28_recursive(buf + 393216, 16);
8604 helper_float_28_recursive(buf + 458752, 16);
8605 for (int j = 0; j < 524288; j += 524288) {
8606 for (int k = 0; k < 65536; k += 8) {
8607 __asm__ volatile (
8608 "vmovups (%0), %%ymm0\n"
8609 "vmovups (%1), %%ymm1\n"
8610 "vmovups (%2), %%ymm2\n"
8611 "vmovups (%3), %%ymm3\n"
8612 "vmovups (%4), %%ymm4\n"
8613 "vmovups (%5), %%ymm5\n"
8614 "vmovups (%6), %%ymm6\n"
8615 "vmovups (%7), %%ymm7\n"
8616 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8617 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8618 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8619 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8620 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8621 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8622 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8623 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8624 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8625 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8626 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8627 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8628 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8629 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8630 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8631 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8632 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8633 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8634 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8635 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8636 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8637 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8638 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8639 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8640 "vmovups %%ymm8, (%0)\n"
8641 "vmovups %%ymm9, (%1)\n"
8642 "vmovups %%ymm10, (%2)\n"
8643 "vmovups %%ymm11, (%3)\n"
8644 "vmovups %%ymm12, (%4)\n"
8645 "vmovups %%ymm13, (%5)\n"
8646 "vmovups %%ymm14, (%6)\n"
8647 "vmovups %%ymm15, (%7)\n"
8648 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8649 );
8650 }
8651 }
8652 return;
8653 }
8654 if (depth == 22) {
8655 helper_float_28_recursive(buf + 0, 19);
8656 helper_float_28_recursive(buf + 524288, 19);
8657 helper_float_28_recursive(buf + 1048576, 19);
8658 helper_float_28_recursive(buf + 1572864, 19);
8659 helper_float_28_recursive(buf + 2097152, 19);
8660 helper_float_28_recursive(buf + 2621440, 19);
8661 helper_float_28_recursive(buf + 3145728, 19);
8662 helper_float_28_recursive(buf + 3670016, 19);
8663 for (int j = 0; j < 4194304; j += 4194304) {
8664 for (int k = 0; k < 524288; k += 8) {
8665 __asm__ volatile (
8666 "vmovups (%0), %%ymm0\n"
8667 "vmovups (%1), %%ymm1\n"
8668 "vmovups (%2), %%ymm2\n"
8669 "vmovups (%3), %%ymm3\n"
8670 "vmovups (%4), %%ymm4\n"
8671 "vmovups (%5), %%ymm5\n"
8672 "vmovups (%6), %%ymm6\n"
8673 "vmovups (%7), %%ymm7\n"
8674 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8675 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8676 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8677 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8678 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8679 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8680 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8681 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8682 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8683 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8684 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8685 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8686 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8687 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8688 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8689 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8690 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8691 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8692 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8693 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8694 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8695 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8696 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8697 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8698 "vmovups %%ymm8, (%0)\n"
8699 "vmovups %%ymm9, (%1)\n"
8700 "vmovups %%ymm10, (%2)\n"
8701 "vmovups %%ymm11, (%3)\n"
8702 "vmovups %%ymm12, (%4)\n"
8703 "vmovups %%ymm13, (%5)\n"
8704 "vmovups %%ymm14, (%6)\n"
8705 "vmovups %%ymm15, (%7)\n"
8706 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8707 );
8708 }
8709 }
8710 return;
8711 }
8712 if (depth == 25) {
8713 helper_float_28_recursive(buf + 0, 22);
8714 helper_float_28_recursive(buf + 4194304, 22);
8715 helper_float_28_recursive(buf + 8388608, 22);
8716 helper_float_28_recursive(buf + 12582912, 22);
8717 helper_float_28_recursive(buf + 16777216, 22);
8718 helper_float_28_recursive(buf + 20971520, 22);
8719 helper_float_28_recursive(buf + 25165824, 22);
8720 helper_float_28_recursive(buf + 29360128, 22);
8721 for (int j = 0; j < 33554432; j += 33554432) {
8722 for (int k = 0; k < 4194304; k += 8) {
8723 __asm__ volatile (
8724 "vmovups (%0), %%ymm0\n"
8725 "vmovups (%1), %%ymm1\n"
8726 "vmovups (%2), %%ymm2\n"
8727 "vmovups (%3), %%ymm3\n"
8728 "vmovups (%4), %%ymm4\n"
8729 "vmovups (%5), %%ymm5\n"
8730 "vmovups (%6), %%ymm6\n"
8731 "vmovups (%7), %%ymm7\n"
8732 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8733 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8734 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8735 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8736 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8737 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8738 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8739 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8740 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8741 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8742 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8743 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8744 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8745 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8746 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8747 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8748 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8749 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8750 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8751 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8752 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8753 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8754 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8755 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8756 "vmovups %%ymm8, (%0)\n"
8757 "vmovups %%ymm9, (%1)\n"
8758 "vmovups %%ymm10, (%2)\n"
8759 "vmovups %%ymm11, (%3)\n"
8760 "vmovups %%ymm12, (%4)\n"
8761 "vmovups %%ymm13, (%5)\n"
8762 "vmovups %%ymm14, (%6)\n"
8763 "vmovups %%ymm15, (%7)\n"
8764 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912), "r"(buf + j + k + 16777216), "r"(buf + j + k + 20971520), "r"(buf + j + k + 25165824), "r"(buf + j + k + 29360128) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8765 );
8766 }
8767 }
8768 return;
8769 }
8770 if (depth == 28) {
8771 helper_float_28_recursive(buf + 0, 25);
8772 helper_float_28_recursive(buf + 33554432, 25);
8773 helper_float_28_recursive(buf + 67108864, 25);
8774 helper_float_28_recursive(buf + 100663296, 25);
8775 helper_float_28_recursive(buf + 134217728, 25);
8776 helper_float_28_recursive(buf + 167772160, 25);
8777 helper_float_28_recursive(buf + 201326592, 25);
8778 helper_float_28_recursive(buf + 234881024, 25);
8779 for (int j = 0; j < 268435456; j += 268435456) {
8780 for (int k = 0; k < 33554432; k += 8) {
8781 __asm__ volatile (
8782 "vmovups (%0), %%ymm0\n"
8783 "vmovups (%1), %%ymm1\n"
8784 "vmovups (%2), %%ymm2\n"
8785 "vmovups (%3), %%ymm3\n"
8786 "vmovups (%4), %%ymm4\n"
8787 "vmovups (%5), %%ymm5\n"
8788 "vmovups (%6), %%ymm6\n"
8789 "vmovups (%7), %%ymm7\n"
8790 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8791 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8792 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8793 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8794 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8795 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8796 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8797 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8798 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8799 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8800 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8801 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8802 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8803 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8804 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8805 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8806 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8807 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8808 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8809 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8810 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8811 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8812 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8813 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8814 "vmovups %%ymm8, (%0)\n"
8815 "vmovups %%ymm9, (%1)\n"
8816 "vmovups %%ymm10, (%2)\n"
8817 "vmovups %%ymm11, (%3)\n"
8818 "vmovups %%ymm12, (%4)\n"
8819 "vmovups %%ymm13, (%5)\n"
8820 "vmovups %%ymm14, (%6)\n"
8821 "vmovups %%ymm15, (%7)\n"
8822 :: "r"(buf + j + k + 0), "r"(buf + j + k + 33554432), "r"(buf + j + k + 67108864), "r"(buf + j + k + 100663296), "r"(buf + j + k + 134217728), "r"(buf + j + k + 167772160), "r"(buf + j + k + 201326592), "r"(buf + j + k + 234881024) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
8823 );
8824 }
8825 }
8826 return;
8827 }
8828 }
8829 void helper_float_28(float *buf);
helper_float_28(float * buf)8830 void helper_float_28(float *buf) {
8831 helper_float_28_recursive(buf, 28);
8832 }
8833 void helper_float_29_recursive(float *buf, int depth);
helper_float_29_recursive(float * buf,int depth)8834 void helper_float_29_recursive(float *buf, int depth) {
8835 if (depth == 12) {
8836 for (int j = 0; j < 4096; j += 64) {
8837 for (int k = 0; k < 8; k += 8) {
8838 __asm__ volatile (
8839 "vmovups (%0), %%ymm0\n"
8840 "vmovups (%1), %%ymm1\n"
8841 "vmovups (%2), %%ymm2\n"
8842 "vmovups (%3), %%ymm3\n"
8843 "vmovups (%4), %%ymm4\n"
8844 "vmovups (%5), %%ymm5\n"
8845 "vmovups (%6), %%ymm6\n"
8846 "vmovups (%7), %%ymm7\n"
8847 "vpermilps $160, %%ymm0, %%ymm8\n"
8848 "vpermilps $245, %%ymm0, %%ymm9\n"
8849 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8850 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8851 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
8852 "vpermilps $160, %%ymm1, %%ymm8\n"
8853 "vpermilps $245, %%ymm1, %%ymm9\n"
8854 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8855 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8856 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
8857 "vpermilps $160, %%ymm2, %%ymm8\n"
8858 "vpermilps $245, %%ymm2, %%ymm9\n"
8859 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8860 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8861 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
8862 "vpermilps $160, %%ymm3, %%ymm8\n"
8863 "vpermilps $245, %%ymm3, %%ymm9\n"
8864 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8865 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8866 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
8867 "vpermilps $160, %%ymm4, %%ymm8\n"
8868 "vpermilps $245, %%ymm4, %%ymm9\n"
8869 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8870 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8871 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
8872 "vpermilps $160, %%ymm5, %%ymm8\n"
8873 "vpermilps $245, %%ymm5, %%ymm9\n"
8874 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8875 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8876 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
8877 "vpermilps $160, %%ymm6, %%ymm8\n"
8878 "vpermilps $245, %%ymm6, %%ymm9\n"
8879 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8880 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8881 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
8882 "vpermilps $160, %%ymm7, %%ymm8\n"
8883 "vpermilps $245, %%ymm7, %%ymm9\n"
8884 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8885 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8886 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
8887 "vpermilps $68, %%ymm0, %%ymm8\n"
8888 "vpermilps $238, %%ymm0, %%ymm9\n"
8889 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8890 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8891 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8892 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
8893 "vpermilps $68, %%ymm1, %%ymm8\n"
8894 "vpermilps $238, %%ymm1, %%ymm9\n"
8895 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8896 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8897 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8898 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
8899 "vpermilps $68, %%ymm2, %%ymm8\n"
8900 "vpermilps $238, %%ymm2, %%ymm9\n"
8901 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8902 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8903 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8904 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
8905 "vpermilps $68, %%ymm3, %%ymm8\n"
8906 "vpermilps $238, %%ymm3, %%ymm9\n"
8907 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8908 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8909 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8910 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
8911 "vpermilps $68, %%ymm4, %%ymm8\n"
8912 "vpermilps $238, %%ymm4, %%ymm9\n"
8913 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8914 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8915 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8916 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
8917 "vpermilps $68, %%ymm5, %%ymm8\n"
8918 "vpermilps $238, %%ymm5, %%ymm9\n"
8919 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8920 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8921 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8922 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
8923 "vpermilps $68, %%ymm6, %%ymm8\n"
8924 "vpermilps $238, %%ymm6, %%ymm9\n"
8925 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8926 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8927 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8928 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
8929 "vpermilps $68, %%ymm7, %%ymm8\n"
8930 "vpermilps $238, %%ymm7, %%ymm9\n"
8931 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
8932 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
8933 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
8934 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
8935 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8936 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
8937 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
8938 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
8939 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
8940 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8941 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
8942 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
8943 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
8944 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
8945 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8946 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
8947 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
8948 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
8949 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
8950 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8951 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
8952 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
8953 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
8954 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
8955 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8956 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
8957 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
8958 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
8959 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
8960 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8961 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
8962 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
8963 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
8964 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
8965 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8966 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
8967 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
8968 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
8969 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
8970 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
8971 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
8972 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
8973 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
8974 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
8975 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
8976 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
8977 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
8978 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
8979 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
8980 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
8981 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
8982 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
8983 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
8984 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
8985 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
8986 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
8987 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
8988 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
8989 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
8990 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
8991 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
8992 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
8993 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
8994 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
8995 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
8996 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
8997 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
8998 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
8999 "vmovups %%ymm8, (%0)\n"
9000 "vmovups %%ymm9, (%1)\n"
9001 "vmovups %%ymm10, (%2)\n"
9002 "vmovups %%ymm11, (%3)\n"
9003 "vmovups %%ymm12, (%4)\n"
9004 "vmovups %%ymm13, (%5)\n"
9005 "vmovups %%ymm14, (%6)\n"
9006 "vmovups %%ymm15, (%7)\n"
9007 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9008 );
9009 }
9010 }
9011 for (int j = 0; j < 4096; j += 512) {
9012 for (int k = 0; k < 64; k += 8) {
9013 __asm__ volatile (
9014 "vmovups (%0), %%ymm0\n"
9015 "vmovups (%1), %%ymm1\n"
9016 "vmovups (%2), %%ymm2\n"
9017 "vmovups (%3), %%ymm3\n"
9018 "vmovups (%4), %%ymm4\n"
9019 "vmovups (%5), %%ymm5\n"
9020 "vmovups (%6), %%ymm6\n"
9021 "vmovups (%7), %%ymm7\n"
9022 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9023 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9024 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9025 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9026 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9027 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9028 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9029 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9030 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9031 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9032 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9033 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9034 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9035 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9036 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9037 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9038 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9039 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9040 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9041 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9042 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9043 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9044 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9045 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9046 "vmovups %%ymm8, (%0)\n"
9047 "vmovups %%ymm9, (%1)\n"
9048 "vmovups %%ymm10, (%2)\n"
9049 "vmovups %%ymm11, (%3)\n"
9050 "vmovups %%ymm12, (%4)\n"
9051 "vmovups %%ymm13, (%5)\n"
9052 "vmovups %%ymm14, (%6)\n"
9053 "vmovups %%ymm15, (%7)\n"
9054 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9055 );
9056 }
9057 }
9058 for (int j = 0; j < 4096; j += 4096) {
9059 for (int k = 0; k < 512; k += 8) {
9060 __asm__ volatile (
9061 "vmovups (%0), %%ymm0\n"
9062 "vmovups (%1), %%ymm1\n"
9063 "vmovups (%2), %%ymm2\n"
9064 "vmovups (%3), %%ymm3\n"
9065 "vmovups (%4), %%ymm4\n"
9066 "vmovups (%5), %%ymm5\n"
9067 "vmovups (%6), %%ymm6\n"
9068 "vmovups (%7), %%ymm7\n"
9069 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9070 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9071 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9072 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9073 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9074 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9075 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9076 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9077 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9078 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9079 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9080 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9081 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9082 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9083 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9084 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9085 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9086 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9087 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9088 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9089 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9090 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9091 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9092 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9093 "vmovups %%ymm8, (%0)\n"
9094 "vmovups %%ymm9, (%1)\n"
9095 "vmovups %%ymm10, (%2)\n"
9096 "vmovups %%ymm11, (%3)\n"
9097 "vmovups %%ymm12, (%4)\n"
9098 "vmovups %%ymm13, (%5)\n"
9099 "vmovups %%ymm14, (%6)\n"
9100 "vmovups %%ymm15, (%7)\n"
9101 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9102 );
9103 }
9104 }
9105 return;
9106 }
9107 if (depth == 15) {
9108 helper_float_29_recursive(buf + 0, 12);
9109 helper_float_29_recursive(buf + 4096, 12);
9110 helper_float_29_recursive(buf + 8192, 12);
9111 helper_float_29_recursive(buf + 12288, 12);
9112 helper_float_29_recursive(buf + 16384, 12);
9113 helper_float_29_recursive(buf + 20480, 12);
9114 helper_float_29_recursive(buf + 24576, 12);
9115 helper_float_29_recursive(buf + 28672, 12);
9116 for (int j = 0; j < 32768; j += 32768) {
9117 for (int k = 0; k < 4096; k += 8) {
9118 __asm__ volatile (
9119 "vmovups (%0), %%ymm0\n"
9120 "vmovups (%1), %%ymm1\n"
9121 "vmovups (%2), %%ymm2\n"
9122 "vmovups (%3), %%ymm3\n"
9123 "vmovups (%4), %%ymm4\n"
9124 "vmovups (%5), %%ymm5\n"
9125 "vmovups (%6), %%ymm6\n"
9126 "vmovups (%7), %%ymm7\n"
9127 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9128 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9129 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9130 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9131 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9132 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9133 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9134 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9135 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9136 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9137 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9138 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9139 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9140 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9141 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9142 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9143 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9144 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9145 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9146 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9147 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9148 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9149 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9150 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9151 "vmovups %%ymm8, (%0)\n"
9152 "vmovups %%ymm9, (%1)\n"
9153 "vmovups %%ymm10, (%2)\n"
9154 "vmovups %%ymm11, (%3)\n"
9155 "vmovups %%ymm12, (%4)\n"
9156 "vmovups %%ymm13, (%5)\n"
9157 "vmovups %%ymm14, (%6)\n"
9158 "vmovups %%ymm15, (%7)\n"
9159 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9160 );
9161 }
9162 }
9163 return;
9164 }
9165 if (depth == 18) {
9166 helper_float_29_recursive(buf + 0, 15);
9167 helper_float_29_recursive(buf + 32768, 15);
9168 helper_float_29_recursive(buf + 65536, 15);
9169 helper_float_29_recursive(buf + 98304, 15);
9170 helper_float_29_recursive(buf + 131072, 15);
9171 helper_float_29_recursive(buf + 163840, 15);
9172 helper_float_29_recursive(buf + 196608, 15);
9173 helper_float_29_recursive(buf + 229376, 15);
9174 for (int j = 0; j < 262144; j += 262144) {
9175 for (int k = 0; k < 32768; k += 8) {
9176 __asm__ volatile (
9177 "vmovups (%0), %%ymm0\n"
9178 "vmovups (%1), %%ymm1\n"
9179 "vmovups (%2), %%ymm2\n"
9180 "vmovups (%3), %%ymm3\n"
9181 "vmovups (%4), %%ymm4\n"
9182 "vmovups (%5), %%ymm5\n"
9183 "vmovups (%6), %%ymm6\n"
9184 "vmovups (%7), %%ymm7\n"
9185 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9186 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9187 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9188 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9189 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9190 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9191 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9192 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9193 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9194 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9195 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9196 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9197 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9198 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9199 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9200 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9201 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9202 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9203 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9204 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9205 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9206 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9207 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9208 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9209 "vmovups %%ymm8, (%0)\n"
9210 "vmovups %%ymm9, (%1)\n"
9211 "vmovups %%ymm10, (%2)\n"
9212 "vmovups %%ymm11, (%3)\n"
9213 "vmovups %%ymm12, (%4)\n"
9214 "vmovups %%ymm13, (%5)\n"
9215 "vmovups %%ymm14, (%6)\n"
9216 "vmovups %%ymm15, (%7)\n"
9217 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9218 );
9219 }
9220 }
9221 return;
9222 }
9223 if (depth == 21) {
9224 helper_float_29_recursive(buf + 0, 18);
9225 helper_float_29_recursive(buf + 262144, 18);
9226 helper_float_29_recursive(buf + 524288, 18);
9227 helper_float_29_recursive(buf + 786432, 18);
9228 helper_float_29_recursive(buf + 1048576, 18);
9229 helper_float_29_recursive(buf + 1310720, 18);
9230 helper_float_29_recursive(buf + 1572864, 18);
9231 helper_float_29_recursive(buf + 1835008, 18);
9232 for (int j = 0; j < 2097152; j += 2097152) {
9233 for (int k = 0; k < 262144; k += 8) {
9234 __asm__ volatile (
9235 "vmovups (%0), %%ymm0\n"
9236 "vmovups (%1), %%ymm1\n"
9237 "vmovups (%2), %%ymm2\n"
9238 "vmovups (%3), %%ymm3\n"
9239 "vmovups (%4), %%ymm4\n"
9240 "vmovups (%5), %%ymm5\n"
9241 "vmovups (%6), %%ymm6\n"
9242 "vmovups (%7), %%ymm7\n"
9243 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9244 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9245 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9246 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9247 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9248 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9249 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9250 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9251 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9252 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9253 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9254 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9255 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9256 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9257 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9258 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9259 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9260 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9261 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9262 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9263 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9264 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9265 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9266 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9267 "vmovups %%ymm8, (%0)\n"
9268 "vmovups %%ymm9, (%1)\n"
9269 "vmovups %%ymm10, (%2)\n"
9270 "vmovups %%ymm11, (%3)\n"
9271 "vmovups %%ymm12, (%4)\n"
9272 "vmovups %%ymm13, (%5)\n"
9273 "vmovups %%ymm14, (%6)\n"
9274 "vmovups %%ymm15, (%7)\n"
9275 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9276 );
9277 }
9278 }
9279 return;
9280 }
9281 if (depth == 24) {
9282 helper_float_29_recursive(buf + 0, 21);
9283 helper_float_29_recursive(buf + 2097152, 21);
9284 helper_float_29_recursive(buf + 4194304, 21);
9285 helper_float_29_recursive(buf + 6291456, 21);
9286 helper_float_29_recursive(buf + 8388608, 21);
9287 helper_float_29_recursive(buf + 10485760, 21);
9288 helper_float_29_recursive(buf + 12582912, 21);
9289 helper_float_29_recursive(buf + 14680064, 21);
9290 for (int j = 0; j < 16777216; j += 16777216) {
9291 for (int k = 0; k < 2097152; k += 8) {
9292 __asm__ volatile (
9293 "vmovups (%0), %%ymm0\n"
9294 "vmovups (%1), %%ymm1\n"
9295 "vmovups (%2), %%ymm2\n"
9296 "vmovups (%3), %%ymm3\n"
9297 "vmovups (%4), %%ymm4\n"
9298 "vmovups (%5), %%ymm5\n"
9299 "vmovups (%6), %%ymm6\n"
9300 "vmovups (%7), %%ymm7\n"
9301 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9302 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9303 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9304 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9305 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9306 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9307 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9308 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9309 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9310 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9311 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9312 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9313 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9314 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9315 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9316 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9317 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9318 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9319 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9320 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9321 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9322 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9323 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9324 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9325 "vmovups %%ymm8, (%0)\n"
9326 "vmovups %%ymm9, (%1)\n"
9327 "vmovups %%ymm10, (%2)\n"
9328 "vmovups %%ymm11, (%3)\n"
9329 "vmovups %%ymm12, (%4)\n"
9330 "vmovups %%ymm13, (%5)\n"
9331 "vmovups %%ymm14, (%6)\n"
9332 "vmovups %%ymm15, (%7)\n"
9333 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9334 );
9335 }
9336 }
9337 return;
9338 }
9339 if (depth == 27) {
9340 helper_float_29_recursive(buf + 0, 24);
9341 helper_float_29_recursive(buf + 16777216, 24);
9342 helper_float_29_recursive(buf + 33554432, 24);
9343 helper_float_29_recursive(buf + 50331648, 24);
9344 helper_float_29_recursive(buf + 67108864, 24);
9345 helper_float_29_recursive(buf + 83886080, 24);
9346 helper_float_29_recursive(buf + 100663296, 24);
9347 helper_float_29_recursive(buf + 117440512, 24);
9348 for (int j = 0; j < 134217728; j += 134217728) {
9349 for (int k = 0; k < 16777216; k += 8) {
9350 __asm__ volatile (
9351 "vmovups (%0), %%ymm0\n"
9352 "vmovups (%1), %%ymm1\n"
9353 "vmovups (%2), %%ymm2\n"
9354 "vmovups (%3), %%ymm3\n"
9355 "vmovups (%4), %%ymm4\n"
9356 "vmovups (%5), %%ymm5\n"
9357 "vmovups (%6), %%ymm6\n"
9358 "vmovups (%7), %%ymm7\n"
9359 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9360 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9361 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9362 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9363 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9364 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9365 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9366 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9367 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9368 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9369 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9370 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9371 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9372 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9373 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9374 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9375 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9376 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9377 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9378 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9379 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9380 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9381 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9382 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9383 "vmovups %%ymm8, (%0)\n"
9384 "vmovups %%ymm9, (%1)\n"
9385 "vmovups %%ymm10, (%2)\n"
9386 "vmovups %%ymm11, (%3)\n"
9387 "vmovups %%ymm12, (%4)\n"
9388 "vmovups %%ymm13, (%5)\n"
9389 "vmovups %%ymm14, (%6)\n"
9390 "vmovups %%ymm15, (%7)\n"
9391 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9392 );
9393 }
9394 }
9395 return;
9396 }
9397 if (depth == 29) {
9398 helper_float_29_recursive(buf + 0, 27);
9399 helper_float_29_recursive(buf + 134217728, 27);
9400 helper_float_29_recursive(buf + 268435456, 27);
9401 helper_float_29_recursive(buf + 402653184, 27);
9402 for (int j = 0; j < 536870912; j += 536870912) {
9403 for (int k = 0; k < 134217728; k += 8) {
9404 __asm__ volatile (
9405 "vmovups (%0), %%ymm0\n"
9406 "vmovups (%1), %%ymm1\n"
9407 "vmovups (%2), %%ymm2\n"
9408 "vmovups (%3), %%ymm3\n"
9409 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9410 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9411 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9412 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9413 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9414 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9415 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9416 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9417 "vmovups %%ymm0, (%0)\n"
9418 "vmovups %%ymm1, (%1)\n"
9419 "vmovups %%ymm2, (%2)\n"
9420 "vmovups %%ymm3, (%3)\n"
9421 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9422 );
9423 }
9424 }
9425 return;
9426 }
9427 }
9428 void helper_float_29(float *buf);
helper_float_29(float * buf)9429 void helper_float_29(float *buf) {
9430 helper_float_29_recursive(buf, 29);
9431 }
9432 void helper_float_30_recursive(float *buf, int depth);
helper_float_30_recursive(float * buf,int depth)9433 void helper_float_30_recursive(float *buf, int depth) {
9434 if (depth == 6) {
9435 for (int j = 0; j < 64; j += 64) {
9436 for (int k = 0; k < 8; k += 8) {
9437 __asm__ volatile (
9438 "vmovups (%0), %%ymm0\n"
9439 "vmovups (%1), %%ymm1\n"
9440 "vmovups (%2), %%ymm2\n"
9441 "vmovups (%3), %%ymm3\n"
9442 "vmovups (%4), %%ymm4\n"
9443 "vmovups (%5), %%ymm5\n"
9444 "vmovups (%6), %%ymm6\n"
9445 "vmovups (%7), %%ymm7\n"
9446 "vpermilps $160, %%ymm0, %%ymm8\n"
9447 "vpermilps $245, %%ymm0, %%ymm9\n"
9448 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9449 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9450 "vaddsubps %%ymm11, %%ymm8, %%ymm0\n"
9451 "vpermilps $160, %%ymm1, %%ymm8\n"
9452 "vpermilps $245, %%ymm1, %%ymm9\n"
9453 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9454 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9455 "vaddsubps %%ymm11, %%ymm8, %%ymm1\n"
9456 "vpermilps $160, %%ymm2, %%ymm8\n"
9457 "vpermilps $245, %%ymm2, %%ymm9\n"
9458 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9459 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9460 "vaddsubps %%ymm11, %%ymm8, %%ymm2\n"
9461 "vpermilps $160, %%ymm3, %%ymm8\n"
9462 "vpermilps $245, %%ymm3, %%ymm9\n"
9463 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9464 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9465 "vaddsubps %%ymm11, %%ymm8, %%ymm3\n"
9466 "vpermilps $160, %%ymm4, %%ymm8\n"
9467 "vpermilps $245, %%ymm4, %%ymm9\n"
9468 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9469 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9470 "vaddsubps %%ymm11, %%ymm8, %%ymm4\n"
9471 "vpermilps $160, %%ymm5, %%ymm8\n"
9472 "vpermilps $245, %%ymm5, %%ymm9\n"
9473 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9474 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9475 "vaddsubps %%ymm11, %%ymm8, %%ymm5\n"
9476 "vpermilps $160, %%ymm6, %%ymm8\n"
9477 "vpermilps $245, %%ymm6, %%ymm9\n"
9478 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9479 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9480 "vaddsubps %%ymm11, %%ymm8, %%ymm6\n"
9481 "vpermilps $160, %%ymm7, %%ymm8\n"
9482 "vpermilps $245, %%ymm7, %%ymm9\n"
9483 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9484 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9485 "vaddsubps %%ymm11, %%ymm8, %%ymm7\n"
9486 "vpermilps $68, %%ymm0, %%ymm8\n"
9487 "vpermilps $238, %%ymm0, %%ymm9\n"
9488 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9489 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9490 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9491 "vaddps %%ymm8, %%ymm12, %%ymm0\n"
9492 "vpermilps $68, %%ymm1, %%ymm8\n"
9493 "vpermilps $238, %%ymm1, %%ymm9\n"
9494 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9495 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9496 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9497 "vaddps %%ymm8, %%ymm12, %%ymm1\n"
9498 "vpermilps $68, %%ymm2, %%ymm8\n"
9499 "vpermilps $238, %%ymm2, %%ymm9\n"
9500 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9501 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9502 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9503 "vaddps %%ymm8, %%ymm12, %%ymm2\n"
9504 "vpermilps $68, %%ymm3, %%ymm8\n"
9505 "vpermilps $238, %%ymm3, %%ymm9\n"
9506 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9507 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9508 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9509 "vaddps %%ymm8, %%ymm12, %%ymm3\n"
9510 "vpermilps $68, %%ymm4, %%ymm8\n"
9511 "vpermilps $238, %%ymm4, %%ymm9\n"
9512 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9513 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9514 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9515 "vaddps %%ymm8, %%ymm12, %%ymm4\n"
9516 "vpermilps $68, %%ymm5, %%ymm8\n"
9517 "vpermilps $238, %%ymm5, %%ymm9\n"
9518 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9519 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9520 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9521 "vaddps %%ymm8, %%ymm12, %%ymm5\n"
9522 "vpermilps $68, %%ymm6, %%ymm8\n"
9523 "vpermilps $238, %%ymm6, %%ymm9\n"
9524 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9525 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9526 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9527 "vaddps %%ymm8, %%ymm12, %%ymm6\n"
9528 "vpermilps $68, %%ymm7, %%ymm8\n"
9529 "vpermilps $238, %%ymm7, %%ymm9\n"
9530 "vxorps %%ymm10, %%ymm10, %%ymm10\n"
9531 "vsubps %%ymm9, %%ymm10, %%ymm11\n"
9532 "vblendps $204, %%ymm11, %%ymm9, %%ymm12\n"
9533 "vaddps %%ymm8, %%ymm12, %%ymm7\n"
9534 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9535 "vsubps %%ymm0, %%ymm8, %%ymm9\n"
9536 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm10\n"
9537 "vperm2f128 $49, %%ymm9, %%ymm0, %%ymm11\n"
9538 "vaddps %%ymm10, %%ymm11, %%ymm0\n"
9539 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9540 "vsubps %%ymm1, %%ymm8, %%ymm9\n"
9541 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm10\n"
9542 "vperm2f128 $49, %%ymm9, %%ymm1, %%ymm11\n"
9543 "vaddps %%ymm10, %%ymm11, %%ymm1\n"
9544 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9545 "vsubps %%ymm2, %%ymm8, %%ymm9\n"
9546 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm10\n"
9547 "vperm2f128 $49, %%ymm9, %%ymm2, %%ymm11\n"
9548 "vaddps %%ymm10, %%ymm11, %%ymm2\n"
9549 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9550 "vsubps %%ymm3, %%ymm8, %%ymm9\n"
9551 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm10\n"
9552 "vperm2f128 $49, %%ymm9, %%ymm3, %%ymm11\n"
9553 "vaddps %%ymm10, %%ymm11, %%ymm3\n"
9554 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9555 "vsubps %%ymm4, %%ymm8, %%ymm9\n"
9556 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm10\n"
9557 "vperm2f128 $49, %%ymm9, %%ymm4, %%ymm11\n"
9558 "vaddps %%ymm10, %%ymm11, %%ymm4\n"
9559 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9560 "vsubps %%ymm5, %%ymm8, %%ymm9\n"
9561 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm10\n"
9562 "vperm2f128 $49, %%ymm9, %%ymm5, %%ymm11\n"
9563 "vaddps %%ymm10, %%ymm11, %%ymm5\n"
9564 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9565 "vsubps %%ymm6, %%ymm8, %%ymm9\n"
9566 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm10\n"
9567 "vperm2f128 $49, %%ymm9, %%ymm6, %%ymm11\n"
9568 "vaddps %%ymm10, %%ymm11, %%ymm6\n"
9569 "vxorps %%ymm8, %%ymm8, %%ymm8\n"
9570 "vsubps %%ymm7, %%ymm8, %%ymm9\n"
9571 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm10\n"
9572 "vperm2f128 $49, %%ymm9, %%ymm7, %%ymm11\n"
9573 "vaddps %%ymm10, %%ymm11, %%ymm7\n"
9574 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9575 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9576 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9577 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9578 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9579 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9580 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9581 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9582 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9583 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9584 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9585 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9586 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9587 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9588 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9589 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9590 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9591 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9592 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9593 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9594 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9595 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9596 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9597 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9598 "vmovups %%ymm8, (%0)\n"
9599 "vmovups %%ymm9, (%1)\n"
9600 "vmovups %%ymm10, (%2)\n"
9601 "vmovups %%ymm11, (%3)\n"
9602 "vmovups %%ymm12, (%4)\n"
9603 "vmovups %%ymm13, (%5)\n"
9604 "vmovups %%ymm14, (%6)\n"
9605 "vmovups %%ymm15, (%7)\n"
9606 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8), "r"(buf + j + k + 16), "r"(buf + j + k + 24), "r"(buf + j + k + 32), "r"(buf + j + k + 40), "r"(buf + j + k + 48), "r"(buf + j + k + 56) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9607 );
9608 }
9609 }
9610 return;
9611 }
9612 if (depth == 9) {
9613 helper_float_30_recursive(buf + 0, 6);
9614 helper_float_30_recursive(buf + 64, 6);
9615 helper_float_30_recursive(buf + 128, 6);
9616 helper_float_30_recursive(buf + 192, 6);
9617 helper_float_30_recursive(buf + 256, 6);
9618 helper_float_30_recursive(buf + 320, 6);
9619 helper_float_30_recursive(buf + 384, 6);
9620 helper_float_30_recursive(buf + 448, 6);
9621 for (int j = 0; j < 512; j += 512) {
9622 for (int k = 0; k < 64; k += 8) {
9623 __asm__ volatile (
9624 "vmovups (%0), %%ymm0\n"
9625 "vmovups (%1), %%ymm1\n"
9626 "vmovups (%2), %%ymm2\n"
9627 "vmovups (%3), %%ymm3\n"
9628 "vmovups (%4), %%ymm4\n"
9629 "vmovups (%5), %%ymm5\n"
9630 "vmovups (%6), %%ymm6\n"
9631 "vmovups (%7), %%ymm7\n"
9632 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9633 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9634 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9635 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9636 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9637 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9638 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9639 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9640 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9641 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9642 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9643 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9644 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9645 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9646 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9647 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9648 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9649 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9650 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9651 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9652 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9653 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9654 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9655 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9656 "vmovups %%ymm8, (%0)\n"
9657 "vmovups %%ymm9, (%1)\n"
9658 "vmovups %%ymm10, (%2)\n"
9659 "vmovups %%ymm11, (%3)\n"
9660 "vmovups %%ymm12, (%4)\n"
9661 "vmovups %%ymm13, (%5)\n"
9662 "vmovups %%ymm14, (%6)\n"
9663 "vmovups %%ymm15, (%7)\n"
9664 :: "r"(buf + j + k + 0), "r"(buf + j + k + 64), "r"(buf + j + k + 128), "r"(buf + j + k + 192), "r"(buf + j + k + 256), "r"(buf + j + k + 320), "r"(buf + j + k + 384), "r"(buf + j + k + 448) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9665 );
9666 }
9667 }
9668 return;
9669 }
9670 if (depth == 12) {
9671 helper_float_30_recursive(buf + 0, 9);
9672 helper_float_30_recursive(buf + 512, 9);
9673 helper_float_30_recursive(buf + 1024, 9);
9674 helper_float_30_recursive(buf + 1536, 9);
9675 helper_float_30_recursive(buf + 2048, 9);
9676 helper_float_30_recursive(buf + 2560, 9);
9677 helper_float_30_recursive(buf + 3072, 9);
9678 helper_float_30_recursive(buf + 3584, 9);
9679 for (int j = 0; j < 4096; j += 4096) {
9680 for (int k = 0; k < 512; k += 8) {
9681 __asm__ volatile (
9682 "vmovups (%0), %%ymm0\n"
9683 "vmovups (%1), %%ymm1\n"
9684 "vmovups (%2), %%ymm2\n"
9685 "vmovups (%3), %%ymm3\n"
9686 "vmovups (%4), %%ymm4\n"
9687 "vmovups (%5), %%ymm5\n"
9688 "vmovups (%6), %%ymm6\n"
9689 "vmovups (%7), %%ymm7\n"
9690 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9691 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9692 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9693 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9694 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9695 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9696 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9697 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9698 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9699 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9700 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9701 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9702 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9703 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9704 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9705 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9706 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9707 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9708 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9709 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9710 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9711 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9712 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9713 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9714 "vmovups %%ymm8, (%0)\n"
9715 "vmovups %%ymm9, (%1)\n"
9716 "vmovups %%ymm10, (%2)\n"
9717 "vmovups %%ymm11, (%3)\n"
9718 "vmovups %%ymm12, (%4)\n"
9719 "vmovups %%ymm13, (%5)\n"
9720 "vmovups %%ymm14, (%6)\n"
9721 "vmovups %%ymm15, (%7)\n"
9722 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9723 );
9724 }
9725 }
9726 return;
9727 }
9728 if (depth == 15) {
9729 helper_float_30_recursive(buf + 0, 12);
9730 helper_float_30_recursive(buf + 4096, 12);
9731 helper_float_30_recursive(buf + 8192, 12);
9732 helper_float_30_recursive(buf + 12288, 12);
9733 helper_float_30_recursive(buf + 16384, 12);
9734 helper_float_30_recursive(buf + 20480, 12);
9735 helper_float_30_recursive(buf + 24576, 12);
9736 helper_float_30_recursive(buf + 28672, 12);
9737 for (int j = 0; j < 32768; j += 32768) {
9738 for (int k = 0; k < 4096; k += 8) {
9739 __asm__ volatile (
9740 "vmovups (%0), %%ymm0\n"
9741 "vmovups (%1), %%ymm1\n"
9742 "vmovups (%2), %%ymm2\n"
9743 "vmovups (%3), %%ymm3\n"
9744 "vmovups (%4), %%ymm4\n"
9745 "vmovups (%5), %%ymm5\n"
9746 "vmovups (%6), %%ymm6\n"
9747 "vmovups (%7), %%ymm7\n"
9748 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9749 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9750 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9751 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9752 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9753 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9754 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9755 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9756 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9757 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9758 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9759 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9760 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9761 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9762 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9763 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9764 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9765 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9766 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9767 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9768 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9769 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9770 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9771 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9772 "vmovups %%ymm8, (%0)\n"
9773 "vmovups %%ymm9, (%1)\n"
9774 "vmovups %%ymm10, (%2)\n"
9775 "vmovups %%ymm11, (%3)\n"
9776 "vmovups %%ymm12, (%4)\n"
9777 "vmovups %%ymm13, (%5)\n"
9778 "vmovups %%ymm14, (%6)\n"
9779 "vmovups %%ymm15, (%7)\n"
9780 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9781 );
9782 }
9783 }
9784 return;
9785 }
9786 if (depth == 18) {
9787 helper_float_30_recursive(buf + 0, 15);
9788 helper_float_30_recursive(buf + 32768, 15);
9789 helper_float_30_recursive(buf + 65536, 15);
9790 helper_float_30_recursive(buf + 98304, 15);
9791 helper_float_30_recursive(buf + 131072, 15);
9792 helper_float_30_recursive(buf + 163840, 15);
9793 helper_float_30_recursive(buf + 196608, 15);
9794 helper_float_30_recursive(buf + 229376, 15);
9795 for (int j = 0; j < 262144; j += 262144) {
9796 for (int k = 0; k < 32768; k += 8) {
9797 __asm__ volatile (
9798 "vmovups (%0), %%ymm0\n"
9799 "vmovups (%1), %%ymm1\n"
9800 "vmovups (%2), %%ymm2\n"
9801 "vmovups (%3), %%ymm3\n"
9802 "vmovups (%4), %%ymm4\n"
9803 "vmovups (%5), %%ymm5\n"
9804 "vmovups (%6), %%ymm6\n"
9805 "vmovups (%7), %%ymm7\n"
9806 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9807 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9808 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9809 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9810 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9811 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9812 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9813 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9814 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9815 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9816 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9817 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9818 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9819 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9820 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9821 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9822 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9823 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9824 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9825 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9826 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9827 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9828 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9829 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9830 "vmovups %%ymm8, (%0)\n"
9831 "vmovups %%ymm9, (%1)\n"
9832 "vmovups %%ymm10, (%2)\n"
9833 "vmovups %%ymm11, (%3)\n"
9834 "vmovups %%ymm12, (%4)\n"
9835 "vmovups %%ymm13, (%5)\n"
9836 "vmovups %%ymm14, (%6)\n"
9837 "vmovups %%ymm15, (%7)\n"
9838 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9839 );
9840 }
9841 }
9842 return;
9843 }
9844 if (depth == 21) {
9845 helper_float_30_recursive(buf + 0, 18);
9846 helper_float_30_recursive(buf + 262144, 18);
9847 helper_float_30_recursive(buf + 524288, 18);
9848 helper_float_30_recursive(buf + 786432, 18);
9849 helper_float_30_recursive(buf + 1048576, 18);
9850 helper_float_30_recursive(buf + 1310720, 18);
9851 helper_float_30_recursive(buf + 1572864, 18);
9852 helper_float_30_recursive(buf + 1835008, 18);
9853 for (int j = 0; j < 2097152; j += 2097152) {
9854 for (int k = 0; k < 262144; k += 8) {
9855 __asm__ volatile (
9856 "vmovups (%0), %%ymm0\n"
9857 "vmovups (%1), %%ymm1\n"
9858 "vmovups (%2), %%ymm2\n"
9859 "vmovups (%3), %%ymm3\n"
9860 "vmovups (%4), %%ymm4\n"
9861 "vmovups (%5), %%ymm5\n"
9862 "vmovups (%6), %%ymm6\n"
9863 "vmovups (%7), %%ymm7\n"
9864 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9865 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9866 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9867 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9868 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9869 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9870 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9871 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9872 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9873 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9874 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9875 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9876 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9877 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9878 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9879 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9880 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9881 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9882 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9883 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9884 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9885 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9886 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9887 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9888 "vmovups %%ymm8, (%0)\n"
9889 "vmovups %%ymm9, (%1)\n"
9890 "vmovups %%ymm10, (%2)\n"
9891 "vmovups %%ymm11, (%3)\n"
9892 "vmovups %%ymm12, (%4)\n"
9893 "vmovups %%ymm13, (%5)\n"
9894 "vmovups %%ymm14, (%6)\n"
9895 "vmovups %%ymm15, (%7)\n"
9896 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9897 );
9898 }
9899 }
9900 return;
9901 }
9902 if (depth == 24) {
9903 helper_float_30_recursive(buf + 0, 21);
9904 helper_float_30_recursive(buf + 2097152, 21);
9905 helper_float_30_recursive(buf + 4194304, 21);
9906 helper_float_30_recursive(buf + 6291456, 21);
9907 helper_float_30_recursive(buf + 8388608, 21);
9908 helper_float_30_recursive(buf + 10485760, 21);
9909 helper_float_30_recursive(buf + 12582912, 21);
9910 helper_float_30_recursive(buf + 14680064, 21);
9911 for (int j = 0; j < 16777216; j += 16777216) {
9912 for (int k = 0; k < 2097152; k += 8) {
9913 __asm__ volatile (
9914 "vmovups (%0), %%ymm0\n"
9915 "vmovups (%1), %%ymm1\n"
9916 "vmovups (%2), %%ymm2\n"
9917 "vmovups (%3), %%ymm3\n"
9918 "vmovups (%4), %%ymm4\n"
9919 "vmovups (%5), %%ymm5\n"
9920 "vmovups (%6), %%ymm6\n"
9921 "vmovups (%7), %%ymm7\n"
9922 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9923 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9924 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9925 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9926 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9927 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9928 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9929 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9930 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9931 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9932 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9933 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9934 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9935 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9936 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9937 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9938 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9939 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9940 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9941 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
9942 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
9943 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
9944 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
9945 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
9946 "vmovups %%ymm8, (%0)\n"
9947 "vmovups %%ymm9, (%1)\n"
9948 "vmovups %%ymm10, (%2)\n"
9949 "vmovups %%ymm11, (%3)\n"
9950 "vmovups %%ymm12, (%4)\n"
9951 "vmovups %%ymm13, (%5)\n"
9952 "vmovups %%ymm14, (%6)\n"
9953 "vmovups %%ymm15, (%7)\n"
9954 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
9955 );
9956 }
9957 }
9958 return;
9959 }
9960 if (depth == 27) {
9961 helper_float_30_recursive(buf + 0, 24);
9962 helper_float_30_recursive(buf + 16777216, 24);
9963 helper_float_30_recursive(buf + 33554432, 24);
9964 helper_float_30_recursive(buf + 50331648, 24);
9965 helper_float_30_recursive(buf + 67108864, 24);
9966 helper_float_30_recursive(buf + 83886080, 24);
9967 helper_float_30_recursive(buf + 100663296, 24);
9968 helper_float_30_recursive(buf + 117440512, 24);
9969 for (int j = 0; j < 134217728; j += 134217728) {
9970 for (int k = 0; k < 16777216; k += 8) {
9971 __asm__ volatile (
9972 "vmovups (%0), %%ymm0\n"
9973 "vmovups (%1), %%ymm1\n"
9974 "vmovups (%2), %%ymm2\n"
9975 "vmovups (%3), %%ymm3\n"
9976 "vmovups (%4), %%ymm4\n"
9977 "vmovups (%5), %%ymm5\n"
9978 "vmovups (%6), %%ymm6\n"
9979 "vmovups (%7), %%ymm7\n"
9980 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
9981 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
9982 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
9983 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
9984 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
9985 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
9986 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
9987 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
9988 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
9989 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
9990 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
9991 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
9992 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
9993 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
9994 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
9995 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
9996 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
9997 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
9998 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
9999 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
10000 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
10001 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
10002 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
10003 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
10004 "vmovups %%ymm8, (%0)\n"
10005 "vmovups %%ymm9, (%1)\n"
10006 "vmovups %%ymm10, (%2)\n"
10007 "vmovups %%ymm11, (%3)\n"
10008 "vmovups %%ymm12, (%4)\n"
10009 "vmovups %%ymm13, (%5)\n"
10010 "vmovups %%ymm14, (%6)\n"
10011 "vmovups %%ymm15, (%7)\n"
10012 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10013 );
10014 }
10015 }
10016 return;
10017 }
10018 if (depth == 30) {
10019 helper_float_30_recursive(buf + 0, 27);
10020 helper_float_30_recursive(buf + 134217728, 27);
10021 helper_float_30_recursive(buf + 268435456, 27);
10022 helper_float_30_recursive(buf + 402653184, 27);
10023 helper_float_30_recursive(buf + 536870912, 27);
10024 helper_float_30_recursive(buf + 671088640, 27);
10025 helper_float_30_recursive(buf + 805306368, 27);
10026 helper_float_30_recursive(buf + 939524096, 27);
10027 for (int j = 0; j < 1073741824; j += 1073741824) {
10028 for (int k = 0; k < 134217728; k += 8) {
10029 __asm__ volatile (
10030 "vmovups (%0), %%ymm0\n"
10031 "vmovups (%1), %%ymm1\n"
10032 "vmovups (%2), %%ymm2\n"
10033 "vmovups (%3), %%ymm3\n"
10034 "vmovups (%4), %%ymm4\n"
10035 "vmovups (%5), %%ymm5\n"
10036 "vmovups (%6), %%ymm6\n"
10037 "vmovups (%7), %%ymm7\n"
10038 "vaddps %%ymm1, %%ymm0, %%ymm8\n"
10039 "vsubps %%ymm1, %%ymm0, %%ymm9\n"
10040 "vaddps %%ymm3, %%ymm2, %%ymm10\n"
10041 "vsubps %%ymm3, %%ymm2, %%ymm11\n"
10042 "vaddps %%ymm5, %%ymm4, %%ymm12\n"
10043 "vsubps %%ymm5, %%ymm4, %%ymm13\n"
10044 "vaddps %%ymm7, %%ymm6, %%ymm14\n"
10045 "vsubps %%ymm7, %%ymm6, %%ymm15\n"
10046 "vaddps %%ymm10, %%ymm8, %%ymm0\n"
10047 "vsubps %%ymm10, %%ymm8, %%ymm2\n"
10048 "vaddps %%ymm11, %%ymm9, %%ymm1\n"
10049 "vsubps %%ymm11, %%ymm9, %%ymm3\n"
10050 "vaddps %%ymm14, %%ymm12, %%ymm4\n"
10051 "vsubps %%ymm14, %%ymm12, %%ymm6\n"
10052 "vaddps %%ymm15, %%ymm13, %%ymm5\n"
10053 "vsubps %%ymm15, %%ymm13, %%ymm7\n"
10054 "vaddps %%ymm4, %%ymm0, %%ymm8\n"
10055 "vsubps %%ymm4, %%ymm0, %%ymm12\n"
10056 "vaddps %%ymm5, %%ymm1, %%ymm9\n"
10057 "vsubps %%ymm5, %%ymm1, %%ymm13\n"
10058 "vaddps %%ymm6, %%ymm2, %%ymm10\n"
10059 "vsubps %%ymm6, %%ymm2, %%ymm14\n"
10060 "vaddps %%ymm7, %%ymm3, %%ymm11\n"
10061 "vsubps %%ymm7, %%ymm3, %%ymm15\n"
10062 "vmovups %%ymm8, (%0)\n"
10063 "vmovups %%ymm9, (%1)\n"
10064 "vmovups %%ymm10, (%2)\n"
10065 "vmovups %%ymm11, (%3)\n"
10066 "vmovups %%ymm12, (%4)\n"
10067 "vmovups %%ymm13, (%5)\n"
10068 "vmovups %%ymm14, (%6)\n"
10069 "vmovups %%ymm15, (%7)\n"
10070 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10071 );
10072 }
10073 }
10074 return;
10075 }
10076 }
10077 void helper_float_30(float *buf);
helper_float_30(float * buf)10078 void helper_float_30(float *buf) {
10079 helper_float_30_recursive(buf, 30);
10080 }
fht_float(float * buf,int log_n)10081 int fht_float(float *buf, int log_n) {
10082 if (log_n == 0) {
10083 return 0;
10084 }
10085 if (log_n == 1) {
10086 helper_float_1(buf);
10087 return 0;
10088 }
10089 if (log_n == 2) {
10090 helper_float_2(buf);
10091 return 0;
10092 }
10093 if (log_n == 3) {
10094 helper_float_3(buf);
10095 return 0;
10096 }
10097 if (log_n == 4) {
10098 helper_float_4(buf);
10099 return 0;
10100 }
10101 if (log_n == 5) {
10102 helper_float_5(buf);
10103 return 0;
10104 }
10105 if (log_n == 6) {
10106 helper_float_6(buf);
10107 return 0;
10108 }
10109 if (log_n == 7) {
10110 helper_float_7(buf);
10111 return 0;
10112 }
10113 if (log_n == 8) {
10114 helper_float_8(buf);
10115 return 0;
10116 }
10117 if (log_n == 9) {
10118 helper_float_9(buf);
10119 return 0;
10120 }
10121 if (log_n == 10) {
10122 helper_float_10(buf);
10123 return 0;
10124 }
10125 if (log_n == 11) {
10126 helper_float_11(buf);
10127 return 0;
10128 }
10129 if (log_n == 12) {
10130 helper_float_12(buf);
10131 return 0;
10132 }
10133 if (log_n == 13) {
10134 helper_float_13(buf);
10135 return 0;
10136 }
10137 if (log_n == 14) {
10138 helper_float_14(buf);
10139 return 0;
10140 }
10141 if (log_n == 15) {
10142 helper_float_15(buf);
10143 return 0;
10144 }
10145 if (log_n == 16) {
10146 helper_float_16(buf);
10147 return 0;
10148 }
10149 if (log_n == 17) {
10150 helper_float_17(buf);
10151 return 0;
10152 }
10153 if (log_n == 18) {
10154 helper_float_18(buf);
10155 return 0;
10156 }
10157 if (log_n == 19) {
10158 helper_float_19(buf);
10159 return 0;
10160 }
10161 if (log_n == 20) {
10162 helper_float_20(buf);
10163 return 0;
10164 }
10165 if (log_n == 21) {
10166 helper_float_21(buf);
10167 return 0;
10168 }
10169 if (log_n == 22) {
10170 helper_float_22(buf);
10171 return 0;
10172 }
10173 if (log_n == 23) {
10174 helper_float_23(buf);
10175 return 0;
10176 }
10177 if (log_n == 24) {
10178 helper_float_24(buf);
10179 return 0;
10180 }
10181 if (log_n == 25) {
10182 helper_float_25(buf);
10183 return 0;
10184 }
10185 if (log_n == 26) {
10186 helper_float_26(buf);
10187 return 0;
10188 }
10189 if (log_n == 27) {
10190 helper_float_27(buf);
10191 return 0;
10192 }
10193 if (log_n == 28) {
10194 helper_float_28(buf);
10195 return 0;
10196 }
10197 if (log_n == 29) {
10198 helper_float_29(buf);
10199 return 0;
10200 }
10201 if (log_n == 30) {
10202 helper_float_30(buf);
10203 return 0;
10204 }
10205 return 1;
10206 }
10207 static inline void helper_double_1(double *buf);
helper_double_1(double * buf)10208 static inline void helper_double_1(double *buf) {
10209 for (int j = 0; j < 2; j += 2) {
10210 for (int k = 0; k < 1; ++k) {
10211 double u = buf[j + k];
10212 double v = buf[j + k + 1];
10213 buf[j + k] = u + v;
10214 buf[j + k + 1] = u - v;
10215 }
10216 }
10217 }
10218 static inline void helper_double_2(double *buf);
helper_double_2(double * buf)10219 static inline void helper_double_2(double *buf) {
10220 for (int j = 0; j < 4; j += 4) {
10221 __asm__ volatile (
10222 "vmovupd (%0), %%ymm0\n"
10223 "vpermilpd $0, %%ymm0, %%ymm8\n"
10224 "vpermilpd $15, %%ymm0, %%ymm9\n"
10225 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10226 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10227 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10228 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10229 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10230 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10231 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10232 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10233 "vmovupd %%ymm0, (%0)\n"
10234 :: "r"(buf + j) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10235 );
10236 }
10237 }
10238 static inline void helper_double_3(double *buf);
helper_double_3(double * buf)10239 static inline void helper_double_3(double *buf) {
10240 for (int j = 0; j < 8; j += 8) {
10241 for (int k = 0; k < 4; k += 4) {
10242 __asm__ volatile (
10243 "vmovupd (%0), %%ymm0\n"
10244 "vmovupd (%1), %%ymm1\n"
10245 "vpermilpd $0, %%ymm0, %%ymm8\n"
10246 "vpermilpd $15, %%ymm0, %%ymm9\n"
10247 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10248 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10249 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10250 "vpermilpd $0, %%ymm1, %%ymm8\n"
10251 "vpermilpd $15, %%ymm1, %%ymm9\n"
10252 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10253 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10254 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10255 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10256 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10257 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10258 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10259 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10260 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10261 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10262 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10263 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10264 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10265 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10266 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10267 "vmovupd %%ymm8, (%0)\n"
10268 "vmovupd %%ymm9, (%1)\n"
10269 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10270 );
10271 }
10272 }
10273 }
10274 void helper_double_4_recursive(double *buf, int depth);
helper_double_4_recursive(double * buf,int depth)10275 void helper_double_4_recursive(double *buf, int depth) {
10276 if (depth == 4) {
10277 for (int j = 0; j < 16; j += 16) {
10278 for (int k = 0; k < 4; k += 4) {
10279 __asm__ volatile (
10280 "vmovupd (%0), %%ymm0\n"
10281 "vmovupd (%1), %%ymm1\n"
10282 "vmovupd (%2), %%ymm2\n"
10283 "vmovupd (%3), %%ymm3\n"
10284 "vpermilpd $0, %%ymm0, %%ymm8\n"
10285 "vpermilpd $15, %%ymm0, %%ymm9\n"
10286 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10287 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10288 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10289 "vpermilpd $0, %%ymm1, %%ymm8\n"
10290 "vpermilpd $15, %%ymm1, %%ymm9\n"
10291 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10292 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10293 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10294 "vpermilpd $0, %%ymm2, %%ymm8\n"
10295 "vpermilpd $15, %%ymm2, %%ymm9\n"
10296 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10297 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10298 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10299 "vpermilpd $0, %%ymm3, %%ymm8\n"
10300 "vpermilpd $15, %%ymm3, %%ymm9\n"
10301 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10302 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10303 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10304 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10305 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10306 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10307 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10308 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10309 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10310 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10311 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10312 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10313 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10314 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10315 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10316 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10317 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10318 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10319 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10320 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10321 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10322 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10323 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10324 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10325 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10326 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10327 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10328 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10329 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10330 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10331 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10332 "vmovupd %%ymm0, (%0)\n"
10333 "vmovupd %%ymm1, (%1)\n"
10334 "vmovupd %%ymm2, (%2)\n"
10335 "vmovupd %%ymm3, (%3)\n"
10336 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10337 );
10338 }
10339 }
10340 return;
10341 }
10342 }
10343 void helper_double_4(double *buf);
helper_double_4(double * buf)10344 void helper_double_4(double *buf) {
10345 helper_double_4_recursive(buf, 4);
10346 }
10347 static inline void helper_double_5(double *buf);
helper_double_5(double * buf)10348 static inline void helper_double_5(double *buf) {
10349 for (int j = 0; j < 32; j += 32) {
10350 for (int k = 0; k < 4; k += 4) {
10351 __asm__ volatile (
10352 "vmovupd (%0), %%ymm0\n"
10353 "vmovupd (%1), %%ymm1\n"
10354 "vmovupd (%2), %%ymm2\n"
10355 "vmovupd (%3), %%ymm3\n"
10356 "vmovupd (%4), %%ymm4\n"
10357 "vmovupd (%5), %%ymm5\n"
10358 "vmovupd (%6), %%ymm6\n"
10359 "vmovupd (%7), %%ymm7\n"
10360 "vpermilpd $0, %%ymm0, %%ymm8\n"
10361 "vpermilpd $15, %%ymm0, %%ymm9\n"
10362 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10363 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10364 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10365 "vpermilpd $0, %%ymm1, %%ymm8\n"
10366 "vpermilpd $15, %%ymm1, %%ymm9\n"
10367 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10368 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10369 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10370 "vpermilpd $0, %%ymm2, %%ymm8\n"
10371 "vpermilpd $15, %%ymm2, %%ymm9\n"
10372 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10373 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10374 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10375 "vpermilpd $0, %%ymm3, %%ymm8\n"
10376 "vpermilpd $15, %%ymm3, %%ymm9\n"
10377 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10378 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10379 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10380 "vpermilpd $0, %%ymm4, %%ymm8\n"
10381 "vpermilpd $15, %%ymm4, %%ymm9\n"
10382 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10383 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10384 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10385 "vpermilpd $0, %%ymm5, %%ymm8\n"
10386 "vpermilpd $15, %%ymm5, %%ymm9\n"
10387 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10388 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10389 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10390 "vpermilpd $0, %%ymm6, %%ymm8\n"
10391 "vpermilpd $15, %%ymm6, %%ymm9\n"
10392 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10393 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10394 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10395 "vpermilpd $0, %%ymm7, %%ymm8\n"
10396 "vpermilpd $15, %%ymm7, %%ymm9\n"
10397 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10398 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10399 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10400 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10401 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10402 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10403 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10404 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10405 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10406 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10407 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10408 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10409 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10410 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10411 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10412 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10413 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10414 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10415 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10416 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10417 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10418 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10419 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10420 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10421 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10422 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10423 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10424 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10425 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10426 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10427 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10428 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10429 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10430 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10431 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10432 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10433 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10434 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10435 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10436 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10437 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10438 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10439 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10440 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10441 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10442 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10443 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10444 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10445 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10446 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10447 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10448 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10449 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10450 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10451 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10452 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10453 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10454 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10455 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10456 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10457 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10458 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10459 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10460 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10461 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10462 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10463 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10464 "vmovupd %%ymm8, (%0)\n"
10465 "vmovupd %%ymm9, (%1)\n"
10466 "vmovupd %%ymm10, (%2)\n"
10467 "vmovupd %%ymm11, (%3)\n"
10468 "vmovupd %%ymm12, (%4)\n"
10469 "vmovupd %%ymm13, (%5)\n"
10470 "vmovupd %%ymm14, (%6)\n"
10471 "vmovupd %%ymm15, (%7)\n"
10472 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10473 );
10474 }
10475 }
10476 }
10477 static inline void helper_double_6(double *buf);
helper_double_6(double * buf)10478 static inline void helper_double_6(double *buf) {
10479 for (int j = 0; j < 64; j += 32) {
10480 for (int k = 0; k < 4; k += 4) {
10481 __asm__ volatile (
10482 "vmovupd (%0), %%ymm0\n"
10483 "vmovupd (%1), %%ymm1\n"
10484 "vmovupd (%2), %%ymm2\n"
10485 "vmovupd (%3), %%ymm3\n"
10486 "vmovupd (%4), %%ymm4\n"
10487 "vmovupd (%5), %%ymm5\n"
10488 "vmovupd (%6), %%ymm6\n"
10489 "vmovupd (%7), %%ymm7\n"
10490 "vpermilpd $0, %%ymm0, %%ymm8\n"
10491 "vpermilpd $15, %%ymm0, %%ymm9\n"
10492 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10493 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10494 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10495 "vpermilpd $0, %%ymm1, %%ymm8\n"
10496 "vpermilpd $15, %%ymm1, %%ymm9\n"
10497 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10498 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10499 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10500 "vpermilpd $0, %%ymm2, %%ymm8\n"
10501 "vpermilpd $15, %%ymm2, %%ymm9\n"
10502 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10503 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10504 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10505 "vpermilpd $0, %%ymm3, %%ymm8\n"
10506 "vpermilpd $15, %%ymm3, %%ymm9\n"
10507 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10508 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10509 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10510 "vpermilpd $0, %%ymm4, %%ymm8\n"
10511 "vpermilpd $15, %%ymm4, %%ymm9\n"
10512 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10513 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10514 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10515 "vpermilpd $0, %%ymm5, %%ymm8\n"
10516 "vpermilpd $15, %%ymm5, %%ymm9\n"
10517 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10518 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10519 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10520 "vpermilpd $0, %%ymm6, %%ymm8\n"
10521 "vpermilpd $15, %%ymm6, %%ymm9\n"
10522 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10523 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10524 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10525 "vpermilpd $0, %%ymm7, %%ymm8\n"
10526 "vpermilpd $15, %%ymm7, %%ymm9\n"
10527 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10528 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10529 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10530 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10531 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10532 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10533 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10534 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10535 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10536 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10537 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10538 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10539 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10540 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10541 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10542 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10543 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10544 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10545 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10546 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10547 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10548 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10549 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10550 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10551 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10552 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10553 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10554 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10555 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10556 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10557 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10558 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10559 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10560 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10561 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10562 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10563 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10564 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10565 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10566 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10567 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10568 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10569 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10570 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10571 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10572 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10573 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10574 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10575 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10576 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10577 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10578 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10579 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10580 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10581 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10582 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10583 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10584 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10585 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10586 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10587 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10588 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10589 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10590 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10591 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10592 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10593 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10594 "vmovupd %%ymm8, (%0)\n"
10595 "vmovupd %%ymm9, (%1)\n"
10596 "vmovupd %%ymm10, (%2)\n"
10597 "vmovupd %%ymm11, (%3)\n"
10598 "vmovupd %%ymm12, (%4)\n"
10599 "vmovupd %%ymm13, (%5)\n"
10600 "vmovupd %%ymm14, (%6)\n"
10601 "vmovupd %%ymm15, (%7)\n"
10602 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10603 );
10604 }
10605 }
10606 for (int j = 0; j < 64; j += 64) {
10607 for (int k = 0; k < 32; k += 4) {
10608 __asm__ volatile (
10609 "vmovupd (%0), %%ymm0\n"
10610 "vmovupd (%1), %%ymm1\n"
10611 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10612 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10613 "vmovupd %%ymm8, (%0)\n"
10614 "vmovupd %%ymm9, (%1)\n"
10615 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10616 );
10617 }
10618 }
10619 }
10620 static inline void helper_double_7(double *buf);
helper_double_7(double * buf)10621 static inline void helper_double_7(double *buf) {
10622 for (int j = 0; j < 128; j += 32) {
10623 for (int k = 0; k < 4; k += 4) {
10624 __asm__ volatile (
10625 "vmovupd (%0), %%ymm0\n"
10626 "vmovupd (%1), %%ymm1\n"
10627 "vmovupd (%2), %%ymm2\n"
10628 "vmovupd (%3), %%ymm3\n"
10629 "vmovupd (%4), %%ymm4\n"
10630 "vmovupd (%5), %%ymm5\n"
10631 "vmovupd (%6), %%ymm6\n"
10632 "vmovupd (%7), %%ymm7\n"
10633 "vpermilpd $0, %%ymm0, %%ymm8\n"
10634 "vpermilpd $15, %%ymm0, %%ymm9\n"
10635 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10636 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10637 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10638 "vpermilpd $0, %%ymm1, %%ymm8\n"
10639 "vpermilpd $15, %%ymm1, %%ymm9\n"
10640 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10641 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10642 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10643 "vpermilpd $0, %%ymm2, %%ymm8\n"
10644 "vpermilpd $15, %%ymm2, %%ymm9\n"
10645 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10646 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10647 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10648 "vpermilpd $0, %%ymm3, %%ymm8\n"
10649 "vpermilpd $15, %%ymm3, %%ymm9\n"
10650 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10651 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10652 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10653 "vpermilpd $0, %%ymm4, %%ymm8\n"
10654 "vpermilpd $15, %%ymm4, %%ymm9\n"
10655 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10656 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10657 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10658 "vpermilpd $0, %%ymm5, %%ymm8\n"
10659 "vpermilpd $15, %%ymm5, %%ymm9\n"
10660 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10661 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10662 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10663 "vpermilpd $0, %%ymm6, %%ymm8\n"
10664 "vpermilpd $15, %%ymm6, %%ymm9\n"
10665 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10666 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10667 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10668 "vpermilpd $0, %%ymm7, %%ymm8\n"
10669 "vpermilpd $15, %%ymm7, %%ymm9\n"
10670 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10671 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10672 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10673 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10674 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10675 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10676 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10677 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10678 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10679 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10680 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10681 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10682 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10683 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10684 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10685 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10686 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10687 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10688 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10689 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10690 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10691 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10692 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10693 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10694 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10695 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10696 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10697 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10698 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10699 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10700 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10701 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10702 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10703 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10704 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10705 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10706 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10707 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10708 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10709 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10710 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10711 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10712 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10713 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10714 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10715 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10716 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10717 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10718 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10719 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10720 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10721 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10722 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10723 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10724 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10725 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10726 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10727 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10728 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10729 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10730 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10731 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10732 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10733 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10734 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10735 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10736 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10737 "vmovupd %%ymm8, (%0)\n"
10738 "vmovupd %%ymm9, (%1)\n"
10739 "vmovupd %%ymm10, (%2)\n"
10740 "vmovupd %%ymm11, (%3)\n"
10741 "vmovupd %%ymm12, (%4)\n"
10742 "vmovupd %%ymm13, (%5)\n"
10743 "vmovupd %%ymm14, (%6)\n"
10744 "vmovupd %%ymm15, (%7)\n"
10745 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10746 );
10747 }
10748 }
10749 for (int j = 0; j < 128; j += 128) {
10750 for (int k = 0; k < 32; k += 4) {
10751 __asm__ volatile (
10752 "vmovupd (%0), %%ymm0\n"
10753 "vmovupd (%1), %%ymm1\n"
10754 "vmovupd (%2), %%ymm2\n"
10755 "vmovupd (%3), %%ymm3\n"
10756 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10757 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10758 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10759 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10760 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10761 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10762 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10763 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10764 "vmovupd %%ymm0, (%0)\n"
10765 "vmovupd %%ymm1, (%1)\n"
10766 "vmovupd %%ymm2, (%2)\n"
10767 "vmovupd %%ymm3, (%3)\n"
10768 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10769 );
10770 }
10771 }
10772 }
10773 static inline void helper_double_8(double *buf);
helper_double_8(double * buf)10774 static inline void helper_double_8(double *buf) {
10775 for (int j = 0; j < 256; j += 32) {
10776 for (int k = 0; k < 4; k += 4) {
10777 __asm__ volatile (
10778 "vmovupd (%0), %%ymm0\n"
10779 "vmovupd (%1), %%ymm1\n"
10780 "vmovupd (%2), %%ymm2\n"
10781 "vmovupd (%3), %%ymm3\n"
10782 "vmovupd (%4), %%ymm4\n"
10783 "vmovupd (%5), %%ymm5\n"
10784 "vmovupd (%6), %%ymm6\n"
10785 "vmovupd (%7), %%ymm7\n"
10786 "vpermilpd $0, %%ymm0, %%ymm8\n"
10787 "vpermilpd $15, %%ymm0, %%ymm9\n"
10788 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10789 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10790 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10791 "vpermilpd $0, %%ymm1, %%ymm8\n"
10792 "vpermilpd $15, %%ymm1, %%ymm9\n"
10793 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10794 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10795 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10796 "vpermilpd $0, %%ymm2, %%ymm8\n"
10797 "vpermilpd $15, %%ymm2, %%ymm9\n"
10798 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10799 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10800 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10801 "vpermilpd $0, %%ymm3, %%ymm8\n"
10802 "vpermilpd $15, %%ymm3, %%ymm9\n"
10803 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10804 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10805 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10806 "vpermilpd $0, %%ymm4, %%ymm8\n"
10807 "vpermilpd $15, %%ymm4, %%ymm9\n"
10808 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10809 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10810 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10811 "vpermilpd $0, %%ymm5, %%ymm8\n"
10812 "vpermilpd $15, %%ymm5, %%ymm9\n"
10813 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10814 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10815 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10816 "vpermilpd $0, %%ymm6, %%ymm8\n"
10817 "vpermilpd $15, %%ymm6, %%ymm9\n"
10818 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10819 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10820 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10821 "vpermilpd $0, %%ymm7, %%ymm8\n"
10822 "vpermilpd $15, %%ymm7, %%ymm9\n"
10823 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10824 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10825 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
10826 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
10827 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10828 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
10829 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
10830 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
10831 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
10832 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10833 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
10834 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
10835 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
10836 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
10837 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10838 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
10839 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
10840 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
10841 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
10842 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10843 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
10844 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
10845 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
10846 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
10847 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10848 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
10849 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
10850 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
10851 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
10852 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10853 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
10854 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
10855 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
10856 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
10857 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10858 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
10859 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
10860 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
10861 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
10862 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
10863 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
10864 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
10865 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
10866 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10867 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10868 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10869 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10870 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10871 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10872 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10873 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10874 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10875 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10876 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10877 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10878 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10879 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10880 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10881 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10882 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10883 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10884 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10885 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10886 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10887 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10888 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10889 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10890 "vmovupd %%ymm8, (%0)\n"
10891 "vmovupd %%ymm9, (%1)\n"
10892 "vmovupd %%ymm10, (%2)\n"
10893 "vmovupd %%ymm11, (%3)\n"
10894 "vmovupd %%ymm12, (%4)\n"
10895 "vmovupd %%ymm13, (%5)\n"
10896 "vmovupd %%ymm14, (%6)\n"
10897 "vmovupd %%ymm15, (%7)\n"
10898 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10899 );
10900 }
10901 }
10902 for (int j = 0; j < 256; j += 256) {
10903 for (int k = 0; k < 32; k += 4) {
10904 __asm__ volatile (
10905 "vmovupd (%0), %%ymm0\n"
10906 "vmovupd (%1), %%ymm1\n"
10907 "vmovupd (%2), %%ymm2\n"
10908 "vmovupd (%3), %%ymm3\n"
10909 "vmovupd (%4), %%ymm4\n"
10910 "vmovupd (%5), %%ymm5\n"
10911 "vmovupd (%6), %%ymm6\n"
10912 "vmovupd (%7), %%ymm7\n"
10913 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
10914 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
10915 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
10916 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
10917 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
10918 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
10919 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
10920 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
10921 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
10922 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
10923 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
10924 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
10925 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
10926 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
10927 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
10928 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
10929 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
10930 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
10931 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
10932 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
10933 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
10934 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
10935 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
10936 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
10937 "vmovupd %%ymm8, (%0)\n"
10938 "vmovupd %%ymm9, (%1)\n"
10939 "vmovupd %%ymm10, (%2)\n"
10940 "vmovupd %%ymm11, (%3)\n"
10941 "vmovupd %%ymm12, (%4)\n"
10942 "vmovupd %%ymm13, (%5)\n"
10943 "vmovupd %%ymm14, (%6)\n"
10944 "vmovupd %%ymm15, (%7)\n"
10945 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
10946 );
10947 }
10948 }
10949 }
10950 static inline void helper_double_9(double *buf);
helper_double_9(double * buf)10951 static inline void helper_double_9(double *buf) {
10952 for (int j = 0; j < 512; j += 32) {
10953 for (int k = 0; k < 4; k += 4) {
10954 __asm__ volatile (
10955 "vmovupd (%0), %%ymm0\n"
10956 "vmovupd (%1), %%ymm1\n"
10957 "vmovupd (%2), %%ymm2\n"
10958 "vmovupd (%3), %%ymm3\n"
10959 "vmovupd (%4), %%ymm4\n"
10960 "vmovupd (%5), %%ymm5\n"
10961 "vmovupd (%6), %%ymm6\n"
10962 "vmovupd (%7), %%ymm7\n"
10963 "vpermilpd $0, %%ymm0, %%ymm8\n"
10964 "vpermilpd $15, %%ymm0, %%ymm9\n"
10965 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10966 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10967 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
10968 "vpermilpd $0, %%ymm1, %%ymm8\n"
10969 "vpermilpd $15, %%ymm1, %%ymm9\n"
10970 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10971 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10972 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
10973 "vpermilpd $0, %%ymm2, %%ymm8\n"
10974 "vpermilpd $15, %%ymm2, %%ymm9\n"
10975 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10976 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10977 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
10978 "vpermilpd $0, %%ymm3, %%ymm8\n"
10979 "vpermilpd $15, %%ymm3, %%ymm9\n"
10980 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10981 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10982 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
10983 "vpermilpd $0, %%ymm4, %%ymm8\n"
10984 "vpermilpd $15, %%ymm4, %%ymm9\n"
10985 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10986 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10987 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
10988 "vpermilpd $0, %%ymm5, %%ymm8\n"
10989 "vpermilpd $15, %%ymm5, %%ymm9\n"
10990 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10991 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10992 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
10993 "vpermilpd $0, %%ymm6, %%ymm8\n"
10994 "vpermilpd $15, %%ymm6, %%ymm9\n"
10995 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
10996 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
10997 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
10998 "vpermilpd $0, %%ymm7, %%ymm8\n"
10999 "vpermilpd $15, %%ymm7, %%ymm9\n"
11000 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11001 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11002 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11003 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11004 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11005 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11006 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11007 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11008 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11009 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11010 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11011 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11012 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11013 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11014 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11015 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11016 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11017 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11018 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11019 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11020 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11021 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11022 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11023 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11024 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11025 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11026 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11027 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11028 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11029 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11030 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11031 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11032 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11033 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11034 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11035 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11036 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11037 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11038 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11039 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11040 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11041 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11042 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11043 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11044 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11045 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11046 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11047 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11048 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11049 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11050 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11051 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11052 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11053 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11054 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11055 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11056 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11057 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11058 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11059 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11060 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11061 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11062 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11063 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11064 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11065 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11066 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11067 "vmovupd %%ymm8, (%0)\n"
11068 "vmovupd %%ymm9, (%1)\n"
11069 "vmovupd %%ymm10, (%2)\n"
11070 "vmovupd %%ymm11, (%3)\n"
11071 "vmovupd %%ymm12, (%4)\n"
11072 "vmovupd %%ymm13, (%5)\n"
11073 "vmovupd %%ymm14, (%6)\n"
11074 "vmovupd %%ymm15, (%7)\n"
11075 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11076 );
11077 }
11078 }
11079 for (int j = 0; j < 512; j += 256) {
11080 for (int k = 0; k < 32; k += 4) {
11081 __asm__ volatile (
11082 "vmovupd (%0), %%ymm0\n"
11083 "vmovupd (%1), %%ymm1\n"
11084 "vmovupd (%2), %%ymm2\n"
11085 "vmovupd (%3), %%ymm3\n"
11086 "vmovupd (%4), %%ymm4\n"
11087 "vmovupd (%5), %%ymm5\n"
11088 "vmovupd (%6), %%ymm6\n"
11089 "vmovupd (%7), %%ymm7\n"
11090 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11091 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11092 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11093 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11094 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11095 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11096 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11097 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11098 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11099 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11100 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11101 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11102 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11103 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11104 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11105 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11106 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11107 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11108 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11109 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11110 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11111 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11112 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11113 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11114 "vmovupd %%ymm8, (%0)\n"
11115 "vmovupd %%ymm9, (%1)\n"
11116 "vmovupd %%ymm10, (%2)\n"
11117 "vmovupd %%ymm11, (%3)\n"
11118 "vmovupd %%ymm12, (%4)\n"
11119 "vmovupd %%ymm13, (%5)\n"
11120 "vmovupd %%ymm14, (%6)\n"
11121 "vmovupd %%ymm15, (%7)\n"
11122 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11123 );
11124 }
11125 }
11126 for (int j = 0; j < 512; j += 512) {
11127 for (int k = 0; k < 256; k += 4) {
11128 __asm__ volatile (
11129 "vmovupd (%0), %%ymm0\n"
11130 "vmovupd (%1), %%ymm1\n"
11131 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11132 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11133 "vmovupd %%ymm8, (%0)\n"
11134 "vmovupd %%ymm9, (%1)\n"
11135 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11136 );
11137 }
11138 }
11139 }
11140 static inline void helper_double_10(double *buf);
helper_double_10(double * buf)11141 static inline void helper_double_10(double *buf) {
11142 for (int j = 0; j < 1024; j += 32) {
11143 for (int k = 0; k < 4; k += 4) {
11144 __asm__ volatile (
11145 "vmovupd (%0), %%ymm0\n"
11146 "vmovupd (%1), %%ymm1\n"
11147 "vmovupd (%2), %%ymm2\n"
11148 "vmovupd (%3), %%ymm3\n"
11149 "vmovupd (%4), %%ymm4\n"
11150 "vmovupd (%5), %%ymm5\n"
11151 "vmovupd (%6), %%ymm6\n"
11152 "vmovupd (%7), %%ymm7\n"
11153 "vpermilpd $0, %%ymm0, %%ymm8\n"
11154 "vpermilpd $15, %%ymm0, %%ymm9\n"
11155 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11156 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11157 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11158 "vpermilpd $0, %%ymm1, %%ymm8\n"
11159 "vpermilpd $15, %%ymm1, %%ymm9\n"
11160 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11161 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11162 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11163 "vpermilpd $0, %%ymm2, %%ymm8\n"
11164 "vpermilpd $15, %%ymm2, %%ymm9\n"
11165 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11166 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11167 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11168 "vpermilpd $0, %%ymm3, %%ymm8\n"
11169 "vpermilpd $15, %%ymm3, %%ymm9\n"
11170 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11171 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11172 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11173 "vpermilpd $0, %%ymm4, %%ymm8\n"
11174 "vpermilpd $15, %%ymm4, %%ymm9\n"
11175 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11176 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11177 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11178 "vpermilpd $0, %%ymm5, %%ymm8\n"
11179 "vpermilpd $15, %%ymm5, %%ymm9\n"
11180 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11181 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11182 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11183 "vpermilpd $0, %%ymm6, %%ymm8\n"
11184 "vpermilpd $15, %%ymm6, %%ymm9\n"
11185 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11186 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11187 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11188 "vpermilpd $0, %%ymm7, %%ymm8\n"
11189 "vpermilpd $15, %%ymm7, %%ymm9\n"
11190 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11191 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11192 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11193 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11194 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11195 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11196 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11197 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11198 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11199 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11200 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11201 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11202 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11203 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11204 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11205 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11206 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11207 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11208 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11209 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11210 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11211 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11212 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11213 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11214 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11215 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11216 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11217 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11218 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11219 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11220 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11221 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11222 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11223 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11224 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11225 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11226 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11227 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11228 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11229 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11230 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11231 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11232 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11233 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11234 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11235 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11236 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11237 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11238 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11239 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11240 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11241 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11242 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11243 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11244 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11245 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11246 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11247 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11248 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11249 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11250 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11251 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11252 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11253 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11254 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11255 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11256 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11257 "vmovupd %%ymm8, (%0)\n"
11258 "vmovupd %%ymm9, (%1)\n"
11259 "vmovupd %%ymm10, (%2)\n"
11260 "vmovupd %%ymm11, (%3)\n"
11261 "vmovupd %%ymm12, (%4)\n"
11262 "vmovupd %%ymm13, (%5)\n"
11263 "vmovupd %%ymm14, (%6)\n"
11264 "vmovupd %%ymm15, (%7)\n"
11265 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11266 );
11267 }
11268 }
11269 for (int j = 0; j < 1024; j += 256) {
11270 for (int k = 0; k < 32; k += 4) {
11271 __asm__ volatile (
11272 "vmovupd (%0), %%ymm0\n"
11273 "vmovupd (%1), %%ymm1\n"
11274 "vmovupd (%2), %%ymm2\n"
11275 "vmovupd (%3), %%ymm3\n"
11276 "vmovupd (%4), %%ymm4\n"
11277 "vmovupd (%5), %%ymm5\n"
11278 "vmovupd (%6), %%ymm6\n"
11279 "vmovupd (%7), %%ymm7\n"
11280 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11281 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11282 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11283 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11284 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11285 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11286 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11287 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11288 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11289 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11290 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11291 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11292 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11293 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11294 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11295 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11296 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11297 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11298 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11299 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11300 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11301 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11302 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11303 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11304 "vmovupd %%ymm8, (%0)\n"
11305 "vmovupd %%ymm9, (%1)\n"
11306 "vmovupd %%ymm10, (%2)\n"
11307 "vmovupd %%ymm11, (%3)\n"
11308 "vmovupd %%ymm12, (%4)\n"
11309 "vmovupd %%ymm13, (%5)\n"
11310 "vmovupd %%ymm14, (%6)\n"
11311 "vmovupd %%ymm15, (%7)\n"
11312 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11313 );
11314 }
11315 }
11316 for (int j = 0; j < 1024; j += 1024) {
11317 for (int k = 0; k < 256; k += 4) {
11318 __asm__ volatile (
11319 "vmovupd (%0), %%ymm0\n"
11320 "vmovupd (%1), %%ymm1\n"
11321 "vmovupd (%2), %%ymm2\n"
11322 "vmovupd (%3), %%ymm3\n"
11323 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11324 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11325 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11326 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11327 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11328 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11329 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11330 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11331 "vmovupd %%ymm0, (%0)\n"
11332 "vmovupd %%ymm1, (%1)\n"
11333 "vmovupd %%ymm2, (%2)\n"
11334 "vmovupd %%ymm3, (%3)\n"
11335 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11336 );
11337 }
11338 }
11339 }
11340 static inline void helper_double_11(double *buf);
helper_double_11(double * buf)11341 static inline void helper_double_11(double *buf) {
11342 for (int j = 0; j < 2048; j += 32) {
11343 for (int k = 0; k < 4; k += 4) {
11344 __asm__ volatile (
11345 "vmovupd (%0), %%ymm0\n"
11346 "vmovupd (%1), %%ymm1\n"
11347 "vmovupd (%2), %%ymm2\n"
11348 "vmovupd (%3), %%ymm3\n"
11349 "vmovupd (%4), %%ymm4\n"
11350 "vmovupd (%5), %%ymm5\n"
11351 "vmovupd (%6), %%ymm6\n"
11352 "vmovupd (%7), %%ymm7\n"
11353 "vpermilpd $0, %%ymm0, %%ymm8\n"
11354 "vpermilpd $15, %%ymm0, %%ymm9\n"
11355 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11356 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11357 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11358 "vpermilpd $0, %%ymm1, %%ymm8\n"
11359 "vpermilpd $15, %%ymm1, %%ymm9\n"
11360 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11361 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11362 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11363 "vpermilpd $0, %%ymm2, %%ymm8\n"
11364 "vpermilpd $15, %%ymm2, %%ymm9\n"
11365 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11366 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11367 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11368 "vpermilpd $0, %%ymm3, %%ymm8\n"
11369 "vpermilpd $15, %%ymm3, %%ymm9\n"
11370 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11371 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11372 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11373 "vpermilpd $0, %%ymm4, %%ymm8\n"
11374 "vpermilpd $15, %%ymm4, %%ymm9\n"
11375 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11376 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11377 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11378 "vpermilpd $0, %%ymm5, %%ymm8\n"
11379 "vpermilpd $15, %%ymm5, %%ymm9\n"
11380 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11381 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11382 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11383 "vpermilpd $0, %%ymm6, %%ymm8\n"
11384 "vpermilpd $15, %%ymm6, %%ymm9\n"
11385 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11386 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11387 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11388 "vpermilpd $0, %%ymm7, %%ymm8\n"
11389 "vpermilpd $15, %%ymm7, %%ymm9\n"
11390 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11391 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11392 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11393 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11394 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11395 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11396 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11397 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11398 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11399 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11400 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11401 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11402 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11403 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11404 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11405 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11406 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11407 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11408 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11409 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11410 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11411 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11412 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11413 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11414 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11415 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11416 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11417 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11418 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11419 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11420 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11421 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11422 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11423 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11424 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11425 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11426 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11427 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11428 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11429 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11430 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11431 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11432 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11433 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11434 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11435 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11436 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11437 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11438 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11439 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11440 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11441 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11442 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11443 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11444 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11445 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11446 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11447 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11448 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11449 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11450 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11451 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11452 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11453 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11454 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11455 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11456 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11457 "vmovupd %%ymm8, (%0)\n"
11458 "vmovupd %%ymm9, (%1)\n"
11459 "vmovupd %%ymm10, (%2)\n"
11460 "vmovupd %%ymm11, (%3)\n"
11461 "vmovupd %%ymm12, (%4)\n"
11462 "vmovupd %%ymm13, (%5)\n"
11463 "vmovupd %%ymm14, (%6)\n"
11464 "vmovupd %%ymm15, (%7)\n"
11465 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11466 );
11467 }
11468 }
11469 for (int j = 0; j < 2048; j += 256) {
11470 for (int k = 0; k < 32; k += 4) {
11471 __asm__ volatile (
11472 "vmovupd (%0), %%ymm0\n"
11473 "vmovupd (%1), %%ymm1\n"
11474 "vmovupd (%2), %%ymm2\n"
11475 "vmovupd (%3), %%ymm3\n"
11476 "vmovupd (%4), %%ymm4\n"
11477 "vmovupd (%5), %%ymm5\n"
11478 "vmovupd (%6), %%ymm6\n"
11479 "vmovupd (%7), %%ymm7\n"
11480 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11481 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11482 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11483 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11484 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11485 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11486 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11487 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11488 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11489 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11490 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11491 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11492 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11493 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11494 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11495 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11496 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11497 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11498 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11499 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11500 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11501 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11502 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11503 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11504 "vmovupd %%ymm8, (%0)\n"
11505 "vmovupd %%ymm9, (%1)\n"
11506 "vmovupd %%ymm10, (%2)\n"
11507 "vmovupd %%ymm11, (%3)\n"
11508 "vmovupd %%ymm12, (%4)\n"
11509 "vmovupd %%ymm13, (%5)\n"
11510 "vmovupd %%ymm14, (%6)\n"
11511 "vmovupd %%ymm15, (%7)\n"
11512 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11513 );
11514 }
11515 }
11516 for (int j = 0; j < 2048; j += 2048) {
11517 for (int k = 0; k < 256; k += 4) {
11518 __asm__ volatile (
11519 "vmovupd (%0), %%ymm0\n"
11520 "vmovupd (%1), %%ymm1\n"
11521 "vmovupd (%2), %%ymm2\n"
11522 "vmovupd (%3), %%ymm3\n"
11523 "vmovupd (%4), %%ymm4\n"
11524 "vmovupd (%5), %%ymm5\n"
11525 "vmovupd (%6), %%ymm6\n"
11526 "vmovupd (%7), %%ymm7\n"
11527 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11528 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11529 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11530 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11531 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11532 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11533 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11534 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11535 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11536 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11537 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11538 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11539 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11540 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11541 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11542 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11543 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11544 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11545 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11546 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11547 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11548 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11549 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11550 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11551 "vmovupd %%ymm8, (%0)\n"
11552 "vmovupd %%ymm9, (%1)\n"
11553 "vmovupd %%ymm10, (%2)\n"
11554 "vmovupd %%ymm11, (%3)\n"
11555 "vmovupd %%ymm12, (%4)\n"
11556 "vmovupd %%ymm13, (%5)\n"
11557 "vmovupd %%ymm14, (%6)\n"
11558 "vmovupd %%ymm15, (%7)\n"
11559 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11560 );
11561 }
11562 }
11563 }
11564 void helper_double_12_recursive(double *buf, int depth);
helper_double_12_recursive(double * buf,int depth)11565 void helper_double_12_recursive(double *buf, int depth) {
11566 if (depth == 11) {
11567 for (int j = 0; j < 2048; j += 32) {
11568 for (int k = 0; k < 4; k += 4) {
11569 __asm__ volatile (
11570 "vmovupd (%0), %%ymm0\n"
11571 "vmovupd (%1), %%ymm1\n"
11572 "vmovupd (%2), %%ymm2\n"
11573 "vmovupd (%3), %%ymm3\n"
11574 "vmovupd (%4), %%ymm4\n"
11575 "vmovupd (%5), %%ymm5\n"
11576 "vmovupd (%6), %%ymm6\n"
11577 "vmovupd (%7), %%ymm7\n"
11578 "vpermilpd $0, %%ymm0, %%ymm8\n"
11579 "vpermilpd $15, %%ymm0, %%ymm9\n"
11580 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11581 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11582 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11583 "vpermilpd $0, %%ymm1, %%ymm8\n"
11584 "vpermilpd $15, %%ymm1, %%ymm9\n"
11585 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11586 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11587 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11588 "vpermilpd $0, %%ymm2, %%ymm8\n"
11589 "vpermilpd $15, %%ymm2, %%ymm9\n"
11590 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11591 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11592 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11593 "vpermilpd $0, %%ymm3, %%ymm8\n"
11594 "vpermilpd $15, %%ymm3, %%ymm9\n"
11595 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11596 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11597 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11598 "vpermilpd $0, %%ymm4, %%ymm8\n"
11599 "vpermilpd $15, %%ymm4, %%ymm9\n"
11600 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11601 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11602 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11603 "vpermilpd $0, %%ymm5, %%ymm8\n"
11604 "vpermilpd $15, %%ymm5, %%ymm9\n"
11605 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11606 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11607 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11608 "vpermilpd $0, %%ymm6, %%ymm8\n"
11609 "vpermilpd $15, %%ymm6, %%ymm9\n"
11610 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11611 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11612 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11613 "vpermilpd $0, %%ymm7, %%ymm8\n"
11614 "vpermilpd $15, %%ymm7, %%ymm9\n"
11615 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11616 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11617 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11618 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11619 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11620 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11621 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11622 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11623 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11624 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11625 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11626 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11627 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11628 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11629 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11630 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11631 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11632 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11633 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11634 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11635 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11636 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11637 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11638 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11639 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11640 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11641 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11642 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11643 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11644 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11645 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11646 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11647 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11648 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11649 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11650 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11651 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11652 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11653 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11654 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11655 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11656 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11657 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11658 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11659 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11660 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11661 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11662 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11663 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11664 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11665 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11666 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11667 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11668 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11669 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11670 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11671 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11672 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11673 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11674 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11675 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11676 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11677 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11678 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11679 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11680 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11681 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11682 "vmovupd %%ymm8, (%0)\n"
11683 "vmovupd %%ymm9, (%1)\n"
11684 "vmovupd %%ymm10, (%2)\n"
11685 "vmovupd %%ymm11, (%3)\n"
11686 "vmovupd %%ymm12, (%4)\n"
11687 "vmovupd %%ymm13, (%5)\n"
11688 "vmovupd %%ymm14, (%6)\n"
11689 "vmovupd %%ymm15, (%7)\n"
11690 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11691 );
11692 }
11693 }
11694 for (int j = 0; j < 2048; j += 256) {
11695 for (int k = 0; k < 32; k += 4) {
11696 __asm__ volatile (
11697 "vmovupd (%0), %%ymm0\n"
11698 "vmovupd (%1), %%ymm1\n"
11699 "vmovupd (%2), %%ymm2\n"
11700 "vmovupd (%3), %%ymm3\n"
11701 "vmovupd (%4), %%ymm4\n"
11702 "vmovupd (%5), %%ymm5\n"
11703 "vmovupd (%6), %%ymm6\n"
11704 "vmovupd (%7), %%ymm7\n"
11705 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11706 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11707 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11708 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11709 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11710 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11711 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11712 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11713 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11714 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11715 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11716 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11717 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11718 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11719 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11720 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11721 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11722 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11723 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11724 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11725 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11726 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11727 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11728 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11729 "vmovupd %%ymm8, (%0)\n"
11730 "vmovupd %%ymm9, (%1)\n"
11731 "vmovupd %%ymm10, (%2)\n"
11732 "vmovupd %%ymm11, (%3)\n"
11733 "vmovupd %%ymm12, (%4)\n"
11734 "vmovupd %%ymm13, (%5)\n"
11735 "vmovupd %%ymm14, (%6)\n"
11736 "vmovupd %%ymm15, (%7)\n"
11737 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11738 );
11739 }
11740 }
11741 for (int j = 0; j < 2048; j += 2048) {
11742 for (int k = 0; k < 256; k += 4) {
11743 __asm__ volatile (
11744 "vmovupd (%0), %%ymm0\n"
11745 "vmovupd (%1), %%ymm1\n"
11746 "vmovupd (%2), %%ymm2\n"
11747 "vmovupd (%3), %%ymm3\n"
11748 "vmovupd (%4), %%ymm4\n"
11749 "vmovupd (%5), %%ymm5\n"
11750 "vmovupd (%6), %%ymm6\n"
11751 "vmovupd (%7), %%ymm7\n"
11752 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11753 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11754 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11755 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11756 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11757 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11758 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11759 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11760 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11761 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11762 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11763 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11764 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11765 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11766 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11767 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11768 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11769 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11770 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11771 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11772 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11773 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11774 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11775 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11776 "vmovupd %%ymm8, (%0)\n"
11777 "vmovupd %%ymm9, (%1)\n"
11778 "vmovupd %%ymm10, (%2)\n"
11779 "vmovupd %%ymm11, (%3)\n"
11780 "vmovupd %%ymm12, (%4)\n"
11781 "vmovupd %%ymm13, (%5)\n"
11782 "vmovupd %%ymm14, (%6)\n"
11783 "vmovupd %%ymm15, (%7)\n"
11784 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11785 );
11786 }
11787 }
11788 return;
11789 }
11790 if (depth == 12) {
11791 helper_double_12_recursive(buf + 0, 11);
11792 helper_double_12_recursive(buf + 2048, 11);
11793 for (int j = 0; j < 4096; j += 4096) {
11794 for (int k = 0; k < 2048; k += 4) {
11795 __asm__ volatile (
11796 "vmovupd (%0), %%ymm0\n"
11797 "vmovupd (%1), %%ymm1\n"
11798 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11799 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11800 "vmovupd %%ymm8, (%0)\n"
11801 "vmovupd %%ymm9, (%1)\n"
11802 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11803 );
11804 }
11805 }
11806 return;
11807 }
11808 }
11809 void helper_double_12(double *buf);
helper_double_12(double * buf)11810 void helper_double_12(double *buf) {
11811 helper_double_12_recursive(buf, 12);
11812 }
11813 void helper_double_13_recursive(double *buf, int depth);
helper_double_13_recursive(double * buf,int depth)11814 void helper_double_13_recursive(double *buf, int depth) {
11815 if (depth == 11) {
11816 for (int j = 0; j < 2048; j += 32) {
11817 for (int k = 0; k < 4; k += 4) {
11818 __asm__ volatile (
11819 "vmovupd (%0), %%ymm0\n"
11820 "vmovupd (%1), %%ymm1\n"
11821 "vmovupd (%2), %%ymm2\n"
11822 "vmovupd (%3), %%ymm3\n"
11823 "vmovupd (%4), %%ymm4\n"
11824 "vmovupd (%5), %%ymm5\n"
11825 "vmovupd (%6), %%ymm6\n"
11826 "vmovupd (%7), %%ymm7\n"
11827 "vpermilpd $0, %%ymm0, %%ymm8\n"
11828 "vpermilpd $15, %%ymm0, %%ymm9\n"
11829 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11830 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11831 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
11832 "vpermilpd $0, %%ymm1, %%ymm8\n"
11833 "vpermilpd $15, %%ymm1, %%ymm9\n"
11834 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11835 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11836 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
11837 "vpermilpd $0, %%ymm2, %%ymm8\n"
11838 "vpermilpd $15, %%ymm2, %%ymm9\n"
11839 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11840 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11841 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
11842 "vpermilpd $0, %%ymm3, %%ymm8\n"
11843 "vpermilpd $15, %%ymm3, %%ymm9\n"
11844 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11845 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11846 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
11847 "vpermilpd $0, %%ymm4, %%ymm8\n"
11848 "vpermilpd $15, %%ymm4, %%ymm9\n"
11849 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11850 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11851 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
11852 "vpermilpd $0, %%ymm5, %%ymm8\n"
11853 "vpermilpd $15, %%ymm5, %%ymm9\n"
11854 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11855 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11856 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
11857 "vpermilpd $0, %%ymm6, %%ymm8\n"
11858 "vpermilpd $15, %%ymm6, %%ymm9\n"
11859 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11860 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11861 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
11862 "vpermilpd $0, %%ymm7, %%ymm8\n"
11863 "vpermilpd $15, %%ymm7, %%ymm9\n"
11864 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
11865 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
11866 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
11867 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
11868 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11869 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
11870 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
11871 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
11872 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
11873 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11874 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
11875 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
11876 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
11877 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
11878 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11879 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
11880 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
11881 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
11882 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
11883 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11884 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
11885 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
11886 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
11887 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
11888 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11889 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
11890 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
11891 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
11892 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
11893 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11894 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
11895 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
11896 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
11897 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
11898 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11899 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
11900 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
11901 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
11902 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
11903 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
11904 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
11905 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
11906 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
11907 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11908 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11909 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11910 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11911 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11912 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11913 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11914 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11915 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11916 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11917 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11918 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11919 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11920 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11921 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11922 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11923 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11924 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11925 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11926 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11927 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11928 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11929 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11930 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11931 "vmovupd %%ymm8, (%0)\n"
11932 "vmovupd %%ymm9, (%1)\n"
11933 "vmovupd %%ymm10, (%2)\n"
11934 "vmovupd %%ymm11, (%3)\n"
11935 "vmovupd %%ymm12, (%4)\n"
11936 "vmovupd %%ymm13, (%5)\n"
11937 "vmovupd %%ymm14, (%6)\n"
11938 "vmovupd %%ymm15, (%7)\n"
11939 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11940 );
11941 }
11942 }
11943 for (int j = 0; j < 2048; j += 256) {
11944 for (int k = 0; k < 32; k += 4) {
11945 __asm__ volatile (
11946 "vmovupd (%0), %%ymm0\n"
11947 "vmovupd (%1), %%ymm1\n"
11948 "vmovupd (%2), %%ymm2\n"
11949 "vmovupd (%3), %%ymm3\n"
11950 "vmovupd (%4), %%ymm4\n"
11951 "vmovupd (%5), %%ymm5\n"
11952 "vmovupd (%6), %%ymm6\n"
11953 "vmovupd (%7), %%ymm7\n"
11954 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
11955 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
11956 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
11957 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
11958 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
11959 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
11960 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
11961 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
11962 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
11963 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
11964 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
11965 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
11966 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
11967 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
11968 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
11969 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
11970 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
11971 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
11972 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
11973 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
11974 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
11975 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
11976 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
11977 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
11978 "vmovupd %%ymm8, (%0)\n"
11979 "vmovupd %%ymm9, (%1)\n"
11980 "vmovupd %%ymm10, (%2)\n"
11981 "vmovupd %%ymm11, (%3)\n"
11982 "vmovupd %%ymm12, (%4)\n"
11983 "vmovupd %%ymm13, (%5)\n"
11984 "vmovupd %%ymm14, (%6)\n"
11985 "vmovupd %%ymm15, (%7)\n"
11986 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
11987 );
11988 }
11989 }
11990 for (int j = 0; j < 2048; j += 2048) {
11991 for (int k = 0; k < 256; k += 4) {
11992 __asm__ volatile (
11993 "vmovupd (%0), %%ymm0\n"
11994 "vmovupd (%1), %%ymm1\n"
11995 "vmovupd (%2), %%ymm2\n"
11996 "vmovupd (%3), %%ymm3\n"
11997 "vmovupd (%4), %%ymm4\n"
11998 "vmovupd (%5), %%ymm5\n"
11999 "vmovupd (%6), %%ymm6\n"
12000 "vmovupd (%7), %%ymm7\n"
12001 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12002 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12003 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12004 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12005 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12006 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12007 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12008 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12009 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12010 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12011 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12012 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12013 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12014 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12015 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12016 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12017 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12018 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12019 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12020 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12021 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12022 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12023 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12024 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12025 "vmovupd %%ymm8, (%0)\n"
12026 "vmovupd %%ymm9, (%1)\n"
12027 "vmovupd %%ymm10, (%2)\n"
12028 "vmovupd %%ymm11, (%3)\n"
12029 "vmovupd %%ymm12, (%4)\n"
12030 "vmovupd %%ymm13, (%5)\n"
12031 "vmovupd %%ymm14, (%6)\n"
12032 "vmovupd %%ymm15, (%7)\n"
12033 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12034 );
12035 }
12036 }
12037 return;
12038 }
12039 if (depth == 13) {
12040 helper_double_13_recursive(buf + 0, 11);
12041 helper_double_13_recursive(buf + 2048, 11);
12042 helper_double_13_recursive(buf + 4096, 11);
12043 helper_double_13_recursive(buf + 6144, 11);
12044 for (int j = 0; j < 8192; j += 8192) {
12045 for (int k = 0; k < 2048; k += 4) {
12046 __asm__ volatile (
12047 "vmovupd (%0), %%ymm0\n"
12048 "vmovupd (%1), %%ymm1\n"
12049 "vmovupd (%2), %%ymm2\n"
12050 "vmovupd (%3), %%ymm3\n"
12051 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12052 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12053 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12054 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12055 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12056 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12057 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12058 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12059 "vmovupd %%ymm0, (%0)\n"
12060 "vmovupd %%ymm1, (%1)\n"
12061 "vmovupd %%ymm2, (%2)\n"
12062 "vmovupd %%ymm3, (%3)\n"
12063 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12064 );
12065 }
12066 }
12067 return;
12068 }
12069 }
12070 void helper_double_13(double *buf);
helper_double_13(double * buf)12071 void helper_double_13(double *buf) {
12072 helper_double_13_recursive(buf, 13);
12073 }
12074 void helper_double_14_recursive(double *buf, int depth);
helper_double_14_recursive(double * buf,int depth)12075 void helper_double_14_recursive(double *buf, int depth) {
12076 if (depth == 12) {
12077 for (int j = 0; j < 4096; j += 32) {
12078 for (int k = 0; k < 4; k += 4) {
12079 __asm__ volatile (
12080 "vmovupd (%0), %%ymm0\n"
12081 "vmovupd (%1), %%ymm1\n"
12082 "vmovupd (%2), %%ymm2\n"
12083 "vmovupd (%3), %%ymm3\n"
12084 "vmovupd (%4), %%ymm4\n"
12085 "vmovupd (%5), %%ymm5\n"
12086 "vmovupd (%6), %%ymm6\n"
12087 "vmovupd (%7), %%ymm7\n"
12088 "vpermilpd $0, %%ymm0, %%ymm8\n"
12089 "vpermilpd $15, %%ymm0, %%ymm9\n"
12090 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12091 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12092 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12093 "vpermilpd $0, %%ymm1, %%ymm8\n"
12094 "vpermilpd $15, %%ymm1, %%ymm9\n"
12095 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12096 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12097 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12098 "vpermilpd $0, %%ymm2, %%ymm8\n"
12099 "vpermilpd $15, %%ymm2, %%ymm9\n"
12100 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12101 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12102 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12103 "vpermilpd $0, %%ymm3, %%ymm8\n"
12104 "vpermilpd $15, %%ymm3, %%ymm9\n"
12105 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12106 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12107 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
12108 "vpermilpd $0, %%ymm4, %%ymm8\n"
12109 "vpermilpd $15, %%ymm4, %%ymm9\n"
12110 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12111 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12112 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
12113 "vpermilpd $0, %%ymm5, %%ymm8\n"
12114 "vpermilpd $15, %%ymm5, %%ymm9\n"
12115 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12116 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12117 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
12118 "vpermilpd $0, %%ymm6, %%ymm8\n"
12119 "vpermilpd $15, %%ymm6, %%ymm9\n"
12120 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12121 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12122 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
12123 "vpermilpd $0, %%ymm7, %%ymm8\n"
12124 "vpermilpd $15, %%ymm7, %%ymm9\n"
12125 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12126 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12127 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
12128 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
12129 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12130 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
12131 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
12132 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
12133 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
12134 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12135 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
12136 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
12137 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
12138 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
12139 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12140 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
12141 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
12142 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
12143 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
12144 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12145 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
12146 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
12147 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
12148 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
12149 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12150 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
12151 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
12152 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
12153 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
12154 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12155 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
12156 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
12157 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
12158 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
12159 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12160 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
12161 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
12162 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
12163 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
12164 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12165 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
12166 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
12167 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
12168 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12169 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12170 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12171 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12172 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12173 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12174 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12175 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12176 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12177 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12178 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12179 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12180 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12181 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12182 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12183 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12184 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12185 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12186 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12187 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12188 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12189 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12190 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12191 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12192 "vmovupd %%ymm8, (%0)\n"
12193 "vmovupd %%ymm9, (%1)\n"
12194 "vmovupd %%ymm10, (%2)\n"
12195 "vmovupd %%ymm11, (%3)\n"
12196 "vmovupd %%ymm12, (%4)\n"
12197 "vmovupd %%ymm13, (%5)\n"
12198 "vmovupd %%ymm14, (%6)\n"
12199 "vmovupd %%ymm15, (%7)\n"
12200 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12201 );
12202 }
12203 }
12204 for (int j = 0; j < 4096; j += 256) {
12205 for (int k = 0; k < 32; k += 4) {
12206 __asm__ volatile (
12207 "vmovupd (%0), %%ymm0\n"
12208 "vmovupd (%1), %%ymm1\n"
12209 "vmovupd (%2), %%ymm2\n"
12210 "vmovupd (%3), %%ymm3\n"
12211 "vmovupd (%4), %%ymm4\n"
12212 "vmovupd (%5), %%ymm5\n"
12213 "vmovupd (%6), %%ymm6\n"
12214 "vmovupd (%7), %%ymm7\n"
12215 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12216 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12217 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12218 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12219 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12220 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12221 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12222 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12223 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12224 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12225 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12226 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12227 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12228 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12229 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12230 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12231 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12232 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12233 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12234 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12235 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12236 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12237 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12238 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12239 "vmovupd %%ymm8, (%0)\n"
12240 "vmovupd %%ymm9, (%1)\n"
12241 "vmovupd %%ymm10, (%2)\n"
12242 "vmovupd %%ymm11, (%3)\n"
12243 "vmovupd %%ymm12, (%4)\n"
12244 "vmovupd %%ymm13, (%5)\n"
12245 "vmovupd %%ymm14, (%6)\n"
12246 "vmovupd %%ymm15, (%7)\n"
12247 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12248 );
12249 }
12250 }
12251 for (int j = 0; j < 4096; j += 2048) {
12252 for (int k = 0; k < 256; k += 4) {
12253 __asm__ volatile (
12254 "vmovupd (%0), %%ymm0\n"
12255 "vmovupd (%1), %%ymm1\n"
12256 "vmovupd (%2), %%ymm2\n"
12257 "vmovupd (%3), %%ymm3\n"
12258 "vmovupd (%4), %%ymm4\n"
12259 "vmovupd (%5), %%ymm5\n"
12260 "vmovupd (%6), %%ymm6\n"
12261 "vmovupd (%7), %%ymm7\n"
12262 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12263 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12264 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12265 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12266 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12267 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12268 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12269 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12270 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12271 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12272 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12273 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12274 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12275 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12276 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12277 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12278 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12279 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12280 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12281 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12282 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12283 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12284 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12285 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12286 "vmovupd %%ymm8, (%0)\n"
12287 "vmovupd %%ymm9, (%1)\n"
12288 "vmovupd %%ymm10, (%2)\n"
12289 "vmovupd %%ymm11, (%3)\n"
12290 "vmovupd %%ymm12, (%4)\n"
12291 "vmovupd %%ymm13, (%5)\n"
12292 "vmovupd %%ymm14, (%6)\n"
12293 "vmovupd %%ymm15, (%7)\n"
12294 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12295 );
12296 }
12297 }
12298 for (int j = 0; j < 4096; j += 4096) {
12299 for (int k = 0; k < 2048; k += 4) {
12300 __asm__ volatile (
12301 "vmovupd (%0), %%ymm0\n"
12302 "vmovupd (%1), %%ymm1\n"
12303 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12304 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12305 "vmovupd %%ymm8, (%0)\n"
12306 "vmovupd %%ymm9, (%1)\n"
12307 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12308 );
12309 }
12310 }
12311 return;
12312 }
12313 if (depth == 14) {
12314 helper_double_14_recursive(buf + 0, 12);
12315 helper_double_14_recursive(buf + 4096, 12);
12316 helper_double_14_recursive(buf + 8192, 12);
12317 helper_double_14_recursive(buf + 12288, 12);
12318 for (int j = 0; j < 16384; j += 16384) {
12319 for (int k = 0; k < 4096; k += 4) {
12320 __asm__ volatile (
12321 "vmovupd (%0), %%ymm0\n"
12322 "vmovupd (%1), %%ymm1\n"
12323 "vmovupd (%2), %%ymm2\n"
12324 "vmovupd (%3), %%ymm3\n"
12325 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12326 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12327 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12328 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12329 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12330 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12331 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12332 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12333 "vmovupd %%ymm0, (%0)\n"
12334 "vmovupd %%ymm1, (%1)\n"
12335 "vmovupd %%ymm2, (%2)\n"
12336 "vmovupd %%ymm3, (%3)\n"
12337 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12338 );
12339 }
12340 }
12341 return;
12342 }
12343 }
12344 void helper_double_14(double *buf);
helper_double_14(double * buf)12345 void helper_double_14(double *buf) {
12346 helper_double_14_recursive(buf, 14);
12347 }
12348 void helper_double_15_recursive(double *buf, int depth);
helper_double_15_recursive(double * buf,int depth)12349 void helper_double_15_recursive(double *buf, int depth) {
12350 if (depth == 12) {
12351 for (int j = 0; j < 4096; j += 32) {
12352 for (int k = 0; k < 4; k += 4) {
12353 __asm__ volatile (
12354 "vmovupd (%0), %%ymm0\n"
12355 "vmovupd (%1), %%ymm1\n"
12356 "vmovupd (%2), %%ymm2\n"
12357 "vmovupd (%3), %%ymm3\n"
12358 "vmovupd (%4), %%ymm4\n"
12359 "vmovupd (%5), %%ymm5\n"
12360 "vmovupd (%6), %%ymm6\n"
12361 "vmovupd (%7), %%ymm7\n"
12362 "vpermilpd $0, %%ymm0, %%ymm8\n"
12363 "vpermilpd $15, %%ymm0, %%ymm9\n"
12364 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12365 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12366 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12367 "vpermilpd $0, %%ymm1, %%ymm8\n"
12368 "vpermilpd $15, %%ymm1, %%ymm9\n"
12369 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12370 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12371 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12372 "vpermilpd $0, %%ymm2, %%ymm8\n"
12373 "vpermilpd $15, %%ymm2, %%ymm9\n"
12374 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12375 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12376 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12377 "vpermilpd $0, %%ymm3, %%ymm8\n"
12378 "vpermilpd $15, %%ymm3, %%ymm9\n"
12379 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12380 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12381 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
12382 "vpermilpd $0, %%ymm4, %%ymm8\n"
12383 "vpermilpd $15, %%ymm4, %%ymm9\n"
12384 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12385 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12386 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
12387 "vpermilpd $0, %%ymm5, %%ymm8\n"
12388 "vpermilpd $15, %%ymm5, %%ymm9\n"
12389 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12390 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12391 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
12392 "vpermilpd $0, %%ymm6, %%ymm8\n"
12393 "vpermilpd $15, %%ymm6, %%ymm9\n"
12394 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12395 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12396 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
12397 "vpermilpd $0, %%ymm7, %%ymm8\n"
12398 "vpermilpd $15, %%ymm7, %%ymm9\n"
12399 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12400 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12401 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
12402 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
12403 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12404 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
12405 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
12406 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
12407 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
12408 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12409 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
12410 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
12411 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
12412 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
12413 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12414 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
12415 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
12416 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
12417 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
12418 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12419 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
12420 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
12421 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
12422 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
12423 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12424 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
12425 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
12426 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
12427 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
12428 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12429 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
12430 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
12431 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
12432 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
12433 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12434 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
12435 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
12436 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
12437 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
12438 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12439 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
12440 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
12441 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
12442 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12443 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12444 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12445 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12446 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12447 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12448 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12449 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12450 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12451 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12452 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12453 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12454 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12455 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12456 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12457 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12458 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12459 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12460 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12461 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12462 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12463 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12464 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12465 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12466 "vmovupd %%ymm8, (%0)\n"
12467 "vmovupd %%ymm9, (%1)\n"
12468 "vmovupd %%ymm10, (%2)\n"
12469 "vmovupd %%ymm11, (%3)\n"
12470 "vmovupd %%ymm12, (%4)\n"
12471 "vmovupd %%ymm13, (%5)\n"
12472 "vmovupd %%ymm14, (%6)\n"
12473 "vmovupd %%ymm15, (%7)\n"
12474 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12475 );
12476 }
12477 }
12478 for (int j = 0; j < 4096; j += 256) {
12479 for (int k = 0; k < 32; k += 4) {
12480 __asm__ volatile (
12481 "vmovupd (%0), %%ymm0\n"
12482 "vmovupd (%1), %%ymm1\n"
12483 "vmovupd (%2), %%ymm2\n"
12484 "vmovupd (%3), %%ymm3\n"
12485 "vmovupd (%4), %%ymm4\n"
12486 "vmovupd (%5), %%ymm5\n"
12487 "vmovupd (%6), %%ymm6\n"
12488 "vmovupd (%7), %%ymm7\n"
12489 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12490 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12491 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12492 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12493 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12494 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12495 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12496 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12497 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12498 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12499 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12500 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12501 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12502 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12503 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12504 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12505 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12506 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12507 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12508 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12509 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12510 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12511 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12512 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12513 "vmovupd %%ymm8, (%0)\n"
12514 "vmovupd %%ymm9, (%1)\n"
12515 "vmovupd %%ymm10, (%2)\n"
12516 "vmovupd %%ymm11, (%3)\n"
12517 "vmovupd %%ymm12, (%4)\n"
12518 "vmovupd %%ymm13, (%5)\n"
12519 "vmovupd %%ymm14, (%6)\n"
12520 "vmovupd %%ymm15, (%7)\n"
12521 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12522 );
12523 }
12524 }
12525 for (int j = 0; j < 4096; j += 2048) {
12526 for (int k = 0; k < 256; k += 4) {
12527 __asm__ volatile (
12528 "vmovupd (%0), %%ymm0\n"
12529 "vmovupd (%1), %%ymm1\n"
12530 "vmovupd (%2), %%ymm2\n"
12531 "vmovupd (%3), %%ymm3\n"
12532 "vmovupd (%4), %%ymm4\n"
12533 "vmovupd (%5), %%ymm5\n"
12534 "vmovupd (%6), %%ymm6\n"
12535 "vmovupd (%7), %%ymm7\n"
12536 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12537 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12538 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12539 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12540 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12541 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12542 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12543 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12544 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12545 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12546 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12547 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12548 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12549 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12550 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12551 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12552 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12553 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12554 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12555 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12556 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12557 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12558 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12559 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12560 "vmovupd %%ymm8, (%0)\n"
12561 "vmovupd %%ymm9, (%1)\n"
12562 "vmovupd %%ymm10, (%2)\n"
12563 "vmovupd %%ymm11, (%3)\n"
12564 "vmovupd %%ymm12, (%4)\n"
12565 "vmovupd %%ymm13, (%5)\n"
12566 "vmovupd %%ymm14, (%6)\n"
12567 "vmovupd %%ymm15, (%7)\n"
12568 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12569 );
12570 }
12571 }
12572 for (int j = 0; j < 4096; j += 4096) {
12573 for (int k = 0; k < 2048; k += 4) {
12574 __asm__ volatile (
12575 "vmovupd (%0), %%ymm0\n"
12576 "vmovupd (%1), %%ymm1\n"
12577 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12578 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12579 "vmovupd %%ymm8, (%0)\n"
12580 "vmovupd %%ymm9, (%1)\n"
12581 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12582 );
12583 }
12584 }
12585 return;
12586 }
12587 if (depth == 15) {
12588 helper_double_15_recursive(buf + 0, 12);
12589 helper_double_15_recursive(buf + 4096, 12);
12590 helper_double_15_recursive(buf + 8192, 12);
12591 helper_double_15_recursive(buf + 12288, 12);
12592 helper_double_15_recursive(buf + 16384, 12);
12593 helper_double_15_recursive(buf + 20480, 12);
12594 helper_double_15_recursive(buf + 24576, 12);
12595 helper_double_15_recursive(buf + 28672, 12);
12596 for (int j = 0; j < 32768; j += 32768) {
12597 for (int k = 0; k < 4096; k += 4) {
12598 __asm__ volatile (
12599 "vmovupd (%0), %%ymm0\n"
12600 "vmovupd (%1), %%ymm1\n"
12601 "vmovupd (%2), %%ymm2\n"
12602 "vmovupd (%3), %%ymm3\n"
12603 "vmovupd (%4), %%ymm4\n"
12604 "vmovupd (%5), %%ymm5\n"
12605 "vmovupd (%6), %%ymm6\n"
12606 "vmovupd (%7), %%ymm7\n"
12607 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12608 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12609 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12610 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12611 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12612 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12613 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12614 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12615 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12616 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12617 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12618 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12619 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12620 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12621 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12622 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12623 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12624 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12625 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12626 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12627 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12628 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12629 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12630 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12631 "vmovupd %%ymm8, (%0)\n"
12632 "vmovupd %%ymm9, (%1)\n"
12633 "vmovupd %%ymm10, (%2)\n"
12634 "vmovupd %%ymm11, (%3)\n"
12635 "vmovupd %%ymm12, (%4)\n"
12636 "vmovupd %%ymm13, (%5)\n"
12637 "vmovupd %%ymm14, (%6)\n"
12638 "vmovupd %%ymm15, (%7)\n"
12639 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12640 );
12641 }
12642 }
12643 return;
12644 }
12645 }
12646 void helper_double_15(double *buf);
helper_double_15(double * buf)12647 void helper_double_15(double *buf) {
12648 helper_double_15_recursive(buf, 15);
12649 }
12650 void helper_double_16_recursive(double *buf, int depth);
helper_double_16_recursive(double * buf,int depth)12651 void helper_double_16_recursive(double *buf, int depth) {
12652 if (depth == 11) {
12653 for (int j = 0; j < 2048; j += 32) {
12654 for (int k = 0; k < 4; k += 4) {
12655 __asm__ volatile (
12656 "vmovupd (%0), %%ymm0\n"
12657 "vmovupd (%1), %%ymm1\n"
12658 "vmovupd (%2), %%ymm2\n"
12659 "vmovupd (%3), %%ymm3\n"
12660 "vmovupd (%4), %%ymm4\n"
12661 "vmovupd (%5), %%ymm5\n"
12662 "vmovupd (%6), %%ymm6\n"
12663 "vmovupd (%7), %%ymm7\n"
12664 "vpermilpd $0, %%ymm0, %%ymm8\n"
12665 "vpermilpd $15, %%ymm0, %%ymm9\n"
12666 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12667 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12668 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12669 "vpermilpd $0, %%ymm1, %%ymm8\n"
12670 "vpermilpd $15, %%ymm1, %%ymm9\n"
12671 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12672 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12673 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12674 "vpermilpd $0, %%ymm2, %%ymm8\n"
12675 "vpermilpd $15, %%ymm2, %%ymm9\n"
12676 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12677 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12678 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12679 "vpermilpd $0, %%ymm3, %%ymm8\n"
12680 "vpermilpd $15, %%ymm3, %%ymm9\n"
12681 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12682 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12683 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
12684 "vpermilpd $0, %%ymm4, %%ymm8\n"
12685 "vpermilpd $15, %%ymm4, %%ymm9\n"
12686 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12687 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12688 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
12689 "vpermilpd $0, %%ymm5, %%ymm8\n"
12690 "vpermilpd $15, %%ymm5, %%ymm9\n"
12691 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12692 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12693 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
12694 "vpermilpd $0, %%ymm6, %%ymm8\n"
12695 "vpermilpd $15, %%ymm6, %%ymm9\n"
12696 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12697 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12698 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
12699 "vpermilpd $0, %%ymm7, %%ymm8\n"
12700 "vpermilpd $15, %%ymm7, %%ymm9\n"
12701 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12702 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12703 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
12704 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
12705 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12706 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
12707 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
12708 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
12709 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
12710 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12711 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
12712 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
12713 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
12714 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
12715 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12716 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
12717 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
12718 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
12719 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
12720 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12721 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
12722 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
12723 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
12724 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
12725 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12726 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
12727 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
12728 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
12729 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
12730 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12731 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
12732 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
12733 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
12734 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
12735 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12736 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
12737 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
12738 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
12739 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
12740 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
12741 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
12742 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
12743 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
12744 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12745 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12746 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12747 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12748 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12749 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12750 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12751 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12752 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12753 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12754 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12755 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12756 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12757 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12758 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12759 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12760 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12761 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12762 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12763 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12764 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12765 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12766 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12767 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12768 "vmovupd %%ymm8, (%0)\n"
12769 "vmovupd %%ymm9, (%1)\n"
12770 "vmovupd %%ymm10, (%2)\n"
12771 "vmovupd %%ymm11, (%3)\n"
12772 "vmovupd %%ymm12, (%4)\n"
12773 "vmovupd %%ymm13, (%5)\n"
12774 "vmovupd %%ymm14, (%6)\n"
12775 "vmovupd %%ymm15, (%7)\n"
12776 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12777 );
12778 }
12779 }
12780 for (int j = 0; j < 2048; j += 256) {
12781 for (int k = 0; k < 32; k += 4) {
12782 __asm__ volatile (
12783 "vmovupd (%0), %%ymm0\n"
12784 "vmovupd (%1), %%ymm1\n"
12785 "vmovupd (%2), %%ymm2\n"
12786 "vmovupd (%3), %%ymm3\n"
12787 "vmovupd (%4), %%ymm4\n"
12788 "vmovupd (%5), %%ymm5\n"
12789 "vmovupd (%6), %%ymm6\n"
12790 "vmovupd (%7), %%ymm7\n"
12791 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12792 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12793 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12794 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12795 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12796 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12797 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12798 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12799 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12800 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12801 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12802 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12803 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12804 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12805 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12806 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12807 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12808 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12809 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12810 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12811 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12812 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12813 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12814 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12815 "vmovupd %%ymm8, (%0)\n"
12816 "vmovupd %%ymm9, (%1)\n"
12817 "vmovupd %%ymm10, (%2)\n"
12818 "vmovupd %%ymm11, (%3)\n"
12819 "vmovupd %%ymm12, (%4)\n"
12820 "vmovupd %%ymm13, (%5)\n"
12821 "vmovupd %%ymm14, (%6)\n"
12822 "vmovupd %%ymm15, (%7)\n"
12823 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12824 );
12825 }
12826 }
12827 for (int j = 0; j < 2048; j += 2048) {
12828 for (int k = 0; k < 256; k += 4) {
12829 __asm__ volatile (
12830 "vmovupd (%0), %%ymm0\n"
12831 "vmovupd (%1), %%ymm1\n"
12832 "vmovupd (%2), %%ymm2\n"
12833 "vmovupd (%3), %%ymm3\n"
12834 "vmovupd (%4), %%ymm4\n"
12835 "vmovupd (%5), %%ymm5\n"
12836 "vmovupd (%6), %%ymm6\n"
12837 "vmovupd (%7), %%ymm7\n"
12838 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12839 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12840 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12841 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12842 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12843 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12844 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12845 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12846 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12847 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12848 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12849 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12850 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12851 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12852 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12853 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12854 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12855 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12856 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12857 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12858 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12859 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12860 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12861 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12862 "vmovupd %%ymm8, (%0)\n"
12863 "vmovupd %%ymm9, (%1)\n"
12864 "vmovupd %%ymm10, (%2)\n"
12865 "vmovupd %%ymm11, (%3)\n"
12866 "vmovupd %%ymm12, (%4)\n"
12867 "vmovupd %%ymm13, (%5)\n"
12868 "vmovupd %%ymm14, (%6)\n"
12869 "vmovupd %%ymm15, (%7)\n"
12870 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12871 );
12872 }
12873 }
12874 return;
12875 }
12876 if (depth == 14) {
12877 helper_double_16_recursive(buf + 0, 11);
12878 helper_double_16_recursive(buf + 2048, 11);
12879 helper_double_16_recursive(buf + 4096, 11);
12880 helper_double_16_recursive(buf + 6144, 11);
12881 helper_double_16_recursive(buf + 8192, 11);
12882 helper_double_16_recursive(buf + 10240, 11);
12883 helper_double_16_recursive(buf + 12288, 11);
12884 helper_double_16_recursive(buf + 14336, 11);
12885 for (int j = 0; j < 16384; j += 16384) {
12886 for (int k = 0; k < 2048; k += 4) {
12887 __asm__ volatile (
12888 "vmovupd (%0), %%ymm0\n"
12889 "vmovupd (%1), %%ymm1\n"
12890 "vmovupd (%2), %%ymm2\n"
12891 "vmovupd (%3), %%ymm3\n"
12892 "vmovupd (%4), %%ymm4\n"
12893 "vmovupd (%5), %%ymm5\n"
12894 "vmovupd (%6), %%ymm6\n"
12895 "vmovupd (%7), %%ymm7\n"
12896 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12897 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12898 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12899 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12900 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
12901 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
12902 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
12903 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
12904 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12905 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12906 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12907 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12908 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
12909 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
12910 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
12911 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
12912 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
12913 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
12914 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
12915 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
12916 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
12917 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
12918 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
12919 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
12920 "vmovupd %%ymm8, (%0)\n"
12921 "vmovupd %%ymm9, (%1)\n"
12922 "vmovupd %%ymm10, (%2)\n"
12923 "vmovupd %%ymm11, (%3)\n"
12924 "vmovupd %%ymm12, (%4)\n"
12925 "vmovupd %%ymm13, (%5)\n"
12926 "vmovupd %%ymm14, (%6)\n"
12927 "vmovupd %%ymm15, (%7)\n"
12928 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12929 );
12930 }
12931 }
12932 return;
12933 }
12934 if (depth == 16) {
12935 helper_double_16_recursive(buf + 0, 14);
12936 helper_double_16_recursive(buf + 16384, 14);
12937 helper_double_16_recursive(buf + 32768, 14);
12938 helper_double_16_recursive(buf + 49152, 14);
12939 for (int j = 0; j < 65536; j += 65536) {
12940 for (int k = 0; k < 16384; k += 4) {
12941 __asm__ volatile (
12942 "vmovupd (%0), %%ymm0\n"
12943 "vmovupd (%1), %%ymm1\n"
12944 "vmovupd (%2), %%ymm2\n"
12945 "vmovupd (%3), %%ymm3\n"
12946 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
12947 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
12948 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
12949 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
12950 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
12951 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
12952 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
12953 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
12954 "vmovupd %%ymm0, (%0)\n"
12955 "vmovupd %%ymm1, (%1)\n"
12956 "vmovupd %%ymm2, (%2)\n"
12957 "vmovupd %%ymm3, (%3)\n"
12958 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
12959 );
12960 }
12961 }
12962 return;
12963 }
12964 }
12965 void helper_double_16(double *buf);
helper_double_16(double * buf)12966 void helper_double_16(double *buf) {
12967 helper_double_16_recursive(buf, 16);
12968 }
12969 void helper_double_17_recursive(double *buf, int depth);
helper_double_17_recursive(double * buf,int depth)12970 void helper_double_17_recursive(double *buf, int depth) {
12971 if (depth == 11) {
12972 for (int j = 0; j < 2048; j += 32) {
12973 for (int k = 0; k < 4; k += 4) {
12974 __asm__ volatile (
12975 "vmovupd (%0), %%ymm0\n"
12976 "vmovupd (%1), %%ymm1\n"
12977 "vmovupd (%2), %%ymm2\n"
12978 "vmovupd (%3), %%ymm3\n"
12979 "vmovupd (%4), %%ymm4\n"
12980 "vmovupd (%5), %%ymm5\n"
12981 "vmovupd (%6), %%ymm6\n"
12982 "vmovupd (%7), %%ymm7\n"
12983 "vpermilpd $0, %%ymm0, %%ymm8\n"
12984 "vpermilpd $15, %%ymm0, %%ymm9\n"
12985 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12986 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12987 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
12988 "vpermilpd $0, %%ymm1, %%ymm8\n"
12989 "vpermilpd $15, %%ymm1, %%ymm9\n"
12990 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12991 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12992 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
12993 "vpermilpd $0, %%ymm2, %%ymm8\n"
12994 "vpermilpd $15, %%ymm2, %%ymm9\n"
12995 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
12996 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
12997 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
12998 "vpermilpd $0, %%ymm3, %%ymm8\n"
12999 "vpermilpd $15, %%ymm3, %%ymm9\n"
13000 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13001 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13002 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
13003 "vpermilpd $0, %%ymm4, %%ymm8\n"
13004 "vpermilpd $15, %%ymm4, %%ymm9\n"
13005 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13006 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13007 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
13008 "vpermilpd $0, %%ymm5, %%ymm8\n"
13009 "vpermilpd $15, %%ymm5, %%ymm9\n"
13010 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13011 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13012 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
13013 "vpermilpd $0, %%ymm6, %%ymm8\n"
13014 "vpermilpd $15, %%ymm6, %%ymm9\n"
13015 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13016 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13017 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
13018 "vpermilpd $0, %%ymm7, %%ymm8\n"
13019 "vpermilpd $15, %%ymm7, %%ymm9\n"
13020 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13021 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13022 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
13023 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
13024 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13025 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
13026 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
13027 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
13028 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
13029 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13030 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
13031 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
13032 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
13033 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
13034 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13035 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
13036 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
13037 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
13038 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
13039 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13040 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
13041 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
13042 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
13043 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
13044 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13045 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
13046 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
13047 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
13048 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
13049 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13050 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
13051 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
13052 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
13053 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
13054 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13055 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
13056 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
13057 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
13058 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
13059 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13060 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
13061 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
13062 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
13063 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13064 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13065 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13066 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13067 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13068 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13069 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13070 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13071 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13072 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13073 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13074 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13075 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13076 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13077 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13078 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13079 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13080 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13081 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13082 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13083 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13084 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13085 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13086 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13087 "vmovupd %%ymm8, (%0)\n"
13088 "vmovupd %%ymm9, (%1)\n"
13089 "vmovupd %%ymm10, (%2)\n"
13090 "vmovupd %%ymm11, (%3)\n"
13091 "vmovupd %%ymm12, (%4)\n"
13092 "vmovupd %%ymm13, (%5)\n"
13093 "vmovupd %%ymm14, (%6)\n"
13094 "vmovupd %%ymm15, (%7)\n"
13095 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13096 );
13097 }
13098 }
13099 for (int j = 0; j < 2048; j += 256) {
13100 for (int k = 0; k < 32; k += 4) {
13101 __asm__ volatile (
13102 "vmovupd (%0), %%ymm0\n"
13103 "vmovupd (%1), %%ymm1\n"
13104 "vmovupd (%2), %%ymm2\n"
13105 "vmovupd (%3), %%ymm3\n"
13106 "vmovupd (%4), %%ymm4\n"
13107 "vmovupd (%5), %%ymm5\n"
13108 "vmovupd (%6), %%ymm6\n"
13109 "vmovupd (%7), %%ymm7\n"
13110 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13111 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13112 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13113 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13114 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13115 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13116 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13117 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13118 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13119 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13120 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13121 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13122 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13123 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13124 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13125 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13126 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13127 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13128 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13129 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13130 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13131 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13132 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13133 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13134 "vmovupd %%ymm8, (%0)\n"
13135 "vmovupd %%ymm9, (%1)\n"
13136 "vmovupd %%ymm10, (%2)\n"
13137 "vmovupd %%ymm11, (%3)\n"
13138 "vmovupd %%ymm12, (%4)\n"
13139 "vmovupd %%ymm13, (%5)\n"
13140 "vmovupd %%ymm14, (%6)\n"
13141 "vmovupd %%ymm15, (%7)\n"
13142 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13143 );
13144 }
13145 }
13146 for (int j = 0; j < 2048; j += 2048) {
13147 for (int k = 0; k < 256; k += 4) {
13148 __asm__ volatile (
13149 "vmovupd (%0), %%ymm0\n"
13150 "vmovupd (%1), %%ymm1\n"
13151 "vmovupd (%2), %%ymm2\n"
13152 "vmovupd (%3), %%ymm3\n"
13153 "vmovupd (%4), %%ymm4\n"
13154 "vmovupd (%5), %%ymm5\n"
13155 "vmovupd (%6), %%ymm6\n"
13156 "vmovupd (%7), %%ymm7\n"
13157 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13158 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13159 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13160 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13161 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13162 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13163 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13164 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13165 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13166 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13167 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13168 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13169 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13170 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13171 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13172 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13173 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13174 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13175 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13176 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13177 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13178 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13179 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13180 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13181 "vmovupd %%ymm8, (%0)\n"
13182 "vmovupd %%ymm9, (%1)\n"
13183 "vmovupd %%ymm10, (%2)\n"
13184 "vmovupd %%ymm11, (%3)\n"
13185 "vmovupd %%ymm12, (%4)\n"
13186 "vmovupd %%ymm13, (%5)\n"
13187 "vmovupd %%ymm14, (%6)\n"
13188 "vmovupd %%ymm15, (%7)\n"
13189 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13190 );
13191 }
13192 }
13193 return;
13194 }
13195 if (depth == 14) {
13196 helper_double_17_recursive(buf + 0, 11);
13197 helper_double_17_recursive(buf + 2048, 11);
13198 helper_double_17_recursive(buf + 4096, 11);
13199 helper_double_17_recursive(buf + 6144, 11);
13200 helper_double_17_recursive(buf + 8192, 11);
13201 helper_double_17_recursive(buf + 10240, 11);
13202 helper_double_17_recursive(buf + 12288, 11);
13203 helper_double_17_recursive(buf + 14336, 11);
13204 for (int j = 0; j < 16384; j += 16384) {
13205 for (int k = 0; k < 2048; k += 4) {
13206 __asm__ volatile (
13207 "vmovupd (%0), %%ymm0\n"
13208 "vmovupd (%1), %%ymm1\n"
13209 "vmovupd (%2), %%ymm2\n"
13210 "vmovupd (%3), %%ymm3\n"
13211 "vmovupd (%4), %%ymm4\n"
13212 "vmovupd (%5), %%ymm5\n"
13213 "vmovupd (%6), %%ymm6\n"
13214 "vmovupd (%7), %%ymm7\n"
13215 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13216 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13217 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13218 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13219 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13220 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13221 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13222 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13223 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13224 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13225 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13226 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13227 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13228 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13229 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13230 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13231 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13232 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13233 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13234 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13235 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13236 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13237 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13238 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13239 "vmovupd %%ymm8, (%0)\n"
13240 "vmovupd %%ymm9, (%1)\n"
13241 "vmovupd %%ymm10, (%2)\n"
13242 "vmovupd %%ymm11, (%3)\n"
13243 "vmovupd %%ymm12, (%4)\n"
13244 "vmovupd %%ymm13, (%5)\n"
13245 "vmovupd %%ymm14, (%6)\n"
13246 "vmovupd %%ymm15, (%7)\n"
13247 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13248 );
13249 }
13250 }
13251 return;
13252 }
13253 if (depth == 17) {
13254 helper_double_17_recursive(buf + 0, 14);
13255 helper_double_17_recursive(buf + 16384, 14);
13256 helper_double_17_recursive(buf + 32768, 14);
13257 helper_double_17_recursive(buf + 49152, 14);
13258 helper_double_17_recursive(buf + 65536, 14);
13259 helper_double_17_recursive(buf + 81920, 14);
13260 helper_double_17_recursive(buf + 98304, 14);
13261 helper_double_17_recursive(buf + 114688, 14);
13262 for (int j = 0; j < 131072; j += 131072) {
13263 for (int k = 0; k < 16384; k += 4) {
13264 __asm__ volatile (
13265 "vmovupd (%0), %%ymm0\n"
13266 "vmovupd (%1), %%ymm1\n"
13267 "vmovupd (%2), %%ymm2\n"
13268 "vmovupd (%3), %%ymm3\n"
13269 "vmovupd (%4), %%ymm4\n"
13270 "vmovupd (%5), %%ymm5\n"
13271 "vmovupd (%6), %%ymm6\n"
13272 "vmovupd (%7), %%ymm7\n"
13273 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13274 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13275 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13276 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13277 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13278 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13279 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13280 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13281 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13282 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13283 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13284 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13285 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13286 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13287 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13288 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13289 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13290 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13291 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13292 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13293 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13294 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13295 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13296 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13297 "vmovupd %%ymm8, (%0)\n"
13298 "vmovupd %%ymm9, (%1)\n"
13299 "vmovupd %%ymm10, (%2)\n"
13300 "vmovupd %%ymm11, (%3)\n"
13301 "vmovupd %%ymm12, (%4)\n"
13302 "vmovupd %%ymm13, (%5)\n"
13303 "vmovupd %%ymm14, (%6)\n"
13304 "vmovupd %%ymm15, (%7)\n"
13305 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13306 );
13307 }
13308 }
13309 return;
13310 }
13311 }
13312 void helper_double_17(double *buf);
helper_double_17(double * buf)13313 void helper_double_17(double *buf) {
13314 helper_double_17_recursive(buf, 17);
13315 }
13316 void helper_double_18_recursive(double *buf, int depth);
helper_double_18_recursive(double * buf,int depth)13317 void helper_double_18_recursive(double *buf, int depth) {
13318 if (depth == 12) {
13319 for (int j = 0; j < 4096; j += 32) {
13320 for (int k = 0; k < 4; k += 4) {
13321 __asm__ volatile (
13322 "vmovupd (%0), %%ymm0\n"
13323 "vmovupd (%1), %%ymm1\n"
13324 "vmovupd (%2), %%ymm2\n"
13325 "vmovupd (%3), %%ymm3\n"
13326 "vmovupd (%4), %%ymm4\n"
13327 "vmovupd (%5), %%ymm5\n"
13328 "vmovupd (%6), %%ymm6\n"
13329 "vmovupd (%7), %%ymm7\n"
13330 "vpermilpd $0, %%ymm0, %%ymm8\n"
13331 "vpermilpd $15, %%ymm0, %%ymm9\n"
13332 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13333 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13334 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
13335 "vpermilpd $0, %%ymm1, %%ymm8\n"
13336 "vpermilpd $15, %%ymm1, %%ymm9\n"
13337 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13338 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13339 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
13340 "vpermilpd $0, %%ymm2, %%ymm8\n"
13341 "vpermilpd $15, %%ymm2, %%ymm9\n"
13342 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13343 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13344 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
13345 "vpermilpd $0, %%ymm3, %%ymm8\n"
13346 "vpermilpd $15, %%ymm3, %%ymm9\n"
13347 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13348 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13349 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
13350 "vpermilpd $0, %%ymm4, %%ymm8\n"
13351 "vpermilpd $15, %%ymm4, %%ymm9\n"
13352 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13353 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13354 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
13355 "vpermilpd $0, %%ymm5, %%ymm8\n"
13356 "vpermilpd $15, %%ymm5, %%ymm9\n"
13357 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13358 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13359 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
13360 "vpermilpd $0, %%ymm6, %%ymm8\n"
13361 "vpermilpd $15, %%ymm6, %%ymm9\n"
13362 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13363 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13364 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
13365 "vpermilpd $0, %%ymm7, %%ymm8\n"
13366 "vpermilpd $15, %%ymm7, %%ymm9\n"
13367 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13368 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13369 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
13370 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
13371 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13372 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
13373 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
13374 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
13375 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
13376 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13377 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
13378 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
13379 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
13380 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
13381 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13382 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
13383 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
13384 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
13385 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
13386 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13387 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
13388 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
13389 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
13390 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
13391 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13392 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
13393 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
13394 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
13395 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
13396 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13397 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
13398 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
13399 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
13400 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
13401 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13402 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
13403 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
13404 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
13405 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
13406 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13407 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
13408 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
13409 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
13410 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13411 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13412 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13413 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13414 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13415 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13416 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13417 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13418 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13419 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13420 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13421 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13422 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13423 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13424 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13425 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13426 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13427 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13428 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13429 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13430 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13431 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13432 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13433 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13434 "vmovupd %%ymm8, (%0)\n"
13435 "vmovupd %%ymm9, (%1)\n"
13436 "vmovupd %%ymm10, (%2)\n"
13437 "vmovupd %%ymm11, (%3)\n"
13438 "vmovupd %%ymm12, (%4)\n"
13439 "vmovupd %%ymm13, (%5)\n"
13440 "vmovupd %%ymm14, (%6)\n"
13441 "vmovupd %%ymm15, (%7)\n"
13442 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13443 );
13444 }
13445 }
13446 for (int j = 0; j < 4096; j += 256) {
13447 for (int k = 0; k < 32; k += 4) {
13448 __asm__ volatile (
13449 "vmovupd (%0), %%ymm0\n"
13450 "vmovupd (%1), %%ymm1\n"
13451 "vmovupd (%2), %%ymm2\n"
13452 "vmovupd (%3), %%ymm3\n"
13453 "vmovupd (%4), %%ymm4\n"
13454 "vmovupd (%5), %%ymm5\n"
13455 "vmovupd (%6), %%ymm6\n"
13456 "vmovupd (%7), %%ymm7\n"
13457 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13458 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13459 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13460 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13461 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13462 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13463 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13464 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13465 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13466 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13467 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13468 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13469 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13470 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13471 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13472 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13473 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13474 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13475 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13476 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13477 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13478 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13479 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13480 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13481 "vmovupd %%ymm8, (%0)\n"
13482 "vmovupd %%ymm9, (%1)\n"
13483 "vmovupd %%ymm10, (%2)\n"
13484 "vmovupd %%ymm11, (%3)\n"
13485 "vmovupd %%ymm12, (%4)\n"
13486 "vmovupd %%ymm13, (%5)\n"
13487 "vmovupd %%ymm14, (%6)\n"
13488 "vmovupd %%ymm15, (%7)\n"
13489 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13490 );
13491 }
13492 }
13493 for (int j = 0; j < 4096; j += 2048) {
13494 for (int k = 0; k < 256; k += 4) {
13495 __asm__ volatile (
13496 "vmovupd (%0), %%ymm0\n"
13497 "vmovupd (%1), %%ymm1\n"
13498 "vmovupd (%2), %%ymm2\n"
13499 "vmovupd (%3), %%ymm3\n"
13500 "vmovupd (%4), %%ymm4\n"
13501 "vmovupd (%5), %%ymm5\n"
13502 "vmovupd (%6), %%ymm6\n"
13503 "vmovupd (%7), %%ymm7\n"
13504 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13505 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13506 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13507 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13508 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13509 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13510 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13511 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13512 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13513 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13514 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13515 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13516 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13517 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13518 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13519 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13520 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13521 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13522 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13523 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13524 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13525 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13526 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13527 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13528 "vmovupd %%ymm8, (%0)\n"
13529 "vmovupd %%ymm9, (%1)\n"
13530 "vmovupd %%ymm10, (%2)\n"
13531 "vmovupd %%ymm11, (%3)\n"
13532 "vmovupd %%ymm12, (%4)\n"
13533 "vmovupd %%ymm13, (%5)\n"
13534 "vmovupd %%ymm14, (%6)\n"
13535 "vmovupd %%ymm15, (%7)\n"
13536 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13537 );
13538 }
13539 }
13540 for (int j = 0; j < 4096; j += 4096) {
13541 for (int k = 0; k < 2048; k += 4) {
13542 __asm__ volatile (
13543 "vmovupd (%0), %%ymm0\n"
13544 "vmovupd (%1), %%ymm1\n"
13545 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13546 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13547 "vmovupd %%ymm8, (%0)\n"
13548 "vmovupd %%ymm9, (%1)\n"
13549 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13550 );
13551 }
13552 }
13553 return;
13554 }
13555 if (depth == 15) {
13556 helper_double_18_recursive(buf + 0, 12);
13557 helper_double_18_recursive(buf + 4096, 12);
13558 helper_double_18_recursive(buf + 8192, 12);
13559 helper_double_18_recursive(buf + 12288, 12);
13560 helper_double_18_recursive(buf + 16384, 12);
13561 helper_double_18_recursive(buf + 20480, 12);
13562 helper_double_18_recursive(buf + 24576, 12);
13563 helper_double_18_recursive(buf + 28672, 12);
13564 for (int j = 0; j < 32768; j += 32768) {
13565 for (int k = 0; k < 4096; k += 4) {
13566 __asm__ volatile (
13567 "vmovupd (%0), %%ymm0\n"
13568 "vmovupd (%1), %%ymm1\n"
13569 "vmovupd (%2), %%ymm2\n"
13570 "vmovupd (%3), %%ymm3\n"
13571 "vmovupd (%4), %%ymm4\n"
13572 "vmovupd (%5), %%ymm5\n"
13573 "vmovupd (%6), %%ymm6\n"
13574 "vmovupd (%7), %%ymm7\n"
13575 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13576 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13577 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13578 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13579 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13580 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13581 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13582 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13583 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13584 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13585 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13586 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13587 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13588 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13589 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13590 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13591 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13592 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13593 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13594 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13595 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13596 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13597 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13598 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13599 "vmovupd %%ymm8, (%0)\n"
13600 "vmovupd %%ymm9, (%1)\n"
13601 "vmovupd %%ymm10, (%2)\n"
13602 "vmovupd %%ymm11, (%3)\n"
13603 "vmovupd %%ymm12, (%4)\n"
13604 "vmovupd %%ymm13, (%5)\n"
13605 "vmovupd %%ymm14, (%6)\n"
13606 "vmovupd %%ymm15, (%7)\n"
13607 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13608 );
13609 }
13610 }
13611 return;
13612 }
13613 if (depth == 18) {
13614 helper_double_18_recursive(buf + 0, 15);
13615 helper_double_18_recursive(buf + 32768, 15);
13616 helper_double_18_recursive(buf + 65536, 15);
13617 helper_double_18_recursive(buf + 98304, 15);
13618 helper_double_18_recursive(buf + 131072, 15);
13619 helper_double_18_recursive(buf + 163840, 15);
13620 helper_double_18_recursive(buf + 196608, 15);
13621 helper_double_18_recursive(buf + 229376, 15);
13622 for (int j = 0; j < 262144; j += 262144) {
13623 for (int k = 0; k < 32768; k += 4) {
13624 __asm__ volatile (
13625 "vmovupd (%0), %%ymm0\n"
13626 "vmovupd (%1), %%ymm1\n"
13627 "vmovupd (%2), %%ymm2\n"
13628 "vmovupd (%3), %%ymm3\n"
13629 "vmovupd (%4), %%ymm4\n"
13630 "vmovupd (%5), %%ymm5\n"
13631 "vmovupd (%6), %%ymm6\n"
13632 "vmovupd (%7), %%ymm7\n"
13633 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13634 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13635 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13636 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13637 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13638 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13639 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13640 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13641 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13642 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13643 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13644 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13645 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13646 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13647 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13648 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13649 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13650 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13651 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13652 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13653 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13654 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13655 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13656 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13657 "vmovupd %%ymm8, (%0)\n"
13658 "vmovupd %%ymm9, (%1)\n"
13659 "vmovupd %%ymm10, (%2)\n"
13660 "vmovupd %%ymm11, (%3)\n"
13661 "vmovupd %%ymm12, (%4)\n"
13662 "vmovupd %%ymm13, (%5)\n"
13663 "vmovupd %%ymm14, (%6)\n"
13664 "vmovupd %%ymm15, (%7)\n"
13665 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13666 );
13667 }
13668 }
13669 return;
13670 }
13671 }
13672 void helper_double_18(double *buf);
helper_double_18(double * buf)13673 void helper_double_18(double *buf) {
13674 helper_double_18_recursive(buf, 18);
13675 }
13676 void helper_double_19_recursive(double *buf, int depth);
helper_double_19_recursive(double * buf,int depth)13677 void helper_double_19_recursive(double *buf, int depth) {
13678 if (depth == 11) {
13679 for (int j = 0; j < 2048; j += 32) {
13680 for (int k = 0; k < 4; k += 4) {
13681 __asm__ volatile (
13682 "vmovupd (%0), %%ymm0\n"
13683 "vmovupd (%1), %%ymm1\n"
13684 "vmovupd (%2), %%ymm2\n"
13685 "vmovupd (%3), %%ymm3\n"
13686 "vmovupd (%4), %%ymm4\n"
13687 "vmovupd (%5), %%ymm5\n"
13688 "vmovupd (%6), %%ymm6\n"
13689 "vmovupd (%7), %%ymm7\n"
13690 "vpermilpd $0, %%ymm0, %%ymm8\n"
13691 "vpermilpd $15, %%ymm0, %%ymm9\n"
13692 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13693 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13694 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
13695 "vpermilpd $0, %%ymm1, %%ymm8\n"
13696 "vpermilpd $15, %%ymm1, %%ymm9\n"
13697 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13698 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13699 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
13700 "vpermilpd $0, %%ymm2, %%ymm8\n"
13701 "vpermilpd $15, %%ymm2, %%ymm9\n"
13702 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13703 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13704 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
13705 "vpermilpd $0, %%ymm3, %%ymm8\n"
13706 "vpermilpd $15, %%ymm3, %%ymm9\n"
13707 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13708 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13709 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
13710 "vpermilpd $0, %%ymm4, %%ymm8\n"
13711 "vpermilpd $15, %%ymm4, %%ymm9\n"
13712 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13713 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13714 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
13715 "vpermilpd $0, %%ymm5, %%ymm8\n"
13716 "vpermilpd $15, %%ymm5, %%ymm9\n"
13717 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13718 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13719 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
13720 "vpermilpd $0, %%ymm6, %%ymm8\n"
13721 "vpermilpd $15, %%ymm6, %%ymm9\n"
13722 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13723 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13724 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
13725 "vpermilpd $0, %%ymm7, %%ymm8\n"
13726 "vpermilpd $15, %%ymm7, %%ymm9\n"
13727 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
13728 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
13729 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
13730 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
13731 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13732 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
13733 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
13734 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
13735 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
13736 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13737 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
13738 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
13739 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
13740 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
13741 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13742 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
13743 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
13744 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
13745 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
13746 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13747 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
13748 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
13749 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
13750 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
13751 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13752 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
13753 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
13754 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
13755 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
13756 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13757 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
13758 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
13759 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
13760 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
13761 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13762 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
13763 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
13764 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
13765 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
13766 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
13767 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
13768 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
13769 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
13770 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13771 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13772 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13773 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13774 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13775 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13776 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13777 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13778 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13779 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13780 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13781 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13782 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13783 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13784 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13785 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13786 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13787 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13788 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13789 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13790 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13791 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13792 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13793 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13794 "vmovupd %%ymm8, (%0)\n"
13795 "vmovupd %%ymm9, (%1)\n"
13796 "vmovupd %%ymm10, (%2)\n"
13797 "vmovupd %%ymm11, (%3)\n"
13798 "vmovupd %%ymm12, (%4)\n"
13799 "vmovupd %%ymm13, (%5)\n"
13800 "vmovupd %%ymm14, (%6)\n"
13801 "vmovupd %%ymm15, (%7)\n"
13802 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13803 );
13804 }
13805 }
13806 for (int j = 0; j < 2048; j += 256) {
13807 for (int k = 0; k < 32; k += 4) {
13808 __asm__ volatile (
13809 "vmovupd (%0), %%ymm0\n"
13810 "vmovupd (%1), %%ymm1\n"
13811 "vmovupd (%2), %%ymm2\n"
13812 "vmovupd (%3), %%ymm3\n"
13813 "vmovupd (%4), %%ymm4\n"
13814 "vmovupd (%5), %%ymm5\n"
13815 "vmovupd (%6), %%ymm6\n"
13816 "vmovupd (%7), %%ymm7\n"
13817 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13818 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13819 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13820 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13821 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13822 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13823 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13824 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13825 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13826 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13827 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13828 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13829 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13830 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13831 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13832 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13833 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13834 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13835 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13836 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13837 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13838 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13839 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13840 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13841 "vmovupd %%ymm8, (%0)\n"
13842 "vmovupd %%ymm9, (%1)\n"
13843 "vmovupd %%ymm10, (%2)\n"
13844 "vmovupd %%ymm11, (%3)\n"
13845 "vmovupd %%ymm12, (%4)\n"
13846 "vmovupd %%ymm13, (%5)\n"
13847 "vmovupd %%ymm14, (%6)\n"
13848 "vmovupd %%ymm15, (%7)\n"
13849 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13850 );
13851 }
13852 }
13853 for (int j = 0; j < 2048; j += 2048) {
13854 for (int k = 0; k < 256; k += 4) {
13855 __asm__ volatile (
13856 "vmovupd (%0), %%ymm0\n"
13857 "vmovupd (%1), %%ymm1\n"
13858 "vmovupd (%2), %%ymm2\n"
13859 "vmovupd (%3), %%ymm3\n"
13860 "vmovupd (%4), %%ymm4\n"
13861 "vmovupd (%5), %%ymm5\n"
13862 "vmovupd (%6), %%ymm6\n"
13863 "vmovupd (%7), %%ymm7\n"
13864 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13865 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13866 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13867 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13868 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13869 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13870 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13871 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13872 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13873 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13874 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13875 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13876 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13877 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13878 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13879 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13880 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13881 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13882 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13883 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13884 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13885 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13886 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13887 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13888 "vmovupd %%ymm8, (%0)\n"
13889 "vmovupd %%ymm9, (%1)\n"
13890 "vmovupd %%ymm10, (%2)\n"
13891 "vmovupd %%ymm11, (%3)\n"
13892 "vmovupd %%ymm12, (%4)\n"
13893 "vmovupd %%ymm13, (%5)\n"
13894 "vmovupd %%ymm14, (%6)\n"
13895 "vmovupd %%ymm15, (%7)\n"
13896 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13897 );
13898 }
13899 }
13900 return;
13901 }
13902 if (depth == 14) {
13903 helper_double_19_recursive(buf + 0, 11);
13904 helper_double_19_recursive(buf + 2048, 11);
13905 helper_double_19_recursive(buf + 4096, 11);
13906 helper_double_19_recursive(buf + 6144, 11);
13907 helper_double_19_recursive(buf + 8192, 11);
13908 helper_double_19_recursive(buf + 10240, 11);
13909 helper_double_19_recursive(buf + 12288, 11);
13910 helper_double_19_recursive(buf + 14336, 11);
13911 for (int j = 0; j < 16384; j += 16384) {
13912 for (int k = 0; k < 2048; k += 4) {
13913 __asm__ volatile (
13914 "vmovupd (%0), %%ymm0\n"
13915 "vmovupd (%1), %%ymm1\n"
13916 "vmovupd (%2), %%ymm2\n"
13917 "vmovupd (%3), %%ymm3\n"
13918 "vmovupd (%4), %%ymm4\n"
13919 "vmovupd (%5), %%ymm5\n"
13920 "vmovupd (%6), %%ymm6\n"
13921 "vmovupd (%7), %%ymm7\n"
13922 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13923 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13924 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13925 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13926 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13927 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13928 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13929 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13930 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13931 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13932 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13933 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13934 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13935 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13936 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13937 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13938 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13939 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13940 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13941 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
13942 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
13943 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
13944 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
13945 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
13946 "vmovupd %%ymm8, (%0)\n"
13947 "vmovupd %%ymm9, (%1)\n"
13948 "vmovupd %%ymm10, (%2)\n"
13949 "vmovupd %%ymm11, (%3)\n"
13950 "vmovupd %%ymm12, (%4)\n"
13951 "vmovupd %%ymm13, (%5)\n"
13952 "vmovupd %%ymm14, (%6)\n"
13953 "vmovupd %%ymm15, (%7)\n"
13954 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
13955 );
13956 }
13957 }
13958 return;
13959 }
13960 if (depth == 17) {
13961 helper_double_19_recursive(buf + 0, 14);
13962 helper_double_19_recursive(buf + 16384, 14);
13963 helper_double_19_recursive(buf + 32768, 14);
13964 helper_double_19_recursive(buf + 49152, 14);
13965 helper_double_19_recursive(buf + 65536, 14);
13966 helper_double_19_recursive(buf + 81920, 14);
13967 helper_double_19_recursive(buf + 98304, 14);
13968 helper_double_19_recursive(buf + 114688, 14);
13969 for (int j = 0; j < 131072; j += 131072) {
13970 for (int k = 0; k < 16384; k += 4) {
13971 __asm__ volatile (
13972 "vmovupd (%0), %%ymm0\n"
13973 "vmovupd (%1), %%ymm1\n"
13974 "vmovupd (%2), %%ymm2\n"
13975 "vmovupd (%3), %%ymm3\n"
13976 "vmovupd (%4), %%ymm4\n"
13977 "vmovupd (%5), %%ymm5\n"
13978 "vmovupd (%6), %%ymm6\n"
13979 "vmovupd (%7), %%ymm7\n"
13980 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
13981 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
13982 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
13983 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
13984 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
13985 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
13986 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
13987 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
13988 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
13989 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
13990 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
13991 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
13992 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
13993 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
13994 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
13995 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
13996 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
13997 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
13998 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
13999 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14000 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14001 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14002 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14003 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14004 "vmovupd %%ymm8, (%0)\n"
14005 "vmovupd %%ymm9, (%1)\n"
14006 "vmovupd %%ymm10, (%2)\n"
14007 "vmovupd %%ymm11, (%3)\n"
14008 "vmovupd %%ymm12, (%4)\n"
14009 "vmovupd %%ymm13, (%5)\n"
14010 "vmovupd %%ymm14, (%6)\n"
14011 "vmovupd %%ymm15, (%7)\n"
14012 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14013 );
14014 }
14015 }
14016 return;
14017 }
14018 if (depth == 19) {
14019 helper_double_19_recursive(buf + 0, 17);
14020 helper_double_19_recursive(buf + 131072, 17);
14021 helper_double_19_recursive(buf + 262144, 17);
14022 helper_double_19_recursive(buf + 393216, 17);
14023 for (int j = 0; j < 524288; j += 524288) {
14024 for (int k = 0; k < 131072; k += 4) {
14025 __asm__ volatile (
14026 "vmovupd (%0), %%ymm0\n"
14027 "vmovupd (%1), %%ymm1\n"
14028 "vmovupd (%2), %%ymm2\n"
14029 "vmovupd (%3), %%ymm3\n"
14030 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14031 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14032 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14033 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14034 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14035 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14036 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14037 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14038 "vmovupd %%ymm0, (%0)\n"
14039 "vmovupd %%ymm1, (%1)\n"
14040 "vmovupd %%ymm2, (%2)\n"
14041 "vmovupd %%ymm3, (%3)\n"
14042 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14043 );
14044 }
14045 }
14046 return;
14047 }
14048 }
14049 void helper_double_19(double *buf);
helper_double_19(double * buf)14050 void helper_double_19(double *buf) {
14051 helper_double_19_recursive(buf, 19);
14052 }
14053 void helper_double_20_recursive(double *buf, int depth);
helper_double_20_recursive(double * buf,int depth)14054 void helper_double_20_recursive(double *buf, int depth) {
14055 if (depth == 9) {
14056 for (int j = 0; j < 512; j += 32) {
14057 for (int k = 0; k < 4; k += 4) {
14058 __asm__ volatile (
14059 "vmovupd (%0), %%ymm0\n"
14060 "vmovupd (%1), %%ymm1\n"
14061 "vmovupd (%2), %%ymm2\n"
14062 "vmovupd (%3), %%ymm3\n"
14063 "vmovupd (%4), %%ymm4\n"
14064 "vmovupd (%5), %%ymm5\n"
14065 "vmovupd (%6), %%ymm6\n"
14066 "vmovupd (%7), %%ymm7\n"
14067 "vpermilpd $0, %%ymm0, %%ymm8\n"
14068 "vpermilpd $15, %%ymm0, %%ymm9\n"
14069 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14070 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14071 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
14072 "vpermilpd $0, %%ymm1, %%ymm8\n"
14073 "vpermilpd $15, %%ymm1, %%ymm9\n"
14074 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14075 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14076 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
14077 "vpermilpd $0, %%ymm2, %%ymm8\n"
14078 "vpermilpd $15, %%ymm2, %%ymm9\n"
14079 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14080 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14081 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
14082 "vpermilpd $0, %%ymm3, %%ymm8\n"
14083 "vpermilpd $15, %%ymm3, %%ymm9\n"
14084 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14085 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14086 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
14087 "vpermilpd $0, %%ymm4, %%ymm8\n"
14088 "vpermilpd $15, %%ymm4, %%ymm9\n"
14089 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14090 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14091 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
14092 "vpermilpd $0, %%ymm5, %%ymm8\n"
14093 "vpermilpd $15, %%ymm5, %%ymm9\n"
14094 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14095 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14096 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
14097 "vpermilpd $0, %%ymm6, %%ymm8\n"
14098 "vpermilpd $15, %%ymm6, %%ymm9\n"
14099 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14100 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14101 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
14102 "vpermilpd $0, %%ymm7, %%ymm8\n"
14103 "vpermilpd $15, %%ymm7, %%ymm9\n"
14104 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14105 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14106 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
14107 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
14108 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14109 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
14110 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
14111 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
14112 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
14113 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14114 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
14115 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
14116 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
14117 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
14118 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14119 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
14120 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
14121 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
14122 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
14123 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14124 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
14125 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
14126 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
14127 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
14128 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14129 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
14130 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
14131 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
14132 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
14133 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14134 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
14135 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
14136 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
14137 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
14138 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14139 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
14140 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
14141 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
14142 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
14143 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14144 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
14145 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
14146 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
14147 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14148 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14149 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14150 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14151 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14152 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14153 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14154 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14155 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14156 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14157 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14158 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14159 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14160 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14161 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14162 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14163 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14164 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14165 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14166 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14167 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14168 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14169 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14170 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14171 "vmovupd %%ymm8, (%0)\n"
14172 "vmovupd %%ymm9, (%1)\n"
14173 "vmovupd %%ymm10, (%2)\n"
14174 "vmovupd %%ymm11, (%3)\n"
14175 "vmovupd %%ymm12, (%4)\n"
14176 "vmovupd %%ymm13, (%5)\n"
14177 "vmovupd %%ymm14, (%6)\n"
14178 "vmovupd %%ymm15, (%7)\n"
14179 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14180 );
14181 }
14182 }
14183 for (int j = 0; j < 512; j += 256) {
14184 for (int k = 0; k < 32; k += 4) {
14185 __asm__ volatile (
14186 "vmovupd (%0), %%ymm0\n"
14187 "vmovupd (%1), %%ymm1\n"
14188 "vmovupd (%2), %%ymm2\n"
14189 "vmovupd (%3), %%ymm3\n"
14190 "vmovupd (%4), %%ymm4\n"
14191 "vmovupd (%5), %%ymm5\n"
14192 "vmovupd (%6), %%ymm6\n"
14193 "vmovupd (%7), %%ymm7\n"
14194 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14195 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14196 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14197 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14198 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14199 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14200 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14201 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14202 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14203 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14204 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14205 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14206 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14207 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14208 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14209 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14210 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14211 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14212 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14213 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14214 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14215 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14216 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14217 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14218 "vmovupd %%ymm8, (%0)\n"
14219 "vmovupd %%ymm9, (%1)\n"
14220 "vmovupd %%ymm10, (%2)\n"
14221 "vmovupd %%ymm11, (%3)\n"
14222 "vmovupd %%ymm12, (%4)\n"
14223 "vmovupd %%ymm13, (%5)\n"
14224 "vmovupd %%ymm14, (%6)\n"
14225 "vmovupd %%ymm15, (%7)\n"
14226 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14227 );
14228 }
14229 }
14230 for (int j = 0; j < 512; j += 512) {
14231 for (int k = 0; k < 256; k += 4) {
14232 __asm__ volatile (
14233 "vmovupd (%0), %%ymm0\n"
14234 "vmovupd (%1), %%ymm1\n"
14235 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14236 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14237 "vmovupd %%ymm8, (%0)\n"
14238 "vmovupd %%ymm9, (%1)\n"
14239 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14240 );
14241 }
14242 }
14243 return;
14244 }
14245 if (depth == 12) {
14246 helper_double_20_recursive(buf + 0, 9);
14247 helper_double_20_recursive(buf + 512, 9);
14248 helper_double_20_recursive(buf + 1024, 9);
14249 helper_double_20_recursive(buf + 1536, 9);
14250 helper_double_20_recursive(buf + 2048, 9);
14251 helper_double_20_recursive(buf + 2560, 9);
14252 helper_double_20_recursive(buf + 3072, 9);
14253 helper_double_20_recursive(buf + 3584, 9);
14254 for (int j = 0; j < 4096; j += 4096) {
14255 for (int k = 0; k < 512; k += 4) {
14256 __asm__ volatile (
14257 "vmovupd (%0), %%ymm0\n"
14258 "vmovupd (%1), %%ymm1\n"
14259 "vmovupd (%2), %%ymm2\n"
14260 "vmovupd (%3), %%ymm3\n"
14261 "vmovupd (%4), %%ymm4\n"
14262 "vmovupd (%5), %%ymm5\n"
14263 "vmovupd (%6), %%ymm6\n"
14264 "vmovupd (%7), %%ymm7\n"
14265 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14266 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14267 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14268 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14269 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14270 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14271 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14272 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14273 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14274 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14275 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14276 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14277 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14278 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14279 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14280 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14281 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14282 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14283 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14284 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14285 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14286 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14287 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14288 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14289 "vmovupd %%ymm8, (%0)\n"
14290 "vmovupd %%ymm9, (%1)\n"
14291 "vmovupd %%ymm10, (%2)\n"
14292 "vmovupd %%ymm11, (%3)\n"
14293 "vmovupd %%ymm12, (%4)\n"
14294 "vmovupd %%ymm13, (%5)\n"
14295 "vmovupd %%ymm14, (%6)\n"
14296 "vmovupd %%ymm15, (%7)\n"
14297 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14298 );
14299 }
14300 }
14301 return;
14302 }
14303 if (depth == 15) {
14304 helper_double_20_recursive(buf + 0, 12);
14305 helper_double_20_recursive(buf + 4096, 12);
14306 helper_double_20_recursive(buf + 8192, 12);
14307 helper_double_20_recursive(buf + 12288, 12);
14308 helper_double_20_recursive(buf + 16384, 12);
14309 helper_double_20_recursive(buf + 20480, 12);
14310 helper_double_20_recursive(buf + 24576, 12);
14311 helper_double_20_recursive(buf + 28672, 12);
14312 for (int j = 0; j < 32768; j += 32768) {
14313 for (int k = 0; k < 4096; k += 4) {
14314 __asm__ volatile (
14315 "vmovupd (%0), %%ymm0\n"
14316 "vmovupd (%1), %%ymm1\n"
14317 "vmovupd (%2), %%ymm2\n"
14318 "vmovupd (%3), %%ymm3\n"
14319 "vmovupd (%4), %%ymm4\n"
14320 "vmovupd (%5), %%ymm5\n"
14321 "vmovupd (%6), %%ymm6\n"
14322 "vmovupd (%7), %%ymm7\n"
14323 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14324 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14325 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14326 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14327 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14328 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14329 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14330 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14331 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14332 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14333 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14334 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14335 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14336 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14337 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14338 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14339 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14340 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14341 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14342 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14343 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14344 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14345 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14346 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14347 "vmovupd %%ymm8, (%0)\n"
14348 "vmovupd %%ymm9, (%1)\n"
14349 "vmovupd %%ymm10, (%2)\n"
14350 "vmovupd %%ymm11, (%3)\n"
14351 "vmovupd %%ymm12, (%4)\n"
14352 "vmovupd %%ymm13, (%5)\n"
14353 "vmovupd %%ymm14, (%6)\n"
14354 "vmovupd %%ymm15, (%7)\n"
14355 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14356 );
14357 }
14358 }
14359 return;
14360 }
14361 if (depth == 18) {
14362 helper_double_20_recursive(buf + 0, 15);
14363 helper_double_20_recursive(buf + 32768, 15);
14364 helper_double_20_recursive(buf + 65536, 15);
14365 helper_double_20_recursive(buf + 98304, 15);
14366 helper_double_20_recursive(buf + 131072, 15);
14367 helper_double_20_recursive(buf + 163840, 15);
14368 helper_double_20_recursive(buf + 196608, 15);
14369 helper_double_20_recursive(buf + 229376, 15);
14370 for (int j = 0; j < 262144; j += 262144) {
14371 for (int k = 0; k < 32768; k += 4) {
14372 __asm__ volatile (
14373 "vmovupd (%0), %%ymm0\n"
14374 "vmovupd (%1), %%ymm1\n"
14375 "vmovupd (%2), %%ymm2\n"
14376 "vmovupd (%3), %%ymm3\n"
14377 "vmovupd (%4), %%ymm4\n"
14378 "vmovupd (%5), %%ymm5\n"
14379 "vmovupd (%6), %%ymm6\n"
14380 "vmovupd (%7), %%ymm7\n"
14381 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14382 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14383 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14384 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14385 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14386 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14387 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14388 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14389 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14390 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14391 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14392 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14393 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14394 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14395 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14396 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14397 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14398 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14399 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14400 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14401 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14402 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14403 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14404 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14405 "vmovupd %%ymm8, (%0)\n"
14406 "vmovupd %%ymm9, (%1)\n"
14407 "vmovupd %%ymm10, (%2)\n"
14408 "vmovupd %%ymm11, (%3)\n"
14409 "vmovupd %%ymm12, (%4)\n"
14410 "vmovupd %%ymm13, (%5)\n"
14411 "vmovupd %%ymm14, (%6)\n"
14412 "vmovupd %%ymm15, (%7)\n"
14413 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14414 );
14415 }
14416 }
14417 return;
14418 }
14419 if (depth == 20) {
14420 helper_double_20_recursive(buf + 0, 18);
14421 helper_double_20_recursive(buf + 262144, 18);
14422 helper_double_20_recursive(buf + 524288, 18);
14423 helper_double_20_recursive(buf + 786432, 18);
14424 for (int j = 0; j < 1048576; j += 1048576) {
14425 for (int k = 0; k < 262144; k += 4) {
14426 __asm__ volatile (
14427 "vmovupd (%0), %%ymm0\n"
14428 "vmovupd (%1), %%ymm1\n"
14429 "vmovupd (%2), %%ymm2\n"
14430 "vmovupd (%3), %%ymm3\n"
14431 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14432 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14433 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14434 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14435 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14436 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14437 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14438 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14439 "vmovupd %%ymm0, (%0)\n"
14440 "vmovupd %%ymm1, (%1)\n"
14441 "vmovupd %%ymm2, (%2)\n"
14442 "vmovupd %%ymm3, (%3)\n"
14443 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14444 );
14445 }
14446 }
14447 return;
14448 }
14449 }
14450 void helper_double_20(double *buf);
helper_double_20(double * buf)14451 void helper_double_20(double *buf) {
14452 helper_double_20_recursive(buf, 20);
14453 }
14454 void helper_double_21_recursive(double *buf, int depth);
helper_double_21_recursive(double * buf,int depth)14455 void helper_double_21_recursive(double *buf, int depth) {
14456 if (depth == 7) {
14457 for (int j = 0; j < 128; j += 32) {
14458 for (int k = 0; k < 4; k += 4) {
14459 __asm__ volatile (
14460 "vmovupd (%0), %%ymm0\n"
14461 "vmovupd (%1), %%ymm1\n"
14462 "vmovupd (%2), %%ymm2\n"
14463 "vmovupd (%3), %%ymm3\n"
14464 "vmovupd (%4), %%ymm4\n"
14465 "vmovupd (%5), %%ymm5\n"
14466 "vmovupd (%6), %%ymm6\n"
14467 "vmovupd (%7), %%ymm7\n"
14468 "vpermilpd $0, %%ymm0, %%ymm8\n"
14469 "vpermilpd $15, %%ymm0, %%ymm9\n"
14470 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14471 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14472 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
14473 "vpermilpd $0, %%ymm1, %%ymm8\n"
14474 "vpermilpd $15, %%ymm1, %%ymm9\n"
14475 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14476 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14477 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
14478 "vpermilpd $0, %%ymm2, %%ymm8\n"
14479 "vpermilpd $15, %%ymm2, %%ymm9\n"
14480 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14481 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14482 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
14483 "vpermilpd $0, %%ymm3, %%ymm8\n"
14484 "vpermilpd $15, %%ymm3, %%ymm9\n"
14485 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14486 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14487 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
14488 "vpermilpd $0, %%ymm4, %%ymm8\n"
14489 "vpermilpd $15, %%ymm4, %%ymm9\n"
14490 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14491 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14492 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
14493 "vpermilpd $0, %%ymm5, %%ymm8\n"
14494 "vpermilpd $15, %%ymm5, %%ymm9\n"
14495 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14496 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14497 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
14498 "vpermilpd $0, %%ymm6, %%ymm8\n"
14499 "vpermilpd $15, %%ymm6, %%ymm9\n"
14500 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14501 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14502 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
14503 "vpermilpd $0, %%ymm7, %%ymm8\n"
14504 "vpermilpd $15, %%ymm7, %%ymm9\n"
14505 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14506 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14507 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
14508 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
14509 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14510 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
14511 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
14512 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
14513 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
14514 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14515 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
14516 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
14517 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
14518 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
14519 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14520 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
14521 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
14522 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
14523 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
14524 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14525 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
14526 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
14527 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
14528 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
14529 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14530 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
14531 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
14532 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
14533 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
14534 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14535 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
14536 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
14537 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
14538 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
14539 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14540 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
14541 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
14542 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
14543 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
14544 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14545 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
14546 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
14547 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
14548 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14549 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14550 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14551 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14552 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14553 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14554 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14555 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14556 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14557 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14558 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14559 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14560 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14561 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14562 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14563 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14564 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14565 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14566 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14567 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14568 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14569 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14570 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14571 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14572 "vmovupd %%ymm8, (%0)\n"
14573 "vmovupd %%ymm9, (%1)\n"
14574 "vmovupd %%ymm10, (%2)\n"
14575 "vmovupd %%ymm11, (%3)\n"
14576 "vmovupd %%ymm12, (%4)\n"
14577 "vmovupd %%ymm13, (%5)\n"
14578 "vmovupd %%ymm14, (%6)\n"
14579 "vmovupd %%ymm15, (%7)\n"
14580 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14581 );
14582 }
14583 }
14584 for (int j = 0; j < 128; j += 128) {
14585 for (int k = 0; k < 32; k += 4) {
14586 __asm__ volatile (
14587 "vmovupd (%0), %%ymm0\n"
14588 "vmovupd (%1), %%ymm1\n"
14589 "vmovupd (%2), %%ymm2\n"
14590 "vmovupd (%3), %%ymm3\n"
14591 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14592 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14593 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14594 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14595 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14596 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14597 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14598 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14599 "vmovupd %%ymm0, (%0)\n"
14600 "vmovupd %%ymm1, (%1)\n"
14601 "vmovupd %%ymm2, (%2)\n"
14602 "vmovupd %%ymm3, (%3)\n"
14603 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14604 );
14605 }
14606 }
14607 return;
14608 }
14609 if (depth == 10) {
14610 helper_double_21_recursive(buf + 0, 7);
14611 helper_double_21_recursive(buf + 128, 7);
14612 helper_double_21_recursive(buf + 256, 7);
14613 helper_double_21_recursive(buf + 384, 7);
14614 helper_double_21_recursive(buf + 512, 7);
14615 helper_double_21_recursive(buf + 640, 7);
14616 helper_double_21_recursive(buf + 768, 7);
14617 helper_double_21_recursive(buf + 896, 7);
14618 for (int j = 0; j < 1024; j += 1024) {
14619 for (int k = 0; k < 128; k += 4) {
14620 __asm__ volatile (
14621 "vmovupd (%0), %%ymm0\n"
14622 "vmovupd (%1), %%ymm1\n"
14623 "vmovupd (%2), %%ymm2\n"
14624 "vmovupd (%3), %%ymm3\n"
14625 "vmovupd (%4), %%ymm4\n"
14626 "vmovupd (%5), %%ymm5\n"
14627 "vmovupd (%6), %%ymm6\n"
14628 "vmovupd (%7), %%ymm7\n"
14629 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14630 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14631 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14632 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14633 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14634 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14635 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14636 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14637 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14638 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14639 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14640 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14641 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14642 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14643 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14644 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14645 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14646 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14647 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14648 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14649 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14650 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14651 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14652 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14653 "vmovupd %%ymm8, (%0)\n"
14654 "vmovupd %%ymm9, (%1)\n"
14655 "vmovupd %%ymm10, (%2)\n"
14656 "vmovupd %%ymm11, (%3)\n"
14657 "vmovupd %%ymm12, (%4)\n"
14658 "vmovupd %%ymm13, (%5)\n"
14659 "vmovupd %%ymm14, (%6)\n"
14660 "vmovupd %%ymm15, (%7)\n"
14661 :: "r"(buf + j + k + 0), "r"(buf + j + k + 128), "r"(buf + j + k + 256), "r"(buf + j + k + 384), "r"(buf + j + k + 512), "r"(buf + j + k + 640), "r"(buf + j + k + 768), "r"(buf + j + k + 896) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14662 );
14663 }
14664 }
14665 return;
14666 }
14667 if (depth == 13) {
14668 helper_double_21_recursive(buf + 0, 10);
14669 helper_double_21_recursive(buf + 1024, 10);
14670 helper_double_21_recursive(buf + 2048, 10);
14671 helper_double_21_recursive(buf + 3072, 10);
14672 helper_double_21_recursive(buf + 4096, 10);
14673 helper_double_21_recursive(buf + 5120, 10);
14674 helper_double_21_recursive(buf + 6144, 10);
14675 helper_double_21_recursive(buf + 7168, 10);
14676 for (int j = 0; j < 8192; j += 8192) {
14677 for (int k = 0; k < 1024; k += 4) {
14678 __asm__ volatile (
14679 "vmovupd (%0), %%ymm0\n"
14680 "vmovupd (%1), %%ymm1\n"
14681 "vmovupd (%2), %%ymm2\n"
14682 "vmovupd (%3), %%ymm3\n"
14683 "vmovupd (%4), %%ymm4\n"
14684 "vmovupd (%5), %%ymm5\n"
14685 "vmovupd (%6), %%ymm6\n"
14686 "vmovupd (%7), %%ymm7\n"
14687 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14688 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14689 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14690 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14691 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14692 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14693 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14694 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14695 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14696 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14697 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14698 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14699 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14700 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14701 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14702 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14703 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14704 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14705 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14706 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14707 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14708 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14709 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14710 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14711 "vmovupd %%ymm8, (%0)\n"
14712 "vmovupd %%ymm9, (%1)\n"
14713 "vmovupd %%ymm10, (%2)\n"
14714 "vmovupd %%ymm11, (%3)\n"
14715 "vmovupd %%ymm12, (%4)\n"
14716 "vmovupd %%ymm13, (%5)\n"
14717 "vmovupd %%ymm14, (%6)\n"
14718 "vmovupd %%ymm15, (%7)\n"
14719 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14720 );
14721 }
14722 }
14723 return;
14724 }
14725 if (depth == 16) {
14726 helper_double_21_recursive(buf + 0, 13);
14727 helper_double_21_recursive(buf + 8192, 13);
14728 helper_double_21_recursive(buf + 16384, 13);
14729 helper_double_21_recursive(buf + 24576, 13);
14730 helper_double_21_recursive(buf + 32768, 13);
14731 helper_double_21_recursive(buf + 40960, 13);
14732 helper_double_21_recursive(buf + 49152, 13);
14733 helper_double_21_recursive(buf + 57344, 13);
14734 for (int j = 0; j < 65536; j += 65536) {
14735 for (int k = 0; k < 8192; k += 4) {
14736 __asm__ volatile (
14737 "vmovupd (%0), %%ymm0\n"
14738 "vmovupd (%1), %%ymm1\n"
14739 "vmovupd (%2), %%ymm2\n"
14740 "vmovupd (%3), %%ymm3\n"
14741 "vmovupd (%4), %%ymm4\n"
14742 "vmovupd (%5), %%ymm5\n"
14743 "vmovupd (%6), %%ymm6\n"
14744 "vmovupd (%7), %%ymm7\n"
14745 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14746 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14747 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14748 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14749 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14750 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14751 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14752 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14753 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14754 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14755 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14756 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14757 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14758 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14759 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14760 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14761 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14762 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14763 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14764 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14765 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14766 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14767 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14768 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14769 "vmovupd %%ymm8, (%0)\n"
14770 "vmovupd %%ymm9, (%1)\n"
14771 "vmovupd %%ymm10, (%2)\n"
14772 "vmovupd %%ymm11, (%3)\n"
14773 "vmovupd %%ymm12, (%4)\n"
14774 "vmovupd %%ymm13, (%5)\n"
14775 "vmovupd %%ymm14, (%6)\n"
14776 "vmovupd %%ymm15, (%7)\n"
14777 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14778 );
14779 }
14780 }
14781 return;
14782 }
14783 if (depth == 19) {
14784 helper_double_21_recursive(buf + 0, 16);
14785 helper_double_21_recursive(buf + 65536, 16);
14786 helper_double_21_recursive(buf + 131072, 16);
14787 helper_double_21_recursive(buf + 196608, 16);
14788 helper_double_21_recursive(buf + 262144, 16);
14789 helper_double_21_recursive(buf + 327680, 16);
14790 helper_double_21_recursive(buf + 393216, 16);
14791 helper_double_21_recursive(buf + 458752, 16);
14792 for (int j = 0; j < 524288; j += 524288) {
14793 for (int k = 0; k < 65536; k += 4) {
14794 __asm__ volatile (
14795 "vmovupd (%0), %%ymm0\n"
14796 "vmovupd (%1), %%ymm1\n"
14797 "vmovupd (%2), %%ymm2\n"
14798 "vmovupd (%3), %%ymm3\n"
14799 "vmovupd (%4), %%ymm4\n"
14800 "vmovupd (%5), %%ymm5\n"
14801 "vmovupd (%6), %%ymm6\n"
14802 "vmovupd (%7), %%ymm7\n"
14803 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14804 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14805 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14806 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14807 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14808 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14809 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14810 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14811 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14812 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14813 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14814 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14815 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14816 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14817 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14818 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14819 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14820 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14821 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14822 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14823 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14824 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14825 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14826 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14827 "vmovupd %%ymm8, (%0)\n"
14828 "vmovupd %%ymm9, (%1)\n"
14829 "vmovupd %%ymm10, (%2)\n"
14830 "vmovupd %%ymm11, (%3)\n"
14831 "vmovupd %%ymm12, (%4)\n"
14832 "vmovupd %%ymm13, (%5)\n"
14833 "vmovupd %%ymm14, (%6)\n"
14834 "vmovupd %%ymm15, (%7)\n"
14835 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14836 );
14837 }
14838 }
14839 return;
14840 }
14841 if (depth == 21) {
14842 helper_double_21_recursive(buf + 0, 19);
14843 helper_double_21_recursive(buf + 524288, 19);
14844 helper_double_21_recursive(buf + 1048576, 19);
14845 helper_double_21_recursive(buf + 1572864, 19);
14846 for (int j = 0; j < 2097152; j += 2097152) {
14847 for (int k = 0; k < 524288; k += 4) {
14848 __asm__ volatile (
14849 "vmovupd (%0), %%ymm0\n"
14850 "vmovupd (%1), %%ymm1\n"
14851 "vmovupd (%2), %%ymm2\n"
14852 "vmovupd (%3), %%ymm3\n"
14853 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14854 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14855 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14856 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14857 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14858 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14859 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14860 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14861 "vmovupd %%ymm0, (%0)\n"
14862 "vmovupd %%ymm1, (%1)\n"
14863 "vmovupd %%ymm2, (%2)\n"
14864 "vmovupd %%ymm3, (%3)\n"
14865 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
14866 );
14867 }
14868 }
14869 return;
14870 }
14871 }
14872 void helper_double_21(double *buf);
helper_double_21(double * buf)14873 void helper_double_21(double *buf) {
14874 helper_double_21_recursive(buf, 21);
14875 }
14876 void helper_double_22_recursive(double *buf, int depth);
helper_double_22_recursive(double * buf,int depth)14877 void helper_double_22_recursive(double *buf, int depth) {
14878 if (depth == 11) {
14879 for (int j = 0; j < 2048; j += 32) {
14880 for (int k = 0; k < 4; k += 4) {
14881 __asm__ volatile (
14882 "vmovupd (%0), %%ymm0\n"
14883 "vmovupd (%1), %%ymm1\n"
14884 "vmovupd (%2), %%ymm2\n"
14885 "vmovupd (%3), %%ymm3\n"
14886 "vmovupd (%4), %%ymm4\n"
14887 "vmovupd (%5), %%ymm5\n"
14888 "vmovupd (%6), %%ymm6\n"
14889 "vmovupd (%7), %%ymm7\n"
14890 "vpermilpd $0, %%ymm0, %%ymm8\n"
14891 "vpermilpd $15, %%ymm0, %%ymm9\n"
14892 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14893 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14894 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
14895 "vpermilpd $0, %%ymm1, %%ymm8\n"
14896 "vpermilpd $15, %%ymm1, %%ymm9\n"
14897 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14898 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14899 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
14900 "vpermilpd $0, %%ymm2, %%ymm8\n"
14901 "vpermilpd $15, %%ymm2, %%ymm9\n"
14902 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14903 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14904 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
14905 "vpermilpd $0, %%ymm3, %%ymm8\n"
14906 "vpermilpd $15, %%ymm3, %%ymm9\n"
14907 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14908 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14909 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
14910 "vpermilpd $0, %%ymm4, %%ymm8\n"
14911 "vpermilpd $15, %%ymm4, %%ymm9\n"
14912 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14913 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14914 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
14915 "vpermilpd $0, %%ymm5, %%ymm8\n"
14916 "vpermilpd $15, %%ymm5, %%ymm9\n"
14917 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14918 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14919 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
14920 "vpermilpd $0, %%ymm6, %%ymm8\n"
14921 "vpermilpd $15, %%ymm6, %%ymm9\n"
14922 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14923 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14924 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
14925 "vpermilpd $0, %%ymm7, %%ymm8\n"
14926 "vpermilpd $15, %%ymm7, %%ymm9\n"
14927 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
14928 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
14929 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
14930 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
14931 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14932 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
14933 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
14934 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
14935 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
14936 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14937 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
14938 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
14939 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
14940 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
14941 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14942 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
14943 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
14944 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
14945 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
14946 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14947 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
14948 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
14949 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
14950 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
14951 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14952 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
14953 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
14954 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
14955 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
14956 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14957 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
14958 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
14959 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
14960 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
14961 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14962 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
14963 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
14964 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
14965 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
14966 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
14967 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
14968 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
14969 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
14970 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
14971 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
14972 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
14973 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
14974 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
14975 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
14976 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
14977 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
14978 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
14979 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
14980 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
14981 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
14982 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
14983 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
14984 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
14985 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
14986 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
14987 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
14988 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
14989 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
14990 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
14991 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
14992 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
14993 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
14994 "vmovupd %%ymm8, (%0)\n"
14995 "vmovupd %%ymm9, (%1)\n"
14996 "vmovupd %%ymm10, (%2)\n"
14997 "vmovupd %%ymm11, (%3)\n"
14998 "vmovupd %%ymm12, (%4)\n"
14999 "vmovupd %%ymm13, (%5)\n"
15000 "vmovupd %%ymm14, (%6)\n"
15001 "vmovupd %%ymm15, (%7)\n"
15002 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15003 );
15004 }
15005 }
15006 for (int j = 0; j < 2048; j += 256) {
15007 for (int k = 0; k < 32; k += 4) {
15008 __asm__ volatile (
15009 "vmovupd (%0), %%ymm0\n"
15010 "vmovupd (%1), %%ymm1\n"
15011 "vmovupd (%2), %%ymm2\n"
15012 "vmovupd (%3), %%ymm3\n"
15013 "vmovupd (%4), %%ymm4\n"
15014 "vmovupd (%5), %%ymm5\n"
15015 "vmovupd (%6), %%ymm6\n"
15016 "vmovupd (%7), %%ymm7\n"
15017 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15018 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15019 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15020 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15021 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15022 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15023 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15024 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15025 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15026 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15027 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15028 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15029 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15030 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15031 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15032 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15033 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15034 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15035 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15036 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15037 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15038 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15039 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15040 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15041 "vmovupd %%ymm8, (%0)\n"
15042 "vmovupd %%ymm9, (%1)\n"
15043 "vmovupd %%ymm10, (%2)\n"
15044 "vmovupd %%ymm11, (%3)\n"
15045 "vmovupd %%ymm12, (%4)\n"
15046 "vmovupd %%ymm13, (%5)\n"
15047 "vmovupd %%ymm14, (%6)\n"
15048 "vmovupd %%ymm15, (%7)\n"
15049 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15050 );
15051 }
15052 }
15053 for (int j = 0; j < 2048; j += 2048) {
15054 for (int k = 0; k < 256; k += 4) {
15055 __asm__ volatile (
15056 "vmovupd (%0), %%ymm0\n"
15057 "vmovupd (%1), %%ymm1\n"
15058 "vmovupd (%2), %%ymm2\n"
15059 "vmovupd (%3), %%ymm3\n"
15060 "vmovupd (%4), %%ymm4\n"
15061 "vmovupd (%5), %%ymm5\n"
15062 "vmovupd (%6), %%ymm6\n"
15063 "vmovupd (%7), %%ymm7\n"
15064 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15065 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15066 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15067 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15068 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15069 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15070 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15071 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15072 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15073 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15074 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15075 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15076 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15077 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15078 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15079 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15080 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15081 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15082 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15083 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15084 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15085 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15086 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15087 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15088 "vmovupd %%ymm8, (%0)\n"
15089 "vmovupd %%ymm9, (%1)\n"
15090 "vmovupd %%ymm10, (%2)\n"
15091 "vmovupd %%ymm11, (%3)\n"
15092 "vmovupd %%ymm12, (%4)\n"
15093 "vmovupd %%ymm13, (%5)\n"
15094 "vmovupd %%ymm14, (%6)\n"
15095 "vmovupd %%ymm15, (%7)\n"
15096 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15097 );
15098 }
15099 }
15100 return;
15101 }
15102 if (depth == 14) {
15103 helper_double_22_recursive(buf + 0, 11);
15104 helper_double_22_recursive(buf + 2048, 11);
15105 helper_double_22_recursive(buf + 4096, 11);
15106 helper_double_22_recursive(buf + 6144, 11);
15107 helper_double_22_recursive(buf + 8192, 11);
15108 helper_double_22_recursive(buf + 10240, 11);
15109 helper_double_22_recursive(buf + 12288, 11);
15110 helper_double_22_recursive(buf + 14336, 11);
15111 for (int j = 0; j < 16384; j += 16384) {
15112 for (int k = 0; k < 2048; k += 4) {
15113 __asm__ volatile (
15114 "vmovupd (%0), %%ymm0\n"
15115 "vmovupd (%1), %%ymm1\n"
15116 "vmovupd (%2), %%ymm2\n"
15117 "vmovupd (%3), %%ymm3\n"
15118 "vmovupd (%4), %%ymm4\n"
15119 "vmovupd (%5), %%ymm5\n"
15120 "vmovupd (%6), %%ymm6\n"
15121 "vmovupd (%7), %%ymm7\n"
15122 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15123 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15124 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15125 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15126 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15127 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15128 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15129 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15130 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15131 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15132 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15133 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15134 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15135 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15136 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15137 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15138 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15139 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15140 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15141 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15142 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15143 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15144 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15145 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15146 "vmovupd %%ymm8, (%0)\n"
15147 "vmovupd %%ymm9, (%1)\n"
15148 "vmovupd %%ymm10, (%2)\n"
15149 "vmovupd %%ymm11, (%3)\n"
15150 "vmovupd %%ymm12, (%4)\n"
15151 "vmovupd %%ymm13, (%5)\n"
15152 "vmovupd %%ymm14, (%6)\n"
15153 "vmovupd %%ymm15, (%7)\n"
15154 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15155 );
15156 }
15157 }
15158 return;
15159 }
15160 if (depth == 17) {
15161 helper_double_22_recursive(buf + 0, 14);
15162 helper_double_22_recursive(buf + 16384, 14);
15163 helper_double_22_recursive(buf + 32768, 14);
15164 helper_double_22_recursive(buf + 49152, 14);
15165 helper_double_22_recursive(buf + 65536, 14);
15166 helper_double_22_recursive(buf + 81920, 14);
15167 helper_double_22_recursive(buf + 98304, 14);
15168 helper_double_22_recursive(buf + 114688, 14);
15169 for (int j = 0; j < 131072; j += 131072) {
15170 for (int k = 0; k < 16384; k += 4) {
15171 __asm__ volatile (
15172 "vmovupd (%0), %%ymm0\n"
15173 "vmovupd (%1), %%ymm1\n"
15174 "vmovupd (%2), %%ymm2\n"
15175 "vmovupd (%3), %%ymm3\n"
15176 "vmovupd (%4), %%ymm4\n"
15177 "vmovupd (%5), %%ymm5\n"
15178 "vmovupd (%6), %%ymm6\n"
15179 "vmovupd (%7), %%ymm7\n"
15180 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15181 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15182 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15183 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15184 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15185 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15186 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15187 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15188 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15189 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15190 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15191 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15192 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15193 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15194 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15195 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15196 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15197 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15198 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15199 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15200 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15201 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15202 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15203 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15204 "vmovupd %%ymm8, (%0)\n"
15205 "vmovupd %%ymm9, (%1)\n"
15206 "vmovupd %%ymm10, (%2)\n"
15207 "vmovupd %%ymm11, (%3)\n"
15208 "vmovupd %%ymm12, (%4)\n"
15209 "vmovupd %%ymm13, (%5)\n"
15210 "vmovupd %%ymm14, (%6)\n"
15211 "vmovupd %%ymm15, (%7)\n"
15212 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15213 );
15214 }
15215 }
15216 return;
15217 }
15218 if (depth == 20) {
15219 helper_double_22_recursive(buf + 0, 17);
15220 helper_double_22_recursive(buf + 131072, 17);
15221 helper_double_22_recursive(buf + 262144, 17);
15222 helper_double_22_recursive(buf + 393216, 17);
15223 helper_double_22_recursive(buf + 524288, 17);
15224 helper_double_22_recursive(buf + 655360, 17);
15225 helper_double_22_recursive(buf + 786432, 17);
15226 helper_double_22_recursive(buf + 917504, 17);
15227 for (int j = 0; j < 1048576; j += 1048576) {
15228 for (int k = 0; k < 131072; k += 4) {
15229 __asm__ volatile (
15230 "vmovupd (%0), %%ymm0\n"
15231 "vmovupd (%1), %%ymm1\n"
15232 "vmovupd (%2), %%ymm2\n"
15233 "vmovupd (%3), %%ymm3\n"
15234 "vmovupd (%4), %%ymm4\n"
15235 "vmovupd (%5), %%ymm5\n"
15236 "vmovupd (%6), %%ymm6\n"
15237 "vmovupd (%7), %%ymm7\n"
15238 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15239 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15240 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15241 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15242 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15243 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15244 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15245 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15246 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15247 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15248 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15249 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15250 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15251 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15252 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15253 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15254 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15255 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15256 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15257 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15258 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15259 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15260 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15261 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15262 "vmovupd %%ymm8, (%0)\n"
15263 "vmovupd %%ymm9, (%1)\n"
15264 "vmovupd %%ymm10, (%2)\n"
15265 "vmovupd %%ymm11, (%3)\n"
15266 "vmovupd %%ymm12, (%4)\n"
15267 "vmovupd %%ymm13, (%5)\n"
15268 "vmovupd %%ymm14, (%6)\n"
15269 "vmovupd %%ymm15, (%7)\n"
15270 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15271 );
15272 }
15273 }
15274 return;
15275 }
15276 if (depth == 22) {
15277 helper_double_22_recursive(buf + 0, 20);
15278 helper_double_22_recursive(buf + 1048576, 20);
15279 helper_double_22_recursive(buf + 2097152, 20);
15280 helper_double_22_recursive(buf + 3145728, 20);
15281 for (int j = 0; j < 4194304; j += 4194304) {
15282 for (int k = 0; k < 1048576; k += 4) {
15283 __asm__ volatile (
15284 "vmovupd (%0), %%ymm0\n"
15285 "vmovupd (%1), %%ymm1\n"
15286 "vmovupd (%2), %%ymm2\n"
15287 "vmovupd (%3), %%ymm3\n"
15288 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15289 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15290 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15291 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15292 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15293 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15294 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15295 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15296 "vmovupd %%ymm0, (%0)\n"
15297 "vmovupd %%ymm1, (%1)\n"
15298 "vmovupd %%ymm2, (%2)\n"
15299 "vmovupd %%ymm3, (%3)\n"
15300 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15301 );
15302 }
15303 }
15304 return;
15305 }
15306 }
15307 void helper_double_22(double *buf);
helper_double_22(double * buf)15308 void helper_double_22(double *buf) {
15309 helper_double_22_recursive(buf, 22);
15310 }
15311 void helper_double_23_recursive(double *buf, int depth);
helper_double_23_recursive(double * buf,int depth)15312 void helper_double_23_recursive(double *buf, int depth) {
15313 if (depth == 11) {
15314 for (int j = 0; j < 2048; j += 32) {
15315 for (int k = 0; k < 4; k += 4) {
15316 __asm__ volatile (
15317 "vmovupd (%0), %%ymm0\n"
15318 "vmovupd (%1), %%ymm1\n"
15319 "vmovupd (%2), %%ymm2\n"
15320 "vmovupd (%3), %%ymm3\n"
15321 "vmovupd (%4), %%ymm4\n"
15322 "vmovupd (%5), %%ymm5\n"
15323 "vmovupd (%6), %%ymm6\n"
15324 "vmovupd (%7), %%ymm7\n"
15325 "vpermilpd $0, %%ymm0, %%ymm8\n"
15326 "vpermilpd $15, %%ymm0, %%ymm9\n"
15327 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15328 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15329 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
15330 "vpermilpd $0, %%ymm1, %%ymm8\n"
15331 "vpermilpd $15, %%ymm1, %%ymm9\n"
15332 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15333 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15334 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
15335 "vpermilpd $0, %%ymm2, %%ymm8\n"
15336 "vpermilpd $15, %%ymm2, %%ymm9\n"
15337 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15338 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15339 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
15340 "vpermilpd $0, %%ymm3, %%ymm8\n"
15341 "vpermilpd $15, %%ymm3, %%ymm9\n"
15342 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15343 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15344 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
15345 "vpermilpd $0, %%ymm4, %%ymm8\n"
15346 "vpermilpd $15, %%ymm4, %%ymm9\n"
15347 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15348 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15349 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
15350 "vpermilpd $0, %%ymm5, %%ymm8\n"
15351 "vpermilpd $15, %%ymm5, %%ymm9\n"
15352 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15353 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15354 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
15355 "vpermilpd $0, %%ymm6, %%ymm8\n"
15356 "vpermilpd $15, %%ymm6, %%ymm9\n"
15357 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15358 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15359 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
15360 "vpermilpd $0, %%ymm7, %%ymm8\n"
15361 "vpermilpd $15, %%ymm7, %%ymm9\n"
15362 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15363 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15364 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
15365 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
15366 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15367 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
15368 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
15369 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
15370 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
15371 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15372 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
15373 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
15374 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
15375 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
15376 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15377 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
15378 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
15379 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
15380 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
15381 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15382 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
15383 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
15384 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
15385 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
15386 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15387 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
15388 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
15389 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
15390 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
15391 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15392 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
15393 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
15394 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
15395 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
15396 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15397 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
15398 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
15399 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
15400 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
15401 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15402 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
15403 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
15404 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
15405 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15406 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15407 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15408 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15409 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15410 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15411 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15412 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15413 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15414 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15415 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15416 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15417 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15418 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15419 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15420 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15421 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15422 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15423 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15424 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15425 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15426 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15427 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15428 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15429 "vmovupd %%ymm8, (%0)\n"
15430 "vmovupd %%ymm9, (%1)\n"
15431 "vmovupd %%ymm10, (%2)\n"
15432 "vmovupd %%ymm11, (%3)\n"
15433 "vmovupd %%ymm12, (%4)\n"
15434 "vmovupd %%ymm13, (%5)\n"
15435 "vmovupd %%ymm14, (%6)\n"
15436 "vmovupd %%ymm15, (%7)\n"
15437 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15438 );
15439 }
15440 }
15441 for (int j = 0; j < 2048; j += 256) {
15442 for (int k = 0; k < 32; k += 4) {
15443 __asm__ volatile (
15444 "vmovupd (%0), %%ymm0\n"
15445 "vmovupd (%1), %%ymm1\n"
15446 "vmovupd (%2), %%ymm2\n"
15447 "vmovupd (%3), %%ymm3\n"
15448 "vmovupd (%4), %%ymm4\n"
15449 "vmovupd (%5), %%ymm5\n"
15450 "vmovupd (%6), %%ymm6\n"
15451 "vmovupd (%7), %%ymm7\n"
15452 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15453 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15454 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15455 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15456 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15457 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15458 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15459 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15460 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15461 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15462 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15463 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15464 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15465 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15466 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15467 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15468 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15469 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15470 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15471 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15472 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15473 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15474 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15475 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15476 "vmovupd %%ymm8, (%0)\n"
15477 "vmovupd %%ymm9, (%1)\n"
15478 "vmovupd %%ymm10, (%2)\n"
15479 "vmovupd %%ymm11, (%3)\n"
15480 "vmovupd %%ymm12, (%4)\n"
15481 "vmovupd %%ymm13, (%5)\n"
15482 "vmovupd %%ymm14, (%6)\n"
15483 "vmovupd %%ymm15, (%7)\n"
15484 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15485 );
15486 }
15487 }
15488 for (int j = 0; j < 2048; j += 2048) {
15489 for (int k = 0; k < 256; k += 4) {
15490 __asm__ volatile (
15491 "vmovupd (%0), %%ymm0\n"
15492 "vmovupd (%1), %%ymm1\n"
15493 "vmovupd (%2), %%ymm2\n"
15494 "vmovupd (%3), %%ymm3\n"
15495 "vmovupd (%4), %%ymm4\n"
15496 "vmovupd (%5), %%ymm5\n"
15497 "vmovupd (%6), %%ymm6\n"
15498 "vmovupd (%7), %%ymm7\n"
15499 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15500 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15501 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15502 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15503 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15504 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15505 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15506 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15507 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15508 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15509 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15510 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15511 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15512 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15513 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15514 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15515 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15516 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15517 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15518 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15519 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15520 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15521 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15522 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15523 "vmovupd %%ymm8, (%0)\n"
15524 "vmovupd %%ymm9, (%1)\n"
15525 "vmovupd %%ymm10, (%2)\n"
15526 "vmovupd %%ymm11, (%3)\n"
15527 "vmovupd %%ymm12, (%4)\n"
15528 "vmovupd %%ymm13, (%5)\n"
15529 "vmovupd %%ymm14, (%6)\n"
15530 "vmovupd %%ymm15, (%7)\n"
15531 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15532 );
15533 }
15534 }
15535 return;
15536 }
15537 if (depth == 14) {
15538 helper_double_23_recursive(buf + 0, 11);
15539 helper_double_23_recursive(buf + 2048, 11);
15540 helper_double_23_recursive(buf + 4096, 11);
15541 helper_double_23_recursive(buf + 6144, 11);
15542 helper_double_23_recursive(buf + 8192, 11);
15543 helper_double_23_recursive(buf + 10240, 11);
15544 helper_double_23_recursive(buf + 12288, 11);
15545 helper_double_23_recursive(buf + 14336, 11);
15546 for (int j = 0; j < 16384; j += 16384) {
15547 for (int k = 0; k < 2048; k += 4) {
15548 __asm__ volatile (
15549 "vmovupd (%0), %%ymm0\n"
15550 "vmovupd (%1), %%ymm1\n"
15551 "vmovupd (%2), %%ymm2\n"
15552 "vmovupd (%3), %%ymm3\n"
15553 "vmovupd (%4), %%ymm4\n"
15554 "vmovupd (%5), %%ymm5\n"
15555 "vmovupd (%6), %%ymm6\n"
15556 "vmovupd (%7), %%ymm7\n"
15557 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15558 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15559 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15560 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15561 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15562 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15563 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15564 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15565 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15566 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15567 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15568 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15569 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15570 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15571 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15572 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15573 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15574 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15575 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15576 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15577 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15578 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15579 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15580 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15581 "vmovupd %%ymm8, (%0)\n"
15582 "vmovupd %%ymm9, (%1)\n"
15583 "vmovupd %%ymm10, (%2)\n"
15584 "vmovupd %%ymm11, (%3)\n"
15585 "vmovupd %%ymm12, (%4)\n"
15586 "vmovupd %%ymm13, (%5)\n"
15587 "vmovupd %%ymm14, (%6)\n"
15588 "vmovupd %%ymm15, (%7)\n"
15589 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15590 );
15591 }
15592 }
15593 return;
15594 }
15595 if (depth == 17) {
15596 helper_double_23_recursive(buf + 0, 14);
15597 helper_double_23_recursive(buf + 16384, 14);
15598 helper_double_23_recursive(buf + 32768, 14);
15599 helper_double_23_recursive(buf + 49152, 14);
15600 helper_double_23_recursive(buf + 65536, 14);
15601 helper_double_23_recursive(buf + 81920, 14);
15602 helper_double_23_recursive(buf + 98304, 14);
15603 helper_double_23_recursive(buf + 114688, 14);
15604 for (int j = 0; j < 131072; j += 131072) {
15605 for (int k = 0; k < 16384; k += 4) {
15606 __asm__ volatile (
15607 "vmovupd (%0), %%ymm0\n"
15608 "vmovupd (%1), %%ymm1\n"
15609 "vmovupd (%2), %%ymm2\n"
15610 "vmovupd (%3), %%ymm3\n"
15611 "vmovupd (%4), %%ymm4\n"
15612 "vmovupd (%5), %%ymm5\n"
15613 "vmovupd (%6), %%ymm6\n"
15614 "vmovupd (%7), %%ymm7\n"
15615 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15616 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15617 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15618 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15619 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15620 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15621 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15622 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15623 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15624 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15625 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15626 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15627 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15628 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15629 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15630 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15631 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15632 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15633 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15634 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15635 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15636 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15637 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15638 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15639 "vmovupd %%ymm8, (%0)\n"
15640 "vmovupd %%ymm9, (%1)\n"
15641 "vmovupd %%ymm10, (%2)\n"
15642 "vmovupd %%ymm11, (%3)\n"
15643 "vmovupd %%ymm12, (%4)\n"
15644 "vmovupd %%ymm13, (%5)\n"
15645 "vmovupd %%ymm14, (%6)\n"
15646 "vmovupd %%ymm15, (%7)\n"
15647 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15648 );
15649 }
15650 }
15651 return;
15652 }
15653 if (depth == 20) {
15654 helper_double_23_recursive(buf + 0, 17);
15655 helper_double_23_recursive(buf + 131072, 17);
15656 helper_double_23_recursive(buf + 262144, 17);
15657 helper_double_23_recursive(buf + 393216, 17);
15658 helper_double_23_recursive(buf + 524288, 17);
15659 helper_double_23_recursive(buf + 655360, 17);
15660 helper_double_23_recursive(buf + 786432, 17);
15661 helper_double_23_recursive(buf + 917504, 17);
15662 for (int j = 0; j < 1048576; j += 1048576) {
15663 for (int k = 0; k < 131072; k += 4) {
15664 __asm__ volatile (
15665 "vmovupd (%0), %%ymm0\n"
15666 "vmovupd (%1), %%ymm1\n"
15667 "vmovupd (%2), %%ymm2\n"
15668 "vmovupd (%3), %%ymm3\n"
15669 "vmovupd (%4), %%ymm4\n"
15670 "vmovupd (%5), %%ymm5\n"
15671 "vmovupd (%6), %%ymm6\n"
15672 "vmovupd (%7), %%ymm7\n"
15673 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15674 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15675 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15676 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15677 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15678 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15679 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15680 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15681 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15682 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15683 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15684 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15685 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15686 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15687 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15688 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15689 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15690 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15691 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15692 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15693 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15694 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15695 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15696 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15697 "vmovupd %%ymm8, (%0)\n"
15698 "vmovupd %%ymm9, (%1)\n"
15699 "vmovupd %%ymm10, (%2)\n"
15700 "vmovupd %%ymm11, (%3)\n"
15701 "vmovupd %%ymm12, (%4)\n"
15702 "vmovupd %%ymm13, (%5)\n"
15703 "vmovupd %%ymm14, (%6)\n"
15704 "vmovupd %%ymm15, (%7)\n"
15705 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15706 );
15707 }
15708 }
15709 return;
15710 }
15711 if (depth == 23) {
15712 helper_double_23_recursive(buf + 0, 20);
15713 helper_double_23_recursive(buf + 1048576, 20);
15714 helper_double_23_recursive(buf + 2097152, 20);
15715 helper_double_23_recursive(buf + 3145728, 20);
15716 helper_double_23_recursive(buf + 4194304, 20);
15717 helper_double_23_recursive(buf + 5242880, 20);
15718 helper_double_23_recursive(buf + 6291456, 20);
15719 helper_double_23_recursive(buf + 7340032, 20);
15720 for (int j = 0; j < 8388608; j += 8388608) {
15721 for (int k = 0; k < 1048576; k += 4) {
15722 __asm__ volatile (
15723 "vmovupd (%0), %%ymm0\n"
15724 "vmovupd (%1), %%ymm1\n"
15725 "vmovupd (%2), %%ymm2\n"
15726 "vmovupd (%3), %%ymm3\n"
15727 "vmovupd (%4), %%ymm4\n"
15728 "vmovupd (%5), %%ymm5\n"
15729 "vmovupd (%6), %%ymm6\n"
15730 "vmovupd (%7), %%ymm7\n"
15731 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15732 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15733 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15734 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15735 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15736 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15737 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15738 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15739 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15740 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15741 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15742 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15743 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15744 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15745 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15746 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15747 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15748 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15749 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15750 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15751 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15752 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15753 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15754 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15755 "vmovupd %%ymm8, (%0)\n"
15756 "vmovupd %%ymm9, (%1)\n"
15757 "vmovupd %%ymm10, (%2)\n"
15758 "vmovupd %%ymm11, (%3)\n"
15759 "vmovupd %%ymm12, (%4)\n"
15760 "vmovupd %%ymm13, (%5)\n"
15761 "vmovupd %%ymm14, (%6)\n"
15762 "vmovupd %%ymm15, (%7)\n"
15763 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15764 );
15765 }
15766 }
15767 return;
15768 }
15769 }
15770 void helper_double_23(double *buf);
helper_double_23(double * buf)15771 void helper_double_23(double *buf) {
15772 helper_double_23_recursive(buf, 23);
15773 }
15774 void helper_double_24_recursive(double *buf, int depth);
helper_double_24_recursive(double * buf,int depth)15775 void helper_double_24_recursive(double *buf, int depth) {
15776 if (depth == 10) {
15777 for (int j = 0; j < 1024; j += 32) {
15778 for (int k = 0; k < 4; k += 4) {
15779 __asm__ volatile (
15780 "vmovupd (%0), %%ymm0\n"
15781 "vmovupd (%1), %%ymm1\n"
15782 "vmovupd (%2), %%ymm2\n"
15783 "vmovupd (%3), %%ymm3\n"
15784 "vmovupd (%4), %%ymm4\n"
15785 "vmovupd (%5), %%ymm5\n"
15786 "vmovupd (%6), %%ymm6\n"
15787 "vmovupd (%7), %%ymm7\n"
15788 "vpermilpd $0, %%ymm0, %%ymm8\n"
15789 "vpermilpd $15, %%ymm0, %%ymm9\n"
15790 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15791 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15792 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
15793 "vpermilpd $0, %%ymm1, %%ymm8\n"
15794 "vpermilpd $15, %%ymm1, %%ymm9\n"
15795 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15796 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15797 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
15798 "vpermilpd $0, %%ymm2, %%ymm8\n"
15799 "vpermilpd $15, %%ymm2, %%ymm9\n"
15800 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15801 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15802 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
15803 "vpermilpd $0, %%ymm3, %%ymm8\n"
15804 "vpermilpd $15, %%ymm3, %%ymm9\n"
15805 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15806 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15807 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
15808 "vpermilpd $0, %%ymm4, %%ymm8\n"
15809 "vpermilpd $15, %%ymm4, %%ymm9\n"
15810 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15811 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15812 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
15813 "vpermilpd $0, %%ymm5, %%ymm8\n"
15814 "vpermilpd $15, %%ymm5, %%ymm9\n"
15815 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15816 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15817 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
15818 "vpermilpd $0, %%ymm6, %%ymm8\n"
15819 "vpermilpd $15, %%ymm6, %%ymm9\n"
15820 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15821 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15822 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
15823 "vpermilpd $0, %%ymm7, %%ymm8\n"
15824 "vpermilpd $15, %%ymm7, %%ymm9\n"
15825 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
15826 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
15827 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
15828 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
15829 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15830 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
15831 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
15832 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
15833 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
15834 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15835 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
15836 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
15837 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
15838 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
15839 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15840 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
15841 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
15842 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
15843 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
15844 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15845 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
15846 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
15847 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
15848 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
15849 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15850 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
15851 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
15852 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
15853 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
15854 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15855 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
15856 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
15857 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
15858 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
15859 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15860 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
15861 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
15862 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
15863 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
15864 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
15865 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
15866 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
15867 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
15868 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15869 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15870 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15871 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15872 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15873 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15874 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15875 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15876 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15877 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15878 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15879 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15880 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15881 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15882 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15883 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15884 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15885 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15886 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15887 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15888 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15889 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15890 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15891 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15892 "vmovupd %%ymm8, (%0)\n"
15893 "vmovupd %%ymm9, (%1)\n"
15894 "vmovupd %%ymm10, (%2)\n"
15895 "vmovupd %%ymm11, (%3)\n"
15896 "vmovupd %%ymm12, (%4)\n"
15897 "vmovupd %%ymm13, (%5)\n"
15898 "vmovupd %%ymm14, (%6)\n"
15899 "vmovupd %%ymm15, (%7)\n"
15900 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15901 );
15902 }
15903 }
15904 for (int j = 0; j < 1024; j += 256) {
15905 for (int k = 0; k < 32; k += 4) {
15906 __asm__ volatile (
15907 "vmovupd (%0), %%ymm0\n"
15908 "vmovupd (%1), %%ymm1\n"
15909 "vmovupd (%2), %%ymm2\n"
15910 "vmovupd (%3), %%ymm3\n"
15911 "vmovupd (%4), %%ymm4\n"
15912 "vmovupd (%5), %%ymm5\n"
15913 "vmovupd (%6), %%ymm6\n"
15914 "vmovupd (%7), %%ymm7\n"
15915 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15916 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15917 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15918 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15919 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
15920 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
15921 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
15922 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
15923 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15924 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15925 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15926 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15927 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
15928 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
15929 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
15930 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
15931 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
15932 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
15933 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
15934 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
15935 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
15936 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
15937 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
15938 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
15939 "vmovupd %%ymm8, (%0)\n"
15940 "vmovupd %%ymm9, (%1)\n"
15941 "vmovupd %%ymm10, (%2)\n"
15942 "vmovupd %%ymm11, (%3)\n"
15943 "vmovupd %%ymm12, (%4)\n"
15944 "vmovupd %%ymm13, (%5)\n"
15945 "vmovupd %%ymm14, (%6)\n"
15946 "vmovupd %%ymm15, (%7)\n"
15947 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15948 );
15949 }
15950 }
15951 for (int j = 0; j < 1024; j += 1024) {
15952 for (int k = 0; k < 256; k += 4) {
15953 __asm__ volatile (
15954 "vmovupd (%0), %%ymm0\n"
15955 "vmovupd (%1), %%ymm1\n"
15956 "vmovupd (%2), %%ymm2\n"
15957 "vmovupd (%3), %%ymm3\n"
15958 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15959 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15960 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15961 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
15962 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
15963 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
15964 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
15965 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
15966 "vmovupd %%ymm0, (%0)\n"
15967 "vmovupd %%ymm1, (%1)\n"
15968 "vmovupd %%ymm2, (%2)\n"
15969 "vmovupd %%ymm3, (%3)\n"
15970 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
15971 );
15972 }
15973 }
15974 return;
15975 }
15976 if (depth == 13) {
15977 helper_double_24_recursive(buf + 0, 10);
15978 helper_double_24_recursive(buf + 1024, 10);
15979 helper_double_24_recursive(buf + 2048, 10);
15980 helper_double_24_recursive(buf + 3072, 10);
15981 helper_double_24_recursive(buf + 4096, 10);
15982 helper_double_24_recursive(buf + 5120, 10);
15983 helper_double_24_recursive(buf + 6144, 10);
15984 helper_double_24_recursive(buf + 7168, 10);
15985 for (int j = 0; j < 8192; j += 8192) {
15986 for (int k = 0; k < 1024; k += 4) {
15987 __asm__ volatile (
15988 "vmovupd (%0), %%ymm0\n"
15989 "vmovupd (%1), %%ymm1\n"
15990 "vmovupd (%2), %%ymm2\n"
15991 "vmovupd (%3), %%ymm3\n"
15992 "vmovupd (%4), %%ymm4\n"
15993 "vmovupd (%5), %%ymm5\n"
15994 "vmovupd (%6), %%ymm6\n"
15995 "vmovupd (%7), %%ymm7\n"
15996 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
15997 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
15998 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
15999 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16000 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16001 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16002 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16003 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16004 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16005 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16006 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16007 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16008 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16009 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16010 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16011 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16012 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16013 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16014 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16015 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16016 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16017 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16018 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16019 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16020 "vmovupd %%ymm8, (%0)\n"
16021 "vmovupd %%ymm9, (%1)\n"
16022 "vmovupd %%ymm10, (%2)\n"
16023 "vmovupd %%ymm11, (%3)\n"
16024 "vmovupd %%ymm12, (%4)\n"
16025 "vmovupd %%ymm13, (%5)\n"
16026 "vmovupd %%ymm14, (%6)\n"
16027 "vmovupd %%ymm15, (%7)\n"
16028 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1024), "r"(buf + j + k + 2048), "r"(buf + j + k + 3072), "r"(buf + j + k + 4096), "r"(buf + j + k + 5120), "r"(buf + j + k + 6144), "r"(buf + j + k + 7168) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16029 );
16030 }
16031 }
16032 return;
16033 }
16034 if (depth == 16) {
16035 helper_double_24_recursive(buf + 0, 13);
16036 helper_double_24_recursive(buf + 8192, 13);
16037 helper_double_24_recursive(buf + 16384, 13);
16038 helper_double_24_recursive(buf + 24576, 13);
16039 helper_double_24_recursive(buf + 32768, 13);
16040 helper_double_24_recursive(buf + 40960, 13);
16041 helper_double_24_recursive(buf + 49152, 13);
16042 helper_double_24_recursive(buf + 57344, 13);
16043 for (int j = 0; j < 65536; j += 65536) {
16044 for (int k = 0; k < 8192; k += 4) {
16045 __asm__ volatile (
16046 "vmovupd (%0), %%ymm0\n"
16047 "vmovupd (%1), %%ymm1\n"
16048 "vmovupd (%2), %%ymm2\n"
16049 "vmovupd (%3), %%ymm3\n"
16050 "vmovupd (%4), %%ymm4\n"
16051 "vmovupd (%5), %%ymm5\n"
16052 "vmovupd (%6), %%ymm6\n"
16053 "vmovupd (%7), %%ymm7\n"
16054 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16055 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16056 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16057 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16058 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16059 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16060 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16061 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16062 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16063 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16064 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16065 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16066 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16067 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16068 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16069 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16070 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16071 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16072 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16073 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16074 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16075 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16076 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16077 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16078 "vmovupd %%ymm8, (%0)\n"
16079 "vmovupd %%ymm9, (%1)\n"
16080 "vmovupd %%ymm10, (%2)\n"
16081 "vmovupd %%ymm11, (%3)\n"
16082 "vmovupd %%ymm12, (%4)\n"
16083 "vmovupd %%ymm13, (%5)\n"
16084 "vmovupd %%ymm14, (%6)\n"
16085 "vmovupd %%ymm15, (%7)\n"
16086 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8192), "r"(buf + j + k + 16384), "r"(buf + j + k + 24576), "r"(buf + j + k + 32768), "r"(buf + j + k + 40960), "r"(buf + j + k + 49152), "r"(buf + j + k + 57344) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16087 );
16088 }
16089 }
16090 return;
16091 }
16092 if (depth == 19) {
16093 helper_double_24_recursive(buf + 0, 16);
16094 helper_double_24_recursive(buf + 65536, 16);
16095 helper_double_24_recursive(buf + 131072, 16);
16096 helper_double_24_recursive(buf + 196608, 16);
16097 helper_double_24_recursive(buf + 262144, 16);
16098 helper_double_24_recursive(buf + 327680, 16);
16099 helper_double_24_recursive(buf + 393216, 16);
16100 helper_double_24_recursive(buf + 458752, 16);
16101 for (int j = 0; j < 524288; j += 524288) {
16102 for (int k = 0; k < 65536; k += 4) {
16103 __asm__ volatile (
16104 "vmovupd (%0), %%ymm0\n"
16105 "vmovupd (%1), %%ymm1\n"
16106 "vmovupd (%2), %%ymm2\n"
16107 "vmovupd (%3), %%ymm3\n"
16108 "vmovupd (%4), %%ymm4\n"
16109 "vmovupd (%5), %%ymm5\n"
16110 "vmovupd (%6), %%ymm6\n"
16111 "vmovupd (%7), %%ymm7\n"
16112 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16113 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16114 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16115 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16116 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16117 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16118 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16119 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16120 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16121 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16122 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16123 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16124 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16125 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16126 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16127 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16128 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16129 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16130 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16131 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16132 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16133 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16134 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16135 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16136 "vmovupd %%ymm8, (%0)\n"
16137 "vmovupd %%ymm9, (%1)\n"
16138 "vmovupd %%ymm10, (%2)\n"
16139 "vmovupd %%ymm11, (%3)\n"
16140 "vmovupd %%ymm12, (%4)\n"
16141 "vmovupd %%ymm13, (%5)\n"
16142 "vmovupd %%ymm14, (%6)\n"
16143 "vmovupd %%ymm15, (%7)\n"
16144 :: "r"(buf + j + k + 0), "r"(buf + j + k + 65536), "r"(buf + j + k + 131072), "r"(buf + j + k + 196608), "r"(buf + j + k + 262144), "r"(buf + j + k + 327680), "r"(buf + j + k + 393216), "r"(buf + j + k + 458752) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16145 );
16146 }
16147 }
16148 return;
16149 }
16150 if (depth == 22) {
16151 helper_double_24_recursive(buf + 0, 19);
16152 helper_double_24_recursive(buf + 524288, 19);
16153 helper_double_24_recursive(buf + 1048576, 19);
16154 helper_double_24_recursive(buf + 1572864, 19);
16155 helper_double_24_recursive(buf + 2097152, 19);
16156 helper_double_24_recursive(buf + 2621440, 19);
16157 helper_double_24_recursive(buf + 3145728, 19);
16158 helper_double_24_recursive(buf + 3670016, 19);
16159 for (int j = 0; j < 4194304; j += 4194304) {
16160 for (int k = 0; k < 524288; k += 4) {
16161 __asm__ volatile (
16162 "vmovupd (%0), %%ymm0\n"
16163 "vmovupd (%1), %%ymm1\n"
16164 "vmovupd (%2), %%ymm2\n"
16165 "vmovupd (%3), %%ymm3\n"
16166 "vmovupd (%4), %%ymm4\n"
16167 "vmovupd (%5), %%ymm5\n"
16168 "vmovupd (%6), %%ymm6\n"
16169 "vmovupd (%7), %%ymm7\n"
16170 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16171 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16172 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16173 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16174 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16175 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16176 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16177 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16178 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16179 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16180 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16181 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16182 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16183 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16184 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16185 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16186 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16187 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16188 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16189 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16190 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16191 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16192 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16193 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16194 "vmovupd %%ymm8, (%0)\n"
16195 "vmovupd %%ymm9, (%1)\n"
16196 "vmovupd %%ymm10, (%2)\n"
16197 "vmovupd %%ymm11, (%3)\n"
16198 "vmovupd %%ymm12, (%4)\n"
16199 "vmovupd %%ymm13, (%5)\n"
16200 "vmovupd %%ymm14, (%6)\n"
16201 "vmovupd %%ymm15, (%7)\n"
16202 :: "r"(buf + j + k + 0), "r"(buf + j + k + 524288), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1572864), "r"(buf + j + k + 2097152), "r"(buf + j + k + 2621440), "r"(buf + j + k + 3145728), "r"(buf + j + k + 3670016) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16203 );
16204 }
16205 }
16206 return;
16207 }
16208 if (depth == 24) {
16209 helper_double_24_recursive(buf + 0, 22);
16210 helper_double_24_recursive(buf + 4194304, 22);
16211 helper_double_24_recursive(buf + 8388608, 22);
16212 helper_double_24_recursive(buf + 12582912, 22);
16213 for (int j = 0; j < 16777216; j += 16777216) {
16214 for (int k = 0; k < 4194304; k += 4) {
16215 __asm__ volatile (
16216 "vmovupd (%0), %%ymm0\n"
16217 "vmovupd (%1), %%ymm1\n"
16218 "vmovupd (%2), %%ymm2\n"
16219 "vmovupd (%3), %%ymm3\n"
16220 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16221 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16222 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16223 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16224 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16225 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16226 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16227 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16228 "vmovupd %%ymm0, (%0)\n"
16229 "vmovupd %%ymm1, (%1)\n"
16230 "vmovupd %%ymm2, (%2)\n"
16231 "vmovupd %%ymm3, (%3)\n"
16232 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4194304), "r"(buf + j + k + 8388608), "r"(buf + j + k + 12582912) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16233 );
16234 }
16235 }
16236 return;
16237 }
16238 }
16239 void helper_double_24(double *buf);
helper_double_24(double * buf)16240 void helper_double_24(double *buf) {
16241 helper_double_24_recursive(buf, 24);
16242 }
16243 void helper_double_25_recursive(double *buf, int depth);
helper_double_25_recursive(double * buf,int depth)16244 void helper_double_25_recursive(double *buf, int depth) {
16245 if (depth == 8) {
16246 for (int j = 0; j < 256; j += 32) {
16247 for (int k = 0; k < 4; k += 4) {
16248 __asm__ volatile (
16249 "vmovupd (%0), %%ymm0\n"
16250 "vmovupd (%1), %%ymm1\n"
16251 "vmovupd (%2), %%ymm2\n"
16252 "vmovupd (%3), %%ymm3\n"
16253 "vmovupd (%4), %%ymm4\n"
16254 "vmovupd (%5), %%ymm5\n"
16255 "vmovupd (%6), %%ymm6\n"
16256 "vmovupd (%7), %%ymm7\n"
16257 "vpermilpd $0, %%ymm0, %%ymm8\n"
16258 "vpermilpd $15, %%ymm0, %%ymm9\n"
16259 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16260 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16261 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
16262 "vpermilpd $0, %%ymm1, %%ymm8\n"
16263 "vpermilpd $15, %%ymm1, %%ymm9\n"
16264 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16265 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16266 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
16267 "vpermilpd $0, %%ymm2, %%ymm8\n"
16268 "vpermilpd $15, %%ymm2, %%ymm9\n"
16269 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16270 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16271 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
16272 "vpermilpd $0, %%ymm3, %%ymm8\n"
16273 "vpermilpd $15, %%ymm3, %%ymm9\n"
16274 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16275 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16276 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
16277 "vpermilpd $0, %%ymm4, %%ymm8\n"
16278 "vpermilpd $15, %%ymm4, %%ymm9\n"
16279 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16280 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16281 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
16282 "vpermilpd $0, %%ymm5, %%ymm8\n"
16283 "vpermilpd $15, %%ymm5, %%ymm9\n"
16284 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16285 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16286 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
16287 "vpermilpd $0, %%ymm6, %%ymm8\n"
16288 "vpermilpd $15, %%ymm6, %%ymm9\n"
16289 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16290 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16291 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
16292 "vpermilpd $0, %%ymm7, %%ymm8\n"
16293 "vpermilpd $15, %%ymm7, %%ymm9\n"
16294 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16295 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16296 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
16297 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
16298 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16299 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
16300 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
16301 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
16302 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
16303 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16304 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
16305 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
16306 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
16307 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
16308 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16309 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
16310 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
16311 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
16312 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
16313 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16314 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
16315 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
16316 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
16317 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
16318 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16319 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
16320 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
16321 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
16322 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
16323 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16324 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
16325 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
16326 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
16327 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
16328 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16329 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
16330 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
16331 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
16332 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
16333 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16334 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
16335 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
16336 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
16337 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16338 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16339 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16340 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16341 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16342 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16343 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16344 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16345 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16346 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16347 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16348 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16349 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16350 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16351 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16352 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16353 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16354 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16355 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16356 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16357 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16358 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16359 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16360 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16361 "vmovupd %%ymm8, (%0)\n"
16362 "vmovupd %%ymm9, (%1)\n"
16363 "vmovupd %%ymm10, (%2)\n"
16364 "vmovupd %%ymm11, (%3)\n"
16365 "vmovupd %%ymm12, (%4)\n"
16366 "vmovupd %%ymm13, (%5)\n"
16367 "vmovupd %%ymm14, (%6)\n"
16368 "vmovupd %%ymm15, (%7)\n"
16369 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16370 );
16371 }
16372 }
16373 for (int j = 0; j < 256; j += 256) {
16374 for (int k = 0; k < 32; k += 4) {
16375 __asm__ volatile (
16376 "vmovupd (%0), %%ymm0\n"
16377 "vmovupd (%1), %%ymm1\n"
16378 "vmovupd (%2), %%ymm2\n"
16379 "vmovupd (%3), %%ymm3\n"
16380 "vmovupd (%4), %%ymm4\n"
16381 "vmovupd (%5), %%ymm5\n"
16382 "vmovupd (%6), %%ymm6\n"
16383 "vmovupd (%7), %%ymm7\n"
16384 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16385 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16386 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16387 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16388 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16389 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16390 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16391 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16392 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16393 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16394 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16395 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16396 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16397 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16398 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16399 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16400 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16401 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16402 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16403 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16404 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16405 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16406 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16407 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16408 "vmovupd %%ymm8, (%0)\n"
16409 "vmovupd %%ymm9, (%1)\n"
16410 "vmovupd %%ymm10, (%2)\n"
16411 "vmovupd %%ymm11, (%3)\n"
16412 "vmovupd %%ymm12, (%4)\n"
16413 "vmovupd %%ymm13, (%5)\n"
16414 "vmovupd %%ymm14, (%6)\n"
16415 "vmovupd %%ymm15, (%7)\n"
16416 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16417 );
16418 }
16419 }
16420 return;
16421 }
16422 if (depth == 11) {
16423 helper_double_25_recursive(buf + 0, 8);
16424 helper_double_25_recursive(buf + 256, 8);
16425 helper_double_25_recursive(buf + 512, 8);
16426 helper_double_25_recursive(buf + 768, 8);
16427 helper_double_25_recursive(buf + 1024, 8);
16428 helper_double_25_recursive(buf + 1280, 8);
16429 helper_double_25_recursive(buf + 1536, 8);
16430 helper_double_25_recursive(buf + 1792, 8);
16431 for (int j = 0; j < 2048; j += 2048) {
16432 for (int k = 0; k < 256; k += 4) {
16433 __asm__ volatile (
16434 "vmovupd (%0), %%ymm0\n"
16435 "vmovupd (%1), %%ymm1\n"
16436 "vmovupd (%2), %%ymm2\n"
16437 "vmovupd (%3), %%ymm3\n"
16438 "vmovupd (%4), %%ymm4\n"
16439 "vmovupd (%5), %%ymm5\n"
16440 "vmovupd (%6), %%ymm6\n"
16441 "vmovupd (%7), %%ymm7\n"
16442 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16443 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16444 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16445 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16446 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16447 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16448 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16449 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16450 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16451 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16452 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16453 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16454 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16455 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16456 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16457 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16458 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16459 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16460 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16461 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16462 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16463 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16464 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16465 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16466 "vmovupd %%ymm8, (%0)\n"
16467 "vmovupd %%ymm9, (%1)\n"
16468 "vmovupd %%ymm10, (%2)\n"
16469 "vmovupd %%ymm11, (%3)\n"
16470 "vmovupd %%ymm12, (%4)\n"
16471 "vmovupd %%ymm13, (%5)\n"
16472 "vmovupd %%ymm14, (%6)\n"
16473 "vmovupd %%ymm15, (%7)\n"
16474 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16475 );
16476 }
16477 }
16478 return;
16479 }
16480 if (depth == 14) {
16481 helper_double_25_recursive(buf + 0, 11);
16482 helper_double_25_recursive(buf + 2048, 11);
16483 helper_double_25_recursive(buf + 4096, 11);
16484 helper_double_25_recursive(buf + 6144, 11);
16485 helper_double_25_recursive(buf + 8192, 11);
16486 helper_double_25_recursive(buf + 10240, 11);
16487 helper_double_25_recursive(buf + 12288, 11);
16488 helper_double_25_recursive(buf + 14336, 11);
16489 for (int j = 0; j < 16384; j += 16384) {
16490 for (int k = 0; k < 2048; k += 4) {
16491 __asm__ volatile (
16492 "vmovupd (%0), %%ymm0\n"
16493 "vmovupd (%1), %%ymm1\n"
16494 "vmovupd (%2), %%ymm2\n"
16495 "vmovupd (%3), %%ymm3\n"
16496 "vmovupd (%4), %%ymm4\n"
16497 "vmovupd (%5), %%ymm5\n"
16498 "vmovupd (%6), %%ymm6\n"
16499 "vmovupd (%7), %%ymm7\n"
16500 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16501 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16502 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16503 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16504 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16505 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16506 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16507 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16508 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16509 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16510 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16511 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16512 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16513 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16514 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16515 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16516 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16517 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16518 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16519 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16520 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16521 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16522 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16523 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16524 "vmovupd %%ymm8, (%0)\n"
16525 "vmovupd %%ymm9, (%1)\n"
16526 "vmovupd %%ymm10, (%2)\n"
16527 "vmovupd %%ymm11, (%3)\n"
16528 "vmovupd %%ymm12, (%4)\n"
16529 "vmovupd %%ymm13, (%5)\n"
16530 "vmovupd %%ymm14, (%6)\n"
16531 "vmovupd %%ymm15, (%7)\n"
16532 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16533 );
16534 }
16535 }
16536 return;
16537 }
16538 if (depth == 17) {
16539 helper_double_25_recursive(buf + 0, 14);
16540 helper_double_25_recursive(buf + 16384, 14);
16541 helper_double_25_recursive(buf + 32768, 14);
16542 helper_double_25_recursive(buf + 49152, 14);
16543 helper_double_25_recursive(buf + 65536, 14);
16544 helper_double_25_recursive(buf + 81920, 14);
16545 helper_double_25_recursive(buf + 98304, 14);
16546 helper_double_25_recursive(buf + 114688, 14);
16547 for (int j = 0; j < 131072; j += 131072) {
16548 for (int k = 0; k < 16384; k += 4) {
16549 __asm__ volatile (
16550 "vmovupd (%0), %%ymm0\n"
16551 "vmovupd (%1), %%ymm1\n"
16552 "vmovupd (%2), %%ymm2\n"
16553 "vmovupd (%3), %%ymm3\n"
16554 "vmovupd (%4), %%ymm4\n"
16555 "vmovupd (%5), %%ymm5\n"
16556 "vmovupd (%6), %%ymm6\n"
16557 "vmovupd (%7), %%ymm7\n"
16558 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16559 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16560 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16561 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16562 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16563 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16564 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16565 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16566 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16567 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16568 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16569 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16570 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16571 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16572 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16573 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16574 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16575 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16576 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16577 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16578 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16579 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16580 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16581 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16582 "vmovupd %%ymm8, (%0)\n"
16583 "vmovupd %%ymm9, (%1)\n"
16584 "vmovupd %%ymm10, (%2)\n"
16585 "vmovupd %%ymm11, (%3)\n"
16586 "vmovupd %%ymm12, (%4)\n"
16587 "vmovupd %%ymm13, (%5)\n"
16588 "vmovupd %%ymm14, (%6)\n"
16589 "vmovupd %%ymm15, (%7)\n"
16590 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16591 );
16592 }
16593 }
16594 return;
16595 }
16596 if (depth == 20) {
16597 helper_double_25_recursive(buf + 0, 17);
16598 helper_double_25_recursive(buf + 131072, 17);
16599 helper_double_25_recursive(buf + 262144, 17);
16600 helper_double_25_recursive(buf + 393216, 17);
16601 helper_double_25_recursive(buf + 524288, 17);
16602 helper_double_25_recursive(buf + 655360, 17);
16603 helper_double_25_recursive(buf + 786432, 17);
16604 helper_double_25_recursive(buf + 917504, 17);
16605 for (int j = 0; j < 1048576; j += 1048576) {
16606 for (int k = 0; k < 131072; k += 4) {
16607 __asm__ volatile (
16608 "vmovupd (%0), %%ymm0\n"
16609 "vmovupd (%1), %%ymm1\n"
16610 "vmovupd (%2), %%ymm2\n"
16611 "vmovupd (%3), %%ymm3\n"
16612 "vmovupd (%4), %%ymm4\n"
16613 "vmovupd (%5), %%ymm5\n"
16614 "vmovupd (%6), %%ymm6\n"
16615 "vmovupd (%7), %%ymm7\n"
16616 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16617 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16618 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16619 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16620 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16621 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16622 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16623 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16624 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16625 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16626 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16627 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16628 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16629 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16630 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16631 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16632 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16633 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16634 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16635 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16636 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16637 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16638 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16639 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16640 "vmovupd %%ymm8, (%0)\n"
16641 "vmovupd %%ymm9, (%1)\n"
16642 "vmovupd %%ymm10, (%2)\n"
16643 "vmovupd %%ymm11, (%3)\n"
16644 "vmovupd %%ymm12, (%4)\n"
16645 "vmovupd %%ymm13, (%5)\n"
16646 "vmovupd %%ymm14, (%6)\n"
16647 "vmovupd %%ymm15, (%7)\n"
16648 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16649 );
16650 }
16651 }
16652 return;
16653 }
16654 if (depth == 23) {
16655 helper_double_25_recursive(buf + 0, 20);
16656 helper_double_25_recursive(buf + 1048576, 20);
16657 helper_double_25_recursive(buf + 2097152, 20);
16658 helper_double_25_recursive(buf + 3145728, 20);
16659 helper_double_25_recursive(buf + 4194304, 20);
16660 helper_double_25_recursive(buf + 5242880, 20);
16661 helper_double_25_recursive(buf + 6291456, 20);
16662 helper_double_25_recursive(buf + 7340032, 20);
16663 for (int j = 0; j < 8388608; j += 8388608) {
16664 for (int k = 0; k < 1048576; k += 4) {
16665 __asm__ volatile (
16666 "vmovupd (%0), %%ymm0\n"
16667 "vmovupd (%1), %%ymm1\n"
16668 "vmovupd (%2), %%ymm2\n"
16669 "vmovupd (%3), %%ymm3\n"
16670 "vmovupd (%4), %%ymm4\n"
16671 "vmovupd (%5), %%ymm5\n"
16672 "vmovupd (%6), %%ymm6\n"
16673 "vmovupd (%7), %%ymm7\n"
16674 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16675 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16676 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16677 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16678 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16679 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16680 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16681 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16682 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16683 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16684 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16685 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16686 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16687 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16688 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16689 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16690 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16691 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16692 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16693 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16694 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16695 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16696 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16697 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16698 "vmovupd %%ymm8, (%0)\n"
16699 "vmovupd %%ymm9, (%1)\n"
16700 "vmovupd %%ymm10, (%2)\n"
16701 "vmovupd %%ymm11, (%3)\n"
16702 "vmovupd %%ymm12, (%4)\n"
16703 "vmovupd %%ymm13, (%5)\n"
16704 "vmovupd %%ymm14, (%6)\n"
16705 "vmovupd %%ymm15, (%7)\n"
16706 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16707 );
16708 }
16709 }
16710 return;
16711 }
16712 if (depth == 25) {
16713 helper_double_25_recursive(buf + 0, 23);
16714 helper_double_25_recursive(buf + 8388608, 23);
16715 helper_double_25_recursive(buf + 16777216, 23);
16716 helper_double_25_recursive(buf + 25165824, 23);
16717 for (int j = 0; j < 33554432; j += 33554432) {
16718 for (int k = 0; k < 8388608; k += 4) {
16719 __asm__ volatile (
16720 "vmovupd (%0), %%ymm0\n"
16721 "vmovupd (%1), %%ymm1\n"
16722 "vmovupd (%2), %%ymm2\n"
16723 "vmovupd (%3), %%ymm3\n"
16724 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16725 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16726 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16727 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16728 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16729 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16730 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16731 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16732 "vmovupd %%ymm0, (%0)\n"
16733 "vmovupd %%ymm1, (%1)\n"
16734 "vmovupd %%ymm2, (%2)\n"
16735 "vmovupd %%ymm3, (%3)\n"
16736 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16737 );
16738 }
16739 }
16740 return;
16741 }
16742 }
16743 void helper_double_25(double *buf);
helper_double_25(double * buf)16744 void helper_double_25(double *buf) {
16745 helper_double_25_recursive(buf, 25);
16746 }
16747 void helper_double_26_recursive(double *buf, int depth);
helper_double_26_recursive(double * buf,int depth)16748 void helper_double_26_recursive(double *buf, int depth) {
16749 if (depth == 11) {
16750 for (int j = 0; j < 2048; j += 32) {
16751 for (int k = 0; k < 4; k += 4) {
16752 __asm__ volatile (
16753 "vmovupd (%0), %%ymm0\n"
16754 "vmovupd (%1), %%ymm1\n"
16755 "vmovupd (%2), %%ymm2\n"
16756 "vmovupd (%3), %%ymm3\n"
16757 "vmovupd (%4), %%ymm4\n"
16758 "vmovupd (%5), %%ymm5\n"
16759 "vmovupd (%6), %%ymm6\n"
16760 "vmovupd (%7), %%ymm7\n"
16761 "vpermilpd $0, %%ymm0, %%ymm8\n"
16762 "vpermilpd $15, %%ymm0, %%ymm9\n"
16763 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16764 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16765 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
16766 "vpermilpd $0, %%ymm1, %%ymm8\n"
16767 "vpermilpd $15, %%ymm1, %%ymm9\n"
16768 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16769 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16770 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
16771 "vpermilpd $0, %%ymm2, %%ymm8\n"
16772 "vpermilpd $15, %%ymm2, %%ymm9\n"
16773 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16774 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16775 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
16776 "vpermilpd $0, %%ymm3, %%ymm8\n"
16777 "vpermilpd $15, %%ymm3, %%ymm9\n"
16778 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16779 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16780 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
16781 "vpermilpd $0, %%ymm4, %%ymm8\n"
16782 "vpermilpd $15, %%ymm4, %%ymm9\n"
16783 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16784 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16785 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
16786 "vpermilpd $0, %%ymm5, %%ymm8\n"
16787 "vpermilpd $15, %%ymm5, %%ymm9\n"
16788 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16789 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16790 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
16791 "vpermilpd $0, %%ymm6, %%ymm8\n"
16792 "vpermilpd $15, %%ymm6, %%ymm9\n"
16793 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16794 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16795 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
16796 "vpermilpd $0, %%ymm7, %%ymm8\n"
16797 "vpermilpd $15, %%ymm7, %%ymm9\n"
16798 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
16799 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
16800 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
16801 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
16802 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16803 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
16804 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
16805 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
16806 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
16807 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16808 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
16809 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
16810 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
16811 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
16812 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16813 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
16814 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
16815 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
16816 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
16817 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16818 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
16819 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
16820 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
16821 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
16822 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16823 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
16824 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
16825 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
16826 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
16827 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16828 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
16829 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
16830 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
16831 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
16832 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16833 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
16834 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
16835 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
16836 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
16837 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
16838 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
16839 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
16840 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
16841 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16842 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16843 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16844 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16845 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16846 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16847 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16848 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16849 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16850 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16851 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16852 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16853 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16854 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16855 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16856 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16857 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16858 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16859 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16860 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16861 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16862 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16863 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16864 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16865 "vmovupd %%ymm8, (%0)\n"
16866 "vmovupd %%ymm9, (%1)\n"
16867 "vmovupd %%ymm10, (%2)\n"
16868 "vmovupd %%ymm11, (%3)\n"
16869 "vmovupd %%ymm12, (%4)\n"
16870 "vmovupd %%ymm13, (%5)\n"
16871 "vmovupd %%ymm14, (%6)\n"
16872 "vmovupd %%ymm15, (%7)\n"
16873 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16874 );
16875 }
16876 }
16877 for (int j = 0; j < 2048; j += 256) {
16878 for (int k = 0; k < 32; k += 4) {
16879 __asm__ volatile (
16880 "vmovupd (%0), %%ymm0\n"
16881 "vmovupd (%1), %%ymm1\n"
16882 "vmovupd (%2), %%ymm2\n"
16883 "vmovupd (%3), %%ymm3\n"
16884 "vmovupd (%4), %%ymm4\n"
16885 "vmovupd (%5), %%ymm5\n"
16886 "vmovupd (%6), %%ymm6\n"
16887 "vmovupd (%7), %%ymm7\n"
16888 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16889 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16890 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16891 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16892 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16893 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16894 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16895 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16896 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16897 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16898 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16899 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16900 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16901 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16902 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16903 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16904 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16905 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16906 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16907 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16908 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16909 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16910 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16911 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16912 "vmovupd %%ymm8, (%0)\n"
16913 "vmovupd %%ymm9, (%1)\n"
16914 "vmovupd %%ymm10, (%2)\n"
16915 "vmovupd %%ymm11, (%3)\n"
16916 "vmovupd %%ymm12, (%4)\n"
16917 "vmovupd %%ymm13, (%5)\n"
16918 "vmovupd %%ymm14, (%6)\n"
16919 "vmovupd %%ymm15, (%7)\n"
16920 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16921 );
16922 }
16923 }
16924 for (int j = 0; j < 2048; j += 2048) {
16925 for (int k = 0; k < 256; k += 4) {
16926 __asm__ volatile (
16927 "vmovupd (%0), %%ymm0\n"
16928 "vmovupd (%1), %%ymm1\n"
16929 "vmovupd (%2), %%ymm2\n"
16930 "vmovupd (%3), %%ymm3\n"
16931 "vmovupd (%4), %%ymm4\n"
16932 "vmovupd (%5), %%ymm5\n"
16933 "vmovupd (%6), %%ymm6\n"
16934 "vmovupd (%7), %%ymm7\n"
16935 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16936 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16937 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16938 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16939 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16940 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16941 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
16942 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
16943 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
16944 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
16945 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
16946 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
16947 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
16948 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
16949 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
16950 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
16951 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
16952 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
16953 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
16954 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
16955 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
16956 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
16957 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
16958 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
16959 "vmovupd %%ymm8, (%0)\n"
16960 "vmovupd %%ymm9, (%1)\n"
16961 "vmovupd %%ymm10, (%2)\n"
16962 "vmovupd %%ymm11, (%3)\n"
16963 "vmovupd %%ymm12, (%4)\n"
16964 "vmovupd %%ymm13, (%5)\n"
16965 "vmovupd %%ymm14, (%6)\n"
16966 "vmovupd %%ymm15, (%7)\n"
16967 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
16968 );
16969 }
16970 }
16971 return;
16972 }
16973 if (depth == 14) {
16974 helper_double_26_recursive(buf + 0, 11);
16975 helper_double_26_recursive(buf + 2048, 11);
16976 helper_double_26_recursive(buf + 4096, 11);
16977 helper_double_26_recursive(buf + 6144, 11);
16978 helper_double_26_recursive(buf + 8192, 11);
16979 helper_double_26_recursive(buf + 10240, 11);
16980 helper_double_26_recursive(buf + 12288, 11);
16981 helper_double_26_recursive(buf + 14336, 11);
16982 for (int j = 0; j < 16384; j += 16384) {
16983 for (int k = 0; k < 2048; k += 4) {
16984 __asm__ volatile (
16985 "vmovupd (%0), %%ymm0\n"
16986 "vmovupd (%1), %%ymm1\n"
16987 "vmovupd (%2), %%ymm2\n"
16988 "vmovupd (%3), %%ymm3\n"
16989 "vmovupd (%4), %%ymm4\n"
16990 "vmovupd (%5), %%ymm5\n"
16991 "vmovupd (%6), %%ymm6\n"
16992 "vmovupd (%7), %%ymm7\n"
16993 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
16994 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
16995 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
16996 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
16997 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
16998 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
16999 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17000 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17001 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17002 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17003 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17004 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17005 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17006 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17007 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17008 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17009 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17010 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17011 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17012 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17013 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17014 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17015 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17016 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17017 "vmovupd %%ymm8, (%0)\n"
17018 "vmovupd %%ymm9, (%1)\n"
17019 "vmovupd %%ymm10, (%2)\n"
17020 "vmovupd %%ymm11, (%3)\n"
17021 "vmovupd %%ymm12, (%4)\n"
17022 "vmovupd %%ymm13, (%5)\n"
17023 "vmovupd %%ymm14, (%6)\n"
17024 "vmovupd %%ymm15, (%7)\n"
17025 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17026 );
17027 }
17028 }
17029 return;
17030 }
17031 if (depth == 17) {
17032 helper_double_26_recursive(buf + 0, 14);
17033 helper_double_26_recursive(buf + 16384, 14);
17034 helper_double_26_recursive(buf + 32768, 14);
17035 helper_double_26_recursive(buf + 49152, 14);
17036 helper_double_26_recursive(buf + 65536, 14);
17037 helper_double_26_recursive(buf + 81920, 14);
17038 helper_double_26_recursive(buf + 98304, 14);
17039 helper_double_26_recursive(buf + 114688, 14);
17040 for (int j = 0; j < 131072; j += 131072) {
17041 for (int k = 0; k < 16384; k += 4) {
17042 __asm__ volatile (
17043 "vmovupd (%0), %%ymm0\n"
17044 "vmovupd (%1), %%ymm1\n"
17045 "vmovupd (%2), %%ymm2\n"
17046 "vmovupd (%3), %%ymm3\n"
17047 "vmovupd (%4), %%ymm4\n"
17048 "vmovupd (%5), %%ymm5\n"
17049 "vmovupd (%6), %%ymm6\n"
17050 "vmovupd (%7), %%ymm7\n"
17051 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17052 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17053 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17054 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17055 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17056 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17057 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17058 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17059 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17060 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17061 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17062 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17063 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17064 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17065 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17066 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17067 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17068 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17069 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17070 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17071 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17072 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17073 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17074 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17075 "vmovupd %%ymm8, (%0)\n"
17076 "vmovupd %%ymm9, (%1)\n"
17077 "vmovupd %%ymm10, (%2)\n"
17078 "vmovupd %%ymm11, (%3)\n"
17079 "vmovupd %%ymm12, (%4)\n"
17080 "vmovupd %%ymm13, (%5)\n"
17081 "vmovupd %%ymm14, (%6)\n"
17082 "vmovupd %%ymm15, (%7)\n"
17083 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17084 );
17085 }
17086 }
17087 return;
17088 }
17089 if (depth == 20) {
17090 helper_double_26_recursive(buf + 0, 17);
17091 helper_double_26_recursive(buf + 131072, 17);
17092 helper_double_26_recursive(buf + 262144, 17);
17093 helper_double_26_recursive(buf + 393216, 17);
17094 helper_double_26_recursive(buf + 524288, 17);
17095 helper_double_26_recursive(buf + 655360, 17);
17096 helper_double_26_recursive(buf + 786432, 17);
17097 helper_double_26_recursive(buf + 917504, 17);
17098 for (int j = 0; j < 1048576; j += 1048576) {
17099 for (int k = 0; k < 131072; k += 4) {
17100 __asm__ volatile (
17101 "vmovupd (%0), %%ymm0\n"
17102 "vmovupd (%1), %%ymm1\n"
17103 "vmovupd (%2), %%ymm2\n"
17104 "vmovupd (%3), %%ymm3\n"
17105 "vmovupd (%4), %%ymm4\n"
17106 "vmovupd (%5), %%ymm5\n"
17107 "vmovupd (%6), %%ymm6\n"
17108 "vmovupd (%7), %%ymm7\n"
17109 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17110 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17111 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17112 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17113 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17114 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17115 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17116 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17117 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17118 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17119 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17120 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17121 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17122 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17123 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17124 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17125 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17126 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17127 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17128 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17129 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17130 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17131 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17132 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17133 "vmovupd %%ymm8, (%0)\n"
17134 "vmovupd %%ymm9, (%1)\n"
17135 "vmovupd %%ymm10, (%2)\n"
17136 "vmovupd %%ymm11, (%3)\n"
17137 "vmovupd %%ymm12, (%4)\n"
17138 "vmovupd %%ymm13, (%5)\n"
17139 "vmovupd %%ymm14, (%6)\n"
17140 "vmovupd %%ymm15, (%7)\n"
17141 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17142 );
17143 }
17144 }
17145 return;
17146 }
17147 if (depth == 23) {
17148 helper_double_26_recursive(buf + 0, 20);
17149 helper_double_26_recursive(buf + 1048576, 20);
17150 helper_double_26_recursive(buf + 2097152, 20);
17151 helper_double_26_recursive(buf + 3145728, 20);
17152 helper_double_26_recursive(buf + 4194304, 20);
17153 helper_double_26_recursive(buf + 5242880, 20);
17154 helper_double_26_recursive(buf + 6291456, 20);
17155 helper_double_26_recursive(buf + 7340032, 20);
17156 for (int j = 0; j < 8388608; j += 8388608) {
17157 for (int k = 0; k < 1048576; k += 4) {
17158 __asm__ volatile (
17159 "vmovupd (%0), %%ymm0\n"
17160 "vmovupd (%1), %%ymm1\n"
17161 "vmovupd (%2), %%ymm2\n"
17162 "vmovupd (%3), %%ymm3\n"
17163 "vmovupd (%4), %%ymm4\n"
17164 "vmovupd (%5), %%ymm5\n"
17165 "vmovupd (%6), %%ymm6\n"
17166 "vmovupd (%7), %%ymm7\n"
17167 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17168 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17169 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17170 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17171 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17172 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17173 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17174 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17175 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17176 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17177 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17178 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17179 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17180 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17181 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17182 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17183 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17184 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17185 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17186 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17187 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17188 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17189 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17190 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17191 "vmovupd %%ymm8, (%0)\n"
17192 "vmovupd %%ymm9, (%1)\n"
17193 "vmovupd %%ymm10, (%2)\n"
17194 "vmovupd %%ymm11, (%3)\n"
17195 "vmovupd %%ymm12, (%4)\n"
17196 "vmovupd %%ymm13, (%5)\n"
17197 "vmovupd %%ymm14, (%6)\n"
17198 "vmovupd %%ymm15, (%7)\n"
17199 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17200 );
17201 }
17202 }
17203 return;
17204 }
17205 if (depth == 26) {
17206 helper_double_26_recursive(buf + 0, 23);
17207 helper_double_26_recursive(buf + 8388608, 23);
17208 helper_double_26_recursive(buf + 16777216, 23);
17209 helper_double_26_recursive(buf + 25165824, 23);
17210 helper_double_26_recursive(buf + 33554432, 23);
17211 helper_double_26_recursive(buf + 41943040, 23);
17212 helper_double_26_recursive(buf + 50331648, 23);
17213 helper_double_26_recursive(buf + 58720256, 23);
17214 for (int j = 0; j < 67108864; j += 67108864) {
17215 for (int k = 0; k < 8388608; k += 4) {
17216 __asm__ volatile (
17217 "vmovupd (%0), %%ymm0\n"
17218 "vmovupd (%1), %%ymm1\n"
17219 "vmovupd (%2), %%ymm2\n"
17220 "vmovupd (%3), %%ymm3\n"
17221 "vmovupd (%4), %%ymm4\n"
17222 "vmovupd (%5), %%ymm5\n"
17223 "vmovupd (%6), %%ymm6\n"
17224 "vmovupd (%7), %%ymm7\n"
17225 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17226 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17227 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17228 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17229 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17230 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17231 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17232 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17233 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17234 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17235 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17236 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17237 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17238 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17239 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17240 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17241 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17242 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17243 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17244 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17245 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17246 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17247 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17248 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17249 "vmovupd %%ymm8, (%0)\n"
17250 "vmovupd %%ymm9, (%1)\n"
17251 "vmovupd %%ymm10, (%2)\n"
17252 "vmovupd %%ymm11, (%3)\n"
17253 "vmovupd %%ymm12, (%4)\n"
17254 "vmovupd %%ymm13, (%5)\n"
17255 "vmovupd %%ymm14, (%6)\n"
17256 "vmovupd %%ymm15, (%7)\n"
17257 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17258 );
17259 }
17260 }
17261 return;
17262 }
17263 }
17264 void helper_double_26(double *buf);
helper_double_26(double * buf)17265 void helper_double_26(double *buf) {
17266 helper_double_26_recursive(buf, 26);
17267 }
17268 void helper_double_27_recursive(double *buf, int depth);
helper_double_27_recursive(double * buf,int depth)17269 void helper_double_27_recursive(double *buf, int depth) {
17270 if (depth == 9) {
17271 for (int j = 0; j < 512; j += 32) {
17272 for (int k = 0; k < 4; k += 4) {
17273 __asm__ volatile (
17274 "vmovupd (%0), %%ymm0\n"
17275 "vmovupd (%1), %%ymm1\n"
17276 "vmovupd (%2), %%ymm2\n"
17277 "vmovupd (%3), %%ymm3\n"
17278 "vmovupd (%4), %%ymm4\n"
17279 "vmovupd (%5), %%ymm5\n"
17280 "vmovupd (%6), %%ymm6\n"
17281 "vmovupd (%7), %%ymm7\n"
17282 "vpermilpd $0, %%ymm0, %%ymm8\n"
17283 "vpermilpd $15, %%ymm0, %%ymm9\n"
17284 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17285 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17286 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
17287 "vpermilpd $0, %%ymm1, %%ymm8\n"
17288 "vpermilpd $15, %%ymm1, %%ymm9\n"
17289 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17290 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17291 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
17292 "vpermilpd $0, %%ymm2, %%ymm8\n"
17293 "vpermilpd $15, %%ymm2, %%ymm9\n"
17294 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17295 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17296 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
17297 "vpermilpd $0, %%ymm3, %%ymm8\n"
17298 "vpermilpd $15, %%ymm3, %%ymm9\n"
17299 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17300 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17301 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
17302 "vpermilpd $0, %%ymm4, %%ymm8\n"
17303 "vpermilpd $15, %%ymm4, %%ymm9\n"
17304 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17305 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17306 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
17307 "vpermilpd $0, %%ymm5, %%ymm8\n"
17308 "vpermilpd $15, %%ymm5, %%ymm9\n"
17309 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17310 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17311 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
17312 "vpermilpd $0, %%ymm6, %%ymm8\n"
17313 "vpermilpd $15, %%ymm6, %%ymm9\n"
17314 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17315 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17316 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
17317 "vpermilpd $0, %%ymm7, %%ymm8\n"
17318 "vpermilpd $15, %%ymm7, %%ymm9\n"
17319 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17320 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17321 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
17322 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
17323 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17324 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
17325 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
17326 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
17327 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
17328 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17329 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
17330 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
17331 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
17332 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
17333 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17334 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
17335 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
17336 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
17337 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
17338 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17339 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
17340 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
17341 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
17342 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
17343 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17344 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
17345 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
17346 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
17347 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
17348 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17349 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
17350 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
17351 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
17352 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
17353 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17354 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
17355 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
17356 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
17357 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
17358 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17359 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
17360 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
17361 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
17362 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17363 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17364 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17365 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17366 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17367 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17368 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17369 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17370 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17371 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17372 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17373 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17374 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17375 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17376 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17377 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17378 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17379 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17380 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17381 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17382 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17383 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17384 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17385 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17386 "vmovupd %%ymm8, (%0)\n"
17387 "vmovupd %%ymm9, (%1)\n"
17388 "vmovupd %%ymm10, (%2)\n"
17389 "vmovupd %%ymm11, (%3)\n"
17390 "vmovupd %%ymm12, (%4)\n"
17391 "vmovupd %%ymm13, (%5)\n"
17392 "vmovupd %%ymm14, (%6)\n"
17393 "vmovupd %%ymm15, (%7)\n"
17394 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17395 );
17396 }
17397 }
17398 for (int j = 0; j < 512; j += 256) {
17399 for (int k = 0; k < 32; k += 4) {
17400 __asm__ volatile (
17401 "vmovupd (%0), %%ymm0\n"
17402 "vmovupd (%1), %%ymm1\n"
17403 "vmovupd (%2), %%ymm2\n"
17404 "vmovupd (%3), %%ymm3\n"
17405 "vmovupd (%4), %%ymm4\n"
17406 "vmovupd (%5), %%ymm5\n"
17407 "vmovupd (%6), %%ymm6\n"
17408 "vmovupd (%7), %%ymm7\n"
17409 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17410 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17411 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17412 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17413 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17414 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17415 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17416 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17417 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17418 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17419 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17420 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17421 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17422 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17423 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17424 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17425 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17426 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17427 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17428 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17429 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17430 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17431 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17432 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17433 "vmovupd %%ymm8, (%0)\n"
17434 "vmovupd %%ymm9, (%1)\n"
17435 "vmovupd %%ymm10, (%2)\n"
17436 "vmovupd %%ymm11, (%3)\n"
17437 "vmovupd %%ymm12, (%4)\n"
17438 "vmovupd %%ymm13, (%5)\n"
17439 "vmovupd %%ymm14, (%6)\n"
17440 "vmovupd %%ymm15, (%7)\n"
17441 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17442 );
17443 }
17444 }
17445 for (int j = 0; j < 512; j += 512) {
17446 for (int k = 0; k < 256; k += 4) {
17447 __asm__ volatile (
17448 "vmovupd (%0), %%ymm0\n"
17449 "vmovupd (%1), %%ymm1\n"
17450 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17451 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17452 "vmovupd %%ymm8, (%0)\n"
17453 "vmovupd %%ymm9, (%1)\n"
17454 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17455 );
17456 }
17457 }
17458 return;
17459 }
17460 if (depth == 12) {
17461 helper_double_27_recursive(buf + 0, 9);
17462 helper_double_27_recursive(buf + 512, 9);
17463 helper_double_27_recursive(buf + 1024, 9);
17464 helper_double_27_recursive(buf + 1536, 9);
17465 helper_double_27_recursive(buf + 2048, 9);
17466 helper_double_27_recursive(buf + 2560, 9);
17467 helper_double_27_recursive(buf + 3072, 9);
17468 helper_double_27_recursive(buf + 3584, 9);
17469 for (int j = 0; j < 4096; j += 4096) {
17470 for (int k = 0; k < 512; k += 4) {
17471 __asm__ volatile (
17472 "vmovupd (%0), %%ymm0\n"
17473 "vmovupd (%1), %%ymm1\n"
17474 "vmovupd (%2), %%ymm2\n"
17475 "vmovupd (%3), %%ymm3\n"
17476 "vmovupd (%4), %%ymm4\n"
17477 "vmovupd (%5), %%ymm5\n"
17478 "vmovupd (%6), %%ymm6\n"
17479 "vmovupd (%7), %%ymm7\n"
17480 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17481 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17482 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17483 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17484 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17485 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17486 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17487 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17488 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17489 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17490 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17491 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17492 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17493 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17494 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17495 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17496 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17497 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17498 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17499 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17500 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17501 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17502 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17503 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17504 "vmovupd %%ymm8, (%0)\n"
17505 "vmovupd %%ymm9, (%1)\n"
17506 "vmovupd %%ymm10, (%2)\n"
17507 "vmovupd %%ymm11, (%3)\n"
17508 "vmovupd %%ymm12, (%4)\n"
17509 "vmovupd %%ymm13, (%5)\n"
17510 "vmovupd %%ymm14, (%6)\n"
17511 "vmovupd %%ymm15, (%7)\n"
17512 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17513 );
17514 }
17515 }
17516 return;
17517 }
17518 if (depth == 15) {
17519 helper_double_27_recursive(buf + 0, 12);
17520 helper_double_27_recursive(buf + 4096, 12);
17521 helper_double_27_recursive(buf + 8192, 12);
17522 helper_double_27_recursive(buf + 12288, 12);
17523 helper_double_27_recursive(buf + 16384, 12);
17524 helper_double_27_recursive(buf + 20480, 12);
17525 helper_double_27_recursive(buf + 24576, 12);
17526 helper_double_27_recursive(buf + 28672, 12);
17527 for (int j = 0; j < 32768; j += 32768) {
17528 for (int k = 0; k < 4096; k += 4) {
17529 __asm__ volatile (
17530 "vmovupd (%0), %%ymm0\n"
17531 "vmovupd (%1), %%ymm1\n"
17532 "vmovupd (%2), %%ymm2\n"
17533 "vmovupd (%3), %%ymm3\n"
17534 "vmovupd (%4), %%ymm4\n"
17535 "vmovupd (%5), %%ymm5\n"
17536 "vmovupd (%6), %%ymm6\n"
17537 "vmovupd (%7), %%ymm7\n"
17538 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17539 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17540 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17541 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17542 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17543 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17544 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17545 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17546 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17547 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17548 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17549 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17550 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17551 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17552 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17553 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17554 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17555 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17556 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17557 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17558 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17559 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17560 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17561 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17562 "vmovupd %%ymm8, (%0)\n"
17563 "vmovupd %%ymm9, (%1)\n"
17564 "vmovupd %%ymm10, (%2)\n"
17565 "vmovupd %%ymm11, (%3)\n"
17566 "vmovupd %%ymm12, (%4)\n"
17567 "vmovupd %%ymm13, (%5)\n"
17568 "vmovupd %%ymm14, (%6)\n"
17569 "vmovupd %%ymm15, (%7)\n"
17570 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17571 );
17572 }
17573 }
17574 return;
17575 }
17576 if (depth == 18) {
17577 helper_double_27_recursive(buf + 0, 15);
17578 helper_double_27_recursive(buf + 32768, 15);
17579 helper_double_27_recursive(buf + 65536, 15);
17580 helper_double_27_recursive(buf + 98304, 15);
17581 helper_double_27_recursive(buf + 131072, 15);
17582 helper_double_27_recursive(buf + 163840, 15);
17583 helper_double_27_recursive(buf + 196608, 15);
17584 helper_double_27_recursive(buf + 229376, 15);
17585 for (int j = 0; j < 262144; j += 262144) {
17586 for (int k = 0; k < 32768; k += 4) {
17587 __asm__ volatile (
17588 "vmovupd (%0), %%ymm0\n"
17589 "vmovupd (%1), %%ymm1\n"
17590 "vmovupd (%2), %%ymm2\n"
17591 "vmovupd (%3), %%ymm3\n"
17592 "vmovupd (%4), %%ymm4\n"
17593 "vmovupd (%5), %%ymm5\n"
17594 "vmovupd (%6), %%ymm6\n"
17595 "vmovupd (%7), %%ymm7\n"
17596 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17597 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17598 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17599 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17600 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17601 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17602 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17603 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17604 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17605 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17606 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17607 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17608 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17609 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17610 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17611 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17612 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17613 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17614 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17615 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17616 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17617 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17618 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17619 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17620 "vmovupd %%ymm8, (%0)\n"
17621 "vmovupd %%ymm9, (%1)\n"
17622 "vmovupd %%ymm10, (%2)\n"
17623 "vmovupd %%ymm11, (%3)\n"
17624 "vmovupd %%ymm12, (%4)\n"
17625 "vmovupd %%ymm13, (%5)\n"
17626 "vmovupd %%ymm14, (%6)\n"
17627 "vmovupd %%ymm15, (%7)\n"
17628 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17629 );
17630 }
17631 }
17632 return;
17633 }
17634 if (depth == 21) {
17635 helper_double_27_recursive(buf + 0, 18);
17636 helper_double_27_recursive(buf + 262144, 18);
17637 helper_double_27_recursive(buf + 524288, 18);
17638 helper_double_27_recursive(buf + 786432, 18);
17639 helper_double_27_recursive(buf + 1048576, 18);
17640 helper_double_27_recursive(buf + 1310720, 18);
17641 helper_double_27_recursive(buf + 1572864, 18);
17642 helper_double_27_recursive(buf + 1835008, 18);
17643 for (int j = 0; j < 2097152; j += 2097152) {
17644 for (int k = 0; k < 262144; k += 4) {
17645 __asm__ volatile (
17646 "vmovupd (%0), %%ymm0\n"
17647 "vmovupd (%1), %%ymm1\n"
17648 "vmovupd (%2), %%ymm2\n"
17649 "vmovupd (%3), %%ymm3\n"
17650 "vmovupd (%4), %%ymm4\n"
17651 "vmovupd (%5), %%ymm5\n"
17652 "vmovupd (%6), %%ymm6\n"
17653 "vmovupd (%7), %%ymm7\n"
17654 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17655 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17656 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17657 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17658 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17659 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17660 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17661 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17662 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17663 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17664 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17665 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17666 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17667 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17668 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17669 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17670 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17671 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17672 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17673 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17674 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17675 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17676 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17677 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17678 "vmovupd %%ymm8, (%0)\n"
17679 "vmovupd %%ymm9, (%1)\n"
17680 "vmovupd %%ymm10, (%2)\n"
17681 "vmovupd %%ymm11, (%3)\n"
17682 "vmovupd %%ymm12, (%4)\n"
17683 "vmovupd %%ymm13, (%5)\n"
17684 "vmovupd %%ymm14, (%6)\n"
17685 "vmovupd %%ymm15, (%7)\n"
17686 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17687 );
17688 }
17689 }
17690 return;
17691 }
17692 if (depth == 24) {
17693 helper_double_27_recursive(buf + 0, 21);
17694 helper_double_27_recursive(buf + 2097152, 21);
17695 helper_double_27_recursive(buf + 4194304, 21);
17696 helper_double_27_recursive(buf + 6291456, 21);
17697 helper_double_27_recursive(buf + 8388608, 21);
17698 helper_double_27_recursive(buf + 10485760, 21);
17699 helper_double_27_recursive(buf + 12582912, 21);
17700 helper_double_27_recursive(buf + 14680064, 21);
17701 for (int j = 0; j < 16777216; j += 16777216) {
17702 for (int k = 0; k < 2097152; k += 4) {
17703 __asm__ volatile (
17704 "vmovupd (%0), %%ymm0\n"
17705 "vmovupd (%1), %%ymm1\n"
17706 "vmovupd (%2), %%ymm2\n"
17707 "vmovupd (%3), %%ymm3\n"
17708 "vmovupd (%4), %%ymm4\n"
17709 "vmovupd (%5), %%ymm5\n"
17710 "vmovupd (%6), %%ymm6\n"
17711 "vmovupd (%7), %%ymm7\n"
17712 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17713 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17714 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17715 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17716 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17717 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17718 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17719 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17720 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17721 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17722 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17723 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17724 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17725 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17726 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17727 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17728 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17729 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17730 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17731 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17732 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17733 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17734 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17735 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17736 "vmovupd %%ymm8, (%0)\n"
17737 "vmovupd %%ymm9, (%1)\n"
17738 "vmovupd %%ymm10, (%2)\n"
17739 "vmovupd %%ymm11, (%3)\n"
17740 "vmovupd %%ymm12, (%4)\n"
17741 "vmovupd %%ymm13, (%5)\n"
17742 "vmovupd %%ymm14, (%6)\n"
17743 "vmovupd %%ymm15, (%7)\n"
17744 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17745 );
17746 }
17747 }
17748 return;
17749 }
17750 if (depth == 27) {
17751 helper_double_27_recursive(buf + 0, 24);
17752 helper_double_27_recursive(buf + 16777216, 24);
17753 helper_double_27_recursive(buf + 33554432, 24);
17754 helper_double_27_recursive(buf + 50331648, 24);
17755 helper_double_27_recursive(buf + 67108864, 24);
17756 helper_double_27_recursive(buf + 83886080, 24);
17757 helper_double_27_recursive(buf + 100663296, 24);
17758 helper_double_27_recursive(buf + 117440512, 24);
17759 for (int j = 0; j < 134217728; j += 134217728) {
17760 for (int k = 0; k < 16777216; k += 4) {
17761 __asm__ volatile (
17762 "vmovupd (%0), %%ymm0\n"
17763 "vmovupd (%1), %%ymm1\n"
17764 "vmovupd (%2), %%ymm2\n"
17765 "vmovupd (%3), %%ymm3\n"
17766 "vmovupd (%4), %%ymm4\n"
17767 "vmovupd (%5), %%ymm5\n"
17768 "vmovupd (%6), %%ymm6\n"
17769 "vmovupd (%7), %%ymm7\n"
17770 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17771 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17772 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17773 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17774 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17775 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17776 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17777 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17778 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17779 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17780 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17781 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17782 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17783 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17784 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17785 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17786 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17787 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17788 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17789 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17790 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17791 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17792 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17793 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17794 "vmovupd %%ymm8, (%0)\n"
17795 "vmovupd %%ymm9, (%1)\n"
17796 "vmovupd %%ymm10, (%2)\n"
17797 "vmovupd %%ymm11, (%3)\n"
17798 "vmovupd %%ymm12, (%4)\n"
17799 "vmovupd %%ymm13, (%5)\n"
17800 "vmovupd %%ymm14, (%6)\n"
17801 "vmovupd %%ymm15, (%7)\n"
17802 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17803 );
17804 }
17805 }
17806 return;
17807 }
17808 }
17809 void helper_double_27(double *buf);
helper_double_27(double * buf)17810 void helper_double_27(double *buf) {
17811 helper_double_27_recursive(buf, 27);
17812 }
17813 void helper_double_28_recursive(double *buf, int depth);
helper_double_28_recursive(double * buf,int depth)17814 void helper_double_28_recursive(double *buf, int depth) {
17815 if (depth == 11) {
17816 for (int j = 0; j < 2048; j += 32) {
17817 for (int k = 0; k < 4; k += 4) {
17818 __asm__ volatile (
17819 "vmovupd (%0), %%ymm0\n"
17820 "vmovupd (%1), %%ymm1\n"
17821 "vmovupd (%2), %%ymm2\n"
17822 "vmovupd (%3), %%ymm3\n"
17823 "vmovupd (%4), %%ymm4\n"
17824 "vmovupd (%5), %%ymm5\n"
17825 "vmovupd (%6), %%ymm6\n"
17826 "vmovupd (%7), %%ymm7\n"
17827 "vpermilpd $0, %%ymm0, %%ymm8\n"
17828 "vpermilpd $15, %%ymm0, %%ymm9\n"
17829 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17830 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17831 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
17832 "vpermilpd $0, %%ymm1, %%ymm8\n"
17833 "vpermilpd $15, %%ymm1, %%ymm9\n"
17834 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17835 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17836 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
17837 "vpermilpd $0, %%ymm2, %%ymm8\n"
17838 "vpermilpd $15, %%ymm2, %%ymm9\n"
17839 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17840 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17841 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
17842 "vpermilpd $0, %%ymm3, %%ymm8\n"
17843 "vpermilpd $15, %%ymm3, %%ymm9\n"
17844 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17845 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17846 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
17847 "vpermilpd $0, %%ymm4, %%ymm8\n"
17848 "vpermilpd $15, %%ymm4, %%ymm9\n"
17849 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17850 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17851 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
17852 "vpermilpd $0, %%ymm5, %%ymm8\n"
17853 "vpermilpd $15, %%ymm5, %%ymm9\n"
17854 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17855 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17856 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
17857 "vpermilpd $0, %%ymm6, %%ymm8\n"
17858 "vpermilpd $15, %%ymm6, %%ymm9\n"
17859 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17860 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17861 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
17862 "vpermilpd $0, %%ymm7, %%ymm8\n"
17863 "vpermilpd $15, %%ymm7, %%ymm9\n"
17864 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
17865 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
17866 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
17867 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
17868 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17869 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
17870 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
17871 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
17872 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
17873 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17874 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
17875 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
17876 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
17877 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
17878 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17879 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
17880 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
17881 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
17882 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
17883 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17884 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
17885 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
17886 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
17887 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
17888 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17889 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
17890 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
17891 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
17892 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
17893 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17894 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
17895 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
17896 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
17897 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
17898 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17899 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
17900 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
17901 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
17902 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
17903 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
17904 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
17905 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
17906 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
17907 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17908 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17909 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17910 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17911 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17912 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17913 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17914 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17915 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17916 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17917 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17918 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17919 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17920 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17921 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17922 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17923 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17924 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17925 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17926 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17927 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17928 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17929 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17930 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17931 "vmovupd %%ymm8, (%0)\n"
17932 "vmovupd %%ymm9, (%1)\n"
17933 "vmovupd %%ymm10, (%2)\n"
17934 "vmovupd %%ymm11, (%3)\n"
17935 "vmovupd %%ymm12, (%4)\n"
17936 "vmovupd %%ymm13, (%5)\n"
17937 "vmovupd %%ymm14, (%6)\n"
17938 "vmovupd %%ymm15, (%7)\n"
17939 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17940 );
17941 }
17942 }
17943 for (int j = 0; j < 2048; j += 256) {
17944 for (int k = 0; k < 32; k += 4) {
17945 __asm__ volatile (
17946 "vmovupd (%0), %%ymm0\n"
17947 "vmovupd (%1), %%ymm1\n"
17948 "vmovupd (%2), %%ymm2\n"
17949 "vmovupd (%3), %%ymm3\n"
17950 "vmovupd (%4), %%ymm4\n"
17951 "vmovupd (%5), %%ymm5\n"
17952 "vmovupd (%6), %%ymm6\n"
17953 "vmovupd (%7), %%ymm7\n"
17954 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
17955 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
17956 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
17957 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
17958 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
17959 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
17960 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
17961 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
17962 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
17963 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
17964 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
17965 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
17966 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
17967 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
17968 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
17969 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
17970 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
17971 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
17972 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
17973 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
17974 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
17975 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
17976 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
17977 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
17978 "vmovupd %%ymm8, (%0)\n"
17979 "vmovupd %%ymm9, (%1)\n"
17980 "vmovupd %%ymm10, (%2)\n"
17981 "vmovupd %%ymm11, (%3)\n"
17982 "vmovupd %%ymm12, (%4)\n"
17983 "vmovupd %%ymm13, (%5)\n"
17984 "vmovupd %%ymm14, (%6)\n"
17985 "vmovupd %%ymm15, (%7)\n"
17986 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
17987 );
17988 }
17989 }
17990 for (int j = 0; j < 2048; j += 2048) {
17991 for (int k = 0; k < 256; k += 4) {
17992 __asm__ volatile (
17993 "vmovupd (%0), %%ymm0\n"
17994 "vmovupd (%1), %%ymm1\n"
17995 "vmovupd (%2), %%ymm2\n"
17996 "vmovupd (%3), %%ymm3\n"
17997 "vmovupd (%4), %%ymm4\n"
17998 "vmovupd (%5), %%ymm5\n"
17999 "vmovupd (%6), %%ymm6\n"
18000 "vmovupd (%7), %%ymm7\n"
18001 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18002 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18003 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18004 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18005 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18006 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18007 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18008 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18009 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18010 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18011 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18012 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18013 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18014 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18015 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18016 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18017 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18018 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18019 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18020 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18021 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18022 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18023 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18024 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18025 "vmovupd %%ymm8, (%0)\n"
18026 "vmovupd %%ymm9, (%1)\n"
18027 "vmovupd %%ymm10, (%2)\n"
18028 "vmovupd %%ymm11, (%3)\n"
18029 "vmovupd %%ymm12, (%4)\n"
18030 "vmovupd %%ymm13, (%5)\n"
18031 "vmovupd %%ymm14, (%6)\n"
18032 "vmovupd %%ymm15, (%7)\n"
18033 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18034 );
18035 }
18036 }
18037 return;
18038 }
18039 if (depth == 14) {
18040 helper_double_28_recursive(buf + 0, 11);
18041 helper_double_28_recursive(buf + 2048, 11);
18042 helper_double_28_recursive(buf + 4096, 11);
18043 helper_double_28_recursive(buf + 6144, 11);
18044 helper_double_28_recursive(buf + 8192, 11);
18045 helper_double_28_recursive(buf + 10240, 11);
18046 helper_double_28_recursive(buf + 12288, 11);
18047 helper_double_28_recursive(buf + 14336, 11);
18048 for (int j = 0; j < 16384; j += 16384) {
18049 for (int k = 0; k < 2048; k += 4) {
18050 __asm__ volatile (
18051 "vmovupd (%0), %%ymm0\n"
18052 "vmovupd (%1), %%ymm1\n"
18053 "vmovupd (%2), %%ymm2\n"
18054 "vmovupd (%3), %%ymm3\n"
18055 "vmovupd (%4), %%ymm4\n"
18056 "vmovupd (%5), %%ymm5\n"
18057 "vmovupd (%6), %%ymm6\n"
18058 "vmovupd (%7), %%ymm7\n"
18059 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18060 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18061 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18062 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18063 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18064 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18065 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18066 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18067 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18068 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18069 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18070 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18071 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18072 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18073 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18074 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18075 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18076 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18077 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18078 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18079 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18080 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18081 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18082 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18083 "vmovupd %%ymm8, (%0)\n"
18084 "vmovupd %%ymm9, (%1)\n"
18085 "vmovupd %%ymm10, (%2)\n"
18086 "vmovupd %%ymm11, (%3)\n"
18087 "vmovupd %%ymm12, (%4)\n"
18088 "vmovupd %%ymm13, (%5)\n"
18089 "vmovupd %%ymm14, (%6)\n"
18090 "vmovupd %%ymm15, (%7)\n"
18091 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18092 );
18093 }
18094 }
18095 return;
18096 }
18097 if (depth == 17) {
18098 helper_double_28_recursive(buf + 0, 14);
18099 helper_double_28_recursive(buf + 16384, 14);
18100 helper_double_28_recursive(buf + 32768, 14);
18101 helper_double_28_recursive(buf + 49152, 14);
18102 helper_double_28_recursive(buf + 65536, 14);
18103 helper_double_28_recursive(buf + 81920, 14);
18104 helper_double_28_recursive(buf + 98304, 14);
18105 helper_double_28_recursive(buf + 114688, 14);
18106 for (int j = 0; j < 131072; j += 131072) {
18107 for (int k = 0; k < 16384; k += 4) {
18108 __asm__ volatile (
18109 "vmovupd (%0), %%ymm0\n"
18110 "vmovupd (%1), %%ymm1\n"
18111 "vmovupd (%2), %%ymm2\n"
18112 "vmovupd (%3), %%ymm3\n"
18113 "vmovupd (%4), %%ymm4\n"
18114 "vmovupd (%5), %%ymm5\n"
18115 "vmovupd (%6), %%ymm6\n"
18116 "vmovupd (%7), %%ymm7\n"
18117 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18118 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18119 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18120 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18121 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18122 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18123 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18124 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18125 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18126 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18127 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18128 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18129 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18130 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18131 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18132 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18133 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18134 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18135 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18136 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18137 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18138 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18139 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18140 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18141 "vmovupd %%ymm8, (%0)\n"
18142 "vmovupd %%ymm9, (%1)\n"
18143 "vmovupd %%ymm10, (%2)\n"
18144 "vmovupd %%ymm11, (%3)\n"
18145 "vmovupd %%ymm12, (%4)\n"
18146 "vmovupd %%ymm13, (%5)\n"
18147 "vmovupd %%ymm14, (%6)\n"
18148 "vmovupd %%ymm15, (%7)\n"
18149 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18150 );
18151 }
18152 }
18153 return;
18154 }
18155 if (depth == 20) {
18156 helper_double_28_recursive(buf + 0, 17);
18157 helper_double_28_recursive(buf + 131072, 17);
18158 helper_double_28_recursive(buf + 262144, 17);
18159 helper_double_28_recursive(buf + 393216, 17);
18160 helper_double_28_recursive(buf + 524288, 17);
18161 helper_double_28_recursive(buf + 655360, 17);
18162 helper_double_28_recursive(buf + 786432, 17);
18163 helper_double_28_recursive(buf + 917504, 17);
18164 for (int j = 0; j < 1048576; j += 1048576) {
18165 for (int k = 0; k < 131072; k += 4) {
18166 __asm__ volatile (
18167 "vmovupd (%0), %%ymm0\n"
18168 "vmovupd (%1), %%ymm1\n"
18169 "vmovupd (%2), %%ymm2\n"
18170 "vmovupd (%3), %%ymm3\n"
18171 "vmovupd (%4), %%ymm4\n"
18172 "vmovupd (%5), %%ymm5\n"
18173 "vmovupd (%6), %%ymm6\n"
18174 "vmovupd (%7), %%ymm7\n"
18175 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18176 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18177 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18178 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18179 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18180 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18181 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18182 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18183 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18184 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18185 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18186 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18187 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18188 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18189 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18190 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18191 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18192 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18193 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18194 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18195 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18196 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18197 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18198 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18199 "vmovupd %%ymm8, (%0)\n"
18200 "vmovupd %%ymm9, (%1)\n"
18201 "vmovupd %%ymm10, (%2)\n"
18202 "vmovupd %%ymm11, (%3)\n"
18203 "vmovupd %%ymm12, (%4)\n"
18204 "vmovupd %%ymm13, (%5)\n"
18205 "vmovupd %%ymm14, (%6)\n"
18206 "vmovupd %%ymm15, (%7)\n"
18207 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18208 );
18209 }
18210 }
18211 return;
18212 }
18213 if (depth == 23) {
18214 helper_double_28_recursive(buf + 0, 20);
18215 helper_double_28_recursive(buf + 1048576, 20);
18216 helper_double_28_recursive(buf + 2097152, 20);
18217 helper_double_28_recursive(buf + 3145728, 20);
18218 helper_double_28_recursive(buf + 4194304, 20);
18219 helper_double_28_recursive(buf + 5242880, 20);
18220 helper_double_28_recursive(buf + 6291456, 20);
18221 helper_double_28_recursive(buf + 7340032, 20);
18222 for (int j = 0; j < 8388608; j += 8388608) {
18223 for (int k = 0; k < 1048576; k += 4) {
18224 __asm__ volatile (
18225 "vmovupd (%0), %%ymm0\n"
18226 "vmovupd (%1), %%ymm1\n"
18227 "vmovupd (%2), %%ymm2\n"
18228 "vmovupd (%3), %%ymm3\n"
18229 "vmovupd (%4), %%ymm4\n"
18230 "vmovupd (%5), %%ymm5\n"
18231 "vmovupd (%6), %%ymm6\n"
18232 "vmovupd (%7), %%ymm7\n"
18233 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18234 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18235 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18236 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18237 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18238 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18239 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18240 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18241 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18242 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18243 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18244 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18245 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18246 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18247 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18248 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18249 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18250 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18251 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18252 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18253 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18254 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18255 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18256 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18257 "vmovupd %%ymm8, (%0)\n"
18258 "vmovupd %%ymm9, (%1)\n"
18259 "vmovupd %%ymm10, (%2)\n"
18260 "vmovupd %%ymm11, (%3)\n"
18261 "vmovupd %%ymm12, (%4)\n"
18262 "vmovupd %%ymm13, (%5)\n"
18263 "vmovupd %%ymm14, (%6)\n"
18264 "vmovupd %%ymm15, (%7)\n"
18265 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18266 );
18267 }
18268 }
18269 return;
18270 }
18271 if (depth == 26) {
18272 helper_double_28_recursive(buf + 0, 23);
18273 helper_double_28_recursive(buf + 8388608, 23);
18274 helper_double_28_recursive(buf + 16777216, 23);
18275 helper_double_28_recursive(buf + 25165824, 23);
18276 helper_double_28_recursive(buf + 33554432, 23);
18277 helper_double_28_recursive(buf + 41943040, 23);
18278 helper_double_28_recursive(buf + 50331648, 23);
18279 helper_double_28_recursive(buf + 58720256, 23);
18280 for (int j = 0; j < 67108864; j += 67108864) {
18281 for (int k = 0; k < 8388608; k += 4) {
18282 __asm__ volatile (
18283 "vmovupd (%0), %%ymm0\n"
18284 "vmovupd (%1), %%ymm1\n"
18285 "vmovupd (%2), %%ymm2\n"
18286 "vmovupd (%3), %%ymm3\n"
18287 "vmovupd (%4), %%ymm4\n"
18288 "vmovupd (%5), %%ymm5\n"
18289 "vmovupd (%6), %%ymm6\n"
18290 "vmovupd (%7), %%ymm7\n"
18291 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18292 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18293 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18294 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18295 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18296 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18297 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18298 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18299 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18300 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18301 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18302 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18303 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18304 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18305 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18306 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18307 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18308 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18309 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18310 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18311 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18312 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18313 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18314 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18315 "vmovupd %%ymm8, (%0)\n"
18316 "vmovupd %%ymm9, (%1)\n"
18317 "vmovupd %%ymm10, (%2)\n"
18318 "vmovupd %%ymm11, (%3)\n"
18319 "vmovupd %%ymm12, (%4)\n"
18320 "vmovupd %%ymm13, (%5)\n"
18321 "vmovupd %%ymm14, (%6)\n"
18322 "vmovupd %%ymm15, (%7)\n"
18323 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18324 );
18325 }
18326 }
18327 return;
18328 }
18329 if (depth == 28) {
18330 helper_double_28_recursive(buf + 0, 26);
18331 helper_double_28_recursive(buf + 67108864, 26);
18332 helper_double_28_recursive(buf + 134217728, 26);
18333 helper_double_28_recursive(buf + 201326592, 26);
18334 for (int j = 0; j < 268435456; j += 268435456) {
18335 for (int k = 0; k < 67108864; k += 4) {
18336 __asm__ volatile (
18337 "vmovupd (%0), %%ymm0\n"
18338 "vmovupd (%1), %%ymm1\n"
18339 "vmovupd (%2), %%ymm2\n"
18340 "vmovupd (%3), %%ymm3\n"
18341 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18342 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18343 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18344 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18345 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18346 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18347 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18348 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18349 "vmovupd %%ymm0, (%0)\n"
18350 "vmovupd %%ymm1, (%1)\n"
18351 "vmovupd %%ymm2, (%2)\n"
18352 "vmovupd %%ymm3, (%3)\n"
18353 :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18354 );
18355 }
18356 }
18357 return;
18358 }
18359 }
18360 void helper_double_28(double *buf);
helper_double_28(double * buf)18361 void helper_double_28(double *buf) {
18362 helper_double_28_recursive(buf, 28);
18363 }
18364 void helper_double_29_recursive(double *buf, int depth);
helper_double_29_recursive(double * buf,int depth)18365 void helper_double_29_recursive(double *buf, int depth) {
18366 if (depth == 11) {
18367 for (int j = 0; j < 2048; j += 32) {
18368 for (int k = 0; k < 4; k += 4) {
18369 __asm__ volatile (
18370 "vmovupd (%0), %%ymm0\n"
18371 "vmovupd (%1), %%ymm1\n"
18372 "vmovupd (%2), %%ymm2\n"
18373 "vmovupd (%3), %%ymm3\n"
18374 "vmovupd (%4), %%ymm4\n"
18375 "vmovupd (%5), %%ymm5\n"
18376 "vmovupd (%6), %%ymm6\n"
18377 "vmovupd (%7), %%ymm7\n"
18378 "vpermilpd $0, %%ymm0, %%ymm8\n"
18379 "vpermilpd $15, %%ymm0, %%ymm9\n"
18380 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18381 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18382 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
18383 "vpermilpd $0, %%ymm1, %%ymm8\n"
18384 "vpermilpd $15, %%ymm1, %%ymm9\n"
18385 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18386 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18387 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
18388 "vpermilpd $0, %%ymm2, %%ymm8\n"
18389 "vpermilpd $15, %%ymm2, %%ymm9\n"
18390 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18391 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18392 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
18393 "vpermilpd $0, %%ymm3, %%ymm8\n"
18394 "vpermilpd $15, %%ymm3, %%ymm9\n"
18395 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18396 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18397 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
18398 "vpermilpd $0, %%ymm4, %%ymm8\n"
18399 "vpermilpd $15, %%ymm4, %%ymm9\n"
18400 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18401 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18402 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
18403 "vpermilpd $0, %%ymm5, %%ymm8\n"
18404 "vpermilpd $15, %%ymm5, %%ymm9\n"
18405 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18406 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18407 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
18408 "vpermilpd $0, %%ymm6, %%ymm8\n"
18409 "vpermilpd $15, %%ymm6, %%ymm9\n"
18410 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18411 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18412 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
18413 "vpermilpd $0, %%ymm7, %%ymm8\n"
18414 "vpermilpd $15, %%ymm7, %%ymm9\n"
18415 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18416 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18417 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
18418 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
18419 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18420 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
18421 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
18422 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
18423 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
18424 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18425 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
18426 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
18427 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
18428 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
18429 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18430 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
18431 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
18432 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
18433 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
18434 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18435 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
18436 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
18437 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
18438 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
18439 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18440 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
18441 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
18442 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
18443 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
18444 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18445 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
18446 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
18447 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
18448 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
18449 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18450 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
18451 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
18452 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
18453 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
18454 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18455 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
18456 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
18457 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
18458 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18459 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18460 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18461 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18462 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18463 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18464 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18465 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18466 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18467 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18468 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18469 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18470 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18471 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18472 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18473 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18474 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18475 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18476 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18477 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18478 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18479 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18480 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18481 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18482 "vmovupd %%ymm8, (%0)\n"
18483 "vmovupd %%ymm9, (%1)\n"
18484 "vmovupd %%ymm10, (%2)\n"
18485 "vmovupd %%ymm11, (%3)\n"
18486 "vmovupd %%ymm12, (%4)\n"
18487 "vmovupd %%ymm13, (%5)\n"
18488 "vmovupd %%ymm14, (%6)\n"
18489 "vmovupd %%ymm15, (%7)\n"
18490 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18491 );
18492 }
18493 }
18494 for (int j = 0; j < 2048; j += 256) {
18495 for (int k = 0; k < 32; k += 4) {
18496 __asm__ volatile (
18497 "vmovupd (%0), %%ymm0\n"
18498 "vmovupd (%1), %%ymm1\n"
18499 "vmovupd (%2), %%ymm2\n"
18500 "vmovupd (%3), %%ymm3\n"
18501 "vmovupd (%4), %%ymm4\n"
18502 "vmovupd (%5), %%ymm5\n"
18503 "vmovupd (%6), %%ymm6\n"
18504 "vmovupd (%7), %%ymm7\n"
18505 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18506 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18507 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18508 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18509 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18510 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18511 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18512 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18513 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18514 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18515 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18516 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18517 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18518 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18519 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18520 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18521 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18522 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18523 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18524 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18525 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18526 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18527 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18528 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18529 "vmovupd %%ymm8, (%0)\n"
18530 "vmovupd %%ymm9, (%1)\n"
18531 "vmovupd %%ymm10, (%2)\n"
18532 "vmovupd %%ymm11, (%3)\n"
18533 "vmovupd %%ymm12, (%4)\n"
18534 "vmovupd %%ymm13, (%5)\n"
18535 "vmovupd %%ymm14, (%6)\n"
18536 "vmovupd %%ymm15, (%7)\n"
18537 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18538 );
18539 }
18540 }
18541 for (int j = 0; j < 2048; j += 2048) {
18542 for (int k = 0; k < 256; k += 4) {
18543 __asm__ volatile (
18544 "vmovupd (%0), %%ymm0\n"
18545 "vmovupd (%1), %%ymm1\n"
18546 "vmovupd (%2), %%ymm2\n"
18547 "vmovupd (%3), %%ymm3\n"
18548 "vmovupd (%4), %%ymm4\n"
18549 "vmovupd (%5), %%ymm5\n"
18550 "vmovupd (%6), %%ymm6\n"
18551 "vmovupd (%7), %%ymm7\n"
18552 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18553 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18554 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18555 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18556 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18557 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18558 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18559 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18560 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18561 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18562 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18563 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18564 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18565 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18566 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18567 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18568 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18569 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18570 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18571 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18572 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18573 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18574 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18575 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18576 "vmovupd %%ymm8, (%0)\n"
18577 "vmovupd %%ymm9, (%1)\n"
18578 "vmovupd %%ymm10, (%2)\n"
18579 "vmovupd %%ymm11, (%3)\n"
18580 "vmovupd %%ymm12, (%4)\n"
18581 "vmovupd %%ymm13, (%5)\n"
18582 "vmovupd %%ymm14, (%6)\n"
18583 "vmovupd %%ymm15, (%7)\n"
18584 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256), "r"(buf + j + k + 512), "r"(buf + j + k + 768), "r"(buf + j + k + 1024), "r"(buf + j + k + 1280), "r"(buf + j + k + 1536), "r"(buf + j + k + 1792) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18585 );
18586 }
18587 }
18588 return;
18589 }
18590 if (depth == 14) {
18591 helper_double_29_recursive(buf + 0, 11);
18592 helper_double_29_recursive(buf + 2048, 11);
18593 helper_double_29_recursive(buf + 4096, 11);
18594 helper_double_29_recursive(buf + 6144, 11);
18595 helper_double_29_recursive(buf + 8192, 11);
18596 helper_double_29_recursive(buf + 10240, 11);
18597 helper_double_29_recursive(buf + 12288, 11);
18598 helper_double_29_recursive(buf + 14336, 11);
18599 for (int j = 0; j < 16384; j += 16384) {
18600 for (int k = 0; k < 2048; k += 4) {
18601 __asm__ volatile (
18602 "vmovupd (%0), %%ymm0\n"
18603 "vmovupd (%1), %%ymm1\n"
18604 "vmovupd (%2), %%ymm2\n"
18605 "vmovupd (%3), %%ymm3\n"
18606 "vmovupd (%4), %%ymm4\n"
18607 "vmovupd (%5), %%ymm5\n"
18608 "vmovupd (%6), %%ymm6\n"
18609 "vmovupd (%7), %%ymm7\n"
18610 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18611 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18612 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18613 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18614 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18615 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18616 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18617 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18618 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18619 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18620 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18621 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18622 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18623 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18624 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18625 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18626 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18627 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18628 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18629 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18630 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18631 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18632 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18633 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18634 "vmovupd %%ymm8, (%0)\n"
18635 "vmovupd %%ymm9, (%1)\n"
18636 "vmovupd %%ymm10, (%2)\n"
18637 "vmovupd %%ymm11, (%3)\n"
18638 "vmovupd %%ymm12, (%4)\n"
18639 "vmovupd %%ymm13, (%5)\n"
18640 "vmovupd %%ymm14, (%6)\n"
18641 "vmovupd %%ymm15, (%7)\n"
18642 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2048), "r"(buf + j + k + 4096), "r"(buf + j + k + 6144), "r"(buf + j + k + 8192), "r"(buf + j + k + 10240), "r"(buf + j + k + 12288), "r"(buf + j + k + 14336) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18643 );
18644 }
18645 }
18646 return;
18647 }
18648 if (depth == 17) {
18649 helper_double_29_recursive(buf + 0, 14);
18650 helper_double_29_recursive(buf + 16384, 14);
18651 helper_double_29_recursive(buf + 32768, 14);
18652 helper_double_29_recursive(buf + 49152, 14);
18653 helper_double_29_recursive(buf + 65536, 14);
18654 helper_double_29_recursive(buf + 81920, 14);
18655 helper_double_29_recursive(buf + 98304, 14);
18656 helper_double_29_recursive(buf + 114688, 14);
18657 for (int j = 0; j < 131072; j += 131072) {
18658 for (int k = 0; k < 16384; k += 4) {
18659 __asm__ volatile (
18660 "vmovupd (%0), %%ymm0\n"
18661 "vmovupd (%1), %%ymm1\n"
18662 "vmovupd (%2), %%ymm2\n"
18663 "vmovupd (%3), %%ymm3\n"
18664 "vmovupd (%4), %%ymm4\n"
18665 "vmovupd (%5), %%ymm5\n"
18666 "vmovupd (%6), %%ymm6\n"
18667 "vmovupd (%7), %%ymm7\n"
18668 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18669 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18670 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18671 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18672 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18673 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18674 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18675 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18676 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18677 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18678 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18679 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18680 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18681 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18682 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18683 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18684 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18685 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18686 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18687 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18688 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18689 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18690 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18691 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18692 "vmovupd %%ymm8, (%0)\n"
18693 "vmovupd %%ymm9, (%1)\n"
18694 "vmovupd %%ymm10, (%2)\n"
18695 "vmovupd %%ymm11, (%3)\n"
18696 "vmovupd %%ymm12, (%4)\n"
18697 "vmovupd %%ymm13, (%5)\n"
18698 "vmovupd %%ymm14, (%6)\n"
18699 "vmovupd %%ymm15, (%7)\n"
18700 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16384), "r"(buf + j + k + 32768), "r"(buf + j + k + 49152), "r"(buf + j + k + 65536), "r"(buf + j + k + 81920), "r"(buf + j + k + 98304), "r"(buf + j + k + 114688) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18701 );
18702 }
18703 }
18704 return;
18705 }
18706 if (depth == 20) {
18707 helper_double_29_recursive(buf + 0, 17);
18708 helper_double_29_recursive(buf + 131072, 17);
18709 helper_double_29_recursive(buf + 262144, 17);
18710 helper_double_29_recursive(buf + 393216, 17);
18711 helper_double_29_recursive(buf + 524288, 17);
18712 helper_double_29_recursive(buf + 655360, 17);
18713 helper_double_29_recursive(buf + 786432, 17);
18714 helper_double_29_recursive(buf + 917504, 17);
18715 for (int j = 0; j < 1048576; j += 1048576) {
18716 for (int k = 0; k < 131072; k += 4) {
18717 __asm__ volatile (
18718 "vmovupd (%0), %%ymm0\n"
18719 "vmovupd (%1), %%ymm1\n"
18720 "vmovupd (%2), %%ymm2\n"
18721 "vmovupd (%3), %%ymm3\n"
18722 "vmovupd (%4), %%ymm4\n"
18723 "vmovupd (%5), %%ymm5\n"
18724 "vmovupd (%6), %%ymm6\n"
18725 "vmovupd (%7), %%ymm7\n"
18726 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18727 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18728 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18729 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18730 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18731 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18732 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18733 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18734 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18735 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18736 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18737 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18738 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18739 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18740 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18741 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18742 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18743 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18744 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18745 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18746 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18747 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18748 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18749 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18750 "vmovupd %%ymm8, (%0)\n"
18751 "vmovupd %%ymm9, (%1)\n"
18752 "vmovupd %%ymm10, (%2)\n"
18753 "vmovupd %%ymm11, (%3)\n"
18754 "vmovupd %%ymm12, (%4)\n"
18755 "vmovupd %%ymm13, (%5)\n"
18756 "vmovupd %%ymm14, (%6)\n"
18757 "vmovupd %%ymm15, (%7)\n"
18758 :: "r"(buf + j + k + 0), "r"(buf + j + k + 131072), "r"(buf + j + k + 262144), "r"(buf + j + k + 393216), "r"(buf + j + k + 524288), "r"(buf + j + k + 655360), "r"(buf + j + k + 786432), "r"(buf + j + k + 917504) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18759 );
18760 }
18761 }
18762 return;
18763 }
18764 if (depth == 23) {
18765 helper_double_29_recursive(buf + 0, 20);
18766 helper_double_29_recursive(buf + 1048576, 20);
18767 helper_double_29_recursive(buf + 2097152, 20);
18768 helper_double_29_recursive(buf + 3145728, 20);
18769 helper_double_29_recursive(buf + 4194304, 20);
18770 helper_double_29_recursive(buf + 5242880, 20);
18771 helper_double_29_recursive(buf + 6291456, 20);
18772 helper_double_29_recursive(buf + 7340032, 20);
18773 for (int j = 0; j < 8388608; j += 8388608) {
18774 for (int k = 0; k < 1048576; k += 4) {
18775 __asm__ volatile (
18776 "vmovupd (%0), %%ymm0\n"
18777 "vmovupd (%1), %%ymm1\n"
18778 "vmovupd (%2), %%ymm2\n"
18779 "vmovupd (%3), %%ymm3\n"
18780 "vmovupd (%4), %%ymm4\n"
18781 "vmovupd (%5), %%ymm5\n"
18782 "vmovupd (%6), %%ymm6\n"
18783 "vmovupd (%7), %%ymm7\n"
18784 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18785 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18786 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18787 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18788 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18789 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18790 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18791 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18792 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18793 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18794 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18795 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18796 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18797 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18798 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18799 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18800 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18801 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18802 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18803 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18804 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18805 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18806 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18807 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18808 "vmovupd %%ymm8, (%0)\n"
18809 "vmovupd %%ymm9, (%1)\n"
18810 "vmovupd %%ymm10, (%2)\n"
18811 "vmovupd %%ymm11, (%3)\n"
18812 "vmovupd %%ymm12, (%4)\n"
18813 "vmovupd %%ymm13, (%5)\n"
18814 "vmovupd %%ymm14, (%6)\n"
18815 "vmovupd %%ymm15, (%7)\n"
18816 :: "r"(buf + j + k + 0), "r"(buf + j + k + 1048576), "r"(buf + j + k + 2097152), "r"(buf + j + k + 3145728), "r"(buf + j + k + 4194304), "r"(buf + j + k + 5242880), "r"(buf + j + k + 6291456), "r"(buf + j + k + 7340032) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18817 );
18818 }
18819 }
18820 return;
18821 }
18822 if (depth == 26) {
18823 helper_double_29_recursive(buf + 0, 23);
18824 helper_double_29_recursive(buf + 8388608, 23);
18825 helper_double_29_recursive(buf + 16777216, 23);
18826 helper_double_29_recursive(buf + 25165824, 23);
18827 helper_double_29_recursive(buf + 33554432, 23);
18828 helper_double_29_recursive(buf + 41943040, 23);
18829 helper_double_29_recursive(buf + 50331648, 23);
18830 helper_double_29_recursive(buf + 58720256, 23);
18831 for (int j = 0; j < 67108864; j += 67108864) {
18832 for (int k = 0; k < 8388608; k += 4) {
18833 __asm__ volatile (
18834 "vmovupd (%0), %%ymm0\n"
18835 "vmovupd (%1), %%ymm1\n"
18836 "vmovupd (%2), %%ymm2\n"
18837 "vmovupd (%3), %%ymm3\n"
18838 "vmovupd (%4), %%ymm4\n"
18839 "vmovupd (%5), %%ymm5\n"
18840 "vmovupd (%6), %%ymm6\n"
18841 "vmovupd (%7), %%ymm7\n"
18842 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18843 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18844 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18845 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18846 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18847 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18848 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18849 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18850 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18851 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18852 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18853 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18854 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18855 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18856 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18857 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18858 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18859 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18860 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18861 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18862 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18863 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18864 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18865 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18866 "vmovupd %%ymm8, (%0)\n"
18867 "vmovupd %%ymm9, (%1)\n"
18868 "vmovupd %%ymm10, (%2)\n"
18869 "vmovupd %%ymm11, (%3)\n"
18870 "vmovupd %%ymm12, (%4)\n"
18871 "vmovupd %%ymm13, (%5)\n"
18872 "vmovupd %%ymm14, (%6)\n"
18873 "vmovupd %%ymm15, (%7)\n"
18874 :: "r"(buf + j + k + 0), "r"(buf + j + k + 8388608), "r"(buf + j + k + 16777216), "r"(buf + j + k + 25165824), "r"(buf + j + k + 33554432), "r"(buf + j + k + 41943040), "r"(buf + j + k + 50331648), "r"(buf + j + k + 58720256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18875 );
18876 }
18877 }
18878 return;
18879 }
18880 if (depth == 29) {
18881 helper_double_29_recursive(buf + 0, 26);
18882 helper_double_29_recursive(buf + 67108864, 26);
18883 helper_double_29_recursive(buf + 134217728, 26);
18884 helper_double_29_recursive(buf + 201326592, 26);
18885 helper_double_29_recursive(buf + 268435456, 26);
18886 helper_double_29_recursive(buf + 335544320, 26);
18887 helper_double_29_recursive(buf + 402653184, 26);
18888 helper_double_29_recursive(buf + 469762048, 26);
18889 for (int j = 0; j < 536870912; j += 536870912) {
18890 for (int k = 0; k < 67108864; k += 4) {
18891 __asm__ volatile (
18892 "vmovupd (%0), %%ymm0\n"
18893 "vmovupd (%1), %%ymm1\n"
18894 "vmovupd (%2), %%ymm2\n"
18895 "vmovupd (%3), %%ymm3\n"
18896 "vmovupd (%4), %%ymm4\n"
18897 "vmovupd (%5), %%ymm5\n"
18898 "vmovupd (%6), %%ymm6\n"
18899 "vmovupd (%7), %%ymm7\n"
18900 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
18901 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
18902 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
18903 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
18904 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
18905 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
18906 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
18907 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
18908 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
18909 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
18910 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
18911 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
18912 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
18913 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
18914 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
18915 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
18916 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
18917 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
18918 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
18919 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
18920 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
18921 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
18922 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
18923 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
18924 "vmovupd %%ymm8, (%0)\n"
18925 "vmovupd %%ymm9, (%1)\n"
18926 "vmovupd %%ymm10, (%2)\n"
18927 "vmovupd %%ymm11, (%3)\n"
18928 "vmovupd %%ymm12, (%4)\n"
18929 "vmovupd %%ymm13, (%5)\n"
18930 "vmovupd %%ymm14, (%6)\n"
18931 "vmovupd %%ymm15, (%7)\n"
18932 :: "r"(buf + j + k + 0), "r"(buf + j + k + 67108864), "r"(buf + j + k + 134217728), "r"(buf + j + k + 201326592), "r"(buf + j + k + 268435456), "r"(buf + j + k + 335544320), "r"(buf + j + k + 402653184), "r"(buf + j + k + 469762048) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
18933 );
18934 }
18935 }
18936 return;
18937 }
18938 }
18939 void helper_double_29(double *buf);
helper_double_29(double * buf)18940 void helper_double_29(double *buf) {
18941 helper_double_29_recursive(buf, 29);
18942 }
18943 void helper_double_30_recursive(double *buf, int depth);
helper_double_30_recursive(double * buf,int depth)18944 void helper_double_30_recursive(double *buf, int depth) {
18945 if (depth == 9) {
18946 for (int j = 0; j < 512; j += 32) {
18947 for (int k = 0; k < 4; k += 4) {
18948 __asm__ volatile (
18949 "vmovupd (%0), %%ymm0\n"
18950 "vmovupd (%1), %%ymm1\n"
18951 "vmovupd (%2), %%ymm2\n"
18952 "vmovupd (%3), %%ymm3\n"
18953 "vmovupd (%4), %%ymm4\n"
18954 "vmovupd (%5), %%ymm5\n"
18955 "vmovupd (%6), %%ymm6\n"
18956 "vmovupd (%7), %%ymm7\n"
18957 "vpermilpd $0, %%ymm0, %%ymm8\n"
18958 "vpermilpd $15, %%ymm0, %%ymm9\n"
18959 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18960 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18961 "vaddsubpd %%ymm11, %%ymm8, %%ymm0\n"
18962 "vpermilpd $0, %%ymm1, %%ymm8\n"
18963 "vpermilpd $15, %%ymm1, %%ymm9\n"
18964 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18965 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18966 "vaddsubpd %%ymm11, %%ymm8, %%ymm1\n"
18967 "vpermilpd $0, %%ymm2, %%ymm8\n"
18968 "vpermilpd $15, %%ymm2, %%ymm9\n"
18969 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18970 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18971 "vaddsubpd %%ymm11, %%ymm8, %%ymm2\n"
18972 "vpermilpd $0, %%ymm3, %%ymm8\n"
18973 "vpermilpd $15, %%ymm3, %%ymm9\n"
18974 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18975 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18976 "vaddsubpd %%ymm11, %%ymm8, %%ymm3\n"
18977 "vpermilpd $0, %%ymm4, %%ymm8\n"
18978 "vpermilpd $15, %%ymm4, %%ymm9\n"
18979 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18980 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18981 "vaddsubpd %%ymm11, %%ymm8, %%ymm4\n"
18982 "vpermilpd $0, %%ymm5, %%ymm8\n"
18983 "vpermilpd $15, %%ymm5, %%ymm9\n"
18984 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18985 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18986 "vaddsubpd %%ymm11, %%ymm8, %%ymm5\n"
18987 "vpermilpd $0, %%ymm6, %%ymm8\n"
18988 "vpermilpd $15, %%ymm6, %%ymm9\n"
18989 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18990 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18991 "vaddsubpd %%ymm11, %%ymm8, %%ymm6\n"
18992 "vpermilpd $0, %%ymm7, %%ymm8\n"
18993 "vpermilpd $15, %%ymm7, %%ymm9\n"
18994 "vxorpd %%ymm10, %%ymm10, %%ymm10\n"
18995 "vsubpd %%ymm9, %%ymm10, %%ymm11\n"
18996 "vaddsubpd %%ymm11, %%ymm8, %%ymm7\n"
18997 "vperm2f128 $0, %%ymm0, %%ymm0, %%ymm8\n"
18998 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
18999 "vsubpd %%ymm0, %%ymm9, %%ymm10\n"
19000 "vperm2f128 $49, %%ymm10, %%ymm0, %%ymm11\n"
19001 "vaddpd %%ymm11, %%ymm8, %%ymm0\n"
19002 "vperm2f128 $0, %%ymm1, %%ymm1, %%ymm8\n"
19003 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19004 "vsubpd %%ymm1, %%ymm9, %%ymm10\n"
19005 "vperm2f128 $49, %%ymm10, %%ymm1, %%ymm11\n"
19006 "vaddpd %%ymm11, %%ymm8, %%ymm1\n"
19007 "vperm2f128 $0, %%ymm2, %%ymm2, %%ymm8\n"
19008 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19009 "vsubpd %%ymm2, %%ymm9, %%ymm10\n"
19010 "vperm2f128 $49, %%ymm10, %%ymm2, %%ymm11\n"
19011 "vaddpd %%ymm11, %%ymm8, %%ymm2\n"
19012 "vperm2f128 $0, %%ymm3, %%ymm3, %%ymm8\n"
19013 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19014 "vsubpd %%ymm3, %%ymm9, %%ymm10\n"
19015 "vperm2f128 $49, %%ymm10, %%ymm3, %%ymm11\n"
19016 "vaddpd %%ymm11, %%ymm8, %%ymm3\n"
19017 "vperm2f128 $0, %%ymm4, %%ymm4, %%ymm8\n"
19018 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19019 "vsubpd %%ymm4, %%ymm9, %%ymm10\n"
19020 "vperm2f128 $49, %%ymm10, %%ymm4, %%ymm11\n"
19021 "vaddpd %%ymm11, %%ymm8, %%ymm4\n"
19022 "vperm2f128 $0, %%ymm5, %%ymm5, %%ymm8\n"
19023 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19024 "vsubpd %%ymm5, %%ymm9, %%ymm10\n"
19025 "vperm2f128 $49, %%ymm10, %%ymm5, %%ymm11\n"
19026 "vaddpd %%ymm11, %%ymm8, %%ymm5\n"
19027 "vperm2f128 $0, %%ymm6, %%ymm6, %%ymm8\n"
19028 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19029 "vsubpd %%ymm6, %%ymm9, %%ymm10\n"
19030 "vperm2f128 $49, %%ymm10, %%ymm6, %%ymm11\n"
19031 "vaddpd %%ymm11, %%ymm8, %%ymm6\n"
19032 "vperm2f128 $0, %%ymm7, %%ymm7, %%ymm8\n"
19033 "vxorpd %%ymm9, %%ymm9, %%ymm9\n"
19034 "vsubpd %%ymm7, %%ymm9, %%ymm10\n"
19035 "vperm2f128 $49, %%ymm10, %%ymm7, %%ymm11\n"
19036 "vaddpd %%ymm11, %%ymm8, %%ymm7\n"
19037 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19038 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19039 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19040 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19041 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19042 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19043 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19044 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19045 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19046 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19047 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19048 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19049 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19050 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19051 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19052 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19053 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19054 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19055 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19056 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19057 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19058 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19059 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19060 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19061 "vmovupd %%ymm8, (%0)\n"
19062 "vmovupd %%ymm9, (%1)\n"
19063 "vmovupd %%ymm10, (%2)\n"
19064 "vmovupd %%ymm11, (%3)\n"
19065 "vmovupd %%ymm12, (%4)\n"
19066 "vmovupd %%ymm13, (%5)\n"
19067 "vmovupd %%ymm14, (%6)\n"
19068 "vmovupd %%ymm15, (%7)\n"
19069 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4), "r"(buf + j + k + 8), "r"(buf + j + k + 12), "r"(buf + j + k + 16), "r"(buf + j + k + 20), "r"(buf + j + k + 24), "r"(buf + j + k + 28) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19070 );
19071 }
19072 }
19073 for (int j = 0; j < 512; j += 256) {
19074 for (int k = 0; k < 32; k += 4) {
19075 __asm__ volatile (
19076 "vmovupd (%0), %%ymm0\n"
19077 "vmovupd (%1), %%ymm1\n"
19078 "vmovupd (%2), %%ymm2\n"
19079 "vmovupd (%3), %%ymm3\n"
19080 "vmovupd (%4), %%ymm4\n"
19081 "vmovupd (%5), %%ymm5\n"
19082 "vmovupd (%6), %%ymm6\n"
19083 "vmovupd (%7), %%ymm7\n"
19084 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19085 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19086 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19087 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19088 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19089 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19090 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19091 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19092 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19093 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19094 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19095 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19096 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19097 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19098 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19099 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19100 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19101 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19102 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19103 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19104 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19105 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19106 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19107 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19108 "vmovupd %%ymm8, (%0)\n"
19109 "vmovupd %%ymm9, (%1)\n"
19110 "vmovupd %%ymm10, (%2)\n"
19111 "vmovupd %%ymm11, (%3)\n"
19112 "vmovupd %%ymm12, (%4)\n"
19113 "vmovupd %%ymm13, (%5)\n"
19114 "vmovupd %%ymm14, (%6)\n"
19115 "vmovupd %%ymm15, (%7)\n"
19116 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32), "r"(buf + j + k + 64), "r"(buf + j + k + 96), "r"(buf + j + k + 128), "r"(buf + j + k + 160), "r"(buf + j + k + 192), "r"(buf + j + k + 224) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19117 );
19118 }
19119 }
19120 for (int j = 0; j < 512; j += 512) {
19121 for (int k = 0; k < 256; k += 4) {
19122 __asm__ volatile (
19123 "vmovupd (%0), %%ymm0\n"
19124 "vmovupd (%1), %%ymm1\n"
19125 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19126 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19127 "vmovupd %%ymm8, (%0)\n"
19128 "vmovupd %%ymm9, (%1)\n"
19129 :: "r"(buf + j + k + 0), "r"(buf + j + k + 256) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19130 );
19131 }
19132 }
19133 return;
19134 }
19135 if (depth == 12) {
19136 helper_double_30_recursive(buf + 0, 9);
19137 helper_double_30_recursive(buf + 512, 9);
19138 helper_double_30_recursive(buf + 1024, 9);
19139 helper_double_30_recursive(buf + 1536, 9);
19140 helper_double_30_recursive(buf + 2048, 9);
19141 helper_double_30_recursive(buf + 2560, 9);
19142 helper_double_30_recursive(buf + 3072, 9);
19143 helper_double_30_recursive(buf + 3584, 9);
19144 for (int j = 0; j < 4096; j += 4096) {
19145 for (int k = 0; k < 512; k += 4) {
19146 __asm__ volatile (
19147 "vmovupd (%0), %%ymm0\n"
19148 "vmovupd (%1), %%ymm1\n"
19149 "vmovupd (%2), %%ymm2\n"
19150 "vmovupd (%3), %%ymm3\n"
19151 "vmovupd (%4), %%ymm4\n"
19152 "vmovupd (%5), %%ymm5\n"
19153 "vmovupd (%6), %%ymm6\n"
19154 "vmovupd (%7), %%ymm7\n"
19155 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19156 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19157 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19158 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19159 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19160 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19161 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19162 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19163 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19164 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19165 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19166 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19167 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19168 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19169 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19170 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19171 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19172 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19173 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19174 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19175 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19176 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19177 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19178 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19179 "vmovupd %%ymm8, (%0)\n"
19180 "vmovupd %%ymm9, (%1)\n"
19181 "vmovupd %%ymm10, (%2)\n"
19182 "vmovupd %%ymm11, (%3)\n"
19183 "vmovupd %%ymm12, (%4)\n"
19184 "vmovupd %%ymm13, (%5)\n"
19185 "vmovupd %%ymm14, (%6)\n"
19186 "vmovupd %%ymm15, (%7)\n"
19187 :: "r"(buf + j + k + 0), "r"(buf + j + k + 512), "r"(buf + j + k + 1024), "r"(buf + j + k + 1536), "r"(buf + j + k + 2048), "r"(buf + j + k + 2560), "r"(buf + j + k + 3072), "r"(buf + j + k + 3584) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19188 );
19189 }
19190 }
19191 return;
19192 }
19193 if (depth == 15) {
19194 helper_double_30_recursive(buf + 0, 12);
19195 helper_double_30_recursive(buf + 4096, 12);
19196 helper_double_30_recursive(buf + 8192, 12);
19197 helper_double_30_recursive(buf + 12288, 12);
19198 helper_double_30_recursive(buf + 16384, 12);
19199 helper_double_30_recursive(buf + 20480, 12);
19200 helper_double_30_recursive(buf + 24576, 12);
19201 helper_double_30_recursive(buf + 28672, 12);
19202 for (int j = 0; j < 32768; j += 32768) {
19203 for (int k = 0; k < 4096; k += 4) {
19204 __asm__ volatile (
19205 "vmovupd (%0), %%ymm0\n"
19206 "vmovupd (%1), %%ymm1\n"
19207 "vmovupd (%2), %%ymm2\n"
19208 "vmovupd (%3), %%ymm3\n"
19209 "vmovupd (%4), %%ymm4\n"
19210 "vmovupd (%5), %%ymm5\n"
19211 "vmovupd (%6), %%ymm6\n"
19212 "vmovupd (%7), %%ymm7\n"
19213 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19214 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19215 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19216 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19217 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19218 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19219 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19220 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19221 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19222 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19223 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19224 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19225 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19226 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19227 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19228 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19229 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19230 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19231 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19232 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19233 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19234 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19235 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19236 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19237 "vmovupd %%ymm8, (%0)\n"
19238 "vmovupd %%ymm9, (%1)\n"
19239 "vmovupd %%ymm10, (%2)\n"
19240 "vmovupd %%ymm11, (%3)\n"
19241 "vmovupd %%ymm12, (%4)\n"
19242 "vmovupd %%ymm13, (%5)\n"
19243 "vmovupd %%ymm14, (%6)\n"
19244 "vmovupd %%ymm15, (%7)\n"
19245 :: "r"(buf + j + k + 0), "r"(buf + j + k + 4096), "r"(buf + j + k + 8192), "r"(buf + j + k + 12288), "r"(buf + j + k + 16384), "r"(buf + j + k + 20480), "r"(buf + j + k + 24576), "r"(buf + j + k + 28672) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19246 );
19247 }
19248 }
19249 return;
19250 }
19251 if (depth == 18) {
19252 helper_double_30_recursive(buf + 0, 15);
19253 helper_double_30_recursive(buf + 32768, 15);
19254 helper_double_30_recursive(buf + 65536, 15);
19255 helper_double_30_recursive(buf + 98304, 15);
19256 helper_double_30_recursive(buf + 131072, 15);
19257 helper_double_30_recursive(buf + 163840, 15);
19258 helper_double_30_recursive(buf + 196608, 15);
19259 helper_double_30_recursive(buf + 229376, 15);
19260 for (int j = 0; j < 262144; j += 262144) {
19261 for (int k = 0; k < 32768; k += 4) {
19262 __asm__ volatile (
19263 "vmovupd (%0), %%ymm0\n"
19264 "vmovupd (%1), %%ymm1\n"
19265 "vmovupd (%2), %%ymm2\n"
19266 "vmovupd (%3), %%ymm3\n"
19267 "vmovupd (%4), %%ymm4\n"
19268 "vmovupd (%5), %%ymm5\n"
19269 "vmovupd (%6), %%ymm6\n"
19270 "vmovupd (%7), %%ymm7\n"
19271 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19272 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19273 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19274 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19275 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19276 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19277 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19278 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19279 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19280 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19281 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19282 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19283 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19284 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19285 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19286 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19287 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19288 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19289 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19290 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19291 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19292 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19293 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19294 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19295 "vmovupd %%ymm8, (%0)\n"
19296 "vmovupd %%ymm9, (%1)\n"
19297 "vmovupd %%ymm10, (%2)\n"
19298 "vmovupd %%ymm11, (%3)\n"
19299 "vmovupd %%ymm12, (%4)\n"
19300 "vmovupd %%ymm13, (%5)\n"
19301 "vmovupd %%ymm14, (%6)\n"
19302 "vmovupd %%ymm15, (%7)\n"
19303 :: "r"(buf + j + k + 0), "r"(buf + j + k + 32768), "r"(buf + j + k + 65536), "r"(buf + j + k + 98304), "r"(buf + j + k + 131072), "r"(buf + j + k + 163840), "r"(buf + j + k + 196608), "r"(buf + j + k + 229376) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19304 );
19305 }
19306 }
19307 return;
19308 }
19309 if (depth == 21) {
19310 helper_double_30_recursive(buf + 0, 18);
19311 helper_double_30_recursive(buf + 262144, 18);
19312 helper_double_30_recursive(buf + 524288, 18);
19313 helper_double_30_recursive(buf + 786432, 18);
19314 helper_double_30_recursive(buf + 1048576, 18);
19315 helper_double_30_recursive(buf + 1310720, 18);
19316 helper_double_30_recursive(buf + 1572864, 18);
19317 helper_double_30_recursive(buf + 1835008, 18);
19318 for (int j = 0; j < 2097152; j += 2097152) {
19319 for (int k = 0; k < 262144; k += 4) {
19320 __asm__ volatile (
19321 "vmovupd (%0), %%ymm0\n"
19322 "vmovupd (%1), %%ymm1\n"
19323 "vmovupd (%2), %%ymm2\n"
19324 "vmovupd (%3), %%ymm3\n"
19325 "vmovupd (%4), %%ymm4\n"
19326 "vmovupd (%5), %%ymm5\n"
19327 "vmovupd (%6), %%ymm6\n"
19328 "vmovupd (%7), %%ymm7\n"
19329 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19330 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19331 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19332 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19333 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19334 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19335 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19336 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19337 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19338 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19339 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19340 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19341 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19342 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19343 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19344 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19345 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19346 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19347 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19348 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19349 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19350 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19351 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19352 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19353 "vmovupd %%ymm8, (%0)\n"
19354 "vmovupd %%ymm9, (%1)\n"
19355 "vmovupd %%ymm10, (%2)\n"
19356 "vmovupd %%ymm11, (%3)\n"
19357 "vmovupd %%ymm12, (%4)\n"
19358 "vmovupd %%ymm13, (%5)\n"
19359 "vmovupd %%ymm14, (%6)\n"
19360 "vmovupd %%ymm15, (%7)\n"
19361 :: "r"(buf + j + k + 0), "r"(buf + j + k + 262144), "r"(buf + j + k + 524288), "r"(buf + j + k + 786432), "r"(buf + j + k + 1048576), "r"(buf + j + k + 1310720), "r"(buf + j + k + 1572864), "r"(buf + j + k + 1835008) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19362 );
19363 }
19364 }
19365 return;
19366 }
19367 if (depth == 24) {
19368 helper_double_30_recursive(buf + 0, 21);
19369 helper_double_30_recursive(buf + 2097152, 21);
19370 helper_double_30_recursive(buf + 4194304, 21);
19371 helper_double_30_recursive(buf + 6291456, 21);
19372 helper_double_30_recursive(buf + 8388608, 21);
19373 helper_double_30_recursive(buf + 10485760, 21);
19374 helper_double_30_recursive(buf + 12582912, 21);
19375 helper_double_30_recursive(buf + 14680064, 21);
19376 for (int j = 0; j < 16777216; j += 16777216) {
19377 for (int k = 0; k < 2097152; k += 4) {
19378 __asm__ volatile (
19379 "vmovupd (%0), %%ymm0\n"
19380 "vmovupd (%1), %%ymm1\n"
19381 "vmovupd (%2), %%ymm2\n"
19382 "vmovupd (%3), %%ymm3\n"
19383 "vmovupd (%4), %%ymm4\n"
19384 "vmovupd (%5), %%ymm5\n"
19385 "vmovupd (%6), %%ymm6\n"
19386 "vmovupd (%7), %%ymm7\n"
19387 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19388 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19389 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19390 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19391 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19392 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19393 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19394 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19395 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19396 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19397 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19398 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19399 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19400 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19401 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19402 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19403 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19404 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19405 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19406 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19407 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19408 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19409 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19410 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19411 "vmovupd %%ymm8, (%0)\n"
19412 "vmovupd %%ymm9, (%1)\n"
19413 "vmovupd %%ymm10, (%2)\n"
19414 "vmovupd %%ymm11, (%3)\n"
19415 "vmovupd %%ymm12, (%4)\n"
19416 "vmovupd %%ymm13, (%5)\n"
19417 "vmovupd %%ymm14, (%6)\n"
19418 "vmovupd %%ymm15, (%7)\n"
19419 :: "r"(buf + j + k + 0), "r"(buf + j + k + 2097152), "r"(buf + j + k + 4194304), "r"(buf + j + k + 6291456), "r"(buf + j + k + 8388608), "r"(buf + j + k + 10485760), "r"(buf + j + k + 12582912), "r"(buf + j + k + 14680064) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19420 );
19421 }
19422 }
19423 return;
19424 }
19425 if (depth == 27) {
19426 helper_double_30_recursive(buf + 0, 24);
19427 helper_double_30_recursive(buf + 16777216, 24);
19428 helper_double_30_recursive(buf + 33554432, 24);
19429 helper_double_30_recursive(buf + 50331648, 24);
19430 helper_double_30_recursive(buf + 67108864, 24);
19431 helper_double_30_recursive(buf + 83886080, 24);
19432 helper_double_30_recursive(buf + 100663296, 24);
19433 helper_double_30_recursive(buf + 117440512, 24);
19434 for (int j = 0; j < 134217728; j += 134217728) {
19435 for (int k = 0; k < 16777216; k += 4) {
19436 __asm__ volatile (
19437 "vmovupd (%0), %%ymm0\n"
19438 "vmovupd (%1), %%ymm1\n"
19439 "vmovupd (%2), %%ymm2\n"
19440 "vmovupd (%3), %%ymm3\n"
19441 "vmovupd (%4), %%ymm4\n"
19442 "vmovupd (%5), %%ymm5\n"
19443 "vmovupd (%6), %%ymm6\n"
19444 "vmovupd (%7), %%ymm7\n"
19445 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19446 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19447 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19448 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19449 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19450 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19451 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19452 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19453 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19454 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19455 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19456 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19457 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19458 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19459 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19460 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19461 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19462 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19463 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19464 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19465 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19466 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19467 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19468 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19469 "vmovupd %%ymm8, (%0)\n"
19470 "vmovupd %%ymm9, (%1)\n"
19471 "vmovupd %%ymm10, (%2)\n"
19472 "vmovupd %%ymm11, (%3)\n"
19473 "vmovupd %%ymm12, (%4)\n"
19474 "vmovupd %%ymm13, (%5)\n"
19475 "vmovupd %%ymm14, (%6)\n"
19476 "vmovupd %%ymm15, (%7)\n"
19477 :: "r"(buf + j + k + 0), "r"(buf + j + k + 16777216), "r"(buf + j + k + 33554432), "r"(buf + j + k + 50331648), "r"(buf + j + k + 67108864), "r"(buf + j + k + 83886080), "r"(buf + j + k + 100663296), "r"(buf + j + k + 117440512) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19478 );
19479 }
19480 }
19481 return;
19482 }
19483 if (depth == 30) {
19484 helper_double_30_recursive(buf + 0, 27);
19485 helper_double_30_recursive(buf + 134217728, 27);
19486 helper_double_30_recursive(buf + 268435456, 27);
19487 helper_double_30_recursive(buf + 402653184, 27);
19488 helper_double_30_recursive(buf + 536870912, 27);
19489 helper_double_30_recursive(buf + 671088640, 27);
19490 helper_double_30_recursive(buf + 805306368, 27);
19491 helper_double_30_recursive(buf + 939524096, 27);
19492 for (int j = 0; j < 1073741824; j += 1073741824) {
19493 for (int k = 0; k < 134217728; k += 4) {
19494 __asm__ volatile (
19495 "vmovupd (%0), %%ymm0\n"
19496 "vmovupd (%1), %%ymm1\n"
19497 "vmovupd (%2), %%ymm2\n"
19498 "vmovupd (%3), %%ymm3\n"
19499 "vmovupd (%4), %%ymm4\n"
19500 "vmovupd (%5), %%ymm5\n"
19501 "vmovupd (%6), %%ymm6\n"
19502 "vmovupd (%7), %%ymm7\n"
19503 "vaddpd %%ymm1, %%ymm0, %%ymm8\n"
19504 "vsubpd %%ymm1, %%ymm0, %%ymm9\n"
19505 "vaddpd %%ymm3, %%ymm2, %%ymm10\n"
19506 "vsubpd %%ymm3, %%ymm2, %%ymm11\n"
19507 "vaddpd %%ymm5, %%ymm4, %%ymm12\n"
19508 "vsubpd %%ymm5, %%ymm4, %%ymm13\n"
19509 "vaddpd %%ymm7, %%ymm6, %%ymm14\n"
19510 "vsubpd %%ymm7, %%ymm6, %%ymm15\n"
19511 "vaddpd %%ymm10, %%ymm8, %%ymm0\n"
19512 "vsubpd %%ymm10, %%ymm8, %%ymm2\n"
19513 "vaddpd %%ymm11, %%ymm9, %%ymm1\n"
19514 "vsubpd %%ymm11, %%ymm9, %%ymm3\n"
19515 "vaddpd %%ymm14, %%ymm12, %%ymm4\n"
19516 "vsubpd %%ymm14, %%ymm12, %%ymm6\n"
19517 "vaddpd %%ymm15, %%ymm13, %%ymm5\n"
19518 "vsubpd %%ymm15, %%ymm13, %%ymm7\n"
19519 "vaddpd %%ymm4, %%ymm0, %%ymm8\n"
19520 "vsubpd %%ymm4, %%ymm0, %%ymm12\n"
19521 "vaddpd %%ymm5, %%ymm1, %%ymm9\n"
19522 "vsubpd %%ymm5, %%ymm1, %%ymm13\n"
19523 "vaddpd %%ymm6, %%ymm2, %%ymm10\n"
19524 "vsubpd %%ymm6, %%ymm2, %%ymm14\n"
19525 "vaddpd %%ymm7, %%ymm3, %%ymm11\n"
19526 "vsubpd %%ymm7, %%ymm3, %%ymm15\n"
19527 "vmovupd %%ymm8, (%0)\n"
19528 "vmovupd %%ymm9, (%1)\n"
19529 "vmovupd %%ymm10, (%2)\n"
19530 "vmovupd %%ymm11, (%3)\n"
19531 "vmovupd %%ymm12, (%4)\n"
19532 "vmovupd %%ymm13, (%5)\n"
19533 "vmovupd %%ymm14, (%6)\n"
19534 "vmovupd %%ymm15, (%7)\n"
19535 :: "r"(buf + j + k + 0), "r"(buf + j + k + 134217728), "r"(buf + j + k + 268435456), "r"(buf + j + k + 402653184), "r"(buf + j + k + 536870912), "r"(buf + j + k + 671088640), "r"(buf + j + k + 805306368), "r"(buf + j + k + 939524096) : "%ymm0", "%ymm1", "%ymm2", "%ymm3", "%ymm4", "%ymm5", "%ymm6", "%ymm7", "%ymm8", "%ymm9", "%ymm10", "%ymm11", "%ymm12", "%ymm13", "%ymm14", "%ymm15", "memory"
19536 );
19537 }
19538 }
19539 return;
19540 }
19541 }
19542 void helper_double_30(double *buf);
helper_double_30(double * buf)19543 void helper_double_30(double *buf) {
19544 helper_double_30_recursive(buf, 30);
19545 }
fht_double(double * buf,int log_n)19546 int fht_double(double *buf, int log_n) {
19547 if (log_n == 0) {
19548 return 0;
19549 }
19550 if (log_n == 1) {
19551 helper_double_1(buf);
19552 return 0;
19553 }
19554 if (log_n == 2) {
19555 helper_double_2(buf);
19556 return 0;
19557 }
19558 if (log_n == 3) {
19559 helper_double_3(buf);
19560 return 0;
19561 }
19562 if (log_n == 4) {
19563 helper_double_4(buf);
19564 return 0;
19565 }
19566 if (log_n == 5) {
19567 helper_double_5(buf);
19568 return 0;
19569 }
19570 if (log_n == 6) {
19571 helper_double_6(buf);
19572 return 0;
19573 }
19574 if (log_n == 7) {
19575 helper_double_7(buf);
19576 return 0;
19577 }
19578 if (log_n == 8) {
19579 helper_double_8(buf);
19580 return 0;
19581 }
19582 if (log_n == 9) {
19583 helper_double_9(buf);
19584 return 0;
19585 }
19586 if (log_n == 10) {
19587 helper_double_10(buf);
19588 return 0;
19589 }
19590 if (log_n == 11) {
19591 helper_double_11(buf);
19592 return 0;
19593 }
19594 if (log_n == 12) {
19595 helper_double_12(buf);
19596 return 0;
19597 }
19598 if (log_n == 13) {
19599 helper_double_13(buf);
19600 return 0;
19601 }
19602 if (log_n == 14) {
19603 helper_double_14(buf);
19604 return 0;
19605 }
19606 if (log_n == 15) {
19607 helper_double_15(buf);
19608 return 0;
19609 }
19610 if (log_n == 16) {
19611 helper_double_16(buf);
19612 return 0;
19613 }
19614 if (log_n == 17) {
19615 helper_double_17(buf);
19616 return 0;
19617 }
19618 if (log_n == 18) {
19619 helper_double_18(buf);
19620 return 0;
19621 }
19622 if (log_n == 19) {
19623 helper_double_19(buf);
19624 return 0;
19625 }
19626 if (log_n == 20) {
19627 helper_double_20(buf);
19628 return 0;
19629 }
19630 if (log_n == 21) {
19631 helper_double_21(buf);
19632 return 0;
19633 }
19634 if (log_n == 22) {
19635 helper_double_22(buf);
19636 return 0;
19637 }
19638 if (log_n == 23) {
19639 helper_double_23(buf);
19640 return 0;
19641 }
19642 if (log_n == 24) {
19643 helper_double_24(buf);
19644 return 0;
19645 }
19646 if (log_n == 25) {
19647 helper_double_25(buf);
19648 return 0;
19649 }
19650 if (log_n == 26) {
19651 helper_double_26(buf);
19652 return 0;
19653 }
19654 if (log_n == 27) {
19655 helper_double_27(buf);
19656 return 0;
19657 }
19658 if (log_n == 28) {
19659 helper_double_28(buf);
19660 return 0;
19661 }
19662 if (log_n == 29) {
19663 helper_double_29(buf);
19664 return 0;
19665 }
19666 if (log_n == 30) {
19667 helper_double_30(buf);
19668 return 0;
19669 }
19670 return 1;
19671 }
19672