1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// The carry bit is saved with SBBQ Rx, Rx: if the carry was set, Rx is -1, otherwise it is 0.
13// It is restored with ADDQ Rx, Rx: if Rx was -1 the carry is set, otherwise it is cleared.
14// This is faster than using rotate instructions.
15
16// func addVV(z, x, y []Word) (c Word)
17TEXT ·addVV(SB),NOSPLIT,$0
18	MOVQ z_len+8(FP), DI
19	MOVQ x+24(FP), R8
20	MOVQ y+48(FP), R9
21	MOVQ z+0(FP), R10
22
23	MOVQ $0, CX		// c = 0
24	MOVQ $0, SI		// i = 0
25
26	// s/JL/JMP/ below to disable the unrolled loop
27	SUBQ $4, DI		// n -= 4
28	JL V1			// if n < 0 goto V1
29
30U1:	// n >= 0
31	// regular loop body unrolled 4x
32	ADDQ CX, CX		// restore CF
33	MOVQ 0(R8)(SI*8), R11
34	MOVQ 8(R8)(SI*8), R12
35	MOVQ 16(R8)(SI*8), R13
36	MOVQ 24(R8)(SI*8), R14
37	ADCQ 0(R9)(SI*8), R11
38	ADCQ 8(R9)(SI*8), R12
39	ADCQ 16(R9)(SI*8), R13
40	ADCQ 24(R9)(SI*8), R14
41	MOVQ R11, 0(R10)(SI*8)
42	MOVQ R12, 8(R10)(SI*8)
43	MOVQ R13, 16(R10)(SI*8)
44	MOVQ R14, 24(R10)(SI*8)
45	SBBQ CX, CX		// save CF
46
47	ADDQ $4, SI		// i += 4
48	SUBQ $4, DI		// n -= 4
49	JGE U1			// if n >= 0 goto U1
50
51V1:	ADDQ $4, DI		// n += 4
52	JLE E1			// if n <= 0 goto E1
53
54L1:	// n > 0
55	ADDQ CX, CX		// restore CF
56	MOVQ 0(R8)(SI*8), R11
57	ADCQ 0(R9)(SI*8), R11
58	MOVQ R11, 0(R10)(SI*8)
59	SBBQ CX, CX		// save CF
60
61	ADDQ $1, SI		// i++
62	SUBQ $1, DI		// n--
63	JG L1			// if n > 0 goto L1
64
65E1:	NEGQ CX
66	MOVQ CX, c+72(FP)	// return c
67	RET
68
69
70// func subVV(z, x, y []Word) (c Word)
71// (same as addVV except for SBBQ instead of ADCQ and label names)
72TEXT ·subVV(SB),NOSPLIT,$0
73	MOVQ z_len+8(FP), DI
74	MOVQ x+24(FP), R8
75	MOVQ y+48(FP), R9
76	MOVQ z+0(FP), R10
77
78	MOVQ $0, CX		// c = 0
79	MOVQ $0, SI		// i = 0
80
81	// s/JL/JMP/ below to disable the unrolled loop
82	SUBQ $4, DI		// n -= 4
83	JL V2			// if n < 0 goto V2
84
85U2:	// n >= 0
86	// regular loop body unrolled 4x
87	ADDQ CX, CX		// restore CF
88	MOVQ 0(R8)(SI*8), R11
89	MOVQ 8(R8)(SI*8), R12
90	MOVQ 16(R8)(SI*8), R13
91	MOVQ 24(R8)(SI*8), R14
92	SBBQ 0(R9)(SI*8), R11
93	SBBQ 8(R9)(SI*8), R12
94	SBBQ 16(R9)(SI*8), R13
95	SBBQ 24(R9)(SI*8), R14
96	MOVQ R11, 0(R10)(SI*8)
97	MOVQ R12, 8(R10)(SI*8)
98	MOVQ R13, 16(R10)(SI*8)
99	MOVQ R14, 24(R10)(SI*8)
100	SBBQ CX, CX		// save CF
101
102	ADDQ $4, SI		// i += 4
103	SUBQ $4, DI		// n -= 4
104	JGE U2			// if n >= 0 goto U2
105
106V2:	ADDQ $4, DI		// n += 4
107	JLE E2			// if n <= 0 goto E2
108
109L2:	// n > 0
110	ADDQ CX, CX		// restore CF
111	MOVQ 0(R8)(SI*8), R11
112	SBBQ 0(R9)(SI*8), R11
113	MOVQ R11, 0(R10)(SI*8)
114	SBBQ CX, CX		// save CF
115
116	ADDQ $1, SI		// i++
117	SUBQ $1, DI		// n--
118	JG L2			// if n > 0 goto L2
119
120E2:	NEGQ CX
121	MOVQ CX, c+72(FP)	// return c
122	RET
123
124
125// func addVW(z, x []Word, y Word) (c Word)
126TEXT ·addVW(SB),NOSPLIT,$0
127	MOVQ z_len+8(FP), DI
128	CMPQ DI, $32
129	JG large
130	MOVQ x+24(FP), R8
131	MOVQ y+48(FP), CX	// c = y
132	MOVQ z+0(FP), R10
133
134	MOVQ $0, SI		// i = 0
135
136	// s/JL/JMP/ below to disable the unrolled loop
137	SUBQ $4, DI		// n -= 4
138	JL V3			// if n < 4 goto V3
139
140U3:	// n >= 0
141	// regular loop body unrolled 4x
142	MOVQ 0(R8)(SI*8), R11
143	MOVQ 8(R8)(SI*8), R12
144	MOVQ 16(R8)(SI*8), R13
145	MOVQ 24(R8)(SI*8), R14
146	ADDQ CX, R11
147	ADCQ $0, R12
148	ADCQ $0, R13
149	ADCQ $0, R14
150	SBBQ CX, CX		// save CF
151	NEGQ CX
152	MOVQ R11, 0(R10)(SI*8)
153	MOVQ R12, 8(R10)(SI*8)
154	MOVQ R13, 16(R10)(SI*8)
155	MOVQ R14, 24(R10)(SI*8)
156
157	ADDQ $4, SI		// i += 4
158	SUBQ $4, DI		// n -= 4
159	JGE U3			// if n >= 0 goto U3
160
161V3:	ADDQ $4, DI		// n += 4
162	JLE E3			// if n <= 0 goto E3
163
164L3:	// n > 0
165	ADDQ 0(R8)(SI*8), CX
166	MOVQ CX, 0(R10)(SI*8)
167	SBBQ CX, CX		// save CF
168	NEGQ CX
169
170	ADDQ $1, SI		// i++
171	SUBQ $1, DI		// n--
172	JG L3			// if n > 0 goto L3
173
174E3:	MOVQ CX, c+56(FP)	// return c
175	RET
176large:
177	JMP ·addVWlarge(SB)
178
179
180// func subVW(z, x []Word, y Word) (c Word)
181// (same as addVW except for SUBQ/SBBQ instead of ADDQ/ADCQ and label names)
182TEXT ·subVW(SB),NOSPLIT,$0
183	MOVQ z_len+8(FP), DI
184	CMPQ DI, $32
185	JG large
186	MOVQ x+24(FP), R8
187	MOVQ y+48(FP), CX	// c = y
188	MOVQ z+0(FP), R10
189
190	MOVQ $0, SI		// i = 0
191
192	// s/JL/JMP/ below to disable the unrolled loop
193	SUBQ $4, DI		// n -= 4
194	JL V4			// if n < 4 goto V4
195
196U4:	// n >= 0
197	// regular loop body unrolled 4x
198	MOVQ 0(R8)(SI*8), R11
199	MOVQ 8(R8)(SI*8), R12
200	MOVQ 16(R8)(SI*8), R13
201	MOVQ 24(R8)(SI*8), R14
202	SUBQ CX, R11
203	SBBQ $0, R12
204	SBBQ $0, R13
205	SBBQ $0, R14
206	SBBQ CX, CX		// save CF
207	NEGQ CX
208	MOVQ R11, 0(R10)(SI*8)
209	MOVQ R12, 8(R10)(SI*8)
210	MOVQ R13, 16(R10)(SI*8)
211	MOVQ R14, 24(R10)(SI*8)
212
213	ADDQ $4, SI		// i += 4
214	SUBQ $4, DI		// n -= 4
215	JGE U4			// if n >= 0 goto U4
216
217V4:	ADDQ $4, DI		// n += 4
218	JLE E4			// if n <= 0 goto E4
219
220L4:	// n > 0
221	MOVQ 0(R8)(SI*8), R11
222	SUBQ CX, R11
223	MOVQ R11, 0(R10)(SI*8)
224	SBBQ CX, CX		// save CF
225	NEGQ CX
226
227	ADDQ $1, SI		// i++
228	SUBQ $1, DI		// n--
229	JG L4			// if n > 0 goto L4
230
231E4:	MOVQ CX, c+56(FP)	// return c
232	RET
233large:
234	JMP ·subVWlarge(SB)
235
236
237// func shlVU(z, x []Word, s uint) (c Word)
238TEXT ·shlVU(SB),NOSPLIT,$0
239	MOVQ z_len+8(FP), BX	// i = z
240	SUBQ $1, BX		// i--
241	JL X8b			// i < 0	(n <= 0)
242
243	// n > 0
244	MOVQ z+0(FP), R10
245	MOVQ x+24(FP), R8
246	MOVQ s+48(FP), CX
247	MOVQ (R8)(BX*8), AX	// w1 = x[n-1]
248	MOVQ $0, DX
249	SHLQ CX, AX, DX		// w1>>ŝ
250	MOVQ DX, c+56(FP)
251
252	CMPQ BX, $0
253	JLE X8a			// i <= 0
254
255	// i > 0
256L8:	MOVQ AX, DX		// w = w1
257	MOVQ -8(R8)(BX*8), AX	// w1 = x[i-1]
258	SHLQ CX, AX, DX		// w<<s | w1>>ŝ
259	MOVQ DX, (R10)(BX*8)	// z[i] = w<<s | w1>>ŝ
260	SUBQ $1, BX		// i--
261	JG L8			// i > 0
262
263	// i <= 0
264X8a:	SHLQ CX, AX		// w1<<s
265	MOVQ AX, (R10)		// z[0] = w1<<s
266	RET
267
268X8b:	MOVQ $0, c+56(FP)
269	RET
270
271
272// func shrVU(z, x []Word, s uint) (c Word)
273TEXT ·shrVU(SB),NOSPLIT,$0
274	MOVQ z_len+8(FP), R11
275	SUBQ $1, R11		// n--
276	JL X9b			// n < 0	(n <= 0)
277
278	// n > 0
279	MOVQ z+0(FP), R10
280	MOVQ x+24(FP), R8
281	MOVQ s+48(FP), CX
282	MOVQ (R8), AX		// w1 = x[0]
283	MOVQ $0, DX
284	SHRQ CX, AX, DX		// w1<<ŝ
285	MOVQ DX, c+56(FP)
286
287	MOVQ $0, BX		// i = 0
288	JMP E9
289
290	// i < n-1
291L9:	MOVQ AX, DX		// w = w1
292	MOVQ 8(R8)(BX*8), AX	// w1 = x[i+1]
293	SHRQ CX, AX, DX		// w>>s | w1<<ŝ
294	MOVQ DX, (R10)(BX*8)	// z[i] = w>>s | w1<<ŝ
295	ADDQ $1, BX		// i++
296
297E9:	CMPQ BX, R11
298	JL L9			// i < n-1
299
300	// i >= n-1
301X9a:	SHRQ CX, AX		// w1>>s
302	MOVQ AX, (R10)(R11*8)	// z[n-1] = w1>>s
303	RET
304
305X9b:	MOVQ $0, c+56(FP)
306	RET
307
308
309// func mulAddVWW(z, x []Word, y, r Word) (c Word)
310TEXT ·mulAddVWW(SB),NOSPLIT,$0
311	MOVQ z+0(FP), R10
312	MOVQ x+24(FP), R8
313	MOVQ y+48(FP), R9
314	MOVQ r+56(FP), CX	// c = r
315	MOVQ z_len+8(FP), R11
316	MOVQ $0, BX		// i = 0
317
318	CMPQ R11, $4
319	JL E5
320
321U5:	// i+4 <= n
322	// regular loop body unrolled 4x
323	MOVQ (0*8)(R8)(BX*8), AX
324	MULQ R9
325	ADDQ CX, AX
326	ADCQ $0, DX
327	MOVQ AX, (0*8)(R10)(BX*8)
328	MOVQ DX, CX
329	MOVQ (1*8)(R8)(BX*8), AX
330	MULQ R9
331	ADDQ CX, AX
332	ADCQ $0, DX
333	MOVQ AX, (1*8)(R10)(BX*8)
334	MOVQ DX, CX
335	MOVQ (2*8)(R8)(BX*8), AX
336	MULQ R9
337	ADDQ CX, AX
338	ADCQ $0, DX
339	MOVQ AX, (2*8)(R10)(BX*8)
340	MOVQ DX, CX
341	MOVQ (3*8)(R8)(BX*8), AX
342	MULQ R9
343	ADDQ CX, AX
344	ADCQ $0, DX
345	MOVQ AX, (3*8)(R10)(BX*8)
346	MOVQ DX, CX
347	ADDQ $4, BX		// i += 4
348
349	LEAQ 4(BX), DX
350	CMPQ DX, R11
351	JLE U5
352	JMP E5
353
354L5:	MOVQ (R8)(BX*8), AX
355	MULQ R9
356	ADDQ CX, AX
357	ADCQ $0, DX
358	MOVQ AX, (R10)(BX*8)
359	MOVQ DX, CX
360	ADDQ $1, BX		// i++
361
362E5:	CMPQ BX, R11		// i < n
363	JL L5
364
365	MOVQ CX, c+64(FP)
366	RET
367
368
369// func addMulVVW(z, x []Word, y Word) (c Word)
370TEXT ·addMulVVW(SB),NOSPLIT,$0
371	CMPB ·support_adx(SB), $1
372	JEQ adx
373	MOVQ z+0(FP), R10
374	MOVQ x+24(FP), R8
375	MOVQ y+48(FP), R9
376	MOVQ z_len+8(FP), R11
377	MOVQ $0, BX		// i = 0
378	MOVQ $0, CX		// c = 0
379	MOVQ R11, R12
380	ANDQ $-2, R12
381	CMPQ R11, $2
382	JAE A6
383	JMP E6
384
385A6:
386	MOVQ (R8)(BX*8), AX
387	MULQ R9
388	ADDQ (R10)(BX*8), AX
389	ADCQ $0, DX
390	ADDQ CX, AX
391	ADCQ $0, DX
392	MOVQ DX, CX
393	MOVQ AX, (R10)(BX*8)
394
395	MOVQ (8)(R8)(BX*8), AX
396	MULQ R9
397	ADDQ (8)(R10)(BX*8), AX
398	ADCQ $0, DX
399	ADDQ CX, AX
400	ADCQ $0, DX
401	MOVQ DX, CX
402	MOVQ AX, (8)(R10)(BX*8)
403
404	ADDQ $2, BX
405	CMPQ BX, R12
406	JL A6
407	JMP E6
408
409L6:	MOVQ (R8)(BX*8), AX
410	MULQ R9
411	ADDQ CX, AX
412	ADCQ $0, DX
413	ADDQ AX, (R10)(BX*8)
414	ADCQ $0, DX
415	MOVQ DX, CX
416	ADDQ $1, BX		// i++
417
418E6:	CMPQ BX, R11		// i < n
419	JL L6
420
421	MOVQ CX, c+56(FP)
422	RET
423
424adx:
425	MOVQ z_len+8(FP), R11
426	MOVQ z+0(FP), R10
427	MOVQ x+24(FP), R8
428	MOVQ y+48(FP), DX
429	MOVQ $0, BX   // i = 0
430	MOVQ $0, CX   // carry
431	CMPQ R11, $8
432	JAE  adx_loop_header
433	CMPQ BX, R11
434	JL adx_short
435	MOVQ CX, c+56(FP)
436	RET
437
438adx_loop_header:
439	MOVQ  R11, R13
440	ANDQ  $-8, R13
441adx_loop:
442	XORQ  R9, R9  // unset flags
443	MULXQ (R8), SI, DI
444	ADCXQ CX,SI
445	ADOXQ (R10), SI
446	MOVQ  SI,(R10)
447
448	MULXQ 8(R8), AX, CX
449	ADCXQ DI, AX
450	ADOXQ 8(R10), AX
451	MOVQ  AX, 8(R10)
452
453	MULXQ 16(R8), SI, DI
454	ADCXQ CX, SI
455	ADOXQ 16(R10), SI
456	MOVQ  SI, 16(R10)
457
458	MULXQ 24(R8), AX, CX
459	ADCXQ DI, AX
460	ADOXQ 24(R10), AX
461	MOVQ  AX, 24(R10)
462
463	MULXQ 32(R8), SI, DI
464	ADCXQ CX, SI
465	ADOXQ 32(R10), SI
466	MOVQ  SI, 32(R10)
467
468	MULXQ 40(R8), AX, CX
469	ADCXQ DI, AX
470	ADOXQ 40(R10), AX
471	MOVQ  AX, 40(R10)
472
473	MULXQ 48(R8), SI, DI
474	ADCXQ CX, SI
475	ADOXQ 48(R10), SI
476	MOVQ  SI, 48(R10)
477
478	MULXQ 56(R8), AX, CX
479	ADCXQ DI, AX
480	ADOXQ 56(R10), AX
481	MOVQ  AX, 56(R10)
482
483	ADCXQ R9, CX
484	ADOXQ R9, CX
485
486	ADDQ $64, R8
487	ADDQ $64, R10
488	ADDQ $8, BX
489
490	CMPQ BX, R13
491	JL adx_loop
492	MOVQ z+0(FP), R10
493	MOVQ x+24(FP), R8
494	CMPQ BX, R11
495	JL adx_short
496	MOVQ CX, c+56(FP)
497	RET
498
499adx_short:
500	MULXQ (R8)(BX*8), SI, DI
501	ADDQ CX, SI
502	ADCQ $0, DI
503	ADDQ SI, (R10)(BX*8)
504	ADCQ $0, DI
505	MOVQ DI, CX
506	ADDQ $1, BX		// i++
507
508	CMPQ BX, R11
509	JL adx_short
510
511	MOVQ CX, c+56(FP)
512	RET
513
514
515
516