1// Copyright 2023 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !purego
6
7#include "textflag.h"
8
9// func addMulVVW1024(z, x *uint, y uint) (c uint)
10TEXT ·addMulVVW1024(SB),$0-32
11	MOV	$16, X30
12	JMP	addMulVVWx(SB)
13
14// func addMulVVW1536(z, x *uint, y uint) (c uint)
15TEXT ·addMulVVW1536(SB),$0-32
16	MOV	$24, X30
17	JMP	addMulVVWx(SB)
18
19// func addMulVVW2048(z, x *uint, y uint) (c uint)
20TEXT ·addMulVVW2048(SB),$0-32
21	MOV	$32, X30
22	JMP	addMulVVWx(SB)
23
24TEXT addMulVVWx(SB),NOFRAME|NOSPLIT,$0
25	MOV	z+0(FP), X5
26	MOV	x+8(FP), X7
27	MOV	y+16(FP), X6
28	MOV	$0, X29
29
30	BEQZ	X30, done
31loop:
32	MOV	0*8(X5), X10	// z[0]
33	MOV	1*8(X5), X13	// z[1]
34	MOV	2*8(X5), X16	// z[2]
35	MOV	3*8(X5), X19	// z[3]
36
37	MOV	0*8(X7), X8	// x[0]
38	MOV	1*8(X7), X11	// x[1]
39	MOV	2*8(X7), X14	// x[2]
40	MOV	3*8(X7), X17	// x[3]
41
42	MULHU	X8, X6, X9	// z_hi[0] = x[0] * y
43	MUL	X8, X6, X8	// z_lo[0] = x[0] * y
44	ADD	X8, X10, X21	// z_lo[0] = x[0] * y + z[0]
45	SLTU	X8, X21, X22
46	ADD	X9, X22, X9	// z_hi[0] = x[0] * y + z[0]
47	ADD	X21, X29, X10	// z_lo[0] = x[0] * y + z[0] + c
48	SLTU	X21, X10, X22
49	ADD	X9, X22, X29	// next c
50
51	MULHU	X11, X6, X12	// z_hi[1] = x[1] * y
52	MUL	X11, X6, X11	// z_lo[1] = x[1] * y
53	ADD	X11, X13, X21	// z_lo[1] = x[1] * y + z[1]
54	SLTU	X11, X21, X22
55	ADD	X12, X22, X12	// z_hi[1] = x[1] * y + z[1]
56	ADD	X21, X29, X13	// z_lo[1] = x[1] * y + z[1] + c
57	SLTU	X21, X13, X22
58	ADD	X12, X22, X29	// next c
59
60	MULHU	X14, X6, X15	// z_hi[2] = x[2] * y
61	MUL	X14, X6, X14	// z_lo[2] = x[2] * y
62	ADD	X14, X16, X21	// z_lo[2] = x[2] * y + z[2]
63	SLTU	X14, X21, X22
64	ADD	X15, X22, X15	// z_hi[2] = x[2] * y + z[2]
65	ADD	X21, X29, X16	// z_lo[2] = x[2] * y + z[2] + c
66	SLTU	X21, X16, X22
67	ADD	X15, X22, X29	// next c
68
69	MULHU	X17, X6, X18	// z_hi[3] = x[3] * y
70	MUL	X17, X6, X17	// z_lo[3] = x[3] * y
71	ADD	X17, X19, X21	// z_lo[3] = x[3] * y + z[3]
72	SLTU	X17, X21, X22
73	ADD	X18, X22, X18	// z_hi[3] = x[3] * y + z[3]
74	ADD	X21, X29, X19	// z_lo[3] = x[3] * y + z[3] + c
75	SLTU	X21, X19, X22
76	ADD	X18, X22, X29	// next c
77
78	MOV	X10, 0*8(X5)	// z[0]
79	MOV	X13, 1*8(X5)	// z[1]
80	MOV	X16, 2*8(X5)	// z[2]
81	MOV	X19, 3*8(X5)	// z[3]
82
83	ADD	$32, X5
84	ADD	$32, X7
85
86	SUB	$4, X30
87	BNEZ	X30, loop
88
89done:
90	MOV	X29, c+24(FP)
91	RET
92