1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !math_big_pure_go && (ppc64 || ppc64le)
6
7#include "textflag.h"
8
9// This file provides fast assembly versions for the elementary
10// arithmetic operations on vectors implemented in arith.go.
11
12// func addVV(z, y, y []Word) (c Word)
13// z[i] = x[i] + y[i] for all i, carrying
14TEXT ·addVV(SB), NOSPLIT, $0
15	MOVD  z_len+8(FP), R7   // R7 = z_len
16	MOVD  x+24(FP), R8      // R8 = x[]
17	MOVD  y+48(FP), R9      // R9 = y[]
18	MOVD  z+0(FP), R10      // R10 = z[]
19
20	// If z_len = 0, we are done
21	CMP   R7, $0
22	MOVD  R0, R4
23	BEQ   done
24
25	// Process the first iteration out of the loop so we can
26	// use MOVDU and avoid 3 index registers updates.
27	MOVD  0(R8), R11      // R11 = x[i]
28	MOVD  0(R9), R12      // R12 = y[i]
29	ADD   $-1, R7         // R7 = z_len - 1
30	ADDC  R12, R11, R15   // R15 = x[i] + y[i], set CA
31	CMP   R7, $0
32	MOVD  R15, 0(R10)     // z[i]
33	BEQ   final          // If z_len was 1, we are done
34
35	SRD   $2, R7, R5      // R5 = z_len/4
36	CMP   R5, $0
37	MOVD  R5, CTR         // Set up loop counter
38	BEQ   tail            // If R5 = 0, we can't use the loop
39
40	// Process 4 elements per iteration. Unrolling this loop
41	// means a performance trade-off: we will lose performance
42	// for small values of z_len (0.90x in the worst case), but
43	// gain significant performance as z_len increases (up to
44	// 1.45x).
45
46	PCALIGN $16
47loop:
48	MOVD  8(R8), R11      // R11 = x[i]
49	MOVD  16(R8), R12     // R12 = x[i+1]
50	MOVD  24(R8), R14     // R14 = x[i+2]
51	MOVDU 32(R8), R15     // R15 = x[i+3]
52	MOVD  8(R9), R16      // R16 = y[i]
53	MOVD  16(R9), R17     // R17 = y[i+1]
54	MOVD  24(R9), R18     // R18 = y[i+2]
55	MOVDU 32(R9), R19     // R19 = y[i+3]
56	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
57	ADDE  R12, R17, R21   // R21 = x[i+1] + y[i+1] + CA
58	ADDE  R14, R18, R22   // R22 = x[i+2] + y[i+2] + CA
59	ADDE  R15, R19, R23   // R23 = x[i+3] + y[i+3] + CA
60	MOVD  R20, 8(R10)     // z[i]
61	MOVD  R21, 16(R10)    // z[i+1]
62	MOVD  R22, 24(R10)    // z[i+2]
63	MOVDU R23, 32(R10)    // z[i+3]
64	ADD   $-4, R7         // R7 = z_len - 4
65	BDNZ  loop
66
67	// We may have more elements to read
68	CMP   R7, $0
69	BEQ   final
70
71	// Process the remaining elements, one at a time
72tail:
73	MOVDU 8(R8), R11      // R11 = x[i]
74	MOVDU 8(R9), R16      // R16 = y[i]
75	ADD   $-1, R7         // R7 = z_len - 1
76	ADDE  R11, R16, R20   // R20 = x[i] + y[i] + CA
77	CMP   R7, $0
78	MOVDU R20, 8(R10)     // z[i]
79	BEQ   final           // If R7 = 0, we are done
80
81	MOVDU 8(R8), R11
82	MOVDU 8(R9), R16
83	ADD   $-1, R7
84	ADDE  R11, R16, R20
85	CMP   R7, $0
86	MOVDU R20, 8(R10)
87	BEQ   final
88
89	MOVD  8(R8), R11
90	MOVD  8(R9), R16
91	ADDE  R11, R16, R20
92	MOVD  R20, 8(R10)
93
94final:
95	ADDZE R4              // Capture CA
96
97done:
98	MOVD  R4, c+72(FP)
99	RET
100
101// func subVV(z, x, y []Word) (c Word)
102// z[i] = x[i] - y[i] for all i, carrying
103TEXT ·subVV(SB), NOSPLIT, $0
104	MOVD  z_len+8(FP), R7 // R7 = z_len
105	MOVD  x+24(FP), R8    // R8 = x[]
106	MOVD  y+48(FP), R9    // R9 = y[]
107	MOVD  z+0(FP), R10    // R10 = z[]
108
109	// If z_len = 0, we are done
110	CMP   R7, $0
111	MOVD  R0, R4
112	BEQ   done
113
114	// Process the first iteration out of the loop so we can
115	// use MOVDU and avoid 3 index registers updates.
116	MOVD  0(R8), R11      // R11 = x[i]
117	MOVD  0(R9), R12      // R12 = y[i]
118	ADD   $-1, R7         // R7 = z_len - 1
119	SUBC  R12, R11, R15   // R15 = x[i] - y[i], set CA
120	CMP   R7, $0
121	MOVD  R15, 0(R10)     // z[i]
122	BEQ   final           // If z_len was 1, we are done
123
124	SRD   $2, R7, R5      // R5 = z_len/4
125	CMP   R5, $0
126	MOVD  R5, CTR         // Set up loop counter
127	BEQ   tail            // If R5 = 0, we can't use the loop
128
129	// Process 4 elements per iteration. Unrolling this loop
130	// means a performance trade-off: we will lose performance
131	// for small values of z_len (0.92x in the worst case), but
132	// gain significant performance as z_len increases (up to
133	// 1.45x).
134
135	PCALIGN $16
136loop:
137	MOVD  8(R8), R11      // R11 = x[i]
138	MOVD  16(R8), R12     // R12 = x[i+1]
139	MOVD  24(R8), R14     // R14 = x[i+2]
140	MOVDU 32(R8), R15     // R15 = x[i+3]
141	MOVD  8(R9), R16      // R16 = y[i]
142	MOVD  16(R9), R17     // R17 = y[i+1]
143	MOVD  24(R9), R18     // R18 = y[i+2]
144	MOVDU 32(R9), R19     // R19 = y[i+3]
145	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
146	SUBE  R17, R12, R21   // R21 = x[i+1] - y[i+1] + CA
147	SUBE  R18, R14, R22   // R22 = x[i+2] - y[i+2] + CA
148	SUBE  R19, R15, R23   // R23 = x[i+3] - y[i+3] + CA
149	MOVD  R20, 8(R10)     // z[i]
150	MOVD  R21, 16(R10)    // z[i+1]
151	MOVD  R22, 24(R10)    // z[i+2]
152	MOVDU R23, 32(R10)    // z[i+3]
153	ADD   $-4, R7         // R7 = z_len - 4
154	BDNZ  loop
155
156	// We may have more elements to read
157	CMP   R7, $0
158	BEQ   final
159
160	// Process the remaining elements, one at a time
161tail:
162	MOVDU 8(R8), R11      // R11 = x[i]
163	MOVDU 8(R9), R16      // R16 = y[i]
164	ADD   $-1, R7         // R7 = z_len - 1
165	SUBE  R16, R11, R20   // R20 = x[i] - y[i] + CA
166	CMP   R7, $0
167	MOVDU R20, 8(R10)     // z[i]
168	BEQ   final           // If R7 = 0, we are done
169
170	MOVDU 8(R8), R11
171	MOVDU 8(R9), R16
172	ADD   $-1, R7
173	SUBE  R16, R11, R20
174	CMP   R7, $0
175	MOVDU R20, 8(R10)
176	BEQ   final
177
178	MOVD  8(R8), R11
179	MOVD  8(R9), R16
180	SUBE  R16, R11, R20
181	MOVD  R20, 8(R10)
182
183final:
184	ADDZE R4
185	XOR   $1, R4
186
187done:
188	MOVD  R4, c+72(FP)
189	RET
190
191// func addVW(z, x []Word, y Word) (c Word)
192TEXT ·addVW(SB), NOSPLIT, $0
193	MOVD z+0(FP), R10	// R10 = z[]
194	MOVD x+24(FP), R8	// R8 = x[]
195	MOVD y+48(FP), R4	// R4 = y = c
196	MOVD z_len+8(FP), R11	// R11 = z_len
197
198	CMP   R11, $0		// If z_len is zero, return
199	BEQ   done
200
201	// We will process the first iteration out of the loop so we capture
202	// the value of c. In the subsequent iterations, we will rely on the
203	// value of CA set here.
204	MOVD  0(R8), R20	// R20 = x[i]
205	ADD   $-1, R11		// R11 = z_len - 1
206	ADDC  R20, R4, R6	// R6 = x[i] + c
207	CMP   R11, $0		// If z_len was 1, we are done
208	MOVD  R6, 0(R10)	// z[i]
209	BEQ   final
210
211	// We will read 4 elements per iteration
212	SRDCC $2, R11, R9	// R9 = z_len/4
213	DCBT  (R8)
214	MOVD  R9, CTR		// Set up the loop counter
215	BEQ   tail		// If R9 = 0, we can't use the loop
216	PCALIGN $16
217
218loop:
219	MOVD  8(R8), R20	// R20 = x[i]
220	MOVD  16(R8), R21	// R21 = x[i+1]
221	MOVD  24(R8), R22	// R22 = x[i+2]
222	MOVDU 32(R8), R23	// R23 = x[i+3]
223	ADDZE R20, R24		// R24 = x[i] + CA
224	ADDZE R21, R25		// R25 = x[i+1] + CA
225	ADDZE R22, R26		// R26 = x[i+2] + CA
226	ADDZE R23, R27		// R27 = x[i+3] + CA
227	MOVD  R24, 8(R10)	// z[i]
228	MOVD  R25, 16(R10)	// z[i+1]
229	MOVD  R26, 24(R10)	// z[i+2]
230	MOVDU R27, 32(R10)	// z[i+3]
231	ADD   $-4, R11		// R11 = z_len - 4
232	BDNZ  loop
233
234	// We may have some elements to read
235	CMP R11, $0
236	BEQ final
237
238tail:
239	MOVDU 8(R8), R20
240	ADDZE R20, R24
241	ADD $-1, R11
242	MOVDU R24, 8(R10)
243	CMP R11, $0
244	BEQ final
245
246	MOVDU 8(R8), R20
247	ADDZE R20, R24
248	ADD $-1, R11
249	MOVDU R24, 8(R10)
250	CMP R11, $0
251	BEQ final
252
253	MOVD 8(R8), R20
254	ADDZE R20, R24
255	MOVD R24, 8(R10)
256
257final:
258	ADDZE R0, R4		// c = CA
259done:
260	MOVD  R4, c+56(FP)
261	RET
262
263// func subVW(z, x []Word, y Word) (c Word)
264TEXT ·subVW(SB), NOSPLIT, $0
265	MOVD  z+0(FP), R10	// R10 = z[]
266	MOVD  x+24(FP), R8	// R8 = x[]
267	MOVD  y+48(FP), R4	// R4 = y = c
268	MOVD  z_len+8(FP), R11	// R11 = z_len
269
270	CMP   R11, $0		// If z_len is zero, return
271	BEQ   done
272
273	// We will process the first iteration out of the loop so we capture
274	// the value of c. In the subsequent iterations, we will rely on the
275	// value of CA set here.
276	MOVD  0(R8), R20	// R20 = x[i]
277	ADD   $-1, R11		// R11 = z_len - 1
278	SUBC  R4, R20, R6	// R6 = x[i] - c
279	CMP   R11, $0		// If z_len was 1, we are done
280	MOVD  R6, 0(R10)	// z[i]
281	BEQ   final
282
283	// We will read 4 elements per iteration
284	SRDCC $2, R11, R9	// R9 = z_len/4
285	DCBT  (R8)
286	MOVD  R9, CTR		// Set up the loop counter
287	BEQ   tail		// If R9 = 0, we can't use the loop
288
289	// The loop here is almost the same as the one used in s390x, but
290	// we don't need to capture CA every iteration because we've already
291	// done that above.
292
293	PCALIGN $16
294loop:
295	MOVD  8(R8), R20
296	MOVD  16(R8), R21
297	MOVD  24(R8), R22
298	MOVDU 32(R8), R23
299	SUBE  R0, R20
300	SUBE  R0, R21
301	SUBE  R0, R22
302	SUBE  R0, R23
303	MOVD  R20, 8(R10)
304	MOVD  R21, 16(R10)
305	MOVD  R22, 24(R10)
306	MOVDU R23, 32(R10)
307	ADD   $-4, R11
308	BDNZ  loop
309
310	// We may have some elements to read
311	CMP   R11, $0
312	BEQ   final
313
314tail:
315	MOVDU 8(R8), R20
316	SUBE  R0, R20
317	ADD   $-1, R11
318	MOVDU R20, 8(R10)
319	CMP   R11, $0
320	BEQ   final
321
322	MOVDU 8(R8), R20
323	SUBE  R0, R20
324	ADD   $-1, R11
325	MOVDU R20, 8(R10)
326	CMP   R11, $0
327	BEQ   final
328
329	MOVD  8(R8), R20
330	SUBE  R0, R20
331	MOVD  R20, 8(R10)
332
333final:
334	// Capture CA
335	SUBE  R4, R4
336	NEG   R4, R4
337
338done:
339	MOVD  R4, c+56(FP)
340	RET
341
342//func shlVU(z, x []Word, s uint) (c Word)
343TEXT ·shlVU(SB), NOSPLIT, $0
344	MOVD    z+0(FP), R3
345	MOVD    x+24(FP), R6
346	MOVD    s+48(FP), R9
347	MOVD    z_len+8(FP), R4
348	MOVD    x_len+32(FP), R7
349	CMP     R9, $0          // s==0 copy(z,x)
350	BEQ     zeroshift
351	CMP     R4, $0          // len(z)==0 return
352	BEQ     done
353
354	ADD     $-1, R4, R5     // len(z)-1
355	SUBC    R9, $64, R4     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
356	SLD     $3, R5, R7
357	ADD     R6, R7, R15     // save starting address &x[len(z)-1]
358	ADD     R3, R7, R16     // save starting address &z[len(z)-1]
359	MOVD    (R6)(R7), R14
360	SRD     R4, R14, R7     // compute x[len(z)-1]>>ŝ into R7
361	CMP     R5, $0          // iterate from i=len(z)-1 to 0
362	BEQ     loopexit        // Already at end?
363	MOVD	0(R15),R10	// x[i]
364	PCALIGN $16
365shloop:
366	SLD     R9, R10, R10    // x[i]<<s
367	MOVDU   -8(R15), R14
368	SRD     R4, R14, R11    // x[i-1]>>ŝ
369	OR      R11, R10, R10
370	MOVD    R10, 0(R16)     // z[i-1]=x[i]<<s | x[i-1]>>ŝ
371	MOVD	R14, R10	// reuse x[i-1] for next iteration
372	ADD     $-8, R16        // i--
373	CMP     R15, R6         // &x[i-1]>&x[0]?
374	BGT     shloop
375loopexit:
376	MOVD    0(R6), R4
377	SLD     R9, R4, R4
378	MOVD    R4, 0(R3)       // z[0]=x[0]<<s
379	MOVD    R7, c+56(FP)    // store pre-computed x[len(z)-1]>>ŝ into c
380	RET
381
382zeroshift:
383	CMP     R6, $0          // x is null, nothing to copy
384	BEQ     done
385	CMP     R6, R3          // if x is same as z, nothing to copy
386	BEQ     done
387	CMP     R7, R4
388	ISEL    $0, R7, R4, R7  // Take the lower bound of lengths of x,z
389	SLD     $3, R7, R7
390	SUB     R6, R3, R11     // dest - src
391	CMPU    R11, R7, CR2    // < len?
392	BLT     CR2, backward   // there is overlap, copy backwards
393	MOVD    $0, R14
394	// shlVU processes backwards, but added a forward copy option
395	// since its faster on POWER
396repeat:
397	MOVD    (R6)(R14), R15  // Copy 8 bytes at a time
398	MOVD    R15, (R3)(R14)
399	ADD     $8, R14
400	CMP     R14, R7         // More 8 bytes left?
401	BLT     repeat
402	BR      done
403backward:
404	ADD     $-8,R7, R14
405repeatback:
406	MOVD    (R6)(R14), R15  // copy x into z backwards
407	MOVD    R15, (R3)(R14)  // copy 8 bytes at a time
408	SUB     $8, R14
409	CMP     R14, $-8        // More 8 bytes left?
410	BGT     repeatback
411
412done:
413	MOVD    R0, c+56(FP)    // c=0
414	RET
415
416//func shrVU(z, x []Word, s uint) (c Word)
417TEXT ·shrVU(SB), NOSPLIT, $0
418	MOVD    z+0(FP), R3
419	MOVD    x+24(FP), R6
420	MOVD    s+48(FP), R9
421	MOVD    z_len+8(FP), R4
422	MOVD    x_len+32(FP), R7
423
424	CMP     R9, $0          // s==0, copy(z,x)
425	BEQ     zeroshift
426	CMP     R4, $0          // len(z)==0 return
427	BEQ     done
428	SUBC    R9, $64, R5     // ŝ=_W-s, we skip & by _W-1 as the caller ensures s < _W(64)
429
430	MOVD    0(R6), R7
431	SLD     R5, R7, R7      // compute x[0]<<ŝ
432	MOVD    $1, R8          // iterate from i=1 to i<len(z)
433	CMP     R8, R4
434	BGE     loopexit        // Already at end?
435
436	// vectorize if len(z) is >=3, else jump to scalar loop
437	CMP     R4, $3
438	BLT     scalar
439	MTVSRD  R9, VS38        // s
440	VSPLTB  $7, V6, V4
441	MTVSRD  R5, VS39        // ŝ
442	VSPLTB  $7, V7, V2
443	ADD     $-2, R4, R16
444	PCALIGN $16
445loopback:
446	ADD     $-1, R8, R10
447	SLD     $3, R10
448	LXVD2X  (R6)(R10), VS32 // load x[i-1], x[i]
449	SLD     $3, R8, R12
450	LXVD2X  (R6)(R12), VS33 // load x[i], x[i+1]
451
452	VSRD    V0, V4, V3      // x[i-1]>>s, x[i]>>s
453	VSLD    V1, V2, V5      // x[i]<<ŝ, x[i+1]<<ŝ
454	VOR     V3, V5, V5      // Or(|) the two registers together
455	STXVD2X VS37, (R3)(R10) // store into z[i-1] and z[i]
456	ADD     $2, R8          // Done processing 2 entries, i and i+1
457	CMP     R8, R16         // Are there at least a couple of more entries left?
458	BLE     loopback
459	CMP     R8, R4          // Are we at the last element?
460	BEQ     loopexit
461scalar:
462	ADD     $-1, R8, R10
463	SLD     $3, R10
464	MOVD    (R6)(R10),R11
465	SRD     R9, R11, R11    // x[len(z)-2] >> s
466	SLD     $3, R8, R12
467	MOVD    (R6)(R12), R12
468	SLD     R5, R12, R12    // x[len(z)-1]<<ŝ
469	OR      R12, R11, R11   // x[len(z)-2]>>s | x[len(z)-1]<<ŝ
470	MOVD    R11, (R3)(R10)  // z[len(z)-2]=x[len(z)-2]>>s | x[len(z)-1]<<ŝ
471loopexit:
472	ADD     $-1, R4
473	SLD     $3, R4
474	MOVD    (R6)(R4), R5
475	SRD     R9, R5, R5      // x[len(z)-1]>>s
476	MOVD    R5, (R3)(R4)    // z[len(z)-1]=x[len(z)-1]>>s
477	MOVD    R7, c+56(FP)    // store pre-computed x[0]<<ŝ into c
478	RET
479
480zeroshift:
481	CMP     R6, $0          // x is null, nothing to copy
482	BEQ     done
483	CMP     R6, R3          // if x is same as z, nothing to copy
484	BEQ     done
485	CMP     R7, R4
486	ISEL    $0, R7, R4, R7  // Take the lower bounds of lengths of x, z
487	SLD     $3, R7, R7
488	MOVD    $0, R14
489repeat:
490	MOVD    (R6)(R14), R15  // copy 8 bytes at a time
491	MOVD    R15, (R3)(R14)  // shrVU processes bytes only forwards
492	ADD     $8, R14
493	CMP     R14, R7         // More 8 bytes left?
494	BLT     repeat
495done:
496	MOVD    R0, c+56(FP)
497	RET
498
499// func mulAddVWW(z, x []Word, y, r Word) (c Word)
500TEXT ·mulAddVWW(SB), NOSPLIT, $0
501	MOVD    z+0(FP), R10      // R10 = z[]
502	MOVD    x+24(FP), R8      // R8 = x[]
503	MOVD    y+48(FP), R9      // R9 = y
504	MOVD    r+56(FP), R4      // R4 = r = c
505	MOVD    z_len+8(FP), R11  // R11 = z_len
506
507	CMP     R11, $0
508	BEQ     done
509
510	MOVD    0(R8), R20
511	ADD     $-1, R11
512	MULLD   R9, R20, R6       // R6 = z0 = Low-order(x[i]*y)
513	MULHDU  R9, R20, R7       // R7 = z1 = High-order(x[i]*y)
514	ADDC    R4, R6            // R6 = z0 + r
515	ADDZE   R7, R4            // R4 = z1 + CA
516	CMP     R11, $0
517	MOVD    R6, 0(R10)        // z[i]
518	BEQ     done
519
520	// We will read 4 elements per iteration
521	SRDCC   $2, R11, R14      // R14 = z_len/4
522	DCBT    (R8)
523	MOVD    R14, CTR          // Set up the loop counter
524	BEQ     tail              // If R9 = 0, we can't use the loop
525	PCALIGN $16
526
527loop:
528	MOVD    8(R8), R20        // R20 = x[i]
529	MOVD    16(R8), R21       // R21 = x[i+1]
530	MOVD    24(R8), R22       // R22 = x[i+2]
531	MOVDU   32(R8), R23       // R23 = x[i+3]
532	MULLD   R9, R20, R24      // R24 = z0[i]
533	MULHDU  R9, R20, R20      // R20 = z1[i]
534	ADDC    R4, R24           // R24 = z0[i] + c
535	MULLD   R9, R21, R25
536	MULHDU  R9, R21, R21
537	ADDE    R20, R25
538	MULLD   R9, R22, R26
539	MULHDU  R9, R22, R22
540	MULLD   R9, R23, R27
541	MULHDU  R9, R23, R23
542	ADDE    R21, R26
543	MOVD    R24, 8(R10)       // z[i]
544	MOVD    R25, 16(R10)      // z[i+1]
545	ADDE    R22, R27
546	ADDZE   R23,R4		  // update carry
547	MOVD    R26, 24(R10)      // z[i+2]
548	MOVDU   R27, 32(R10)      // z[i+3]
549	ADD     $-4, R11          // R11 = z_len - 4
550	BDNZ    loop
551
552	// We may have some elements to read
553	CMP   R11, $0
554	BEQ   done
555
556	// Process the remaining elements, one at a time
557tail:
558	MOVDU   8(R8), R20        // R20 = x[i]
559	MULLD   R9, R20, R24      // R24 = z0[i]
560	MULHDU  R9, R20, R25      // R25 = z1[i]
561	ADD     $-1, R11          // R11 = z_len - 1
562	ADDC    R4, R24
563	ADDZE   R25, R4
564	MOVDU   R24, 8(R10)       // z[i]
565	CMP     R11, $0
566	BEQ     done              // If R11 = 0, we are done
567
568	MOVDU   8(R8), R20
569	MULLD   R9, R20, R24
570	MULHDU  R9, R20, R25
571	ADD     $-1, R11
572	ADDC    R4, R24
573	ADDZE   R25, R4
574	MOVDU   R24, 8(R10)
575	CMP     R11, $0
576	BEQ     done
577
578	MOVD    8(R8), R20
579	MULLD   R9, R20, R24
580	MULHDU  R9, R20, R25
581	ADD     $-1, R11
582	ADDC    R4, R24
583	ADDZE   R25,R4
584	MOVD    R24, 8(R10)
585
586done:
587	MOVD    R4, c+64(FP)
588	RET
589
590// func addMulVVW(z, x []Word, y Word) (c Word)
591TEXT ·addMulVVW(SB), NOSPLIT, $0
592	MOVD	z+0(FP), R3	// R3 = z[]
593	MOVD	x+24(FP), R4	// R4 = x[]
594	MOVD	y+48(FP), R5	// R5 = y
595	MOVD	z_len+8(FP), R6	// R6 = z_len
596
597	CMP	R6, $4
598	MOVD	R0, R9		// R9 = c = 0
599	BLT	tail
600	SRD	$2, R6, R7
601	MOVD	R7, CTR		// Initialize loop counter
602	PCALIGN	$16
603
604loop:
605	MOVD	0(R4), R14	// x[i]
606	MOVD	8(R4), R16	// x[i+1]
607	MOVD	16(R4), R18	// x[i+2]
608	MOVD	24(R4), R20	// x[i+3]
609	MOVD	0(R3), R15	// z[i]
610	MOVD	8(R3), R17	// z[i+1]
611	MOVD	16(R3), R19	// z[i+2]
612	MOVD	24(R3), R21	// z[i+3]
613	MULLD	R5, R14, R10	// low x[i]*y
614	MULHDU	R5, R14, R11	// high x[i]*y
615	ADDC	R15, R10
616	ADDZE	R11
617	ADDC	R9, R10
618	ADDZE	R11, R9
619	MULLD	R5, R16, R14	// low x[i+1]*y
620	MULHDU	R5, R16, R15	// high x[i+1]*y
621	ADDC	R17, R14
622	ADDZE	R15
623	ADDC	R9, R14
624	ADDZE	R15, R9
625	MULLD	R5, R18, R16    // low x[i+2]*y
626	MULHDU	R5, R18, R17    // high x[i+2]*y
627	ADDC	R19, R16
628	ADDZE	R17
629	ADDC	R9, R16
630	ADDZE	R17, R9
631	MULLD	R5, R20, R18    // low x[i+3]*y
632	MULHDU	R5, R20, R19    // high x[i+3]*y
633	ADDC	R21, R18
634	ADDZE	R19
635	ADDC	R9, R18
636	ADDZE	R19, R9
637	MOVD	R10, 0(R3)	// z[i]
638	MOVD	R14, 8(R3)	// z[i+1]
639	MOVD	R16, 16(R3)	// z[i+2]
640	MOVD	R18, 24(R3)	// z[i+3]
641	ADD	$32, R3
642	ADD	$32, R4
643	BDNZ	loop
644
645	ANDCC	$3, R6
646tail:
647	CMP	R6, $0
648	BEQ	done
649	MOVD	R6, CTR
650	PCALIGN $16
651tailloop:
652	MOVD	0(R4), R14
653	MOVD	0(R3), R15
654	MULLD	R5, R14, R10
655	MULHDU	R5, R14, R11
656	ADDC	R15, R10
657	ADDZE	R11
658	ADDC	R9, R10
659	ADDZE	R11, R9
660	MOVD	R10, 0(R3)
661	ADD	$8, R3
662	ADD	$8, R4
663	BDNZ	tailloop
664
665done:
666	MOVD	R9, c+56(FP)
667	RET
668
669