1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64 || ppc64le
6
7#include "textflag.h"
8
9// See memclrNoHeapPointers Go doc for important implementation constraints.
10
11// func memclrNoHeapPointers(ptr unsafe.Pointer, n uintptr)
12TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-16
13	// R3 = ptr
14	// R4 = n
15
16	// Determine if there are doublewords to clear
17check:
18	ANDCC $7, R4, R5  // R5: leftover bytes to clear
19	SRD   $3, R4, R6  // R6: double words to clear
20	CMP   R6, $0, CR1 // CR1[EQ] set if no double words
21
22	BC    12, 6, nozerolarge // only single bytes
23	CMP   R4, $512
24	BLT   under512           // special case for < 512
25	ANDCC $127, R3, R8       // check for 128 alignment of address
26	BEQ   zero512setup
27
28	ANDCC $7, R3, R15
29	BEQ   zero512xsetup // at least 8 byte aligned
30
31	// zero bytes up to 8 byte alignment
32
33	ANDCC $1, R3, R15 // check for byte alignment
34	BEQ   byte2
35	MOVB  R0, 0(R3)   // zero 1 byte
36	ADD   $1, R3      // bump ptr by 1
37	ADD   $-1, R4
38
39byte2:
40	ANDCC $2, R3, R15 // check for 2 byte alignment
41	BEQ   byte4
42	MOVH  R0, 0(R3)   // zero 2 bytes
43	ADD   $2, R3      // bump ptr by 2
44	ADD   $-2, R4
45
46byte4:
47	ANDCC $4, R3, R15   // check for 4 byte alignment
48	BEQ   zero512xsetup
49	MOVW  R0, 0(R3)     // zero 4 bytes
50	ADD   $4, R3        // bump ptr by 4
51	ADD   $-4, R4
52	BR    zero512xsetup // ptr should now be 8 byte aligned
53
54under512:
55	SRDCC $3, R6, R7  // 64 byte chunks?
56	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
57	BEQ   lt64gt8
58
59	// Prepare to clear 64 bytes at a time.
60
61zero64setup:
62	DCBTST (R3)             // prepare data cache
63	MOVD   R7, CTR          // number of 64 byte chunks
64	MOVD   $16, R8
65	MOVD   $32, R16
66	MOVD   $48, R17
67
68zero64:
69	STXVD2X VS32, (R3+R0)   // store 16 bytes
70	STXVD2X VS32, (R3+R8)
71	STXVD2X VS32, (R3+R16)
72	STXVD2X VS32, (R3+R17)
73	ADD     $64, R3
74	ADD     $-64, R4
75	BDNZ    zero64          // dec ctr, br zero64 if ctr not 0
76	SRDCC   $3, R4, R6	// remaining doublewords
77	BEQ     nozerolarge
78
79lt64gt8:
80	CMP	R4, $32
81	BLT	lt32gt8
82	MOVD	$16, R8
83	STXVD2X	VS32, (R3+R0)
84	STXVD2X	VS32, (R3+R8)
85	ADD	$-32, R4
86	ADD	$32, R3
87lt32gt8:
88	CMP	R4, $16
89	BLT	lt16gt8
90	STXVD2X	VS32, (R3+R0)
91	ADD	$16, R3
92	ADD	$-16, R4
93lt16gt8:
94#ifdef GOPPC64_power10
95	SLD	$56, R4, R7
96	STXVL   V0, R3, R7
97	RET
98#else
99	CMP	R4, $8
100	BLT	nozerolarge
101	MOVD	R0, 0(R3)
102	ADD	$8, R3
103	ADD	$-8, R4
104#endif
105nozerolarge:
106	ANDCC $7, R4, R5 // any remaining bytes
107	BC    4, 1, LR   // ble lr
108#ifdef GOPPC64_power10
109	XXLXOR  VS32, VS32, VS32 // clear VS32 (V0)
110	SLD	$56, R5, R7
111	STXVL   V0, R3, R7
112	RET
113#else
114	CMP   R5, $4
115	BLT   next2
116	MOVW  R0, 0(R3)
117	ADD   $4, R3
118	ADD   $-4, R5
119next2:
120	CMP   R5, $2
121	BLT   next1
122	MOVH  R0, 0(R3)
123	ADD   $2, R3
124	ADD   $-2, R5
125next1:
126	CMP   R5, $0
127	BC    12, 2, LR      // beqlr
128	MOVB  R0, 0(R3)
129	RET
130#endif
131
132zero512xsetup:  // 512 chunk with extra needed
133	ANDCC $8, R3, R11    // 8 byte alignment?
134	BEQ   zero512setup16
135	MOVD  R0, 0(R3)      // clear 8 bytes
136	ADD   $8, R3         // update ptr to next 8
137	ADD   $-8, R4        // dec count by 8
138
139zero512setup16:
140	ANDCC $127, R3, R14 // < 128 byte alignment
141	BEQ   zero512setup  // handle 128 byte alignment
142	MOVD  $128, R15
143	SUB   R14, R15, R14 // find increment to 128 alignment
144	SRD   $4, R14, R15  // number of 16 byte chunks
145	MOVD   R15, CTR         // loop counter of 16 bytes
146	XXLXOR VS32, VS32, VS32 // clear VS32 (V0)
147
148zero512preloop:  // clear up to 128 alignment
149	STXVD2X VS32, (R3+R0)         // clear 16 bytes
150	ADD     $16, R3               // update ptr
151	ADD     $-16, R4              // dec count
152	BDNZ    zero512preloop
153
154zero512setup:  // setup for dcbz loop
155	CMP  R4, $512   // check if at least 512
156	BLT  remain
157	SRD  $9, R4, R8 // loop count for 512 chunks
158	MOVD R8, CTR    // set up counter
159	MOVD $128, R9   // index regs for 128 bytes
160	MOVD $256, R10
161	MOVD $384, R11
162	PCALIGN $16
163zero512:
164	DCBZ (R3+R0)        // clear first chunk
165	DCBZ (R3+R9)        // clear second chunk
166	DCBZ (R3+R10)       // clear third chunk
167	DCBZ (R3+R11)       // clear fourth chunk
168	ADD  $512, R3
169	BDNZ zero512
170	ANDCC $511, R4
171
172remain:
173	CMP  R4, $128  // check if 128 byte chunks left
174	BLT  smaller
175	DCBZ (R3+R0)   // clear 128
176	ADD  $128, R3
177	ADD  $-128, R4
178	BR   remain
179
180smaller:
181	ANDCC $127, R4, R7 // find leftovers
182	BEQ   done
183	CMP   R7, $64      // more than 64, do 64 at a time
184	XXLXOR VS32, VS32, VS32
185	BLT   lt64gt8	   // less than 64
186	SRD   $6, R7, R7   // set up counter for 64
187	BR    zero64setup
188
189done:
190	RET
191