1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64 || ppc64le
6
7#include "go_asm.h"
8#include "textflag.h"
9
10TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
11	// R3 = byte array pointer
12	// R4 = length
13	MOVD	R6, R5		// R5 = byte
14	BR	indexbytebody<>(SB)
15
16TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-32
17	// R3 = string
18	// R4 = length
19	// R5 = byte
20	BR	indexbytebody<>(SB)
21
22#ifndef GOPPC64_power9
23#ifdef GOARCH_ppc64le
24DATA indexbytevbperm<>+0(SB)/8, $0x3830282018100800
25DATA indexbytevbperm<>+8(SB)/8, $0x7870686058504840
26#else
27DATA indexbytevbperm<>+0(SB)/8, $0x0008101820283038
28DATA indexbytevbperm<>+8(SB)/8, $0x4048505860687078
29#endif
30GLOBL indexbytevbperm<>+0(SB), RODATA, $16
31#endif
32
33// Some operations are endian specific, choose the correct opcode base on GOARCH.
34// Note, _VCZBEBB is only available on power9 and newer.
35#ifdef GOARCH_ppc64le
36#define _LDBEX	MOVDBR
37#define _LWBEX	MOVWBR
38#define _LHBEX	MOVHBR
39#define _VCZBEBB VCTZLSBB
40#else
41#define _LDBEX	MOVD
42#define _LWBEX	MOVW
43#define _LHBEX	MOVH
44#define _VCZBEBB VCLZLSBB
45#endif
46
47// R3 = addr of string
48// R4 = len of string
49// R5 = byte to find
50// On exit:
51// R3 = return value
52TEXT indexbytebody<>(SB),NOSPLIT|NOFRAME,$0-0
53	CMPU	R4,$32
54
55#ifndef GOPPC64_power9
56	// Load VBPERMQ constant to reduce compare into an ordered bit mask.
57	MOVD	$indexbytevbperm<>+00(SB),R16
58	LXVD2X	(R16),V0	// Set up swap string
59#endif
60
61	MTVRD	R5,V1
62	VSPLTB	$7,V1,V1	// Replicate byte across V1
63
64	BLT	cmp16		// Jump to the small string case if it's <32 bytes.
65
66	CMP	R4,$64,CR1
67	MOVD	$16,R11
68	MOVD	R3,R8
69	BLT	CR1,cmp32	// Special case for length 32 - 63
70	MOVD	$32,R12
71	MOVD	$48,R6
72
73	RLDICR  $0,R4,$63-6,R9	// R9 = len &^ 63
74	ADD	R3,R9,R9	// R9 = &s[len &^ 63]
75	ANDCC	$63,R4		// (len &= 63) cmp 0.
76
77	PCALIGN	$16
78loop64:
79	LXVD2X	(R0)(R8),V2	// Scan 64 bytes at a time, starting at &s[0]
80	VCMPEQUBCC	V2,V1,V6
81	BNE	CR6,foundat0	// Match found at R8, jump out
82
83	LXVD2X	(R11)(R8),V2
84	VCMPEQUBCC	V2,V1,V6
85	BNE	CR6,foundat1	// Match found at R8+16 bytes, jump out
86
87	LXVD2X	(R12)(R8),V2
88	VCMPEQUBCC	V2,V1,V6
89	BNE	CR6,foundat2	// Match found at R8+32 bytes, jump out
90
91	LXVD2X	(R6)(R8),V2
92	VCMPEQUBCC	V2,V1,V6
93	BNE	CR6,foundat3	// Match found at R8+48 bytes, jump out
94
95	ADD	$64,R8
96	CMPU	R8,R9,CR1
97	BNE	CR1,loop64	// R8 != &s[len &^ 63]?
98
99	PCALIGN	$32
100	BEQ	notfound	// Is tail length 0? CR0 is set before entering loop64.
101
102	CMP	R4,$32		// Tail length >= 32, use cmp32 path.
103	CMP	R4,$16,CR1
104	BGE	cmp32
105
106	ADD	R8,R4,R9
107	ADD	$-16,R9
108	BLE	CR1,cmp64_tail_gt0
109
110cmp64_tail_gt16:	// Tail length 17 - 32
111	LXVD2X	(R0)(R8),V2
112	VCMPEQUBCC	V2,V1,V6
113	BNE	CR6,foundat0
114
115cmp64_tail_gt0:	// Tail length 1 - 16
116	MOVD	R9,R8
117	LXVD2X	(R0)(R9),V2
118	VCMPEQUBCC	V2,V1,V6
119	BNE	CR6,foundat0
120
121	BR	notfound
122
123cmp32:	// Length 32 - 63
124
125	// Bytes 0 - 15
126	LXVD2X	(R0)(R8),V2
127	VCMPEQUBCC	V2,V1,V6
128	BNE	CR6,foundat0
129
130	// Bytes 16 - 31
131	LXVD2X	(R8)(R11),V2
132	VCMPEQUBCC	V2,V1,V6
133	BNE	CR6,foundat1		// Match found at R8+16 bytes, jump out
134
135	BEQ	notfound		// Is length <= 32? (CR0 holds this comparison on entry to cmp32)
136	CMP	R4,$48
137
138	ADD	R4,R8,R9		// Compute &s[len(s)-16]
139	ADD	$32,R8,R8
140	ADD	$-16,R9,R9
141	ISEL	CR0GT,R8,R9,R8		// R8 = len(s) <= 48 ? R9 : R8
142
143	// Bytes 33 - 47
144	LXVD2X	(R0)(R8),V2
145	VCMPEQUBCC	V2,V1,V6
146	BNE	CR6,foundat0		// match found at R8+32 bytes, jump out
147
148	BLE	notfound
149
150	// Bytes 48 - 63
151	MOVD	R9,R8			// R9 holds the final check.
152	LXVD2X	(R0)(R9),V2
153	VCMPEQUBCC	V2,V1,V6
154	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
155
156	BR	notfound
157
158// If ISA 3.0 instructions are unavailable, we need to account for the extra 16 added by CNTLZW.
159#ifndef GOPPC64_power9
160#define ADJUST_FOR_CNTLZW -16
161#else
162#define ADJUST_FOR_CNTLZW 0
163#endif
164
165// Now, find the index of the 16B vector the match was discovered in. If CNTLZW is used
166// to determine the offset into the 16B vector, it will overcount by 16. Account for it here.
167foundat3:
168	SUB	R3,R8,R3
169	ADD	$48+ADJUST_FOR_CNTLZW,R3
170	BR	vfound
171foundat2:
172	SUB	R3,R8,R3
173	ADD	$32+ADJUST_FOR_CNTLZW,R3
174	BR	vfound
175foundat1:
176	SUB	R3,R8,R3
177	ADD	$16+ADJUST_FOR_CNTLZW,R3
178	BR	vfound
179foundat0:
180	SUB	R3,R8,R3
181	ADD	$0+ADJUST_FOR_CNTLZW,R3
182vfound:
183	// Map equal values into a 16 bit value with earlier matches setting higher bits.
184#ifndef GOPPC64_power9
185	VBPERMQ	V6,V0,V6
186	MFVRD	V6,R4
187	CNTLZW	R4,R4
188#else
189#ifdef GOARCH_ppc64le
190	// Put the value back into LE ordering by swapping doublewords.
191	XXPERMDI	V6,V6,$2,V6
192#endif
193	_VCZBEBB	V6,R4
194#endif
195	ADD	R3,R4,R3
196	RET
197
198cmp16:	// Length 16 - 31
199	CMPU	R4,$16
200	ADD	R4,R3,R9
201	BLT	cmp8
202
203	ADD	$-16,R9,R9		// &s[len(s)-16]
204
205	// Bytes 0 - 15
206	LXVD2X	(R0)(R3),V2
207	VCMPEQUBCC	V2,V1,V6
208	MOVD	R3,R8
209	BNE	CR6,foundat0		// Match found at R8+32 bytes, jump out
210
211	BEQ	notfound
212
213	// Bytes 16 - 30
214	MOVD	R9,R8			// R9 holds the final check.
215	LXVD2X	(R0)(R9),V2
216	VCMPEQUBCC	V2,V1,V6
217	BNE	CR6,foundat0		// Match found at R8+48 bytes, jump out
218
219	BR	notfound
220
221
222cmp8:	// Length 8 - 15
223#ifdef GOPPC64_power10
224	// Load all the bytes into a single VSR in BE order.
225	SLD	$56,R4,R5
226	LXVLL	R3,R5,V2
227	// Compare and count the number which don't match.
228	VCMPEQUB	V2,V1,V6
229	VCLZLSBB	V6,R3
230	// If count is the number of bytes, or more. No matches are found.
231	CMPU	R3,R4
232	MOVD	$-1,R5
233	// Otherwise, the count is the index of the first match.
234	ISEL	CR0LT,R3,R5,R3
235	RET
236#else
237	RLDIMI	$8,R5,$48,R5	// Replicating the byte across the register.
238	RLDIMI	$16,R5,$32,R5
239	RLDIMI	$32,R5,$0,R5
240	CMPU	R4,$8
241	BLT	cmp4
242	MOVD	$-8,R11
243	ADD	$-8,R4,R4
244
245	_LDBEX	(R0)(R3),R10
246	_LDBEX	(R11)(R9),R11
247	CMPB	R10,R5,R10
248	CMPB	R11,R5,R11
249	CMPU	R10,$0
250	CMPU	R11,$0,CR1
251	CNTLZD	R10,R10
252	CNTLZD	R11,R11
253	SRD	$3,R10,R3
254	SRD	$3,R11,R11
255	BNE	found
256
257	ADD	R4,R11,R4
258	MOVD	$-1,R3
259	ISEL	CR1EQ,R3,R4,R3
260	RET
261
262cmp4:	// Length 4 - 7
263	CMPU	R4,$4
264	BLT	cmp2
265	MOVD	$-4,R11
266	ADD	$-4,R4,R4
267
268	_LWBEX	(R0)(R3),R10
269	_LWBEX	(R11)(R9),R11
270	CMPB	R10,R5,R10
271	CMPB	R11,R5,R11
272	CNTLZW	R10,R10
273	CNTLZW	R11,R11
274	CMPU	R10,$32
275	CMPU	R11,$32,CR1
276	SRD	$3,R10,R3
277	SRD	$3,R11,R11
278	BNE	found
279
280	ADD	R4,R11,R4
281	MOVD	$-1,R3
282	ISEL	CR1EQ,R3,R4,R3
283	RET
284
285cmp2:	// Length 2 - 3
286	CMPU	R4,$2
287	BLT	cmp1
288
289	_LHBEX	(R0)(R3),R10
290	CMPB	R10,R5,R10
291	SLDCC	$48,R10,R10
292	CNTLZD	R10,R10
293	SRD	$3,R10,R3
294	BNE	found
295
296cmp1:	// Length 1
297	MOVD	$-1,R3
298	ANDCC	$1,R4,R31
299	BEQ	found
300
301	MOVBZ	-1(R9),R10
302	CMPB	R10,R5,R10
303	ANDCC	$1,R10
304	ADD	$-1,R4
305	ISEL	CR0EQ,R3,R4,R3
306
307found:
308	RET
309#endif
310
311notfound:
312	MOVD $-1,R3
313	RET
314
315