1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64 || ppc64le
6
7#include "go_asm.h"
8#include "textflag.h"
9
10// Helper names for x-form loads in BE ordering.
11#ifdef  GOARCH_ppc64le
12#define _LDBEX	MOVDBR
13#define _LWBEX	MOVWBR
14#define _LHBEX	MOVHBR
15#else
16#define _LDBEX	MOVD
17#define _LWBEX	MOVW
18#define _LHBEX	MOVH
19#endif
20
21#ifdef GOPPC64_power9
22#define SETB_CR0(rout) SETB CR0, rout
23#define SETB_CR1(rout) SETB CR1, rout
24#define SETB_INIT()
25#define SETB_CR0_NE(rout) SETB_CR0(rout)
26#else
27// A helper macro to emulate SETB on P8. This assumes
28// -1 is in R20, and 1 is in R21. crxlt and crxeq must
29// also be the same CR field.
30#define _SETB(crxlt, crxeq, rout) \
31	ISEL	crxeq,R0,R21,rout \
32	ISEL	crxlt,R20,rout,rout
33
34// A special case when it is know the comparison
35// will always be not equal. The result must be -1 or 1.
36#define SETB_CR0_NE(rout) \
37	ISEL	CR0LT,R20,R21,rout
38
39#define SETB_CR0(rout) _SETB(CR0LT, CR0EQ, rout)
40#define SETB_CR1(rout) _SETB(CR1LT, CR1EQ, rout)
41#define SETB_INIT() \
42	MOVD	$-1,R20 \
43	MOVD	$1,R21
44#endif
45
46TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
47	// incoming:
48	// R3 a addr
49	// R4 a len
50	// R6 b addr
51	// R7 b len
52	//
53	// on entry to cmpbody:
54	// R3 return value if len(a) == len(b)
55	// R5 a addr
56	// R6 b addr
57	// R9 min(len(a),len(b))
58	SETB_INIT()
59	MOVD	R3,R5
60	CMP	R4,R7,CR0
61	CMP	R3,R6,CR7
62	ISEL	CR0LT,R4,R7,R9
63	SETB_CR0(R3)
64	BC	$12,30,LR	// beqlr cr7
65	BR	cmpbody<>(SB)
66
67TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
68	// incoming:
69	// R3 a addr -> R5
70	// R4 a len  -> R3
71	// R5 b addr -> R6
72	// R6 b len  -> R4
73	//
74	// on entry to cmpbody:
75	// R3 compare value if compared length is same.
76	// R5 a addr
77	// R6 b addr
78	// R9 min(len(a),len(b))
79	SETB_INIT()
80	CMP	R4,R6,CR0
81	CMP	R3,R5,CR7
82	ISEL	CR0LT,R4,R6,R9
83	MOVD	R5,R6
84	MOVD	R3,R5
85	SETB_CR0(R3)
86	BC	$12,30,LR	// beqlr cr7
87	BR	cmpbody<>(SB)
88
89#ifdef GOARCH_ppc64le
90DATA byteswap<>+0(SB)/8, $0x0706050403020100
91DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
92GLOBL byteswap<>+0(SB), RODATA, $16
93#define SWAP V21
94#endif
95
96TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
97start:
98	CMP	R9,$16,CR0
99	CMP	R9,$32,CR1
100	CMP	R9,$64,CR2
101	MOVD	$16,R10
102	BLT	cmp8
103	BLT	CR1,cmp16
104	BLT	CR2,cmp32
105
106cmp64:	// >= 64B
107	DCBT	(R5)		// optimize for size>=64
108	DCBT	(R6)		// cache hint
109
110	SRD	$6,R9,R14	// There is at least one iteration.
111	MOVD	R14,CTR
112	ANDCC   $63,R9,R9
113	CMP	R9,$16,CR1	// Do setup for tail check early on.
114	CMP	R9,$32,CR2
115	CMP	R9,$48,CR3
116	ADD	$-16,R9,R9
117
118	MOVD	$32,R11		// set offsets to load into vector
119	MOVD	$48,R12		// set offsets to load into vector
120
121	PCALIGN	$16
122cmp64_loop:
123	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
124	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
125	VCMPEQUDCC	V3,V4,V1
126	BGE	CR6,different	// jump out if its different
127
128	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
129	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
130	VCMPEQUDCC	V3,V4,V1
131	BGE	CR6,different
132
133	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
134	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
135	VCMPEQUDCC	V3,V4,V1
136	BGE	CR6,different
137
138	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
139	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
140	VCMPEQUDCC	V3,V4,V1
141	BGE	CR6,different
142
143	ADD	$64,R5,R5	// increment to next 64 bytes of A
144	ADD	$64,R6,R6	// increment to next 64 bytes of B
145	BDNZ	cmp64_loop
146	BC	$12,2,LR	// beqlr
147
148	// Finish out tail with minimal overlapped checking.
149	// Note, 0 tail is handled by beqlr above.
150	BLE	CR1,cmp64_tail_gt0
151	BLE	CR2,cmp64_tail_gt16
152	BLE	CR3,cmp64_tail_gt32
153
154cmp64_tail_gt48: // 49 - 63 B
155	LXVD2X	(R0)(R5),V3
156	LXVD2X	(R0)(R6),V4
157	VCMPEQUDCC	V3,V4,V1
158	BGE	CR6,different
159
160	LXVD2X	(R5)(R10),V3
161	LXVD2X	(R6)(R10),V4
162	VCMPEQUDCC	V3,V4,V1
163	BGE	CR6,different
164
165	LXVD2X	(R5)(R11),V3
166	LXVD2X	(R6)(R11),V4
167	VCMPEQUDCC	V3,V4,V1
168	BGE	CR6,different
169
170	BR cmp64_tail_gt0
171
172	PCALIGN $16
173cmp64_tail_gt32: // 33 - 48B
174	LXVD2X	(R0)(R5),V3
175	LXVD2X	(R0)(R6),V4
176	VCMPEQUDCC	V3,V4,V1
177	BGE	CR6,different
178
179	LXVD2X	(R5)(R10),V3
180	LXVD2X	(R6)(R10),V4
181	VCMPEQUDCC	V3,V4,V1
182	BGE	CR6,different
183
184	BR cmp64_tail_gt0
185
186	PCALIGN $16
187cmp64_tail_gt16: // 17 - 32B
188	LXVD2X	(R0)(R5),V3
189	LXVD2X	(R0)(R6),V4
190	VCMPEQUDCC	V3,V4,V1
191	BGE	CR6,different
192
193	BR cmp64_tail_gt0
194
195	PCALIGN $16
196cmp64_tail_gt0: // 1 - 16B
197	LXVD2X	(R5)(R9),V3
198	LXVD2X	(R6)(R9),V4
199	VCMPEQUDCC	V3,V4,V1
200	BGE	CR6,different
201
202	RET
203
204	PCALIGN $16
205cmp32:	// 32 - 63B
206	ANDCC	$31,R9,R9
207
208	LXVD2X	(R0)(R5),V3
209	LXVD2X	(R0)(R6),V4
210	VCMPEQUDCC	V3,V4,V1
211	BGE	CR6,different
212
213	LXVD2X	(R10)(R5),V3
214	LXVD2X	(R10)(R6),V4
215	VCMPEQUDCC	V3,V4,V1
216	BGE	CR6,different
217
218	BC	$12,2,LR	// beqlr
219	ADD	R9,R10,R10
220
221	LXVD2X	(R9)(R5),V3
222	LXVD2X	(R9)(R6),V4
223	VCMPEQUDCC	V3,V4,V1
224	BGE	CR6,different
225
226	LXVD2X	(R10)(R5),V3
227	LXVD2X	(R10)(R6),V4
228	VCMPEQUDCC	V3,V4,V1
229	BGE	CR6,different
230	RET
231
232	PCALIGN $16
233cmp16:	// 16 - 31B
234	ANDCC	$15,R9,R9
235	LXVD2X	(R0)(R5),V3
236	LXVD2X	(R0)(R6),V4
237	VCMPEQUDCC	V3,V4,V1
238	BGE	CR6,different
239	BC	$12,2,LR	// beqlr
240
241	LXVD2X	(R9)(R5),V3
242	LXVD2X	(R9)(R6),V4
243	VCMPEQUDCC	V3,V4,V1
244	BGE	CR6,different
245	RET
246
247	PCALIGN $16
248different:
249#ifdef	GOARCH_ppc64le
250	MOVD	$byteswap<>+00(SB),R16
251	LXVD2X	(R16)(R0),SWAP	// Set up swap string
252
253	VPERM	V3,V3,SWAP,V3
254	VPERM	V4,V4,SWAP,V4
255#endif
256
257	MFVSRD	VS35,R16	// move upper doublewords of A and B into GPR for comparison
258	MFVSRD	VS36,R10
259
260	CMPU	R16,R10
261	BEQ	lower
262	SETB_CR0_NE(R3)
263	RET
264
265	PCALIGN $16
266lower:
267	VSLDOI	$8,V3,V3,V3	// move lower doublewords of A and B into GPR for comparison
268	MFVSRD	VS35,R16
269	VSLDOI	$8,V4,V4,V4
270	MFVSRD	VS36,R10
271
272	CMPU	R16,R10
273	SETB_CR0_NE(R3)
274	RET
275
276	PCALIGN $16
277cmp8:	// 8 - 15B (0 - 15B if GOPPC64_power10)
278#ifdef GOPPC64_power10
279	SLD	$56,R9,R9
280	LXVLL	R5,R9,V3	// Load bytes starting from MSB to LSB, unused are zero filled.
281	LXVLL	R6,R9,V4
282	VCMPUQ	V3,V4,CR0	// Compare as a 128b integer.
283	SETB_CR0(R6)
284	ISEL	CR0EQ,R3,R6,R3	// If equal, length determines the return value.
285	RET
286#else
287	CMP	R9,$8
288	BLT	cmp4
289	ANDCC	$7,R9,R9
290	_LDBEX	(R0)(R5),R10
291	_LDBEX	(R0)(R6),R11
292	_LDBEX	(R9)(R5),R12
293	_LDBEX	(R9)(R6),R14
294	CMPU	R10,R11,CR0
295	SETB_CR0(R5)
296	CMPU	R12,R14,CR1
297	SETB_CR1(R6)
298	CRAND   CR0EQ,CR1EQ,CR1EQ // If both equal, length determines return value.
299	ISEL	CR0EQ,R6,R5,R4
300	ISEL	CR1EQ,R3,R4,R3
301	RET
302
303	PCALIGN	$16
304cmp4:	// 4 - 7B
305	CMP	R9,$4
306	BLT	cmp2
307	ANDCC	$3,R9,R9
308	_LWBEX	(R0)(R5),R10
309	_LWBEX	(R0)(R6),R11
310	_LWBEX	(R9)(R5),R12
311	_LWBEX	(R9)(R6),R14
312	RLDIMI	$32,R10,$0,R12
313	RLDIMI	$32,R11,$0,R14
314	CMPU	R12,R14
315	BR	cmp0
316
317	PCALIGN $16
318cmp2:	// 2 - 3B
319	CMP	R9,$2
320	BLT	cmp1
321	ANDCC	$1,R9,R9
322	_LHBEX	(R0)(R5),R10
323	_LHBEX	(R0)(R6),R11
324	_LHBEX	(R9)(R5),R12
325	_LHBEX	(R9)(R6),R14
326	RLDIMI	$32,R10,$0,R12
327	RLDIMI	$32,R11,$0,R14
328	CMPU	R12,R14
329	BR	cmp0
330
331	PCALIGN $16
332cmp1:
333	CMP	R9,$0
334	BEQ	cmp0
335	MOVBZ	(R5),R10
336	MOVBZ	(R6),R11
337	CMPU	R10,R11
338cmp0:
339	SETB_CR0(R6)
340	ISEL	CR0EQ,R3,R6,R3
341	RET
342#endif
343