1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "textflag.h"
7
8TEXT ·Index(SB),NOSPLIT,$0-56
9	MOVQ a_base+0(FP), DI
10	MOVQ a_len+8(FP), DX
11	MOVQ b_base+24(FP), R8
12	MOVQ b_len+32(FP), AX
13	MOVQ DI, R10
14	LEAQ ret+48(FP), R11
15	JMP  indexbody<>(SB)
16
17TEXT ·IndexString(SB),NOSPLIT,$0-40
18	MOVQ a_base+0(FP), DI
19	MOVQ a_len+8(FP), DX
20	MOVQ b_base+16(FP), R8
21	MOVQ b_len+24(FP), AX
22	MOVQ DI, R10
23	LEAQ ret+32(FP), R11
24	JMP  indexbody<>(SB)
25
26// AX: length of string, that we are searching for
27// DX: length of string, in which we are searching
28// DI: pointer to string, in which we are searching
29// R8: pointer to string, that we are searching for
30// R11: address, where to put return value
31// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
32TEXT indexbody<>(SB),NOSPLIT,$0
33	CMPQ AX, DX
34	JA fail
35	CMPQ DX, $16
36	JAE sse42
37no_sse42:
38	CMPQ AX, $2
39	JA   _3_or_more
40	MOVW (R8), R8
41	LEAQ -1(DI)(DX*1), DX
42	PCALIGN $16
43loop2:
44	MOVW (DI), SI
45	CMPW SI,R8
46	JZ success
47	ADDQ $1,DI
48	CMPQ DI,DX
49	JB loop2
50	JMP fail
51_3_or_more:
52	CMPQ AX, $3
53	JA   _4_or_more
54	MOVW 1(R8), BX
55	MOVW (R8), R8
56	LEAQ -2(DI)(DX*1), DX
57loop3:
58	MOVW (DI), SI
59	CMPW SI,R8
60	JZ   partial_success3
61	ADDQ $1,DI
62	CMPQ DI,DX
63	JB loop3
64	JMP fail
65partial_success3:
66	MOVW 1(DI), SI
67	CMPW SI,BX
68	JZ success
69	ADDQ $1,DI
70	CMPQ DI,DX
71	JB loop3
72	JMP fail
73_4_or_more:
74	CMPQ AX, $4
75	JA   _5_or_more
76	MOVL (R8), R8
77	LEAQ -3(DI)(DX*1), DX
78loop4:
79	MOVL (DI), SI
80	CMPL SI,R8
81	JZ   success
82	ADDQ $1,DI
83	CMPQ DI,DX
84	JB loop4
85	JMP fail
86_5_or_more:
87	CMPQ AX, $7
88	JA   _8_or_more
89	LEAQ 1(DI)(DX*1), DX
90	SUBQ AX, DX
91	MOVL -4(R8)(AX*1), BX
92	MOVL (R8), R8
93loop5to7:
94	MOVL (DI), SI
95	CMPL SI,R8
96	JZ   partial_success5to7
97	ADDQ $1,DI
98	CMPQ DI,DX
99	JB loop5to7
100	JMP fail
101partial_success5to7:
102	MOVL -4(AX)(DI*1), SI
103	CMPL SI,BX
104	JZ success
105	ADDQ $1,DI
106	CMPQ DI,DX
107	JB loop5to7
108	JMP fail
109_8_or_more:
110	CMPQ AX, $8
111	JA   _9_or_more
112	MOVQ (R8), R8
113	LEAQ -7(DI)(DX*1), DX
114loop8:
115	MOVQ (DI), SI
116	CMPQ SI,R8
117	JZ   success
118	ADDQ $1,DI
119	CMPQ DI,DX
120	JB loop8
121	JMP fail
122_9_or_more:
123	CMPQ AX, $15
124	JA   _16_or_more
125	LEAQ 1(DI)(DX*1), DX
126	SUBQ AX, DX
127	MOVQ -8(R8)(AX*1), BX
128	MOVQ (R8), R8
129loop9to15:
130	MOVQ (DI), SI
131	CMPQ SI,R8
132	JZ   partial_success9to15
133	ADDQ $1,DI
134	CMPQ DI,DX
135	JB loop9to15
136	JMP fail
137partial_success9to15:
138	MOVQ -8(AX)(DI*1), SI
139	CMPQ SI,BX
140	JZ success
141	ADDQ $1,DI
142	CMPQ DI,DX
143	JB loop9to15
144	JMP fail
145_16_or_more:
146	CMPQ AX, $16
147	JA   _17_or_more
148	MOVOU (R8), X1
149	LEAQ -15(DI)(DX*1), DX
150loop16:
151	MOVOU (DI), X2
152	PCMPEQB X1, X2
153	PMOVMSKB X2, SI
154	CMPQ  SI, $0xffff
155	JE   success
156	ADDQ $1,DI
157	CMPQ DI,DX
158	JB loop16
159	JMP fail
160_17_or_more:
161	CMPQ AX, $31
162	JA   _32_or_more
163	LEAQ 1(DI)(DX*1), DX
164	SUBQ AX, DX
165	MOVOU -16(R8)(AX*1), X0
166	MOVOU (R8), X1
167loop17to31:
168	MOVOU (DI), X2
169	PCMPEQB X1,X2
170	PMOVMSKB X2, SI
171	CMPQ  SI, $0xffff
172	JE   partial_success17to31
173	ADDQ $1,DI
174	CMPQ DI,DX
175	JB loop17to31
176	JMP fail
177partial_success17to31:
178	MOVOU -16(AX)(DI*1), X3
179	PCMPEQB X0, X3
180	PMOVMSKB X3, SI
181	CMPQ  SI, $0xffff
182	JE success
183	ADDQ $1,DI
184	CMPQ DI,DX
185	JB loop17to31
186	JMP fail
187// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
188// So no need to check cpuid
189_32_or_more:
190	CMPQ AX, $32
191	JA   _33_to_63
192	VMOVDQU (R8), Y1
193	LEAQ -31(DI)(DX*1), DX
194loop32:
195	VMOVDQU (DI), Y2
196	VPCMPEQB Y1, Y2, Y3
197	VPMOVMSKB Y3, SI
198	CMPL  SI, $0xffffffff
199	JE   success_avx2
200	ADDQ $1,DI
201	CMPQ DI,DX
202	JB loop32
203	JMP fail_avx2
204_33_to_63:
205	LEAQ 1(DI)(DX*1), DX
206	SUBQ AX, DX
207	VMOVDQU -32(R8)(AX*1), Y0
208	VMOVDQU (R8), Y1
209loop33to63:
210	VMOVDQU (DI), Y2
211	VPCMPEQB Y1, Y2, Y3
212	VPMOVMSKB Y3, SI
213	CMPL  SI, $0xffffffff
214	JE   partial_success33to63
215	ADDQ $1,DI
216	CMPQ DI,DX
217	JB loop33to63
218	JMP fail_avx2
219partial_success33to63:
220	VMOVDQU -32(AX)(DI*1), Y3
221	VPCMPEQB Y0, Y3, Y4
222	VPMOVMSKB Y4, SI
223	CMPL  SI, $0xffffffff
224	JE success_avx2
225	ADDQ $1,DI
226	CMPQ DI,DX
227	JB loop33to63
228fail_avx2:
229	VZEROUPPER
230fail:
231	MOVQ $-1, (R11)
232	RET
233success_avx2:
234	VZEROUPPER
235	JMP success
236sse42:
237#ifndef hasSSE42
238	CMPB internalcpu·X86+const_offsetX86HasSSE42(SB), $1
239	JNE no_sse42
240#endif
241	CMPQ AX, $12
242	// PCMPESTRI is slower than normal compare,
243	// so using it makes sense only if we advance 4+ bytes per compare
244	// This value was determined experimentally and is the ~same
245	// on Nehalem (first with SSE42) and Haswell.
246	JAE _9_or_more
247	LEAQ 16(R8), SI
248	TESTW $0xff0, SI
249	JEQ no_sse42
250	MOVOU (R8), X1
251	LEAQ -15(DI)(DX*1), SI
252	MOVQ $16, R9
253	SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
254	PCALIGN $16
255loop_sse42:
256	// 0x0c means: unsigned byte compare (bits 0,1 are 00)
257	// for equality (bits 2,3 are 11)
258	// result is not masked or inverted (bits 4,5 are 00)
259	// and corresponds to first matching byte (bit 6 is 0)
260	PCMPESTRI $0x0c, (DI), X1
261	// CX == 16 means no match,
262	// CX > R9 means partial match at the end of the string,
263	// otherwise sep is at offset CX from X1 start
264	CMPQ CX, R9
265	JBE sse42_success
266	ADDQ R9, DI
267	CMPQ DI, SI
268	JB loop_sse42
269	PCMPESTRI $0x0c, -1(SI), X1
270	CMPQ CX, R9
271	JA fail
272	LEAQ -1(SI), DI
273sse42_success:
274	ADDQ CX, DI
275success:
276	SUBQ R10, DI
277	MOVQ DI, (R11)
278	RET
279