1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build !plan9
6
7#include "go_asm.h"
8#include "textflag.h"
9
10TEXT	·IndexByte(SB), NOSPLIT, $0-40
11	MOVQ b_base+0(FP), SI
12	MOVQ b_len+8(FP), BX
13	MOVB c+24(FP), AL
14	LEAQ ret+32(FP), R8
15	JMP  indexbytebody<>(SB)
16
17TEXT	·IndexByteString(SB), NOSPLIT, $0-32
18	MOVQ s_base+0(FP), SI
19	MOVQ s_len+8(FP), BX
20	MOVB c+16(FP), AL
21	LEAQ ret+24(FP), R8
22	JMP  indexbytebody<>(SB)
23
24// input:
25//   SI: data
26//   BX: data len
27//   AL: byte sought
28//   R8: address to put result
29TEXT	indexbytebody<>(SB), NOSPLIT, $0
30	// Shuffle X0 around so that each byte contains
31	// the character we're looking for.
32	MOVD AX, X0
33	PUNPCKLBW X0, X0
34	PUNPCKLBW X0, X0
35	PSHUFL $0, X0, X0
36
37	CMPQ BX, $16
38	JLT small
39
40	MOVQ SI, DI
41
42	CMPQ BX, $32
43	JA avx2
44sse:
45	LEAQ	-16(SI)(BX*1), AX	// AX = address of last 16 bytes
46	JMP	sseloopentry
47
48	PCALIGN $16
49sseloop:
50	// Move the next 16-byte chunk of the data into X1.
51	MOVOU	(DI), X1
52	// Compare bytes in X0 to X1.
53	PCMPEQB	X0, X1
54	// Take the top bit of each byte in X1 and put the result in DX.
55	PMOVMSKB X1, DX
56	// Find first set bit, if any.
57	BSFL	DX, DX
58	JNZ	ssesuccess
59	// Advance to next block.
60	ADDQ	$16, DI
61sseloopentry:
62	CMPQ	DI, AX
63	JB	sseloop
64
65	// Search the last 16-byte chunk. This chunk may overlap with the
66	// chunks we've already searched, but that's ok.
67	MOVQ	AX, DI
68	MOVOU	(AX), X1
69	PCMPEQB	X0, X1
70	PMOVMSKB X1, DX
71	BSFL	DX, DX
72	JNZ	ssesuccess
73
74failure:
75	MOVQ $-1, (R8)
76	RET
77
78// We've found a chunk containing the byte.
79// The chunk was loaded from DI.
80// The index of the matching byte in the chunk is DX.
81// The start of the data is SI.
82ssesuccess:
83	SUBQ SI, DI	// Compute offset of chunk within data.
84	ADDQ DX, DI	// Add offset of byte within chunk.
85	MOVQ DI, (R8)
86	RET
87
88// handle for lengths < 16
89small:
90	TESTQ	BX, BX
91	JEQ	failure
92
93	// Check if we'll load across a page boundary.
94	LEAQ	16(SI), AX
95	TESTW	$0xff0, AX
96	JEQ	endofpage
97
98	MOVOU	(SI), X1 // Load data
99	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
100	PMOVMSKB X1, DX	// Move result bits to integer register.
101	BSFL	DX, DX	// Find first set bit.
102	JZ	failure	// No set bit, failure.
103	CMPL	DX, BX
104	JAE	failure	// Match is past end of data.
105	MOVQ	DX, (R8)
106	RET
107
108endofpage:
109	MOVOU	-16(SI)(BX*1), X1	// Load data into the high end of X1.
110	PCMPEQB	X0, X1	// Compare target byte with each byte in data.
111	PMOVMSKB X1, DX	// Move result bits to integer register.
112	MOVL	BX, CX
113	SHLL	CX, DX
114	SHRL	$16, DX	// Shift desired bits down to bottom of register.
115	BSFL	DX, DX	// Find first set bit.
116	JZ	failure	// No set bit, failure.
117	MOVQ	DX, (R8)
118	RET
119
120avx2:
121#ifndef hasAVX2
122	CMPB   internalcpu·X86+const_offsetX86HasAVX2(SB), $1
123	JNE sse
124#endif
125	MOVD AX, X0
126	LEAQ -32(SI)(BX*1), R11
127	VPBROADCASTB  X0, Y1
128
129	PCALIGN $32
130avx2_loop:
131	VMOVDQU (DI), Y2
132	VPCMPEQB Y1, Y2, Y3
133	VPTEST Y3, Y3
134	JNZ avx2success
135	ADDQ $32, DI
136	CMPQ DI, R11
137	JLT avx2_loop
138	MOVQ R11, DI
139	VMOVDQU (DI), Y2
140	VPCMPEQB Y1, Y2, Y3
141	VPTEST Y3, Y3
142	JNZ avx2success
143	VZEROUPPER
144	MOVQ $-1, (R8)
145	RET
146
147avx2success:
148	VPMOVMSKB Y3, DX
149	BSFL DX, DX
150	SUBQ SI, DI
151	ADDQ DI, DX
152	MOVQ DX, (R8)
153	VZEROUPPER
154	RET
155