1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "textflag.h"
6
7TEXT ·IndexByte(SB),NOSPLIT,$0-40
8	MOVD	b_base+0(FP), R0
9	MOVD	b_len+8(FP), R2
10	MOVBU	c+24(FP), R1
11	MOVD	$ret+32(FP), R8
12	B	indexbytebody<>(SB)
13
14TEXT ·IndexByteString(SB),NOSPLIT,$0-32
15	MOVD	s_base+0(FP), R0
16	MOVD	s_len+8(FP), R2
17	MOVBU	c+16(FP), R1
18	MOVD	$ret+24(FP), R8
19	B	indexbytebody<>(SB)
20
21// input:
22//   R0: data
23//   R1: byte to search
24//   R2: data len
25//   R8: address to put result
26TEXT indexbytebody<>(SB),NOSPLIT,$0
27	// Core algorithm:
28	// For each 32-byte chunk we calculate a 64-bit syndrome value,
29	// with two bits per byte. For each tuple, bit 0 is set if the
30	// relevant byte matched the requested character and bit 1 is
31	// not used (faster than using a 32bit syndrome). Since the bits
32	// in the syndrome reflect exactly the order in which things occur
33	// in the original string, counting trailing zeros allows to
34	// identify exactly which byte has matched.
35
36	CBZ	R2, fail
37	MOVD	R0, R11
38	// Magic constant 0x40100401 allows us to identify
39	// which lane matches the requested byte.
40	// 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
41	// Different bytes have different bit masks (i.e: 1, 4, 16, 64)
42	MOVD	$0x40100401, R5
43	VMOV	R1, V0.B16
44	// Work with aligned 32-byte chunks
45	BIC	$0x1f, R0, R3
46	VMOV	R5, V5.S4
47	ANDS	$0x1f, R0, R9
48	AND	$0x1f, R2, R10
49	BEQ	loop
50
51	// Input string is not 32-byte aligned. We calculate the
52	// syndrome value for the aligned 32 bytes block containing
53	// the first bytes and mask off the irrelevant part.
54	VLD1.P	(R3), [V1.B16, V2.B16]
55	SUB	$0x20, R9, R4
56	ADDS	R4, R2, R2
57	VCMEQ	V0.B16, V1.B16, V3.B16
58	VCMEQ	V0.B16, V2.B16, V4.B16
59	VAND	V5.B16, V3.B16, V3.B16
60	VAND	V5.B16, V4.B16, V4.B16
61	VADDP	V4.B16, V3.B16, V6.B16 // 256->128
62	VADDP	V6.B16, V6.B16, V6.B16 // 128->64
63	VMOV	V6.D[0], R6
64	// Clear the irrelevant lower bits
65	LSL	$1, R9, R4
66	LSR	R4, R6, R6
67	LSL	R4, R6, R6
68	// The first block can also be the last
69	BLS	masklast
70	// Have we found something already?
71	CBNZ	R6, tail
72
73loop:
74	VLD1.P	(R3), [V1.B16, V2.B16]
75	SUBS	$0x20, R2, R2
76	VCMEQ	V0.B16, V1.B16, V3.B16
77	VCMEQ	V0.B16, V2.B16, V4.B16
78	// If we're out of data we finish regardless of the result
79	BLS	end
80	// Use a fast check for the termination condition
81	VORR	V4.B16, V3.B16, V6.B16
82	VADDP	V6.D2, V6.D2, V6.D2
83	VMOV	V6.D[0], R6
84	// We're not out of data, loop if we haven't found the character
85	CBZ	R6, loop
86
87end:
88	// Termination condition found, let's calculate the syndrome value
89	VAND	V5.B16, V3.B16, V3.B16
90	VAND	V5.B16, V4.B16, V4.B16
91	VADDP	V4.B16, V3.B16, V6.B16
92	VADDP	V6.B16, V6.B16, V6.B16
93	VMOV	V6.D[0], R6
94	// Only do the clear for the last possible block with less than 32 bytes
95	// Condition flags come from SUBS in the loop
96	BHS	tail
97
98masklast:
99	// Clear the irrelevant upper bits
100	ADD	R9, R10, R4
101	AND	$0x1f, R4, R4
102	SUB	$0x20, R4, R4
103	NEG	R4<<1, R4
104	LSL	R4, R6, R6
105	LSR	R4, R6, R6
106
107tail:
108	// Check that we have found a character
109	CBZ	R6, fail
110	// Count the trailing zeros using bit reversing
111	RBIT	R6, R6
112	// Compensate the last post-increment
113	SUB	$0x20, R3, R3
114	// And count the leading zeros
115	CLZ	R6, R6
116	// R6 is twice the offset into the fragment
117	ADD	R6>>1, R3, R0
118	// Compute the offset result
119	SUB	R11, R0, R0
120	MOVD	R0, (R8)
121	RET
122
123fail:
124	MOVD	$-1, R0
125	MOVD	R0, (R8)
126	RET
127