1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5#include "go_asm.h"
6#include "textflag.h"
7
8TEXT ·Count(SB),NOSPLIT,$0-40
9	MOVD	b_base+0(FP), R0
10	MOVD	b_len+8(FP), R2
11	MOVBU	c+24(FP), R1
12	MOVD	$ret+32(FP), R8
13	B	countbytebody<>(SB)
14
15TEXT ·CountString(SB),NOSPLIT,$0-32
16	MOVD	s_base+0(FP), R0
17	MOVD	s_len+8(FP), R2
18	MOVBU	c+16(FP), R1
19	MOVD	$ret+24(FP), R8
20	B	countbytebody<>(SB)
21
22// input:
23//   R0: data
24//   R2: data len
25//   R1: byte to find
26//   R8: address to put result
27TEXT countbytebody<>(SB),NOSPLIT,$0
28	// R11 = count of byte to search
29	MOVD	$0, R11
30	// short path to handle 0-byte case
31	CBZ	R2, done
32	CMP	$0x20, R2
33	// jump directly to tail if length < 32
34	BLO	tail
35	ANDS	$0x1f, R0, R9
36	BEQ	chunk
37	// Work with not 32-byte aligned head
38	BIC	$0x1f, R0, R3
39	ADD	$0x20, R3
40	PCALIGN $16
41head_loop:
42	MOVBU.P	1(R0), R5
43	CMP	R5, R1
44	CINC	EQ, R11, R11
45	SUB	$1, R2, R2
46	CMP	R0, R3
47	BNE	head_loop
48	// Work with 32-byte aligned chunks
49chunk:
50	BIC	$0x1f, R2, R9
51	// The first chunk can also be the last
52	CBZ	R9, tail
53	// R3 = end of 32-byte chunks
54	ADD	R0, R9, R3
55	MOVD	$1, R5
56	VMOV	R5, V5.B16
57	// R2 = length of tail
58	SUB	R9, R2, R2
59	// Duplicate R1 (byte to search) to 16 1-byte elements of V0
60	VMOV	R1, V0.B16
61	// Clear the low 64-bit element of V7 and V8
62	VEOR	V7.B8, V7.B8, V7.B8
63	VEOR	V8.B8, V8.B8, V8.B8
64	PCALIGN $16
65	// Count the target byte in 32-byte chunk
66chunk_loop:
67	VLD1.P	(R0), [V1.B16, V2.B16]
68	CMP	R0, R3
69	VCMEQ	V0.B16, V1.B16, V3.B16
70	VCMEQ	V0.B16, V2.B16, V4.B16
71	// Clear the higher 7 bits
72	VAND	V5.B16, V3.B16, V3.B16
73	VAND	V5.B16, V4.B16, V4.B16
74	// Count lanes match the requested byte
75	VADDP	V4.B16, V3.B16, V6.B16 // 32B->16B
76	VUADDLV	V6.B16, V7
77	// Accumulate the count in low 64-bit element of V8 when inside the loop
78	VADD	V7, V8
79	BNE	chunk_loop
80	VMOV	V8.D[0], R6
81	ADD	R6, R11, R11
82	CBZ	R2, done
83tail:
84	// Work with tail shorter than 32 bytes
85	MOVBU.P	1(R0), R5
86	SUB	$1, R2, R2
87	CMP	R5, R1
88	CINC	EQ, R11, R11
89	CBNZ	R2, tail
90done:
91	MOVD	R11, (R8)
92	RET
93