1// Copyright 2018 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ppc64le || ppc64
6
7#include "go_asm.h"
8#include "textflag.h"
9
10TEXT ·Count<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
11	// R3 = byte array pointer
12	// R4 = length
13	// R6 = byte to count
14	MTVRD	R6, V1		// move compare byte
15	MOVD	R6, R5
16	VSPLTB	$7, V1, V1	// replicate byte across V1
17	BR	countbytebody<>(SB)
18
19TEXT ·CountString<ABIInternal>(SB), NOSPLIT|NOFRAME, $0-32
20	// R3 = byte array pointer
21	// R4 = length
22	// R5 = byte to count
23	MTVRD	R5, V1		// move compare byte
24	VSPLTB	$7, V1, V1	// replicate byte across V1
25	BR	countbytebody<>(SB)
26
27// R3: addr of string
28// R4: len of string
29// R5: byte to count
30// V1: byte to count, splatted.
31// On exit:
32// R3: return value
33TEXT countbytebody<>(SB), NOSPLIT|NOFRAME, $0-0
34	MOVD	$0, R18 // byte count
35
36#ifndef GOPPC64_power10
37	RLDIMI	$8, R5, $48, R5
38	RLDIMI	$16, R5, $32, R5
39	RLDIMI	$32, R5, $0, R5	// fill reg with the byte to count
40#endif
41
42	CMPU	R4, $32		// Check if it's a small string (<32 bytes)
43	BLT	tail		// Jump to the small string case
44	SRD	$5, R4, R20
45	MOVD	R20, CTR
46	MOVD	$16, R21
47	XXLXOR	V4, V4, V4
48	XXLXOR	V5, V5, V5
49
50	PCALIGN	$16
51cmploop:
52	LXVD2X	(R0)(R3), V0	// Count 32B per loop with two vector accumulators.
53	LXVD2X	(R21)(R3), V2
54	VCMPEQUB V2, V1, V2
55	VCMPEQUB V0, V1, V0
56	VPOPCNTD V2, V2		// A match is 0xFF or 0. Count the bits into doubleword buckets.
57	VPOPCNTD V0, V0
58	VADDUDM	V0, V4, V4	// Accumulate the popcounts. They are 8x the count.
59	VADDUDM	V2, V5, V5	// The count will be fixed up afterwards.
60	ADD	$32, R3
61	BDNZ	cmploop
62
63	VADDUDM	V4, V5, V5
64	MFVSRD	V5, R18
65	VSLDOI	$8, V5, V5, V5
66	MFVSRD	V5, R21
67	ADD	R21, R18, R18
68	ANDCC	$31, R4, R4
69	// Skip the tail processing if no bytes remaining.
70	BEQ	tail_0
71
72#ifdef GOPPC64_power10
73	SRD	$3, R18, R18	// Fix the vector loop count before counting the tail on P10.
74
75tail:	// Count the last 0 - 31 bytes.
76	CMP	R4, $16
77	BLE	small_tail_p10
78	LXV	0(R3), V0
79	VCMPEQUB V0, V1, V0
80	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
81	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
82	ADD	R14, R18, R18
83	ADD	$16, R3, R3
84	ANDCC	$15, R4, R4
85
86small_tail_p10:
87	SLD	$56, R4, R6
88	LXVLL	R3, R6, V0
89	VCMPEQUB V0, V1, V0
90	VCLRRB	V0, R4, V0	// If <16B being compared, clear matches of the 16-R4 bytes.
91	VCNTMBB	V0, $1, R14	// Sum the value of bit 0 of each byte of the compare into R14.
92	SRD	$56, R14, R14	// The result of VCNTMBB is shifted. Unshift it.
93	ADD	R14, R18, R3
94	RET
95
96#else
97tail:	// Count the last 0 - 31 bytes.
98	CMP	R4, $16
99	BLT	tail_8
100	MOVD	(R3), R12
101	MOVD	8(R3), R14
102	CMPB	R12, R5, R12
103	CMPB	R14, R5, R14
104	POPCNTD	R12, R12
105	POPCNTD	R14, R14
106	ADD	R12, R18, R18
107	ADD	R14, R18, R18
108	ADD	$16, R3, R3
109	ADD	$-16, R4, R4
110
111tail_8:	// Count the remaining 0 - 15 bytes.
112	CMP	R4, $8
113	BLT	tail_4
114	MOVD	(R3), R12
115	CMPB	R12, R5, R12
116	POPCNTD	R12, R12
117	ADD	R12, R18, R18
118	ADD	$8, R3, R3
119	ADD	$-8, R4, R4
120
121tail_4:	// Count the remaining 0 - 7 bytes.
122	CMP	R4, $4
123	BLT	tail_2
124	MOVWZ	(R3), R12
125	CMPB	R12, R5, R12
126	SLD	$32, R12, R12	// Remove non-participating matches.
127	POPCNTD	R12, R12
128	ADD	R12, R18, R18
129	ADD	$4, R3, R3
130	ADD	$-4, R4, R4
131
132tail_2:	// Count the remaining 0 - 3 bytes.
133	CMP	R4, $2
134	BLT	tail_1
135	MOVHZ	(R3), R12
136	CMPB	R12, R5, R12
137	SLD	$48, R12, R12	// Remove non-participating matches.
138	POPCNTD	R12, R12
139	ADD	R12, R18, R18
140	ADD	$2, R3, R3
141	ADD	$-2, R4, R4
142
143tail_1:	// Count the remaining 0 - 1 bytes.
144	CMP	R4, $1
145	BLT	tail_0
146	MOVBZ	(R3), R12
147	CMPB	R12, R5, R12
148	ANDCC	$0x8, R12, R12
149	ADD	R12, R18, R18
150#endif
151
152tail_0:	// No remaining tail to count.
153	SRD	$3, R18, R3	// Fixup count, it is off by 8x.
154	RET
155