xref: /aosp_15_r20/external/arm-optimized-routines/string/arm/memchr.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memchr - scan memory for a character
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2010-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/*
9*412f47f9SXin Li   Written by Dave Gilbert <[email protected]>
10*412f47f9SXin Li
11*412f47f9SXin Li   This __memchr_arm routine is optimised on a Cortex-A9 and should work on
12*412f47f9SXin Li   all ARMv7 processors.   It has a fast past for short sizes, and has
13*412f47f9SXin Li   an optimised path for large data sets; the worst case is finding the
14*412f47f9SXin Li   match early in a large data set.
15*412f47f9SXin Li
16*412f47f9SXin Li */
17*412f47f9SXin Li
18*412f47f9SXin Li@ 2011-02-07 david.gilbert@linaro.org
19*412f47f9SXin Li@    Extracted from local git a5b438d861
20*412f47f9SXin Li@ 2011-07-14 david.gilbert@linaro.org
21*412f47f9SXin Li@    Import endianness fix from local git ea786f1b
22*412f47f9SXin Li@ 2011-12-07 david.gilbert@linaro.org
23*412f47f9SXin Li@    Removed unneeded cbz from align loop
24*412f47f9SXin Li
25*412f47f9SXin Li	.syntax unified
26*412f47f9SXin Li#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M'
27*412f47f9SXin Li    /* keep config inherited from -march= */
28*412f47f9SXin Li#else
29*412f47f9SXin Li	.arch armv7-a
30*412f47f9SXin Li#endif
31*412f47f9SXin Li
32*412f47f9SXin Li@ this lets us check a flag in a 00/ff byte easily in either endianness
33*412f47f9SXin Li#ifdef __ARMEB__
34*412f47f9SXin Li#define CHARTSTMASK(c) 1<<(31-(c*8))
35*412f47f9SXin Li#else
36*412f47f9SXin Li#define CHARTSTMASK(c) 1<<(c*8)
37*412f47f9SXin Li#endif
38*412f47f9SXin Li	.thumb
39*412f47f9SXin Li#include "asmdefs.h"
40*412f47f9SXin Li
41*412f47f9SXin Li
42*412f47f9SXin Li@ ---------------------------------------------------------------------------
43*412f47f9SXin Li	.thumb_func
44*412f47f9SXin Li	.align 2
45*412f47f9SXin Li	.p2align 4,,15
46*412f47f9SXin Li	.global __memchr_arm
47*412f47f9SXin Li	.type __memchr_arm,%function
48*412f47f9SXin Li	.fnstart
49*412f47f9SXin Li	.cfi_startproc
50*412f47f9SXin Li__memchr_arm:
51*412f47f9SXin Li	@ r0 = start of memory to scan
52*412f47f9SXin Li	@ r1 = character to look for
53*412f47f9SXin Li	@ r2 = length
54*412f47f9SXin Li	@ returns r0 = pointer to character or NULL if not found
55*412f47f9SXin Li	prologue
56*412f47f9SXin Li	and	r1,r1,#0xff	@ Don't think we can trust the caller to actually pass a char
57*412f47f9SXin Li
58*412f47f9SXin Li	cmp	r2,#16		@ If it's short don't bother with anything clever
59*412f47f9SXin Li	blt	20f
60*412f47f9SXin Li
61*412f47f9SXin Li	tst	r0, #7		@ If it's already aligned skip the next bit
62*412f47f9SXin Li	beq	10f
63*412f47f9SXin Li
64*412f47f9SXin Li	@ Work up to an aligned point
65*412f47f9SXin Li5:
66*412f47f9SXin Li	ldrb	r3, [r0],#1
67*412f47f9SXin Li	subs	r2, r2, #1
68*412f47f9SXin Li	cmp	r3, r1
69*412f47f9SXin Li	beq	50f		@ If it matches exit found
70*412f47f9SXin Li	tst	r0, #7
71*412f47f9SXin Li	bne	5b		@ If not aligned yet then do next byte
72*412f47f9SXin Li
73*412f47f9SXin Li10:
74*412f47f9SXin Li	@ At this point, we are aligned, we know we have at least 8 bytes to work with
75*412f47f9SXin Li	push	{r4,r5,r6,r7}
76*412f47f9SXin Li	.cfi_adjust_cfa_offset 16
77*412f47f9SXin Li	.cfi_rel_offset 4, 0
78*412f47f9SXin Li	.cfi_rel_offset 5, 4
79*412f47f9SXin Li	.cfi_rel_offset 6, 8
80*412f47f9SXin Li	.cfi_rel_offset 7, 12
81*412f47f9SXin Li	orr	r1, r1, r1, lsl #8	@ expand the match word across to all bytes
82*412f47f9SXin Li	orr	r1, r1, r1, lsl #16
83*412f47f9SXin Li	bic	r4, r2, #7	@ Number of double words to work with
84*412f47f9SXin Li	mvns	r7, #0		@ all F's
85*412f47f9SXin Li	movs	r3, #0
86*412f47f9SXin Li
87*412f47f9SXin Li15:
88*412f47f9SXin Li	ldmia	r0!,{r5,r6}
89*412f47f9SXin Li	subs	r4, r4, #8
90*412f47f9SXin Li	eor	r5,r5, r1	@ Get it so that r5,r6 have 00's where the bytes match the target
91*412f47f9SXin Li	eor	r6,r6, r1
92*412f47f9SXin Li	uadd8	r5, r5, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
93*412f47f9SXin Li	sel	r5, r3, r7	@ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
94*412f47f9SXin Li	uadd8	r6, r6, r7	@ Parallel add 0xff - sets the GE bits for anything that wasn't 0
95*412f47f9SXin Li	sel	r6, r5, r7	@ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION
96*412f47f9SXin Li	cbnz	r6, 60f
97*412f47f9SXin Li	bne	15b		@ (Flags from the subs above) If not run out of bytes then go around again
98*412f47f9SXin Li
99*412f47f9SXin Li	pop	{r4,r5,r6,r7}
100*412f47f9SXin Li	.cfi_restore 7
101*412f47f9SXin Li	.cfi_restore 6
102*412f47f9SXin Li	.cfi_restore 5
103*412f47f9SXin Li	.cfi_restore 4
104*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
105*412f47f9SXin Li	and	r1,r1,#0xff	@ Get r1 back to a single character from the expansion above
106*412f47f9SXin Li	and	r2,r2,#7	@ Leave the count remaining as the number after the double words have been done
107*412f47f9SXin Li
108*412f47f9SXin Li20:
109*412f47f9SXin Li	cbz	r2, 40f		@ 0 length or hit the end already then not found
110*412f47f9SXin Li
111*412f47f9SXin Li21:  @ Post aligned section, or just a short call
112*412f47f9SXin Li	ldrb	r3,[r0],#1
113*412f47f9SXin Li	subs	r2,r2,#1
114*412f47f9SXin Li	eor	r3,r3,r1	@ r3 = 0 if match - doesn't break flags from sub
115*412f47f9SXin Li	cbz	r3, 50f
116*412f47f9SXin Li	bne	21b		@ on r2 flags
117*412f47f9SXin Li
118*412f47f9SXin Li40:
119*412f47f9SXin Li	.cfi_remember_state
120*412f47f9SXin Li	movs	r0,#0		@ not found
121*412f47f9SXin Li	epilogue
122*412f47f9SXin Li
123*412f47f9SXin Li50:
124*412f47f9SXin Li	.cfi_restore_state
125*412f47f9SXin Li	.cfi_remember_state
126*412f47f9SXin Li	subs	r0,r0,#1	@ found
127*412f47f9SXin Li	epilogue
128*412f47f9SXin Li
129*412f47f9SXin Li60:  @ We're here because the fast path found a hit - now we have to track down exactly which word it was
130*412f47f9SXin Li	@ r0 points to the start of the double word after the one that was tested
131*412f47f9SXin Li	@ r5 has the 00/ff pattern for the first word, r6 has the chained value
132*412f47f9SXin Li	.cfi_restore_state	@ Standard post-prologue state
133*412f47f9SXin Li	.cfi_adjust_cfa_offset 16
134*412f47f9SXin Li	.cfi_rel_offset	4, 0
135*412f47f9SXin Li	.cfi_rel_offset 5, 4
136*412f47f9SXin Li	.cfi_rel_offset 6, 8
137*412f47f9SXin Li	.cfi_rel_offset 7, 12
138*412f47f9SXin Li	cmp	r5, #0
139*412f47f9SXin Li	itte	eq
140*412f47f9SXin Li	moveq	r5, r6		@ the end is in the 2nd word
141*412f47f9SXin Li	subeq	r0,r0,#3	@ Points to 2nd byte of 2nd word
142*412f47f9SXin Li	subne	r0,r0,#7	@ or 2nd byte of 1st word
143*412f47f9SXin Li
144*412f47f9SXin Li	@ r0 currently points to the 3rd byte of the word containing the hit
145*412f47f9SXin Li	tst	r5, # CHARTSTMASK(0)	@ 1st character
146*412f47f9SXin Li	bne	61f
147*412f47f9SXin Li	adds	r0,r0,#1
148*412f47f9SXin Li	tst	r5, # CHARTSTMASK(1)	@ 2nd character
149*412f47f9SXin Li	ittt	eq
150*412f47f9SXin Li	addeq	r0,r0,#1
151*412f47f9SXin Li	tsteq	r5, # (3<<15)		@ 2nd & 3rd character
152*412f47f9SXin Li	@ If not the 3rd must be the last one
153*412f47f9SXin Li	addeq	r0,r0,#1
154*412f47f9SXin Li
155*412f47f9SXin Li61:
156*412f47f9SXin Li	pop	{r4,r5,r6,r7}
157*412f47f9SXin Li	.cfi_restore 7
158*412f47f9SXin Li	.cfi_restore 6
159*412f47f9SXin Li	.cfi_restore 5
160*412f47f9SXin Li	.cfi_restore 4
161*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
162*412f47f9SXin Li	subs	r0,r0,#1
163*412f47f9SXin Li	epilogue
164*412f47f9SXin Li	.cfi_endproc
165*412f47f9SXin Li	.cantunwind
166*412f47f9SXin Li	.fnend
167*412f47f9SXin Li
168*412f47f9SXin Li	.size	__memchr_arm, . - __memchr_arm
169