xref: /aosp_15_r20/external/arm-optimized-routines/string/arm/strcmp.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * strcmp for ARMv7
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2012-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li#if __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1
9*412f47f9SXin Li
10*412f47f9SXin Li/* Implementation of strcmp for ARMv7 when DSP instructions are
11*412f47f9SXin Li   available.  Use ldrd to support wider loads, provided the data
12*412f47f9SXin Li   is sufficiently aligned.  Use saturating arithmetic to optimize
13*412f47f9SXin Li   the compares.  */
14*412f47f9SXin Li
15*412f47f9SXin Li#include "asmdefs.h"
16*412f47f9SXin Li
17*412f47f9SXin Li/* Build Options:
18*412f47f9SXin Li   STRCMP_NO_PRECHECK: Don't run a quick pre-check of the first
19*412f47f9SXin Li   byte in the string.  If comparing completely random strings
20*412f47f9SXin Li   the pre-check will save time, since there is a very high
21*412f47f9SXin Li   probability of a mismatch in the first character: we save
22*412f47f9SXin Li   significant overhead if this is the common case.  However,
23*412f47f9SXin Li   if strings are likely to be identical (eg because we're
24*412f47f9SXin Li   verifying a hit in a hash table), then this check is largely
25*412f47f9SXin Li   redundant.  */
26*412f47f9SXin Li
27*412f47f9SXin Li#define STRCMP_NO_PRECHECK	0
28*412f47f9SXin Li
29*412f47f9SXin Li/* Ensure the .cantunwind directive is prepended to .fnend.
30*412f47f9SXin Li   Leaf functions cannot throw exceptions - EHABI only supports
31*412f47f9SXin Li   synchronous exceptions.  */
32*412f47f9SXin Li#define IS_LEAF
33*412f47f9SXin Li
34*412f47f9SXin Li	/* This version uses Thumb-2 code.  */
35*412f47f9SXin Li	.thumb
36*412f47f9SXin Li	.syntax unified
37*412f47f9SXin Li
38*412f47f9SXin Li#ifdef __ARM_BIG_ENDIAN
39*412f47f9SXin Li#define S2LO lsl
40*412f47f9SXin Li#define S2LOEQ lsleq
41*412f47f9SXin Li#define S2HI lsr
42*412f47f9SXin Li#define MSB 0x000000ff
43*412f47f9SXin Li#define LSB 0xff000000
44*412f47f9SXin Li#define BYTE0_OFFSET 24
45*412f47f9SXin Li#define BYTE1_OFFSET 16
46*412f47f9SXin Li#define BYTE2_OFFSET 8
47*412f47f9SXin Li#define BYTE3_OFFSET 0
48*412f47f9SXin Li#else /* not  __ARM_BIG_ENDIAN */
49*412f47f9SXin Li#define S2LO lsr
50*412f47f9SXin Li#define S2LOEQ lsreq
51*412f47f9SXin Li#define S2HI lsl
52*412f47f9SXin Li#define BYTE0_OFFSET 0
53*412f47f9SXin Li#define BYTE1_OFFSET 8
54*412f47f9SXin Li#define BYTE2_OFFSET 16
55*412f47f9SXin Li#define BYTE3_OFFSET 24
56*412f47f9SXin Li#define MSB 0xff000000
57*412f47f9SXin Li#define LSB 0x000000ff
58*412f47f9SXin Li#endif /* not  __ARM_BIG_ENDIAN */
59*412f47f9SXin Li
60*412f47f9SXin Li/* Parameters and result.  */
61*412f47f9SXin Li#define src1		r0
62*412f47f9SXin Li#define src2		r1
63*412f47f9SXin Li#define result		r0	/* Overlaps src1.  */
64*412f47f9SXin Li
65*412f47f9SXin Li/* Internal variables.  */
66*412f47f9SXin Li#define tmp1		r4
67*412f47f9SXin Li#define tmp2		r5
68*412f47f9SXin Li#define const_m1	r12
69*412f47f9SXin Li
70*412f47f9SXin Li/* Additional internal variables for 64-bit aligned data.  */
71*412f47f9SXin Li#define data1a		r2
72*412f47f9SXin Li#define data1b		r3
73*412f47f9SXin Li#define data2a		r6
74*412f47f9SXin Li#define data2b		r7
75*412f47f9SXin Li#define syndrome_a	tmp1
76*412f47f9SXin Li#define syndrome_b	tmp2
77*412f47f9SXin Li
78*412f47f9SXin Li/* Additional internal variables for 32-bit aligned data.  */
79*412f47f9SXin Li#define data1		r2
80*412f47f9SXin Li#define data2		r3
81*412f47f9SXin Li#define syndrome	tmp2
82*412f47f9SXin Li
83*412f47f9SXin Li
84*412f47f9SXin Li	/* Macro to compute and return the result value for word-aligned
85*412f47f9SXin Li	   cases.  */
86*412f47f9SXin Li	.macro strcmp_epilogue_aligned synd d1 d2 restore_r6
87*412f47f9SXin Li#ifdef __ARM_BIG_ENDIAN
88*412f47f9SXin Li	/* If data1 contains a zero byte, then syndrome will contain a 1 in
89*412f47f9SXin Li	   bit 7 of that byte.  Otherwise, the highest set bit in the
90*412f47f9SXin Li	   syndrome will highlight the first different bit.  It is therefore
91*412f47f9SXin Li	   sufficient to extract the eight bits starting with the syndrome
92*412f47f9SXin Li	   bit.  */
93*412f47f9SXin Li	clz	tmp1, \synd
94*412f47f9SXin Li	lsl	r1, \d2, tmp1
95*412f47f9SXin Li	.if \restore_r6
96*412f47f9SXin Li	ldrd	r6, r7, [sp, #8]
97*412f47f9SXin Li	.endif
98*412f47f9SXin Li	.cfi_restore 6
99*412f47f9SXin Li	.cfi_restore 7
100*412f47f9SXin Li	lsl	\d1, \d1, tmp1
101*412f47f9SXin Li	.cfi_remember_state
102*412f47f9SXin Li	lsr	result, \d1, #24
103*412f47f9SXin Li	ldrd	r4, r5, [sp], #16
104*412f47f9SXin Li	.cfi_restore 4
105*412f47f9SXin Li	.cfi_restore 5
106*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
107*412f47f9SXin Li	sub	result, result, r1, lsr #24
108*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
109*412f47f9SXin Li#else
110*412f47f9SXin Li	/* To use the big-endian trick we'd have to reverse all three words.
111*412f47f9SXin Li	   that's slower than this approach.  */
112*412f47f9SXin Li	rev	\synd, \synd
113*412f47f9SXin Li	clz	tmp1, \synd
114*412f47f9SXin Li	bic	tmp1, tmp1, #7
115*412f47f9SXin Li	lsr	r1, \d2, tmp1
116*412f47f9SXin Li	.cfi_remember_state
117*412f47f9SXin Li	.if \restore_r6
118*412f47f9SXin Li	ldrd	r6, r7, [sp, #8]
119*412f47f9SXin Li	.endif
120*412f47f9SXin Li	.cfi_restore 6
121*412f47f9SXin Li	.cfi_restore 7
122*412f47f9SXin Li	lsr	\d1, \d1, tmp1
123*412f47f9SXin Li	and	result, \d1, #255
124*412f47f9SXin Li	and	r1, r1, #255
125*412f47f9SXin Li	ldrd	r4, r5, [sp], #16
126*412f47f9SXin Li	.cfi_restore 4
127*412f47f9SXin Li	.cfi_restore 5
128*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
129*412f47f9SXin Li	sub	result, result, r1
130*412f47f9SXin Li
131*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
132*412f47f9SXin Li#endif
133*412f47f9SXin Li	.endm
134*412f47f9SXin Li
135*412f47f9SXin LiENTRY(__strcmp_arm)
136*412f47f9SXin Li	prologue push_ip=HAVE_PAC_LEAF
137*412f47f9SXin Li#if STRCMP_NO_PRECHECK == 0
138*412f47f9SXin Li	ldrb	r2, [src1]
139*412f47f9SXin Li	ldrb	r3, [src2]
140*412f47f9SXin Li	cmp	r2, #1
141*412f47f9SXin Li	it	cs
142*412f47f9SXin Li	cmpcs	r2, r3
143*412f47f9SXin Li	bne	L(fastpath_exit)
144*412f47f9SXin Li#endif
145*412f47f9SXin Li	strd	r4, r5, [sp, #-16]!
146*412f47f9SXin Li	.cfi_adjust_cfa_offset 16
147*412f47f9SXin Li	.cfi_rel_offset 4, 0
148*412f47f9SXin Li	.cfi_rel_offset 5, 4
149*412f47f9SXin Li	orr	tmp1, src1, src2
150*412f47f9SXin Li	strd	r6, r7, [sp, #8]
151*412f47f9SXin Li	.cfi_rel_offset 6, 8
152*412f47f9SXin Li	.cfi_rel_offset 7, 12
153*412f47f9SXin Li	mvn	const_m1, #0
154*412f47f9SXin Li	lsl	r2, tmp1, #29
155*412f47f9SXin Li	cbz	r2, L(loop_aligned8)
156*412f47f9SXin Li
157*412f47f9SXin LiL(not_aligned):
158*412f47f9SXin Li	eor	tmp1, src1, src2
159*412f47f9SXin Li	tst	tmp1, #7
160*412f47f9SXin Li	bne	L(misaligned8)
161*412f47f9SXin Li
162*412f47f9SXin Li	/* Deal with mutual misalignment by aligning downwards and then
163*412f47f9SXin Li	   masking off the unwanted loaded data to prevent a difference.  */
164*412f47f9SXin Li	and	tmp1, src1, #7
165*412f47f9SXin Li	bic	src1, src1, #7
166*412f47f9SXin Li	and	tmp2, tmp1, #3
167*412f47f9SXin Li	bic	src2, src2, #7
168*412f47f9SXin Li	lsl	tmp2, tmp2, #3	/* Bytes -> bits.  */
169*412f47f9SXin Li	ldrd	data1a, data1b, [src1], #16
170*412f47f9SXin Li	tst	tmp1, #4
171*412f47f9SXin Li	ldrd	data2a, data2b, [src2], #16
172*412f47f9SXin Li	/* In thumb code we can't use MVN with a register shift, but
173*412f47f9SXin Li	   we do have ORN.  */
174*412f47f9SXin Li	S2HI	tmp1, const_m1, tmp2
175*412f47f9SXin Li	orn	data1a, data1a, tmp1
176*412f47f9SXin Li	orn	data2a, data2a, tmp1
177*412f47f9SXin Li	beq	L(start_realigned8)
178*412f47f9SXin Li	orn	data1b, data1b, tmp1
179*412f47f9SXin Li	mov	data1a, const_m1
180*412f47f9SXin Li	orn	data2b, data2b, tmp1
181*412f47f9SXin Li	mov	data2a, const_m1
182*412f47f9SXin Li	b	L(start_realigned8)
183*412f47f9SXin Li
184*412f47f9SXin Li	/* Unwind the inner loop by a factor of 2, giving 16 bytes per
185*412f47f9SXin Li	   pass.  */
186*412f47f9SXin Li	.p2align 5,,12  /* Don't start in the tail bytes of a cache line.  */
187*412f47f9SXin Li	.p2align 2	/* Always word aligned.  */
188*412f47f9SXin LiL(loop_aligned8):
189*412f47f9SXin Li	ldrd	data1a, data1b, [src1], #16
190*412f47f9SXin Li	ldrd	data2a, data2b, [src2], #16
191*412f47f9SXin LiL(start_realigned8):
192*412f47f9SXin Li	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
193*412f47f9SXin Li	eor	syndrome_a, data1a, data2a
194*412f47f9SXin Li	sel	syndrome_a, syndrome_a, const_m1
195*412f47f9SXin Li	cbnz	syndrome_a, L(diff_in_a)
196*412f47f9SXin Li	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
197*412f47f9SXin Li	eor	syndrome_b, data1b, data2b
198*412f47f9SXin Li	sel	syndrome_b, syndrome_b, const_m1
199*412f47f9SXin Li	cbnz	syndrome_b, L(diff_in_b)
200*412f47f9SXin Li
201*412f47f9SXin Li	ldrd	data1a, data1b, [src1, #-8]
202*412f47f9SXin Li	ldrd	data2a, data2b, [src2, #-8]
203*412f47f9SXin Li	uadd8	syndrome_b, data1a, const_m1	/* Only want GE bits,  */
204*412f47f9SXin Li	eor	syndrome_a, data1a, data2a
205*412f47f9SXin Li	sel	syndrome_a, syndrome_a, const_m1
206*412f47f9SXin Li	uadd8	syndrome_b, data1b, const_m1	/* Only want GE bits.  */
207*412f47f9SXin Li	eor	syndrome_b, data1b, data2b
208*412f47f9SXin Li	sel	syndrome_b, syndrome_b, const_m1
209*412f47f9SXin Li	/* Can't use CBZ for backwards branch.  */
210*412f47f9SXin Li	orrs	syndrome_b, syndrome_b, syndrome_a /* Only need if s_a == 0 */
211*412f47f9SXin Li	beq	L(loop_aligned8)
212*412f47f9SXin Li
213*412f47f9SXin LiL(diff_found):
214*412f47f9SXin Li	cbnz	syndrome_a, L(diff_in_a)
215*412f47f9SXin Li
216*412f47f9SXin LiL(diff_in_b):
217*412f47f9SXin Li	strcmp_epilogue_aligned syndrome_b, data1b, data2b 1
218*412f47f9SXin Li
219*412f47f9SXin LiL(diff_in_a):
220*412f47f9SXin Li	.cfi_restore_state
221*412f47f9SXin Li	strcmp_epilogue_aligned syndrome_a, data1a, data2a 1
222*412f47f9SXin Li
223*412f47f9SXin Li	.cfi_restore_state
224*412f47f9SXin LiL(misaligned8):
225*412f47f9SXin Li	tst	tmp1, #3
226*412f47f9SXin Li	bne	L(misaligned4)
227*412f47f9SXin Li	ands	tmp1, src1, #3
228*412f47f9SXin Li	bne	L(mutual_align4)
229*412f47f9SXin Li
230*412f47f9SXin Li	/* Unrolled by a factor of 2, to reduce the number of post-increment
231*412f47f9SXin Li	   operations.  */
232*412f47f9SXin LiL(loop_aligned4):
233*412f47f9SXin Li	ldr	data1, [src1], #8
234*412f47f9SXin Li	ldr	data2, [src2], #8
235*412f47f9SXin LiL(start_realigned4):
236*412f47f9SXin Li	uadd8	syndrome, data1, const_m1	/* Only need GE bits.  */
237*412f47f9SXin Li	eor	syndrome, data1, data2
238*412f47f9SXin Li	sel	syndrome, syndrome, const_m1
239*412f47f9SXin Li	cbnz	syndrome, L(aligned4_done)
240*412f47f9SXin Li	ldr	data1, [src1, #-4]
241*412f47f9SXin Li	ldr	data2, [src2, #-4]
242*412f47f9SXin Li	uadd8	syndrome, data1, const_m1
243*412f47f9SXin Li	eor	syndrome, data1, data2
244*412f47f9SXin Li	sel	syndrome, syndrome, const_m1
245*412f47f9SXin Li	cmp	syndrome, #0
246*412f47f9SXin Li	beq	L(loop_aligned4)
247*412f47f9SXin Li
248*412f47f9SXin LiL(aligned4_done):
249*412f47f9SXin Li	strcmp_epilogue_aligned syndrome, data1, data2, 0
250*412f47f9SXin Li
251*412f47f9SXin LiL(mutual_align4):
252*412f47f9SXin Li	.cfi_restore_state
253*412f47f9SXin Li	/* Deal with mutual misalignment by aligning downwards and then
254*412f47f9SXin Li	   masking off the unwanted loaded data to prevent a difference.  */
255*412f47f9SXin Li	lsl	tmp1, tmp1, #3	/* Bytes -> bits.  */
256*412f47f9SXin Li	bic	src1, src1, #3
257*412f47f9SXin Li	ldr	data1, [src1], #8
258*412f47f9SXin Li	bic	src2, src2, #3
259*412f47f9SXin Li	ldr	data2, [src2], #8
260*412f47f9SXin Li
261*412f47f9SXin Li	/* In thumb code we can't use MVN with a register shift, but
262*412f47f9SXin Li	   we do have ORN.  */
263*412f47f9SXin Li	S2HI	tmp1, const_m1, tmp1
264*412f47f9SXin Li	orn	data1, data1, tmp1
265*412f47f9SXin Li	orn	data2, data2, tmp1
266*412f47f9SXin Li	b	L(start_realigned4)
267*412f47f9SXin Li
268*412f47f9SXin LiL(misaligned4):
269*412f47f9SXin Li	ands	tmp1, src1, #3
270*412f47f9SXin Li	beq	L(src1_aligned)
271*412f47f9SXin Li	sub	src2, src2, tmp1
272*412f47f9SXin Li	bic	src1, src1, #3
273*412f47f9SXin Li	lsls	tmp1, tmp1, #31
274*412f47f9SXin Li	ldr	data1, [src1], #4
275*412f47f9SXin Li	beq	L(aligned_m2)
276*412f47f9SXin Li	bcs	L(aligned_m1)
277*412f47f9SXin Li
278*412f47f9SXin Li#if STRCMP_NO_PRECHECK == 1
279*412f47f9SXin Li	ldrb	data2, [src2, #1]
280*412f47f9SXin Li	uxtb	tmp1, data1, ror #BYTE1_OFFSET
281*412f47f9SXin Li	subs	tmp1, tmp1, data2
282*412f47f9SXin Li	bne	L(misaligned_exit)
283*412f47f9SXin Li	cbz	data2, L(misaligned_exit)
284*412f47f9SXin Li
285*412f47f9SXin LiL(aligned_m2):
286*412f47f9SXin Li	ldrb	data2, [src2, #2]
287*412f47f9SXin Li	uxtb	tmp1, data1, ror #BYTE2_OFFSET
288*412f47f9SXin Li	subs	tmp1, tmp1, data2
289*412f47f9SXin Li	bne	L(misaligned_exit)
290*412f47f9SXin Li	cbz	data2, L(misaligned_exit)
291*412f47f9SXin Li
292*412f47f9SXin LiL(aligned_m1):
293*412f47f9SXin Li	ldrb	data2, [src2, #3]
294*412f47f9SXin Li	uxtb	tmp1, data1, ror #BYTE3_OFFSET
295*412f47f9SXin Li	subs	tmp1, tmp1, data2
296*412f47f9SXin Li	bne	L(misaligned_exit)
297*412f47f9SXin Li	add	src2, src2, #4
298*412f47f9SXin Li	cbnz	data2, L(src1_aligned)
299*412f47f9SXin Li#else  /* STRCMP_NO_PRECHECK */
300*412f47f9SXin Li	/* If we've done the pre-check, then we don't need to check the
301*412f47f9SXin Li	   first byte again here.  */
302*412f47f9SXin Li	ldrb	data2, [src2, #2]
303*412f47f9SXin Li	uxtb	tmp1, data1, ror #BYTE2_OFFSET
304*412f47f9SXin Li	subs	tmp1, tmp1, data2
305*412f47f9SXin Li	bne	L(misaligned_exit)
306*412f47f9SXin Li	cbz	data2, L(misaligned_exit)
307*412f47f9SXin Li
308*412f47f9SXin LiL(aligned_m2):
309*412f47f9SXin Li	ldrb	data2, [src2, #3]
310*412f47f9SXin Li	uxtb	tmp1, data1, ror #BYTE3_OFFSET
311*412f47f9SXin Li	subs	tmp1, tmp1, data2
312*412f47f9SXin Li	bne	L(misaligned_exit)
313*412f47f9SXin Li	cbnz	data2, L(aligned_m1)
314*412f47f9SXin Li#endif
315*412f47f9SXin Li
316*412f47f9SXin LiL(misaligned_exit):
317*412f47f9SXin Li	.cfi_remember_state
318*412f47f9SXin Li	mov	result, tmp1
319*412f47f9SXin Li	ldr	r4, [sp], #16
320*412f47f9SXin Li	.cfi_restore 4
321*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
322*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
323*412f47f9SXin Li
324*412f47f9SXin Li#if STRCMP_NO_PRECHECK == 0
325*412f47f9SXin LiL(fastpath_exit):
326*412f47f9SXin Li	.cfi_restore_state
327*412f47f9SXin Li	.cfi_remember_state
328*412f47f9SXin Li	sub	r0, r2, r3
329*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
330*412f47f9SXin Li
331*412f47f9SXin LiL(aligned_m1):
332*412f47f9SXin Li	.cfi_restore_state
333*412f47f9SXin Li	.cfi_remember_state
334*412f47f9SXin Li	add	src2, src2, #4
335*412f47f9SXin Li#endif
336*412f47f9SXin LiL(src1_aligned):
337*412f47f9SXin Li	.cfi_restore_state
338*412f47f9SXin Li	/* src1 is word aligned, but src2 has no common alignment
339*412f47f9SXin Li	   with it.  */
340*412f47f9SXin Li	ldr	data1, [src1], #4
341*412f47f9SXin Li	lsls	tmp1, src2, #31		/* C=src2[1], Z=src2[0].  */
342*412f47f9SXin Li
343*412f47f9SXin Li	bic	src2, src2, #3
344*412f47f9SXin Li	ldr	data2, [src2], #4
345*412f47f9SXin Li	bhi	L(overlap1)		/* C=1, Z=0 => src2[1:0] = 0b11.  */
346*412f47f9SXin Li	bcs	L(overlap2)		/* C=1, Z=1 => src2[1:0] = 0b10.  */
347*412f47f9SXin Li
348*412f47f9SXin Li	/* (overlap3) C=0, Z=0 => src2[1:0] = 0b01.  */
349*412f47f9SXin LiL(overlap3):
350*412f47f9SXin Li	bic	tmp1, data1, #MSB
351*412f47f9SXin Li	uadd8	syndrome, data1, const_m1
352*412f47f9SXin Li	eors	syndrome, tmp1, data2, S2LO #8
353*412f47f9SXin Li	sel	syndrome, syndrome, const_m1
354*412f47f9SXin Li	bne	4f
355*412f47f9SXin Li	cbnz	syndrome, 5f
356*412f47f9SXin Li	ldr	data2, [src2], #4
357*412f47f9SXin Li	eor	tmp1, tmp1, data1
358*412f47f9SXin Li	cmp	tmp1, data2, S2HI #24
359*412f47f9SXin Li	bne	6f
360*412f47f9SXin Li	ldr	data1, [src1], #4
361*412f47f9SXin Li	b	L(overlap3)
362*412f47f9SXin Li4:
363*412f47f9SXin Li	S2LO	data2, data2, #8
364*412f47f9SXin Li	b	L(strcmp_tail)
365*412f47f9SXin Li
366*412f47f9SXin Li5:
367*412f47f9SXin Li	bics	syndrome, syndrome, #MSB
368*412f47f9SXin Li	bne	L(strcmp_done_equal)
369*412f47f9SXin Li
370*412f47f9SXin Li	/* We can only get here if the MSB of data1 contains 0, so
371*412f47f9SXin Li	   fast-path the exit.  */
372*412f47f9SXin Li	ldrb	result, [src2]
373*412f47f9SXin Li	.cfi_remember_state
374*412f47f9SXin Li	ldrd	r4, r5, [sp], #16
375*412f47f9SXin Li	.cfi_restore 4
376*412f47f9SXin Li	.cfi_restore 5
377*412f47f9SXin Li	/* R6/7 Not used in this sequence.  */
378*412f47f9SXin Li	.cfi_restore 6
379*412f47f9SXin Li	.cfi_restore 7
380*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
381*412f47f9SXin Li	neg	result, result
382*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
383*412f47f9SXin Li6:
384*412f47f9SXin Li	.cfi_restore_state
385*412f47f9SXin Li	S2LO	data1, data1, #24
386*412f47f9SXin Li	and	data2, data2, #LSB
387*412f47f9SXin Li	b	L(strcmp_tail)
388*412f47f9SXin Li
389*412f47f9SXin Li	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
390*412f47f9SXin LiL(overlap2):
391*412f47f9SXin Li	and	tmp1, data1, const_m1, S2LO #16
392*412f47f9SXin Li	uadd8	syndrome, data1, const_m1
393*412f47f9SXin Li	eors	syndrome, tmp1, data2, S2LO #16
394*412f47f9SXin Li	sel	syndrome, syndrome, const_m1
395*412f47f9SXin Li	bne	4f
396*412f47f9SXin Li	cbnz	syndrome, 5f
397*412f47f9SXin Li	ldr	data2, [src2], #4
398*412f47f9SXin Li	eor	tmp1, tmp1, data1
399*412f47f9SXin Li	cmp	tmp1, data2, S2HI #16
400*412f47f9SXin Li	bne	6f
401*412f47f9SXin Li	ldr	data1, [src1], #4
402*412f47f9SXin Li	b	L(overlap2)
403*412f47f9SXin Li4:
404*412f47f9SXin Li	S2LO	data2, data2, #16
405*412f47f9SXin Li	b	L(strcmp_tail)
406*412f47f9SXin Li5:
407*412f47f9SXin Li	ands	syndrome, syndrome, const_m1, S2LO #16
408*412f47f9SXin Li	bne	L(strcmp_done_equal)
409*412f47f9SXin Li
410*412f47f9SXin Li	ldrh	data2, [src2]
411*412f47f9SXin Li	S2LO	data1, data1, #16
412*412f47f9SXin Li#ifdef __ARM_BIG_ENDIAN
413*412f47f9SXin Li	lsl	data2, data2, #16
414*412f47f9SXin Li#endif
415*412f47f9SXin Li	b	L(strcmp_tail)
416*412f47f9SXin Li
417*412f47f9SXin Li6:
418*412f47f9SXin Li	S2LO	data1, data1, #16
419*412f47f9SXin Li	and	data2, data2, const_m1, S2LO #16
420*412f47f9SXin Li	b	L(strcmp_tail)
421*412f47f9SXin Li
422*412f47f9SXin Li	.p2align 5,,12	/* Ensure at least 3 instructions in cache line.  */
423*412f47f9SXin LiL(overlap1):
424*412f47f9SXin Li	and	tmp1, data1, #LSB
425*412f47f9SXin Li	uadd8	syndrome, data1, const_m1
426*412f47f9SXin Li	eors	syndrome, tmp1, data2, S2LO #24
427*412f47f9SXin Li	sel	syndrome, syndrome, const_m1
428*412f47f9SXin Li	bne	4f
429*412f47f9SXin Li	cbnz	syndrome, 5f
430*412f47f9SXin Li	ldr	data2, [src2], #4
431*412f47f9SXin Li	eor	tmp1, tmp1, data1
432*412f47f9SXin Li	cmp	tmp1, data2, S2HI #8
433*412f47f9SXin Li	bne	6f
434*412f47f9SXin Li	ldr	data1, [src1], #4
435*412f47f9SXin Li	b	L(overlap1)
436*412f47f9SXin Li4:
437*412f47f9SXin Li	S2LO	data2, data2, #24
438*412f47f9SXin Li	b	L(strcmp_tail)
439*412f47f9SXin Li5:
440*412f47f9SXin Li	tst	syndrome, #LSB
441*412f47f9SXin Li	bne	L(strcmp_done_equal)
442*412f47f9SXin Li	ldr	data2, [src2]
443*412f47f9SXin Li6:
444*412f47f9SXin Li	S2LO	data1, data1, #8
445*412f47f9SXin Li	bic	data2, data2, #MSB
446*412f47f9SXin Li	b	L(strcmp_tail)
447*412f47f9SXin Li
448*412f47f9SXin LiL(strcmp_done_equal):
449*412f47f9SXin Li	mov	result, #0
450*412f47f9SXin Li	.cfi_remember_state
451*412f47f9SXin Li	ldrd	r4, r5, [sp], #16
452*412f47f9SXin Li	.cfi_restore 4
453*412f47f9SXin Li	.cfi_restore 5
454*412f47f9SXin Li	/* R6/7 not used in this sequence.  */
455*412f47f9SXin Li	.cfi_restore 6
456*412f47f9SXin Li	.cfi_restore 7
457*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
458*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
459*412f47f9SXin Li
460*412f47f9SXin LiL(strcmp_tail):
461*412f47f9SXin Li	.cfi_restore_state
462*412f47f9SXin Li#ifndef __ARM_BIG_ENDIAN
463*412f47f9SXin Li	rev	data1, data1
464*412f47f9SXin Li	rev	data2, data2
465*412f47f9SXin Li	/* Now everything looks big-endian...  */
466*412f47f9SXin Li#endif
467*412f47f9SXin Li	uadd8	tmp1, data1, const_m1
468*412f47f9SXin Li	eor	tmp1, data1, data2
469*412f47f9SXin Li	sel	syndrome, tmp1, const_m1
470*412f47f9SXin Li	clz	tmp1, syndrome
471*412f47f9SXin Li	lsl	data1, data1, tmp1
472*412f47f9SXin Li	lsl	data2, data2, tmp1
473*412f47f9SXin Li	lsr	result, data1, #24
474*412f47f9SXin Li	ldrd	r4, r5, [sp], #16
475*412f47f9SXin Li	.cfi_restore 4
476*412f47f9SXin Li	.cfi_restore 5
477*412f47f9SXin Li	/* R6/7 not used in this sequence.  */
478*412f47f9SXin Li	.cfi_restore 6
479*412f47f9SXin Li	.cfi_restore 7
480*412f47f9SXin Li	.cfi_adjust_cfa_offset -16
481*412f47f9SXin Li	sub	result, result, data2, lsr #24
482*412f47f9SXin Li	epilogue push_ip=HAVE_PAC_LEAF
483*412f47f9SXin Li
484*412f47f9SXin LiEND (__strcmp_arm)
485*412f47f9SXin Li
486*412f47f9SXin Li#endif /* __ARM_ARCH >= 7 && __ARM_ARCH_ISA_ARM >= 1  */
487