1*412f47f9SXin Li/* 2*412f47f9SXin Li * memchr - scan memory for a character 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2010-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* 9*412f47f9SXin Li Written by Dave Gilbert <[email protected]> 10*412f47f9SXin Li 11*412f47f9SXin Li This __memchr_arm routine is optimised on a Cortex-A9 and should work on 12*412f47f9SXin Li all ARMv7 processors. It has a fast past for short sizes, and has 13*412f47f9SXin Li an optimised path for large data sets; the worst case is finding the 14*412f47f9SXin Li match early in a large data set. 15*412f47f9SXin Li 16*412f47f9SXin Li */ 17*412f47f9SXin Li 18*412f47f9SXin Li@ 2011-02-07 david.gilbert@linaro.org 19*412f47f9SXin Li@ Extracted from local git a5b438d861 20*412f47f9SXin Li@ 2011-07-14 david.gilbert@linaro.org 21*412f47f9SXin Li@ Import endianness fix from local git ea786f1b 22*412f47f9SXin Li@ 2011-12-07 david.gilbert@linaro.org 23*412f47f9SXin Li@ Removed unneeded cbz from align loop 24*412f47f9SXin Li 25*412f47f9SXin Li .syntax unified 26*412f47f9SXin Li#if __ARM_ARCH >= 8 && __ARM_ARCH_PROFILE == 'M' 27*412f47f9SXin Li /* keep config inherited from -march= */ 28*412f47f9SXin Li#else 29*412f47f9SXin Li .arch armv7-a 30*412f47f9SXin Li#endif 31*412f47f9SXin Li 32*412f47f9SXin Li@ this lets us check a flag in a 00/ff byte easily in either endianness 33*412f47f9SXin Li#ifdef __ARMEB__ 34*412f47f9SXin Li#define CHARTSTMASK(c) 1<<(31-(c*8)) 35*412f47f9SXin Li#else 36*412f47f9SXin Li#define CHARTSTMASK(c) 1<<(c*8) 37*412f47f9SXin Li#endif 38*412f47f9SXin Li .thumb 39*412f47f9SXin Li#include "asmdefs.h" 40*412f47f9SXin Li 41*412f47f9SXin Li 42*412f47f9SXin Li@ --------------------------------------------------------------------------- 43*412f47f9SXin Li .thumb_func 44*412f47f9SXin Li .align 2 45*412f47f9SXin Li .p2align 4,,15 46*412f47f9SXin Li .global __memchr_arm 47*412f47f9SXin Li .type __memchr_arm,%function 48*412f47f9SXin Li .fnstart 49*412f47f9SXin Li .cfi_startproc 50*412f47f9SXin Li__memchr_arm: 51*412f47f9SXin Li @ r0 = start of memory to scan 52*412f47f9SXin Li @ r1 = character to look for 53*412f47f9SXin Li @ r2 = length 54*412f47f9SXin Li @ returns r0 = pointer to character or NULL if not found 55*412f47f9SXin Li prologue 56*412f47f9SXin Li and r1,r1,#0xff @ Don't think we can trust the caller to actually pass a char 57*412f47f9SXin Li 58*412f47f9SXin Li cmp r2,#16 @ If it's short don't bother with anything clever 59*412f47f9SXin Li blt 20f 60*412f47f9SXin Li 61*412f47f9SXin Li tst r0, #7 @ If it's already aligned skip the next bit 62*412f47f9SXin Li beq 10f 63*412f47f9SXin Li 64*412f47f9SXin Li @ Work up to an aligned point 65*412f47f9SXin Li5: 66*412f47f9SXin Li ldrb r3, [r0],#1 67*412f47f9SXin Li subs r2, r2, #1 68*412f47f9SXin Li cmp r3, r1 69*412f47f9SXin Li beq 50f @ If it matches exit found 70*412f47f9SXin Li tst r0, #7 71*412f47f9SXin Li bne 5b @ If not aligned yet then do next byte 72*412f47f9SXin Li 73*412f47f9SXin Li10: 74*412f47f9SXin Li @ At this point, we are aligned, we know we have at least 8 bytes to work with 75*412f47f9SXin Li push {r4,r5,r6,r7} 76*412f47f9SXin Li .cfi_adjust_cfa_offset 16 77*412f47f9SXin Li .cfi_rel_offset 4, 0 78*412f47f9SXin Li .cfi_rel_offset 5, 4 79*412f47f9SXin Li .cfi_rel_offset 6, 8 80*412f47f9SXin Li .cfi_rel_offset 7, 12 81*412f47f9SXin Li orr r1, r1, r1, lsl #8 @ expand the match word across to all bytes 82*412f47f9SXin Li orr r1, r1, r1, lsl #16 83*412f47f9SXin Li bic r4, r2, #7 @ Number of double words to work with 84*412f47f9SXin Li mvns r7, #0 @ all F's 85*412f47f9SXin Li movs r3, #0 86*412f47f9SXin Li 87*412f47f9SXin Li15: 88*412f47f9SXin Li ldmia r0!,{r5,r6} 89*412f47f9SXin Li subs r4, r4, #8 90*412f47f9SXin Li eor r5,r5, r1 @ Get it so that r5,r6 have 00's where the bytes match the target 91*412f47f9SXin Li eor r6,r6, r1 92*412f47f9SXin Li uadd8 r5, r5, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 93*412f47f9SXin Li sel r5, r3, r7 @ bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION 94*412f47f9SXin Li uadd8 r6, r6, r7 @ Parallel add 0xff - sets the GE bits for anything that wasn't 0 95*412f47f9SXin Li sel r6, r5, r7 @ chained....bytes are 00 for none-00 bytes, or ff for 00 bytes - NOTE INVERSION 96*412f47f9SXin Li cbnz r6, 60f 97*412f47f9SXin Li bne 15b @ (Flags from the subs above) If not run out of bytes then go around again 98*412f47f9SXin Li 99*412f47f9SXin Li pop {r4,r5,r6,r7} 100*412f47f9SXin Li .cfi_restore 7 101*412f47f9SXin Li .cfi_restore 6 102*412f47f9SXin Li .cfi_restore 5 103*412f47f9SXin Li .cfi_restore 4 104*412f47f9SXin Li .cfi_adjust_cfa_offset -16 105*412f47f9SXin Li and r1,r1,#0xff @ Get r1 back to a single character from the expansion above 106*412f47f9SXin Li and r2,r2,#7 @ Leave the count remaining as the number after the double words have been done 107*412f47f9SXin Li 108*412f47f9SXin Li20: 109*412f47f9SXin Li cbz r2, 40f @ 0 length or hit the end already then not found 110*412f47f9SXin Li 111*412f47f9SXin Li21: @ Post aligned section, or just a short call 112*412f47f9SXin Li ldrb r3,[r0],#1 113*412f47f9SXin Li subs r2,r2,#1 114*412f47f9SXin Li eor r3,r3,r1 @ r3 = 0 if match - doesn't break flags from sub 115*412f47f9SXin Li cbz r3, 50f 116*412f47f9SXin Li bne 21b @ on r2 flags 117*412f47f9SXin Li 118*412f47f9SXin Li40: 119*412f47f9SXin Li .cfi_remember_state 120*412f47f9SXin Li movs r0,#0 @ not found 121*412f47f9SXin Li epilogue 122*412f47f9SXin Li 123*412f47f9SXin Li50: 124*412f47f9SXin Li .cfi_restore_state 125*412f47f9SXin Li .cfi_remember_state 126*412f47f9SXin Li subs r0,r0,#1 @ found 127*412f47f9SXin Li epilogue 128*412f47f9SXin Li 129*412f47f9SXin Li60: @ We're here because the fast path found a hit - now we have to track down exactly which word it was 130*412f47f9SXin Li @ r0 points to the start of the double word after the one that was tested 131*412f47f9SXin Li @ r5 has the 00/ff pattern for the first word, r6 has the chained value 132*412f47f9SXin Li .cfi_restore_state @ Standard post-prologue state 133*412f47f9SXin Li .cfi_adjust_cfa_offset 16 134*412f47f9SXin Li .cfi_rel_offset 4, 0 135*412f47f9SXin Li .cfi_rel_offset 5, 4 136*412f47f9SXin Li .cfi_rel_offset 6, 8 137*412f47f9SXin Li .cfi_rel_offset 7, 12 138*412f47f9SXin Li cmp r5, #0 139*412f47f9SXin Li itte eq 140*412f47f9SXin Li moveq r5, r6 @ the end is in the 2nd word 141*412f47f9SXin Li subeq r0,r0,#3 @ Points to 2nd byte of 2nd word 142*412f47f9SXin Li subne r0,r0,#7 @ or 2nd byte of 1st word 143*412f47f9SXin Li 144*412f47f9SXin Li @ r0 currently points to the 3rd byte of the word containing the hit 145*412f47f9SXin Li tst r5, # CHARTSTMASK(0) @ 1st character 146*412f47f9SXin Li bne 61f 147*412f47f9SXin Li adds r0,r0,#1 148*412f47f9SXin Li tst r5, # CHARTSTMASK(1) @ 2nd character 149*412f47f9SXin Li ittt eq 150*412f47f9SXin Li addeq r0,r0,#1 151*412f47f9SXin Li tsteq r5, # (3<<15) @ 2nd & 3rd character 152*412f47f9SXin Li @ If not the 3rd must be the last one 153*412f47f9SXin Li addeq r0,r0,#1 154*412f47f9SXin Li 155*412f47f9SXin Li61: 156*412f47f9SXin Li pop {r4,r5,r6,r7} 157*412f47f9SXin Li .cfi_restore 7 158*412f47f9SXin Li .cfi_restore 6 159*412f47f9SXin Li .cfi_restore 5 160*412f47f9SXin Li .cfi_restore 4 161*412f47f9SXin Li .cfi_adjust_cfa_offset -16 162*412f47f9SXin Li subs r0,r0,#1 163*412f47f9SXin Li epilogue 164*412f47f9SXin Li .cfi_endproc 165*412f47f9SXin Li .cantunwind 166*412f47f9SXin Li .fnend 167*412f47f9SXin Li 168*412f47f9SXin Li .size __memchr_arm, . - __memchr_arm 169