1*412f47f9SXin Li/* 2*412f47f9SXin Li * memcpy - copy memory area 3*412f47f9SXin Li * 4*412f47f9SXin Li * Copyright (c) 2013-2022, Arm Limited. 5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6*412f47f9SXin Li */ 7*412f47f9SXin Li 8*412f47f9SXin Li/* 9*412f47f9SXin Li This memcpy routine is optimised for Cortex-A15 cores and takes advantage 10*412f47f9SXin Li of VFP or NEON when built with the appropriate flags. 11*412f47f9SXin Li 12*412f47f9SXin Li Assumptions: 13*412f47f9SXin Li 14*412f47f9SXin Li ARMv6 (ARMv7-a if using Neon) 15*412f47f9SXin Li ARM state 16*412f47f9SXin Li Unaligned accesses 17*412f47f9SXin Li 18*412f47f9SXin Li */ 19*412f47f9SXin Li 20*412f47f9SXin Li#include "asmdefs.h" 21*412f47f9SXin Li 22*412f47f9SXin Li .syntax unified 23*412f47f9SXin Li /* This implementation requires ARM state. */ 24*412f47f9SXin Li .arm 25*412f47f9SXin Li 26*412f47f9SXin Li#ifdef __ARM_NEON__ 27*412f47f9SXin Li 28*412f47f9SXin Li .fpu neon 29*412f47f9SXin Li .arch armv7-a 30*412f47f9SXin Li# define FRAME_SIZE 4 31*412f47f9SXin Li# define USE_VFP 32*412f47f9SXin Li# define USE_NEON 33*412f47f9SXin Li 34*412f47f9SXin Li#elif !defined (__SOFTFP__) 35*412f47f9SXin Li 36*412f47f9SXin Li .arch armv6 37*412f47f9SXin Li .fpu vfpv2 38*412f47f9SXin Li# define FRAME_SIZE 32 39*412f47f9SXin Li# define USE_VFP 40*412f47f9SXin Li 41*412f47f9SXin Li#else 42*412f47f9SXin Li .arch armv6 43*412f47f9SXin Li# define FRAME_SIZE 32 44*412f47f9SXin Li 45*412f47f9SXin Li#endif 46*412f47f9SXin Li 47*412f47f9SXin Li/* Old versions of GAS incorrectly implement the NEON align semantics. */ 48*412f47f9SXin Li#ifdef BROKEN_ASM_NEON_ALIGN 49*412f47f9SXin Li#define ALIGN(addr, align) addr,:align 50*412f47f9SXin Li#else 51*412f47f9SXin Li#define ALIGN(addr, align) addr:align 52*412f47f9SXin Li#endif 53*412f47f9SXin Li 54*412f47f9SXin Li#define PC_OFFSET 8 /* PC pipeline compensation. */ 55*412f47f9SXin Li#define INSN_SIZE 4 56*412f47f9SXin Li 57*412f47f9SXin Li/* Call parameters. */ 58*412f47f9SXin Li#define dstin r0 59*412f47f9SXin Li#define src r1 60*412f47f9SXin Li#define count r2 61*412f47f9SXin Li 62*412f47f9SXin Li/* Locals. */ 63*412f47f9SXin Li#define tmp1 r3 64*412f47f9SXin Li#define dst ip 65*412f47f9SXin Li#define tmp2 r10 66*412f47f9SXin Li 67*412f47f9SXin Li#ifndef USE_NEON 68*412f47f9SXin Li/* For bulk copies using GP registers. */ 69*412f47f9SXin Li#define A_l r2 /* Call-clobbered. */ 70*412f47f9SXin Li#define A_h r3 /* Call-clobbered. */ 71*412f47f9SXin Li#define B_l r4 72*412f47f9SXin Li#define B_h r5 73*412f47f9SXin Li#define C_l r6 74*412f47f9SXin Li#define C_h r7 75*412f47f9SXin Li#define D_l r8 76*412f47f9SXin Li#define D_h r9 77*412f47f9SXin Li#endif 78*412f47f9SXin Li 79*412f47f9SXin Li/* Number of lines ahead to pre-fetch data. If you change this the code 80*412f47f9SXin Li below will need adjustment to compensate. */ 81*412f47f9SXin Li 82*412f47f9SXin Li#define prefetch_lines 5 83*412f47f9SXin Li 84*412f47f9SXin Li#ifdef USE_VFP 85*412f47f9SXin Li .macro cpy_line_vfp vreg, base 86*412f47f9SXin Li vstr \vreg, [dst, #\base] 87*412f47f9SXin Li vldr \vreg, [src, #\base] 88*412f47f9SXin Li vstr d0, [dst, #\base + 8] 89*412f47f9SXin Li vldr d0, [src, #\base + 8] 90*412f47f9SXin Li vstr d1, [dst, #\base + 16] 91*412f47f9SXin Li vldr d1, [src, #\base + 16] 92*412f47f9SXin Li vstr d2, [dst, #\base + 24] 93*412f47f9SXin Li vldr d2, [src, #\base + 24] 94*412f47f9SXin Li vstr \vreg, [dst, #\base + 32] 95*412f47f9SXin Li vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] 96*412f47f9SXin Li vstr d0, [dst, #\base + 40] 97*412f47f9SXin Li vldr d0, [src, #\base + 40] 98*412f47f9SXin Li vstr d1, [dst, #\base + 48] 99*412f47f9SXin Li vldr d1, [src, #\base + 48] 100*412f47f9SXin Li vstr d2, [dst, #\base + 56] 101*412f47f9SXin Li vldr d2, [src, #\base + 56] 102*412f47f9SXin Li .endm 103*412f47f9SXin Li 104*412f47f9SXin Li .macro cpy_tail_vfp vreg, base 105*412f47f9SXin Li vstr \vreg, [dst, #\base] 106*412f47f9SXin Li vldr \vreg, [src, #\base] 107*412f47f9SXin Li vstr d0, [dst, #\base + 8] 108*412f47f9SXin Li vldr d0, [src, #\base + 8] 109*412f47f9SXin Li vstr d1, [dst, #\base + 16] 110*412f47f9SXin Li vldr d1, [src, #\base + 16] 111*412f47f9SXin Li vstr d2, [dst, #\base + 24] 112*412f47f9SXin Li vldr d2, [src, #\base + 24] 113*412f47f9SXin Li vstr \vreg, [dst, #\base + 32] 114*412f47f9SXin Li vstr d0, [dst, #\base + 40] 115*412f47f9SXin Li vldr d0, [src, #\base + 40] 116*412f47f9SXin Li vstr d1, [dst, #\base + 48] 117*412f47f9SXin Li vldr d1, [src, #\base + 48] 118*412f47f9SXin Li vstr d2, [dst, #\base + 56] 119*412f47f9SXin Li vldr d2, [src, #\base + 56] 120*412f47f9SXin Li .endm 121*412f47f9SXin Li#endif 122*412f47f9SXin Li 123*412f47f9SXin LiENTRY (__memcpy_arm) 124*412f47f9SXin Li 125*412f47f9SXin Li mov dst, dstin /* Preserve dstin, we need to return it. */ 126*412f47f9SXin Li cmp count, #64 127*412f47f9SXin Li bhs L(cpy_not_short) 128*412f47f9SXin Li /* Deal with small copies quickly by dropping straight into the 129*412f47f9SXin Li exit block. */ 130*412f47f9SXin Li 131*412f47f9SXin LiL(tail63unaligned): 132*412f47f9SXin Li#ifdef USE_NEON 133*412f47f9SXin Li and tmp1, count, #0x38 134*412f47f9SXin Li rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 135*412f47f9SXin Li add pc, pc, tmp1 136*412f47f9SXin Li vld1.8 {d0}, [src]! /* 14 words to go. */ 137*412f47f9SXin Li vst1.8 {d0}, [dst]! 138*412f47f9SXin Li vld1.8 {d0}, [src]! /* 12 words to go. */ 139*412f47f9SXin Li vst1.8 {d0}, [dst]! 140*412f47f9SXin Li vld1.8 {d0}, [src]! /* 10 words to go. */ 141*412f47f9SXin Li vst1.8 {d0}, [dst]! 142*412f47f9SXin Li vld1.8 {d0}, [src]! /* 8 words to go. */ 143*412f47f9SXin Li vst1.8 {d0}, [dst]! 144*412f47f9SXin Li vld1.8 {d0}, [src]! /* 6 words to go. */ 145*412f47f9SXin Li vst1.8 {d0}, [dst]! 146*412f47f9SXin Li vld1.8 {d0}, [src]! /* 4 words to go. */ 147*412f47f9SXin Li vst1.8 {d0}, [dst]! 148*412f47f9SXin Li vld1.8 {d0}, [src]! /* 2 words to go. */ 149*412f47f9SXin Li vst1.8 {d0}, [dst]! 150*412f47f9SXin Li 151*412f47f9SXin Li tst count, #4 152*412f47f9SXin Li ldrne tmp1, [src], #4 153*412f47f9SXin Li strne tmp1, [dst], #4 154*412f47f9SXin Li#else 155*412f47f9SXin Li /* Copy up to 15 full words of data. May not be aligned. */ 156*412f47f9SXin Li /* Cannot use VFP for unaligned data. */ 157*412f47f9SXin Li and tmp1, count, #0x3c 158*412f47f9SXin Li add dst, dst, tmp1 159*412f47f9SXin Li add src, src, tmp1 160*412f47f9SXin Li rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) 161*412f47f9SXin Li /* Jump directly into the sequence below at the correct offset. */ 162*412f47f9SXin Li add pc, pc, tmp1, lsl #1 163*412f47f9SXin Li 164*412f47f9SXin Li ldr tmp1, [src, #-60] /* 15 words to go. */ 165*412f47f9SXin Li str tmp1, [dst, #-60] 166*412f47f9SXin Li 167*412f47f9SXin Li ldr tmp1, [src, #-56] /* 14 words to go. */ 168*412f47f9SXin Li str tmp1, [dst, #-56] 169*412f47f9SXin Li ldr tmp1, [src, #-52] 170*412f47f9SXin Li str tmp1, [dst, #-52] 171*412f47f9SXin Li 172*412f47f9SXin Li ldr tmp1, [src, #-48] /* 12 words to go. */ 173*412f47f9SXin Li str tmp1, [dst, #-48] 174*412f47f9SXin Li ldr tmp1, [src, #-44] 175*412f47f9SXin Li str tmp1, [dst, #-44] 176*412f47f9SXin Li 177*412f47f9SXin Li ldr tmp1, [src, #-40] /* 10 words to go. */ 178*412f47f9SXin Li str tmp1, [dst, #-40] 179*412f47f9SXin Li ldr tmp1, [src, #-36] 180*412f47f9SXin Li str tmp1, [dst, #-36] 181*412f47f9SXin Li 182*412f47f9SXin Li ldr tmp1, [src, #-32] /* 8 words to go. */ 183*412f47f9SXin Li str tmp1, [dst, #-32] 184*412f47f9SXin Li ldr tmp1, [src, #-28] 185*412f47f9SXin Li str tmp1, [dst, #-28] 186*412f47f9SXin Li 187*412f47f9SXin Li ldr tmp1, [src, #-24] /* 6 words to go. */ 188*412f47f9SXin Li str tmp1, [dst, #-24] 189*412f47f9SXin Li ldr tmp1, [src, #-20] 190*412f47f9SXin Li str tmp1, [dst, #-20] 191*412f47f9SXin Li 192*412f47f9SXin Li ldr tmp1, [src, #-16] /* 4 words to go. */ 193*412f47f9SXin Li str tmp1, [dst, #-16] 194*412f47f9SXin Li ldr tmp1, [src, #-12] 195*412f47f9SXin Li str tmp1, [dst, #-12] 196*412f47f9SXin Li 197*412f47f9SXin Li ldr tmp1, [src, #-8] /* 2 words to go. */ 198*412f47f9SXin Li str tmp1, [dst, #-8] 199*412f47f9SXin Li ldr tmp1, [src, #-4] 200*412f47f9SXin Li str tmp1, [dst, #-4] 201*412f47f9SXin Li#endif 202*412f47f9SXin Li 203*412f47f9SXin Li lsls count, count, #31 204*412f47f9SXin Li ldrhcs tmp1, [src], #2 205*412f47f9SXin Li ldrbne src, [src] /* Src is dead, use as a scratch. */ 206*412f47f9SXin Li strhcs tmp1, [dst], #2 207*412f47f9SXin Li strbne src, [dst] 208*412f47f9SXin Li bx lr 209*412f47f9SXin Li 210*412f47f9SXin LiL(cpy_not_short): 211*412f47f9SXin Li /* At least 64 bytes to copy, but don't know the alignment yet. */ 212*412f47f9SXin Li str tmp2, [sp, #-FRAME_SIZE]! 213*412f47f9SXin Li and tmp2, src, #7 214*412f47f9SXin Li and tmp1, dst, #7 215*412f47f9SXin Li cmp tmp1, tmp2 216*412f47f9SXin Li bne L(cpy_notaligned) 217*412f47f9SXin Li 218*412f47f9SXin Li#ifdef USE_VFP 219*412f47f9SXin Li /* Magic dust alert! Force VFP on Cortex-A9. Experiments show 220*412f47f9SXin Li that the FP pipeline is much better at streaming loads and 221*412f47f9SXin Li stores. This is outside the critical loop. */ 222*412f47f9SXin Li vmov.f32 s0, s0 223*412f47f9SXin Li#endif 224*412f47f9SXin Li 225*412f47f9SXin Li /* SRC and DST have the same mutual 64-bit alignment, but we may 226*412f47f9SXin Li still need to pre-copy some bytes to get to natural alignment. 227*412f47f9SXin Li We bring SRC and DST into full 64-bit alignment. */ 228*412f47f9SXin Li lsls tmp2, dst, #29 229*412f47f9SXin Li beq 1f 230*412f47f9SXin Li rsbs tmp2, tmp2, #0 231*412f47f9SXin Li sub count, count, tmp2, lsr #29 232*412f47f9SXin Li ldrmi tmp1, [src], #4 233*412f47f9SXin Li strmi tmp1, [dst], #4 234*412f47f9SXin Li lsls tmp2, tmp2, #2 235*412f47f9SXin Li ldrhcs tmp1, [src], #2 236*412f47f9SXin Li ldrbne tmp2, [src], #1 237*412f47f9SXin Li strhcs tmp1, [dst], #2 238*412f47f9SXin Li strbne tmp2, [dst], #1 239*412f47f9SXin Li 240*412f47f9SXin Li1: 241*412f47f9SXin Li subs tmp2, count, #64 /* Use tmp2 for count. */ 242*412f47f9SXin Li blo L(tail63aligned) 243*412f47f9SXin Li 244*412f47f9SXin Li cmp tmp2, #512 245*412f47f9SXin Li bhs L(cpy_body_long) 246*412f47f9SXin Li 247*412f47f9SXin LiL(cpy_body_medium): /* Count in tmp2. */ 248*412f47f9SXin Li#ifdef USE_VFP 249*412f47f9SXin Li1: 250*412f47f9SXin Li vldr d0, [src, #0] 251*412f47f9SXin Li subs tmp2, tmp2, #64 252*412f47f9SXin Li vldr d1, [src, #8] 253*412f47f9SXin Li vstr d0, [dst, #0] 254*412f47f9SXin Li vldr d0, [src, #16] 255*412f47f9SXin Li vstr d1, [dst, #8] 256*412f47f9SXin Li vldr d1, [src, #24] 257*412f47f9SXin Li vstr d0, [dst, #16] 258*412f47f9SXin Li vldr d0, [src, #32] 259*412f47f9SXin Li vstr d1, [dst, #24] 260*412f47f9SXin Li vldr d1, [src, #40] 261*412f47f9SXin Li vstr d0, [dst, #32] 262*412f47f9SXin Li vldr d0, [src, #48] 263*412f47f9SXin Li vstr d1, [dst, #40] 264*412f47f9SXin Li vldr d1, [src, #56] 265*412f47f9SXin Li vstr d0, [dst, #48] 266*412f47f9SXin Li add src, src, #64 267*412f47f9SXin Li vstr d1, [dst, #56] 268*412f47f9SXin Li add dst, dst, #64 269*412f47f9SXin Li bhs 1b 270*412f47f9SXin Li tst tmp2, #0x3f 271*412f47f9SXin Li beq L(done) 272*412f47f9SXin Li 273*412f47f9SXin LiL(tail63aligned): /* Count in tmp2. */ 274*412f47f9SXin Li and tmp1, tmp2, #0x38 275*412f47f9SXin Li add dst, dst, tmp1 276*412f47f9SXin Li add src, src, tmp1 277*412f47f9SXin Li rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 278*412f47f9SXin Li add pc, pc, tmp1 279*412f47f9SXin Li 280*412f47f9SXin Li vldr d0, [src, #-56] /* 14 words to go. */ 281*412f47f9SXin Li vstr d0, [dst, #-56] 282*412f47f9SXin Li vldr d0, [src, #-48] /* 12 words to go. */ 283*412f47f9SXin Li vstr d0, [dst, #-48] 284*412f47f9SXin Li vldr d0, [src, #-40] /* 10 words to go. */ 285*412f47f9SXin Li vstr d0, [dst, #-40] 286*412f47f9SXin Li vldr d0, [src, #-32] /* 8 words to go. */ 287*412f47f9SXin Li vstr d0, [dst, #-32] 288*412f47f9SXin Li vldr d0, [src, #-24] /* 6 words to go. */ 289*412f47f9SXin Li vstr d0, [dst, #-24] 290*412f47f9SXin Li vldr d0, [src, #-16] /* 4 words to go. */ 291*412f47f9SXin Li vstr d0, [dst, #-16] 292*412f47f9SXin Li vldr d0, [src, #-8] /* 2 words to go. */ 293*412f47f9SXin Li vstr d0, [dst, #-8] 294*412f47f9SXin Li#else 295*412f47f9SXin Li sub src, src, #8 296*412f47f9SXin Li sub dst, dst, #8 297*412f47f9SXin Li1: 298*412f47f9SXin Li ldrd A_l, A_h, [src, #8] 299*412f47f9SXin Li strd A_l, A_h, [dst, #8] 300*412f47f9SXin Li ldrd A_l, A_h, [src, #16] 301*412f47f9SXin Li strd A_l, A_h, [dst, #16] 302*412f47f9SXin Li ldrd A_l, A_h, [src, #24] 303*412f47f9SXin Li strd A_l, A_h, [dst, #24] 304*412f47f9SXin Li ldrd A_l, A_h, [src, #32] 305*412f47f9SXin Li strd A_l, A_h, [dst, #32] 306*412f47f9SXin Li ldrd A_l, A_h, [src, #40] 307*412f47f9SXin Li strd A_l, A_h, [dst, #40] 308*412f47f9SXin Li ldrd A_l, A_h, [src, #48] 309*412f47f9SXin Li strd A_l, A_h, [dst, #48] 310*412f47f9SXin Li ldrd A_l, A_h, [src, #56] 311*412f47f9SXin Li strd A_l, A_h, [dst, #56] 312*412f47f9SXin Li ldrd A_l, A_h, [src, #64]! 313*412f47f9SXin Li strd A_l, A_h, [dst, #64]! 314*412f47f9SXin Li subs tmp2, tmp2, #64 315*412f47f9SXin Li bhs 1b 316*412f47f9SXin Li tst tmp2, #0x3f 317*412f47f9SXin Li bne 1f 318*412f47f9SXin Li ldr tmp2,[sp], #FRAME_SIZE 319*412f47f9SXin Li bx lr 320*412f47f9SXin Li1: 321*412f47f9SXin Li add src, src, #8 322*412f47f9SXin Li add dst, dst, #8 323*412f47f9SXin Li 324*412f47f9SXin LiL(tail63aligned): /* Count in tmp2. */ 325*412f47f9SXin Li /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but 326*412f47f9SXin Li we know that the src and dest are 64-bit aligned so we can use 327*412f47f9SXin Li LDRD/STRD to improve efficiency. */ 328*412f47f9SXin Li /* TMP2 is now negative, but we don't care about that. The bottom 329*412f47f9SXin Li six bits still tell us how many bytes are left to copy. */ 330*412f47f9SXin Li 331*412f47f9SXin Li and tmp1, tmp2, #0x38 332*412f47f9SXin Li add dst, dst, tmp1 333*412f47f9SXin Li add src, src, tmp1 334*412f47f9SXin Li rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) 335*412f47f9SXin Li add pc, pc, tmp1 336*412f47f9SXin Li ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ 337*412f47f9SXin Li strd A_l, A_h, [dst, #-56] 338*412f47f9SXin Li ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ 339*412f47f9SXin Li strd A_l, A_h, [dst, #-48] 340*412f47f9SXin Li ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ 341*412f47f9SXin Li strd A_l, A_h, [dst, #-40] 342*412f47f9SXin Li ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ 343*412f47f9SXin Li strd A_l, A_h, [dst, #-32] 344*412f47f9SXin Li ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ 345*412f47f9SXin Li strd A_l, A_h, [dst, #-24] 346*412f47f9SXin Li ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ 347*412f47f9SXin Li strd A_l, A_h, [dst, #-16] 348*412f47f9SXin Li ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ 349*412f47f9SXin Li strd A_l, A_h, [dst, #-8] 350*412f47f9SXin Li 351*412f47f9SXin Li#endif 352*412f47f9SXin Li tst tmp2, #4 353*412f47f9SXin Li ldrne tmp1, [src], #4 354*412f47f9SXin Li strne tmp1, [dst], #4 355*412f47f9SXin Li lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ 356*412f47f9SXin Li ldrhcs tmp1, [src], #2 357*412f47f9SXin Li ldrbne tmp2, [src] 358*412f47f9SXin Li strhcs tmp1, [dst], #2 359*412f47f9SXin Li strbne tmp2, [dst] 360*412f47f9SXin Li 361*412f47f9SXin LiL(done): 362*412f47f9SXin Li ldr tmp2, [sp], #FRAME_SIZE 363*412f47f9SXin Li bx lr 364*412f47f9SXin Li 365*412f47f9SXin LiL(cpy_body_long): /* Count in tmp2. */ 366*412f47f9SXin Li 367*412f47f9SXin Li /* Long copy. We know that there's at least (prefetch_lines * 64) 368*412f47f9SXin Li bytes to go. */ 369*412f47f9SXin Li#ifdef USE_VFP 370*412f47f9SXin Li /* Don't use PLD. Instead, read some data in advance of the current 371*412f47f9SXin Li copy position into a register. This should act like a PLD 372*412f47f9SXin Li operation but we won't have to repeat the transfer. */ 373*412f47f9SXin Li 374*412f47f9SXin Li vldr d3, [src, #0] 375*412f47f9SXin Li vldr d4, [src, #64] 376*412f47f9SXin Li vldr d5, [src, #128] 377*412f47f9SXin Li vldr d6, [src, #192] 378*412f47f9SXin Li vldr d7, [src, #256] 379*412f47f9SXin Li 380*412f47f9SXin Li vldr d0, [src, #8] 381*412f47f9SXin Li vldr d1, [src, #16] 382*412f47f9SXin Li vldr d2, [src, #24] 383*412f47f9SXin Li add src, src, #32 384*412f47f9SXin Li 385*412f47f9SXin Li subs tmp2, tmp2, #prefetch_lines * 64 * 2 386*412f47f9SXin Li blo 2f 387*412f47f9SXin Li1: 388*412f47f9SXin Li cpy_line_vfp d3, 0 389*412f47f9SXin Li cpy_line_vfp d4, 64 390*412f47f9SXin Li cpy_line_vfp d5, 128 391*412f47f9SXin Li add dst, dst, #3 * 64 392*412f47f9SXin Li add src, src, #3 * 64 393*412f47f9SXin Li cpy_line_vfp d6, 0 394*412f47f9SXin Li cpy_line_vfp d7, 64 395*412f47f9SXin Li add dst, dst, #2 * 64 396*412f47f9SXin Li add src, src, #2 * 64 397*412f47f9SXin Li subs tmp2, tmp2, #prefetch_lines * 64 398*412f47f9SXin Li bhs 1b 399*412f47f9SXin Li 400*412f47f9SXin Li2: 401*412f47f9SXin Li cpy_tail_vfp d3, 0 402*412f47f9SXin Li cpy_tail_vfp d4, 64 403*412f47f9SXin Li cpy_tail_vfp d5, 128 404*412f47f9SXin Li add src, src, #3 * 64 405*412f47f9SXin Li add dst, dst, #3 * 64 406*412f47f9SXin Li cpy_tail_vfp d6, 0 407*412f47f9SXin Li vstr d7, [dst, #64] 408*412f47f9SXin Li vldr d7, [src, #64] 409*412f47f9SXin Li vstr d0, [dst, #64 + 8] 410*412f47f9SXin Li vldr d0, [src, #64 + 8] 411*412f47f9SXin Li vstr d1, [dst, #64 + 16] 412*412f47f9SXin Li vldr d1, [src, #64 + 16] 413*412f47f9SXin Li vstr d2, [dst, #64 + 24] 414*412f47f9SXin Li vldr d2, [src, #64 + 24] 415*412f47f9SXin Li vstr d7, [dst, #64 + 32] 416*412f47f9SXin Li add src, src, #96 417*412f47f9SXin Li vstr d0, [dst, #64 + 40] 418*412f47f9SXin Li vstr d1, [dst, #64 + 48] 419*412f47f9SXin Li vstr d2, [dst, #64 + 56] 420*412f47f9SXin Li add dst, dst, #128 421*412f47f9SXin Li add tmp2, tmp2, #prefetch_lines * 64 422*412f47f9SXin Li b L(cpy_body_medium) 423*412f47f9SXin Li#else 424*412f47f9SXin Li /* Long copy. Use an SMS style loop to maximize the I/O 425*412f47f9SXin Li bandwidth of the core. We don't have enough spare registers 426*412f47f9SXin Li to synthesise prefetching, so use PLD operations. */ 427*412f47f9SXin Li /* Pre-bias src and dst. */ 428*412f47f9SXin Li sub src, src, #8 429*412f47f9SXin Li sub dst, dst, #8 430*412f47f9SXin Li pld [src, #8] 431*412f47f9SXin Li pld [src, #72] 432*412f47f9SXin Li subs tmp2, tmp2, #64 433*412f47f9SXin Li pld [src, #136] 434*412f47f9SXin Li ldrd A_l, A_h, [src, #8] 435*412f47f9SXin Li strd B_l, B_h, [sp, #8] 436*412f47f9SXin Li ldrd B_l, B_h, [src, #16] 437*412f47f9SXin Li strd C_l, C_h, [sp, #16] 438*412f47f9SXin Li ldrd C_l, C_h, [src, #24] 439*412f47f9SXin Li strd D_l, D_h, [sp, #24] 440*412f47f9SXin Li pld [src, #200] 441*412f47f9SXin Li ldrd D_l, D_h, [src, #32]! 442*412f47f9SXin Li b 1f 443*412f47f9SXin Li .p2align 6 444*412f47f9SXin Li2: 445*412f47f9SXin Li pld [src, #232] 446*412f47f9SXin Li strd A_l, A_h, [dst, #40] 447*412f47f9SXin Li ldrd A_l, A_h, [src, #40] 448*412f47f9SXin Li strd B_l, B_h, [dst, #48] 449*412f47f9SXin Li ldrd B_l, B_h, [src, #48] 450*412f47f9SXin Li strd C_l, C_h, [dst, #56] 451*412f47f9SXin Li ldrd C_l, C_h, [src, #56] 452*412f47f9SXin Li strd D_l, D_h, [dst, #64]! 453*412f47f9SXin Li ldrd D_l, D_h, [src, #64]! 454*412f47f9SXin Li subs tmp2, tmp2, #64 455*412f47f9SXin Li1: 456*412f47f9SXin Li strd A_l, A_h, [dst, #8] 457*412f47f9SXin Li ldrd A_l, A_h, [src, #8] 458*412f47f9SXin Li strd B_l, B_h, [dst, #16] 459*412f47f9SXin Li ldrd B_l, B_h, [src, #16] 460*412f47f9SXin Li strd C_l, C_h, [dst, #24] 461*412f47f9SXin Li ldrd C_l, C_h, [src, #24] 462*412f47f9SXin Li strd D_l, D_h, [dst, #32] 463*412f47f9SXin Li ldrd D_l, D_h, [src, #32] 464*412f47f9SXin Li bcs 2b 465*412f47f9SXin Li /* Save the remaining bytes and restore the callee-saved regs. */ 466*412f47f9SXin Li strd A_l, A_h, [dst, #40] 467*412f47f9SXin Li add src, src, #40 468*412f47f9SXin Li strd B_l, B_h, [dst, #48] 469*412f47f9SXin Li ldrd B_l, B_h, [sp, #8] 470*412f47f9SXin Li strd C_l, C_h, [dst, #56] 471*412f47f9SXin Li ldrd C_l, C_h, [sp, #16] 472*412f47f9SXin Li strd D_l, D_h, [dst, #64] 473*412f47f9SXin Li ldrd D_l, D_h, [sp, #24] 474*412f47f9SXin Li add dst, dst, #72 475*412f47f9SXin Li tst tmp2, #0x3f 476*412f47f9SXin Li bne L(tail63aligned) 477*412f47f9SXin Li ldr tmp2, [sp], #FRAME_SIZE 478*412f47f9SXin Li bx lr 479*412f47f9SXin Li#endif 480*412f47f9SXin Li 481*412f47f9SXin LiL(cpy_notaligned): 482*412f47f9SXin Li pld [src] 483*412f47f9SXin Li pld [src, #64] 484*412f47f9SXin Li /* There's at least 64 bytes to copy, but there is no mutual 485*412f47f9SXin Li alignment. */ 486*412f47f9SXin Li /* Bring DST to 64-bit alignment. */ 487*412f47f9SXin Li lsls tmp2, dst, #29 488*412f47f9SXin Li pld [src, #(2 * 64)] 489*412f47f9SXin Li beq 1f 490*412f47f9SXin Li rsbs tmp2, tmp2, #0 491*412f47f9SXin Li sub count, count, tmp2, lsr #29 492*412f47f9SXin Li ldrmi tmp1, [src], #4 493*412f47f9SXin Li strmi tmp1, [dst], #4 494*412f47f9SXin Li lsls tmp2, tmp2, #2 495*412f47f9SXin Li ldrbne tmp1, [src], #1 496*412f47f9SXin Li ldrhcs tmp2, [src], #2 497*412f47f9SXin Li strbne tmp1, [dst], #1 498*412f47f9SXin Li strhcs tmp2, [dst], #2 499*412f47f9SXin Li1: 500*412f47f9SXin Li pld [src, #(3 * 64)] 501*412f47f9SXin Li subs count, count, #64 502*412f47f9SXin Li ldrlo tmp2, [sp], #FRAME_SIZE 503*412f47f9SXin Li blo L(tail63unaligned) 504*412f47f9SXin Li pld [src, #(4 * 64)] 505*412f47f9SXin Li 506*412f47f9SXin Li#ifdef USE_NEON 507*412f47f9SXin Li vld1.8 {d0-d3}, [src]! 508*412f47f9SXin Li vld1.8 {d4-d7}, [src]! 509*412f47f9SXin Li subs count, count, #64 510*412f47f9SXin Li blo 2f 511*412f47f9SXin Li1: 512*412f47f9SXin Li pld [src, #(4 * 64)] 513*412f47f9SXin Li vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 514*412f47f9SXin Li vld1.8 {d0-d3}, [src]! 515*412f47f9SXin Li vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 516*412f47f9SXin Li vld1.8 {d4-d7}, [src]! 517*412f47f9SXin Li subs count, count, #64 518*412f47f9SXin Li bhs 1b 519*412f47f9SXin Li2: 520*412f47f9SXin Li vst1.8 {d0-d3}, [ALIGN (dst, 64)]! 521*412f47f9SXin Li vst1.8 {d4-d7}, [ALIGN (dst, 64)]! 522*412f47f9SXin Li ands count, count, #0x3f 523*412f47f9SXin Li#else 524*412f47f9SXin Li /* Use an SMS style loop to maximize the I/O bandwidth. */ 525*412f47f9SXin Li sub src, src, #4 526*412f47f9SXin Li sub dst, dst, #8 527*412f47f9SXin Li subs tmp2, count, #64 /* Use tmp2 for count. */ 528*412f47f9SXin Li ldr A_l, [src, #4] 529*412f47f9SXin Li ldr A_h, [src, #8] 530*412f47f9SXin Li strd B_l, B_h, [sp, #8] 531*412f47f9SXin Li ldr B_l, [src, #12] 532*412f47f9SXin Li ldr B_h, [src, #16] 533*412f47f9SXin Li strd C_l, C_h, [sp, #16] 534*412f47f9SXin Li ldr C_l, [src, #20] 535*412f47f9SXin Li ldr C_h, [src, #24] 536*412f47f9SXin Li strd D_l, D_h, [sp, #24] 537*412f47f9SXin Li ldr D_l, [src, #28] 538*412f47f9SXin Li ldr D_h, [src, #32]! 539*412f47f9SXin Li b 1f 540*412f47f9SXin Li .p2align 6 541*412f47f9SXin Li2: 542*412f47f9SXin Li pld [src, #(5 * 64) - (32 - 4)] 543*412f47f9SXin Li strd A_l, A_h, [dst, #40] 544*412f47f9SXin Li ldr A_l, [src, #36] 545*412f47f9SXin Li ldr A_h, [src, #40] 546*412f47f9SXin Li strd B_l, B_h, [dst, #48] 547*412f47f9SXin Li ldr B_l, [src, #44] 548*412f47f9SXin Li ldr B_h, [src, #48] 549*412f47f9SXin Li strd C_l, C_h, [dst, #56] 550*412f47f9SXin Li ldr C_l, [src, #52] 551*412f47f9SXin Li ldr C_h, [src, #56] 552*412f47f9SXin Li strd D_l, D_h, [dst, #64]! 553*412f47f9SXin Li ldr D_l, [src, #60] 554*412f47f9SXin Li ldr D_h, [src, #64]! 555*412f47f9SXin Li subs tmp2, tmp2, #64 556*412f47f9SXin Li1: 557*412f47f9SXin Li strd A_l, A_h, [dst, #8] 558*412f47f9SXin Li ldr A_l, [src, #4] 559*412f47f9SXin Li ldr A_h, [src, #8] 560*412f47f9SXin Li strd B_l, B_h, [dst, #16] 561*412f47f9SXin Li ldr B_l, [src, #12] 562*412f47f9SXin Li ldr B_h, [src, #16] 563*412f47f9SXin Li strd C_l, C_h, [dst, #24] 564*412f47f9SXin Li ldr C_l, [src, #20] 565*412f47f9SXin Li ldr C_h, [src, #24] 566*412f47f9SXin Li strd D_l, D_h, [dst, #32] 567*412f47f9SXin Li ldr D_l, [src, #28] 568*412f47f9SXin Li ldr D_h, [src, #32] 569*412f47f9SXin Li bcs 2b 570*412f47f9SXin Li 571*412f47f9SXin Li /* Save the remaining bytes and restore the callee-saved regs. */ 572*412f47f9SXin Li strd A_l, A_h, [dst, #40] 573*412f47f9SXin Li add src, src, #36 574*412f47f9SXin Li strd B_l, B_h, [dst, #48] 575*412f47f9SXin Li ldrd B_l, B_h, [sp, #8] 576*412f47f9SXin Li strd C_l, C_h, [dst, #56] 577*412f47f9SXin Li ldrd C_l, C_h, [sp, #16] 578*412f47f9SXin Li strd D_l, D_h, [dst, #64] 579*412f47f9SXin Li ldrd D_l, D_h, [sp, #24] 580*412f47f9SXin Li add dst, dst, #72 581*412f47f9SXin Li ands count, tmp2, #0x3f 582*412f47f9SXin Li#endif 583*412f47f9SXin Li ldr tmp2, [sp], #FRAME_SIZE 584*412f47f9SXin Li bne L(tail63unaligned) 585*412f47f9SXin Li bx lr 586*412f47f9SXin Li 587*412f47f9SXin LiEND (__memcpy_arm) 588