xref: /aosp_15_r20/external/arm-optimized-routines/string/arm/memcpy.S (revision 412f47f9e737e10ed5cc46ec6a8d7fa2264f8a14)
1*412f47f9SXin Li/*
2*412f47f9SXin Li * memcpy - copy memory area
3*412f47f9SXin Li *
4*412f47f9SXin Li * Copyright (c) 2013-2022, Arm Limited.
5*412f47f9SXin Li * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*412f47f9SXin Li */
7*412f47f9SXin Li
8*412f47f9SXin Li/*
9*412f47f9SXin Li   This memcpy routine is optimised for Cortex-A15 cores and takes advantage
10*412f47f9SXin Li   of VFP or NEON when built with the appropriate flags.
11*412f47f9SXin Li
12*412f47f9SXin Li   Assumptions:
13*412f47f9SXin Li
14*412f47f9SXin Li    ARMv6 (ARMv7-a if using Neon)
15*412f47f9SXin Li    ARM state
16*412f47f9SXin Li    Unaligned accesses
17*412f47f9SXin Li
18*412f47f9SXin Li */
19*412f47f9SXin Li
20*412f47f9SXin Li#include "asmdefs.h"
21*412f47f9SXin Li
22*412f47f9SXin Li	.syntax unified
23*412f47f9SXin Li	/* This implementation requires ARM state.  */
24*412f47f9SXin Li	.arm
25*412f47f9SXin Li
26*412f47f9SXin Li#ifdef __ARM_NEON__
27*412f47f9SXin Li
28*412f47f9SXin Li	.fpu	neon
29*412f47f9SXin Li	.arch	armv7-a
30*412f47f9SXin Li# define FRAME_SIZE	4
31*412f47f9SXin Li# define USE_VFP
32*412f47f9SXin Li# define USE_NEON
33*412f47f9SXin Li
34*412f47f9SXin Li#elif !defined (__SOFTFP__)
35*412f47f9SXin Li
36*412f47f9SXin Li	.arch	armv6
37*412f47f9SXin Li	.fpu	vfpv2
38*412f47f9SXin Li# define FRAME_SIZE	32
39*412f47f9SXin Li# define USE_VFP
40*412f47f9SXin Li
41*412f47f9SXin Li#else
42*412f47f9SXin Li	.arch	armv6
43*412f47f9SXin Li# define FRAME_SIZE    32
44*412f47f9SXin Li
45*412f47f9SXin Li#endif
46*412f47f9SXin Li
47*412f47f9SXin Li/* Old versions of GAS incorrectly implement the NEON align semantics.  */
48*412f47f9SXin Li#ifdef BROKEN_ASM_NEON_ALIGN
49*412f47f9SXin Li#define ALIGN(addr, align) addr,:align
50*412f47f9SXin Li#else
51*412f47f9SXin Li#define ALIGN(addr, align) addr:align
52*412f47f9SXin Li#endif
53*412f47f9SXin Li
54*412f47f9SXin Li#define PC_OFFSET	8	/* PC pipeline compensation.  */
55*412f47f9SXin Li#define INSN_SIZE	4
56*412f47f9SXin Li
57*412f47f9SXin Li/* Call parameters.  */
58*412f47f9SXin Li#define dstin	r0
59*412f47f9SXin Li#define src	r1
60*412f47f9SXin Li#define count	r2
61*412f47f9SXin Li
62*412f47f9SXin Li/* Locals.  */
63*412f47f9SXin Li#define tmp1	r3
64*412f47f9SXin Li#define dst	ip
65*412f47f9SXin Li#define tmp2	r10
66*412f47f9SXin Li
67*412f47f9SXin Li#ifndef USE_NEON
68*412f47f9SXin Li/* For bulk copies using GP registers.  */
69*412f47f9SXin Li#define	A_l	r2		/* Call-clobbered.  */
70*412f47f9SXin Li#define	A_h	r3		/* Call-clobbered.  */
71*412f47f9SXin Li#define	B_l	r4
72*412f47f9SXin Li#define	B_h	r5
73*412f47f9SXin Li#define	C_l	r6
74*412f47f9SXin Li#define	C_h	r7
75*412f47f9SXin Li#define	D_l	r8
76*412f47f9SXin Li#define	D_h	r9
77*412f47f9SXin Li#endif
78*412f47f9SXin Li
79*412f47f9SXin Li/* Number of lines ahead to pre-fetch data.  If you change this the code
80*412f47f9SXin Li   below will need adjustment to compensate.  */
81*412f47f9SXin Li
82*412f47f9SXin Li#define prefetch_lines	5
83*412f47f9SXin Li
84*412f47f9SXin Li#ifdef USE_VFP
85*412f47f9SXin Li	.macro	cpy_line_vfp vreg, base
86*412f47f9SXin Li	vstr	\vreg, [dst, #\base]
87*412f47f9SXin Li	vldr	\vreg, [src, #\base]
88*412f47f9SXin Li	vstr	d0, [dst, #\base + 8]
89*412f47f9SXin Li	vldr	d0, [src, #\base + 8]
90*412f47f9SXin Li	vstr	d1, [dst, #\base + 16]
91*412f47f9SXin Li	vldr	d1, [src, #\base + 16]
92*412f47f9SXin Li	vstr	d2, [dst, #\base + 24]
93*412f47f9SXin Li	vldr	d2, [src, #\base + 24]
94*412f47f9SXin Li	vstr	\vreg, [dst, #\base + 32]
95*412f47f9SXin Li	vldr	\vreg, [src, #\base + prefetch_lines * 64 - 32]
96*412f47f9SXin Li	vstr	d0, [dst, #\base + 40]
97*412f47f9SXin Li	vldr	d0, [src, #\base + 40]
98*412f47f9SXin Li	vstr	d1, [dst, #\base + 48]
99*412f47f9SXin Li	vldr	d1, [src, #\base + 48]
100*412f47f9SXin Li	vstr	d2, [dst, #\base + 56]
101*412f47f9SXin Li	vldr	d2, [src, #\base + 56]
102*412f47f9SXin Li	.endm
103*412f47f9SXin Li
104*412f47f9SXin Li	.macro	cpy_tail_vfp vreg, base
105*412f47f9SXin Li	vstr	\vreg, [dst, #\base]
106*412f47f9SXin Li	vldr	\vreg, [src, #\base]
107*412f47f9SXin Li	vstr	d0, [dst, #\base + 8]
108*412f47f9SXin Li	vldr	d0, [src, #\base + 8]
109*412f47f9SXin Li	vstr	d1, [dst, #\base + 16]
110*412f47f9SXin Li	vldr	d1, [src, #\base + 16]
111*412f47f9SXin Li	vstr	d2, [dst, #\base + 24]
112*412f47f9SXin Li	vldr	d2, [src, #\base + 24]
113*412f47f9SXin Li	vstr	\vreg, [dst, #\base + 32]
114*412f47f9SXin Li	vstr	d0, [dst, #\base + 40]
115*412f47f9SXin Li	vldr	d0, [src, #\base + 40]
116*412f47f9SXin Li	vstr	d1, [dst, #\base + 48]
117*412f47f9SXin Li	vldr	d1, [src, #\base + 48]
118*412f47f9SXin Li	vstr	d2, [dst, #\base + 56]
119*412f47f9SXin Li	vldr	d2, [src, #\base + 56]
120*412f47f9SXin Li	.endm
121*412f47f9SXin Li#endif
122*412f47f9SXin Li
123*412f47f9SXin LiENTRY (__memcpy_arm)
124*412f47f9SXin Li
125*412f47f9SXin Li	mov	dst, dstin	/* Preserve dstin, we need to return it.  */
126*412f47f9SXin Li	cmp	count, #64
127*412f47f9SXin Li	bhs	L(cpy_not_short)
128*412f47f9SXin Li	/* Deal with small copies quickly by dropping straight into the
129*412f47f9SXin Li	   exit block.  */
130*412f47f9SXin Li
131*412f47f9SXin LiL(tail63unaligned):
132*412f47f9SXin Li#ifdef USE_NEON
133*412f47f9SXin Li	and	tmp1, count, #0x38
134*412f47f9SXin Li	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
135*412f47f9SXin Li	add	pc, pc, tmp1
136*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 14 words to go.  */
137*412f47f9SXin Li	vst1.8	{d0}, [dst]!
138*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 12 words to go.  */
139*412f47f9SXin Li	vst1.8	{d0}, [dst]!
140*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 10 words to go.  */
141*412f47f9SXin Li	vst1.8	{d0}, [dst]!
142*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 8 words to go.  */
143*412f47f9SXin Li	vst1.8	{d0}, [dst]!
144*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 6 words to go.  */
145*412f47f9SXin Li	vst1.8	{d0}, [dst]!
146*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 4 words to go.  */
147*412f47f9SXin Li	vst1.8	{d0}, [dst]!
148*412f47f9SXin Li	vld1.8	{d0}, [src]!	/* 2 words to go.  */
149*412f47f9SXin Li	vst1.8	{d0}, [dst]!
150*412f47f9SXin Li
151*412f47f9SXin Li	tst	count, #4
152*412f47f9SXin Li	ldrne	tmp1, [src], #4
153*412f47f9SXin Li	strne	tmp1, [dst], #4
154*412f47f9SXin Li#else
155*412f47f9SXin Li	/* Copy up to 15 full words of data.  May not be aligned.  */
156*412f47f9SXin Li	/* Cannot use VFP for unaligned data.  */
157*412f47f9SXin Li	and	tmp1, count, #0x3c
158*412f47f9SXin Li	add	dst, dst, tmp1
159*412f47f9SXin Li	add	src, src, tmp1
160*412f47f9SXin Li	rsb	tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
161*412f47f9SXin Li	/* Jump directly into the sequence below at the correct offset.  */
162*412f47f9SXin Li	add	pc, pc, tmp1, lsl #1
163*412f47f9SXin Li
164*412f47f9SXin Li	ldr	tmp1, [src, #-60]	/* 15 words to go.  */
165*412f47f9SXin Li	str	tmp1, [dst, #-60]
166*412f47f9SXin Li
167*412f47f9SXin Li	ldr	tmp1, [src, #-56]	/* 14 words to go.  */
168*412f47f9SXin Li	str	tmp1, [dst, #-56]
169*412f47f9SXin Li	ldr	tmp1, [src, #-52]
170*412f47f9SXin Li	str	tmp1, [dst, #-52]
171*412f47f9SXin Li
172*412f47f9SXin Li	ldr	tmp1, [src, #-48]	/* 12 words to go.  */
173*412f47f9SXin Li	str	tmp1, [dst, #-48]
174*412f47f9SXin Li	ldr	tmp1, [src, #-44]
175*412f47f9SXin Li	str	tmp1, [dst, #-44]
176*412f47f9SXin Li
177*412f47f9SXin Li	ldr	tmp1, [src, #-40]	/* 10 words to go.  */
178*412f47f9SXin Li	str	tmp1, [dst, #-40]
179*412f47f9SXin Li	ldr	tmp1, [src, #-36]
180*412f47f9SXin Li	str	tmp1, [dst, #-36]
181*412f47f9SXin Li
182*412f47f9SXin Li	ldr	tmp1, [src, #-32]	/* 8 words to go.  */
183*412f47f9SXin Li	str	tmp1, [dst, #-32]
184*412f47f9SXin Li	ldr	tmp1, [src, #-28]
185*412f47f9SXin Li	str	tmp1, [dst, #-28]
186*412f47f9SXin Li
187*412f47f9SXin Li	ldr	tmp1, [src, #-24]	/* 6 words to go.  */
188*412f47f9SXin Li	str	tmp1, [dst, #-24]
189*412f47f9SXin Li	ldr	tmp1, [src, #-20]
190*412f47f9SXin Li	str	tmp1, [dst, #-20]
191*412f47f9SXin Li
192*412f47f9SXin Li	ldr	tmp1, [src, #-16]	/* 4 words to go.  */
193*412f47f9SXin Li	str	tmp1, [dst, #-16]
194*412f47f9SXin Li	ldr	tmp1, [src, #-12]
195*412f47f9SXin Li	str	tmp1, [dst, #-12]
196*412f47f9SXin Li
197*412f47f9SXin Li	ldr	tmp1, [src, #-8]	/* 2 words to go.  */
198*412f47f9SXin Li	str	tmp1, [dst, #-8]
199*412f47f9SXin Li	ldr	tmp1, [src, #-4]
200*412f47f9SXin Li	str	tmp1, [dst, #-4]
201*412f47f9SXin Li#endif
202*412f47f9SXin Li
203*412f47f9SXin Li	lsls	count, count, #31
204*412f47f9SXin Li	ldrhcs	tmp1, [src], #2
205*412f47f9SXin Li	ldrbne	src, [src]		/* Src is dead, use as a scratch.  */
206*412f47f9SXin Li	strhcs	tmp1, [dst], #2
207*412f47f9SXin Li	strbne	src, [dst]
208*412f47f9SXin Li	bx	lr
209*412f47f9SXin Li
210*412f47f9SXin LiL(cpy_not_short):
211*412f47f9SXin Li	/* At least 64 bytes to copy, but don't know the alignment yet.  */
212*412f47f9SXin Li	str	tmp2, [sp, #-FRAME_SIZE]!
213*412f47f9SXin Li	and	tmp2, src, #7
214*412f47f9SXin Li	and	tmp1, dst, #7
215*412f47f9SXin Li	cmp	tmp1, tmp2
216*412f47f9SXin Li	bne	L(cpy_notaligned)
217*412f47f9SXin Li
218*412f47f9SXin Li#ifdef USE_VFP
219*412f47f9SXin Li	/* Magic dust alert!  Force VFP on Cortex-A9.  Experiments show
220*412f47f9SXin Li	   that the FP pipeline is much better at streaming loads and
221*412f47f9SXin Li	   stores.  This is outside the critical loop.  */
222*412f47f9SXin Li	vmov.f32	s0, s0
223*412f47f9SXin Li#endif
224*412f47f9SXin Li
225*412f47f9SXin Li	/* SRC and DST have the same mutual 64-bit alignment, but we may
226*412f47f9SXin Li	   still need to pre-copy some bytes to get to natural alignment.
227*412f47f9SXin Li	   We bring SRC and DST into full 64-bit alignment.  */
228*412f47f9SXin Li	lsls	tmp2, dst, #29
229*412f47f9SXin Li	beq	1f
230*412f47f9SXin Li	rsbs	tmp2, tmp2, #0
231*412f47f9SXin Li	sub	count, count, tmp2, lsr #29
232*412f47f9SXin Li	ldrmi	tmp1, [src], #4
233*412f47f9SXin Li	strmi	tmp1, [dst], #4
234*412f47f9SXin Li	lsls	tmp2, tmp2, #2
235*412f47f9SXin Li	ldrhcs	tmp1, [src], #2
236*412f47f9SXin Li	ldrbne	tmp2, [src], #1
237*412f47f9SXin Li	strhcs	tmp1, [dst], #2
238*412f47f9SXin Li	strbne	tmp2, [dst], #1
239*412f47f9SXin Li
240*412f47f9SXin Li1:
241*412f47f9SXin Li	subs	tmp2, count, #64	/* Use tmp2 for count.  */
242*412f47f9SXin Li	blo	L(tail63aligned)
243*412f47f9SXin Li
244*412f47f9SXin Li	cmp	tmp2, #512
245*412f47f9SXin Li	bhs	L(cpy_body_long)
246*412f47f9SXin Li
247*412f47f9SXin LiL(cpy_body_medium):			/* Count in tmp2.  */
248*412f47f9SXin Li#ifdef USE_VFP
249*412f47f9SXin Li1:
250*412f47f9SXin Li	vldr	d0, [src, #0]
251*412f47f9SXin Li	subs	tmp2, tmp2, #64
252*412f47f9SXin Li	vldr	d1, [src, #8]
253*412f47f9SXin Li	vstr	d0, [dst, #0]
254*412f47f9SXin Li	vldr	d0, [src, #16]
255*412f47f9SXin Li	vstr	d1, [dst, #8]
256*412f47f9SXin Li	vldr	d1, [src, #24]
257*412f47f9SXin Li	vstr	d0, [dst, #16]
258*412f47f9SXin Li	vldr	d0, [src, #32]
259*412f47f9SXin Li	vstr	d1, [dst, #24]
260*412f47f9SXin Li	vldr	d1, [src, #40]
261*412f47f9SXin Li	vstr	d0, [dst, #32]
262*412f47f9SXin Li	vldr	d0, [src, #48]
263*412f47f9SXin Li	vstr	d1, [dst, #40]
264*412f47f9SXin Li	vldr	d1, [src, #56]
265*412f47f9SXin Li	vstr	d0, [dst, #48]
266*412f47f9SXin Li	add	src, src, #64
267*412f47f9SXin Li	vstr	d1, [dst, #56]
268*412f47f9SXin Li	add	dst, dst, #64
269*412f47f9SXin Li	bhs	1b
270*412f47f9SXin Li	tst	tmp2, #0x3f
271*412f47f9SXin Li	beq	L(done)
272*412f47f9SXin Li
273*412f47f9SXin LiL(tail63aligned):			/* Count in tmp2.  */
274*412f47f9SXin Li	and	tmp1, tmp2, #0x38
275*412f47f9SXin Li	add	dst, dst, tmp1
276*412f47f9SXin Li	add	src, src, tmp1
277*412f47f9SXin Li	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
278*412f47f9SXin Li	add	pc, pc, tmp1
279*412f47f9SXin Li
280*412f47f9SXin Li	vldr	d0, [src, #-56]	/* 14 words to go.  */
281*412f47f9SXin Li	vstr	d0, [dst, #-56]
282*412f47f9SXin Li	vldr	d0, [src, #-48]	/* 12 words to go.  */
283*412f47f9SXin Li	vstr	d0, [dst, #-48]
284*412f47f9SXin Li	vldr	d0, [src, #-40]	/* 10 words to go.  */
285*412f47f9SXin Li	vstr	d0, [dst, #-40]
286*412f47f9SXin Li	vldr	d0, [src, #-32]	/* 8 words to go.  */
287*412f47f9SXin Li	vstr	d0, [dst, #-32]
288*412f47f9SXin Li	vldr	d0, [src, #-24]	/* 6 words to go.  */
289*412f47f9SXin Li	vstr	d0, [dst, #-24]
290*412f47f9SXin Li	vldr	d0, [src, #-16]	/* 4 words to go.  */
291*412f47f9SXin Li	vstr	d0, [dst, #-16]
292*412f47f9SXin Li	vldr	d0, [src, #-8]	/* 2 words to go.  */
293*412f47f9SXin Li	vstr	d0, [dst, #-8]
294*412f47f9SXin Li#else
295*412f47f9SXin Li	sub	src, src, #8
296*412f47f9SXin Li	sub	dst, dst, #8
297*412f47f9SXin Li1:
298*412f47f9SXin Li	ldrd	A_l, A_h, [src, #8]
299*412f47f9SXin Li	strd	A_l, A_h, [dst, #8]
300*412f47f9SXin Li	ldrd	A_l, A_h, [src, #16]
301*412f47f9SXin Li	strd	A_l, A_h, [dst, #16]
302*412f47f9SXin Li	ldrd	A_l, A_h, [src, #24]
303*412f47f9SXin Li	strd	A_l, A_h, [dst, #24]
304*412f47f9SXin Li	ldrd	A_l, A_h, [src, #32]
305*412f47f9SXin Li	strd	A_l, A_h, [dst, #32]
306*412f47f9SXin Li	ldrd	A_l, A_h, [src, #40]
307*412f47f9SXin Li	strd	A_l, A_h, [dst, #40]
308*412f47f9SXin Li	ldrd	A_l, A_h, [src, #48]
309*412f47f9SXin Li	strd	A_l, A_h, [dst, #48]
310*412f47f9SXin Li	ldrd	A_l, A_h, [src, #56]
311*412f47f9SXin Li	strd	A_l, A_h, [dst, #56]
312*412f47f9SXin Li	ldrd	A_l, A_h, [src, #64]!
313*412f47f9SXin Li	strd	A_l, A_h, [dst, #64]!
314*412f47f9SXin Li	subs	tmp2, tmp2, #64
315*412f47f9SXin Li	bhs	1b
316*412f47f9SXin Li	tst	tmp2, #0x3f
317*412f47f9SXin Li	bne	1f
318*412f47f9SXin Li	ldr	tmp2,[sp], #FRAME_SIZE
319*412f47f9SXin Li	bx	lr
320*412f47f9SXin Li1:
321*412f47f9SXin Li	add	src, src, #8
322*412f47f9SXin Li	add	dst, dst, #8
323*412f47f9SXin Li
324*412f47f9SXin LiL(tail63aligned):			/* Count in tmp2.  */
325*412f47f9SXin Li	/* Copy up to 7 d-words of data.  Similar to Ltail63unaligned, but
326*412f47f9SXin Li	   we know that the src and dest are 64-bit aligned so we can use
327*412f47f9SXin Li	   LDRD/STRD to improve efficiency.  */
328*412f47f9SXin Li	/* TMP2 is now negative, but we don't care about that.  The bottom
329*412f47f9SXin Li	   six bits still tell us how many bytes are left to copy.  */
330*412f47f9SXin Li
331*412f47f9SXin Li	and	tmp1, tmp2, #0x38
332*412f47f9SXin Li	add	dst, dst, tmp1
333*412f47f9SXin Li	add	src, src, tmp1
334*412f47f9SXin Li	rsb	tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
335*412f47f9SXin Li	add	pc, pc, tmp1
336*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-56]	/* 14 words to go.  */
337*412f47f9SXin Li	strd	A_l, A_h, [dst, #-56]
338*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-48]	/* 12 words to go.  */
339*412f47f9SXin Li	strd	A_l, A_h, [dst, #-48]
340*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-40]	/* 10 words to go.  */
341*412f47f9SXin Li	strd	A_l, A_h, [dst, #-40]
342*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-32]	/* 8 words to go.  */
343*412f47f9SXin Li	strd	A_l, A_h, [dst, #-32]
344*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-24]	/* 6 words to go.  */
345*412f47f9SXin Li	strd	A_l, A_h, [dst, #-24]
346*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-16]	/* 4 words to go.  */
347*412f47f9SXin Li	strd	A_l, A_h, [dst, #-16]
348*412f47f9SXin Li	ldrd	A_l, A_h, [src, #-8]	/* 2 words to go.  */
349*412f47f9SXin Li	strd	A_l, A_h, [dst, #-8]
350*412f47f9SXin Li
351*412f47f9SXin Li#endif
352*412f47f9SXin Li	tst	tmp2, #4
353*412f47f9SXin Li	ldrne	tmp1, [src], #4
354*412f47f9SXin Li	strne	tmp1, [dst], #4
355*412f47f9SXin Li	lsls	tmp2, tmp2, #31		/* Count (tmp2) now dead. */
356*412f47f9SXin Li	ldrhcs	tmp1, [src], #2
357*412f47f9SXin Li	ldrbne	tmp2, [src]
358*412f47f9SXin Li	strhcs	tmp1, [dst], #2
359*412f47f9SXin Li	strbne	tmp2, [dst]
360*412f47f9SXin Li
361*412f47f9SXin LiL(done):
362*412f47f9SXin Li	ldr	tmp2, [sp], #FRAME_SIZE
363*412f47f9SXin Li	bx	lr
364*412f47f9SXin Li
365*412f47f9SXin LiL(cpy_body_long):			/* Count in tmp2.  */
366*412f47f9SXin Li
367*412f47f9SXin Li	/* Long copy.  We know that there's at least (prefetch_lines * 64)
368*412f47f9SXin Li	   bytes to go.  */
369*412f47f9SXin Li#ifdef USE_VFP
370*412f47f9SXin Li	/* Don't use PLD.  Instead, read some data in advance of the current
371*412f47f9SXin Li	   copy position into a register.  This should act like a PLD
372*412f47f9SXin Li	   operation but we won't have to repeat the transfer.  */
373*412f47f9SXin Li
374*412f47f9SXin Li	vldr	d3, [src, #0]
375*412f47f9SXin Li	vldr	d4, [src, #64]
376*412f47f9SXin Li	vldr	d5, [src, #128]
377*412f47f9SXin Li	vldr	d6, [src, #192]
378*412f47f9SXin Li	vldr	d7, [src, #256]
379*412f47f9SXin Li
380*412f47f9SXin Li	vldr	d0, [src, #8]
381*412f47f9SXin Li	vldr	d1, [src, #16]
382*412f47f9SXin Li	vldr	d2, [src, #24]
383*412f47f9SXin Li	add	src, src, #32
384*412f47f9SXin Li
385*412f47f9SXin Li	subs	tmp2, tmp2, #prefetch_lines * 64 * 2
386*412f47f9SXin Li	blo	2f
387*412f47f9SXin Li1:
388*412f47f9SXin Li	cpy_line_vfp	d3, 0
389*412f47f9SXin Li	cpy_line_vfp	d4, 64
390*412f47f9SXin Li	cpy_line_vfp	d5, 128
391*412f47f9SXin Li	add	dst, dst, #3 * 64
392*412f47f9SXin Li	add	src, src, #3 * 64
393*412f47f9SXin Li	cpy_line_vfp	d6, 0
394*412f47f9SXin Li	cpy_line_vfp	d7, 64
395*412f47f9SXin Li	add	dst, dst, #2 * 64
396*412f47f9SXin Li	add	src, src, #2 * 64
397*412f47f9SXin Li	subs	tmp2, tmp2, #prefetch_lines * 64
398*412f47f9SXin Li	bhs	1b
399*412f47f9SXin Li
400*412f47f9SXin Li2:
401*412f47f9SXin Li	cpy_tail_vfp	d3, 0
402*412f47f9SXin Li	cpy_tail_vfp	d4, 64
403*412f47f9SXin Li	cpy_tail_vfp	d5, 128
404*412f47f9SXin Li	add	src, src, #3 * 64
405*412f47f9SXin Li	add	dst, dst, #3 * 64
406*412f47f9SXin Li	cpy_tail_vfp	d6, 0
407*412f47f9SXin Li	vstr	d7, [dst, #64]
408*412f47f9SXin Li	vldr	d7, [src, #64]
409*412f47f9SXin Li	vstr	d0, [dst, #64 + 8]
410*412f47f9SXin Li	vldr	d0, [src, #64 + 8]
411*412f47f9SXin Li	vstr	d1, [dst, #64 + 16]
412*412f47f9SXin Li	vldr	d1, [src, #64 + 16]
413*412f47f9SXin Li	vstr	d2, [dst, #64 + 24]
414*412f47f9SXin Li	vldr	d2, [src, #64 + 24]
415*412f47f9SXin Li	vstr	d7, [dst, #64 + 32]
416*412f47f9SXin Li	add	src, src, #96
417*412f47f9SXin Li	vstr	d0, [dst, #64 + 40]
418*412f47f9SXin Li	vstr	d1, [dst, #64 + 48]
419*412f47f9SXin Li	vstr	d2, [dst, #64 + 56]
420*412f47f9SXin Li	add	dst, dst, #128
421*412f47f9SXin Li	add	tmp2, tmp2, #prefetch_lines * 64
422*412f47f9SXin Li	b	L(cpy_body_medium)
423*412f47f9SXin Li#else
424*412f47f9SXin Li	/* Long copy.  Use an SMS style loop to maximize the I/O
425*412f47f9SXin Li	   bandwidth of the core.  We don't have enough spare registers
426*412f47f9SXin Li	   to synthesise prefetching, so use PLD operations.  */
427*412f47f9SXin Li	/* Pre-bias src and dst.  */
428*412f47f9SXin Li	sub	src, src, #8
429*412f47f9SXin Li	sub	dst, dst, #8
430*412f47f9SXin Li	pld	[src, #8]
431*412f47f9SXin Li	pld	[src, #72]
432*412f47f9SXin Li	subs	tmp2, tmp2, #64
433*412f47f9SXin Li	pld	[src, #136]
434*412f47f9SXin Li	ldrd	A_l, A_h, [src, #8]
435*412f47f9SXin Li	strd	B_l, B_h, [sp, #8]
436*412f47f9SXin Li	ldrd	B_l, B_h, [src, #16]
437*412f47f9SXin Li	strd	C_l, C_h, [sp, #16]
438*412f47f9SXin Li	ldrd	C_l, C_h, [src, #24]
439*412f47f9SXin Li	strd	D_l, D_h, [sp, #24]
440*412f47f9SXin Li	pld	[src, #200]
441*412f47f9SXin Li	ldrd	D_l, D_h, [src, #32]!
442*412f47f9SXin Li	b	1f
443*412f47f9SXin Li	.p2align	6
444*412f47f9SXin Li2:
445*412f47f9SXin Li	pld	[src, #232]
446*412f47f9SXin Li	strd	A_l, A_h, [dst, #40]
447*412f47f9SXin Li	ldrd	A_l, A_h, [src, #40]
448*412f47f9SXin Li	strd	B_l, B_h, [dst, #48]
449*412f47f9SXin Li	ldrd	B_l, B_h, [src, #48]
450*412f47f9SXin Li	strd	C_l, C_h, [dst, #56]
451*412f47f9SXin Li	ldrd	C_l, C_h, [src, #56]
452*412f47f9SXin Li	strd	D_l, D_h, [dst, #64]!
453*412f47f9SXin Li	ldrd	D_l, D_h, [src, #64]!
454*412f47f9SXin Li	subs	tmp2, tmp2, #64
455*412f47f9SXin Li1:
456*412f47f9SXin Li	strd	A_l, A_h, [dst, #8]
457*412f47f9SXin Li	ldrd	A_l, A_h, [src, #8]
458*412f47f9SXin Li	strd	B_l, B_h, [dst, #16]
459*412f47f9SXin Li	ldrd	B_l, B_h, [src, #16]
460*412f47f9SXin Li	strd	C_l, C_h, [dst, #24]
461*412f47f9SXin Li	ldrd	C_l, C_h, [src, #24]
462*412f47f9SXin Li	strd	D_l, D_h, [dst, #32]
463*412f47f9SXin Li	ldrd	D_l, D_h, [src, #32]
464*412f47f9SXin Li	bcs	2b
465*412f47f9SXin Li	/* Save the remaining bytes and restore the callee-saved regs.  */
466*412f47f9SXin Li	strd	A_l, A_h, [dst, #40]
467*412f47f9SXin Li	add	src, src, #40
468*412f47f9SXin Li	strd	B_l, B_h, [dst, #48]
469*412f47f9SXin Li	ldrd	B_l, B_h, [sp, #8]
470*412f47f9SXin Li	strd	C_l, C_h, [dst, #56]
471*412f47f9SXin Li	ldrd	C_l, C_h, [sp, #16]
472*412f47f9SXin Li	strd	D_l, D_h, [dst, #64]
473*412f47f9SXin Li	ldrd	D_l, D_h, [sp, #24]
474*412f47f9SXin Li	add	dst, dst, #72
475*412f47f9SXin Li	tst	tmp2, #0x3f
476*412f47f9SXin Li	bne	L(tail63aligned)
477*412f47f9SXin Li	ldr	tmp2, [sp], #FRAME_SIZE
478*412f47f9SXin Li	bx	lr
479*412f47f9SXin Li#endif
480*412f47f9SXin Li
481*412f47f9SXin LiL(cpy_notaligned):
482*412f47f9SXin Li	pld	[src]
483*412f47f9SXin Li	pld	[src, #64]
484*412f47f9SXin Li	/* There's at least 64 bytes to copy, but there is no mutual
485*412f47f9SXin Li	   alignment.  */
486*412f47f9SXin Li	/* Bring DST to 64-bit alignment.  */
487*412f47f9SXin Li	lsls	tmp2, dst, #29
488*412f47f9SXin Li	pld	[src, #(2 * 64)]
489*412f47f9SXin Li	beq	1f
490*412f47f9SXin Li	rsbs	tmp2, tmp2, #0
491*412f47f9SXin Li	sub	count, count, tmp2, lsr #29
492*412f47f9SXin Li	ldrmi	tmp1, [src], #4
493*412f47f9SXin Li	strmi	tmp1, [dst], #4
494*412f47f9SXin Li	lsls	tmp2, tmp2, #2
495*412f47f9SXin Li	ldrbne	tmp1, [src], #1
496*412f47f9SXin Li	ldrhcs	tmp2, [src], #2
497*412f47f9SXin Li	strbne	tmp1, [dst], #1
498*412f47f9SXin Li	strhcs	tmp2, [dst], #2
499*412f47f9SXin Li1:
500*412f47f9SXin Li	pld	[src, #(3 * 64)]
501*412f47f9SXin Li	subs	count, count, #64
502*412f47f9SXin Li	ldrlo	tmp2, [sp], #FRAME_SIZE
503*412f47f9SXin Li	blo	L(tail63unaligned)
504*412f47f9SXin Li	pld	[src, #(4 * 64)]
505*412f47f9SXin Li
506*412f47f9SXin Li#ifdef USE_NEON
507*412f47f9SXin Li	vld1.8	{d0-d3}, [src]!
508*412f47f9SXin Li	vld1.8	{d4-d7}, [src]!
509*412f47f9SXin Li	subs	count, count, #64
510*412f47f9SXin Li	blo	2f
511*412f47f9SXin Li1:
512*412f47f9SXin Li	pld	[src, #(4 * 64)]
513*412f47f9SXin Li	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
514*412f47f9SXin Li	vld1.8	{d0-d3}, [src]!
515*412f47f9SXin Li	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
516*412f47f9SXin Li	vld1.8	{d4-d7}, [src]!
517*412f47f9SXin Li	subs	count, count, #64
518*412f47f9SXin Li	bhs	1b
519*412f47f9SXin Li2:
520*412f47f9SXin Li	vst1.8	{d0-d3}, [ALIGN (dst, 64)]!
521*412f47f9SXin Li	vst1.8	{d4-d7}, [ALIGN (dst, 64)]!
522*412f47f9SXin Li	ands	count, count, #0x3f
523*412f47f9SXin Li#else
524*412f47f9SXin Li	/* Use an SMS style loop to maximize the I/O bandwidth.  */
525*412f47f9SXin Li	sub	src, src, #4
526*412f47f9SXin Li	sub	dst, dst, #8
527*412f47f9SXin Li	subs	tmp2, count, #64	/* Use tmp2 for count.  */
528*412f47f9SXin Li	ldr	A_l, [src, #4]
529*412f47f9SXin Li	ldr	A_h, [src, #8]
530*412f47f9SXin Li	strd	B_l, B_h, [sp, #8]
531*412f47f9SXin Li	ldr	B_l, [src, #12]
532*412f47f9SXin Li	ldr	B_h, [src, #16]
533*412f47f9SXin Li	strd	C_l, C_h, [sp, #16]
534*412f47f9SXin Li	ldr	C_l, [src, #20]
535*412f47f9SXin Li	ldr	C_h, [src, #24]
536*412f47f9SXin Li	strd	D_l, D_h, [sp, #24]
537*412f47f9SXin Li	ldr	D_l, [src, #28]
538*412f47f9SXin Li	ldr	D_h, [src, #32]!
539*412f47f9SXin Li	b	1f
540*412f47f9SXin Li	.p2align	6
541*412f47f9SXin Li2:
542*412f47f9SXin Li	pld	[src, #(5 * 64) - (32 - 4)]
543*412f47f9SXin Li	strd	A_l, A_h, [dst, #40]
544*412f47f9SXin Li	ldr	A_l, [src, #36]
545*412f47f9SXin Li	ldr	A_h, [src, #40]
546*412f47f9SXin Li	strd	B_l, B_h, [dst, #48]
547*412f47f9SXin Li	ldr	B_l, [src, #44]
548*412f47f9SXin Li	ldr	B_h, [src, #48]
549*412f47f9SXin Li	strd	C_l, C_h, [dst, #56]
550*412f47f9SXin Li	ldr	C_l, [src, #52]
551*412f47f9SXin Li	ldr	C_h, [src, #56]
552*412f47f9SXin Li	strd	D_l, D_h, [dst, #64]!
553*412f47f9SXin Li	ldr	D_l, [src, #60]
554*412f47f9SXin Li	ldr	D_h, [src, #64]!
555*412f47f9SXin Li	subs	tmp2, tmp2, #64
556*412f47f9SXin Li1:
557*412f47f9SXin Li	strd	A_l, A_h, [dst, #8]
558*412f47f9SXin Li	ldr	A_l, [src, #4]
559*412f47f9SXin Li	ldr	A_h, [src, #8]
560*412f47f9SXin Li	strd	B_l, B_h, [dst, #16]
561*412f47f9SXin Li	ldr	B_l, [src, #12]
562*412f47f9SXin Li	ldr	B_h, [src, #16]
563*412f47f9SXin Li	strd	C_l, C_h, [dst, #24]
564*412f47f9SXin Li	ldr	C_l, [src, #20]
565*412f47f9SXin Li	ldr	C_h, [src, #24]
566*412f47f9SXin Li	strd	D_l, D_h, [dst, #32]
567*412f47f9SXin Li	ldr	D_l, [src, #28]
568*412f47f9SXin Li	ldr	D_h, [src, #32]
569*412f47f9SXin Li	bcs	2b
570*412f47f9SXin Li
571*412f47f9SXin Li	/* Save the remaining bytes and restore the callee-saved regs.  */
572*412f47f9SXin Li	strd	A_l, A_h, [dst, #40]
573*412f47f9SXin Li	add	src, src, #36
574*412f47f9SXin Li	strd	B_l, B_h, [dst, #48]
575*412f47f9SXin Li	ldrd	B_l, B_h, [sp, #8]
576*412f47f9SXin Li	strd	C_l, C_h, [dst, #56]
577*412f47f9SXin Li	ldrd	C_l, C_h, [sp, #16]
578*412f47f9SXin Li	strd	D_l, D_h, [dst, #64]
579*412f47f9SXin Li	ldrd	D_l, D_h, [sp, #24]
580*412f47f9SXin Li	add	dst, dst, #72
581*412f47f9SXin Li	ands	count, tmp2, #0x3f
582*412f47f9SXin Li#endif
583*412f47f9SXin Li	ldr	tmp2, [sp], #FRAME_SIZE
584*412f47f9SXin Li	bne	L(tail63unaligned)
585*412f47f9SXin Li	bx	lr
586*412f47f9SXin Li
587*412f47f9SXin LiEND (__memcpy_arm)
588