xref: /aosp_15_r20/external/coreboot/src/arch/x86/memmove_64.S (revision b9411a12aaaa7e1e6a6fb7c5e057f44ee179a49c)
1/* SPDX-License-Identifier: GPL-2.0-only */
2
3/* This code originates from Linux 5.19 */
4
5/*
6 * Implement memmove(). This can handle overlap between src and dst.
7 *
8 * Input:
9 * rdi: dest
10 * rsi: src
11 * rdx: count
12 *
13 * Output:
14 * rax: dest
15 */
16.global memmove
17memmove:
18
19	mov %rdi, %rax
20
21	/* Decide forward/backward copy mode */
22	cmp %rdi, %rsi
23	jge .Lmemmove_begin_forward
24	mov %rsi, %r8
25	add %rdx, %r8
26	cmp %rdi, %r8
27	jg 2f
28
29	/* Don't optimize for FSRM and ERMS like Linux */
30.Lmemmove_begin_forward:
31	cmp $0x20, %rdx
32	jb 1f
33
34	/*
35	 * movsq instruction have many startup latency
36	 * so we handle small size by general register.
37	 */
38	cmp  $680, %rdx
39	jb	3f
40	/*
41	 * movsq instruction is only good for aligned case.
42	 */
43
44	cmpb %dil, %sil
45	je 4f
463:
47	sub $0x20, %rdx
48	/*
49	 * We gobble 32 bytes forward in each loop.
50	 */
515:
52	sub $0x20, %rdx
53	movq 0*8(%rsi), %r11
54	movq 1*8(%rsi), %r10
55	movq 2*8(%rsi), %r9
56	movq 3*8(%rsi), %r8
57	leaq 4*8(%rsi), %rsi
58
59	movq %r11, 0*8(%rdi)
60	movq %r10, 1*8(%rdi)
61	movq %r9, 2*8(%rdi)
62	movq %r8, 3*8(%rdi)
63	leaq 4*8(%rdi), %rdi
64	jae 5b
65	addq $0x20, %rdx
66	jmp 1f
67	/*
68	 * Handle data forward by movsq.
69	 */
70	.p2align 4
714:
72	movq %rdx, %rcx
73	movq -8(%rsi, %rdx), %r11
74	lea -8(%rdi, %rdx), %r10
75	shrq $3, %rcx
76	rep movsq
77	movq %r11, (%r10)
78	jmp 13f
79.Lmemmove_end_forward:
80
81	/*
82	 * Handle data backward by movsq.
83	 */
84	.p2align 4
857:
86	movq %rdx, %rcx
87	movq (%rsi), %r11
88	movq %rdi, %r10
89	leaq -8(%rsi, %rdx), %rsi
90	leaq -8(%rdi, %rdx), %rdi
91	shrq $3, %rcx
92	std
93	rep movsq
94	cld
95	movq %r11, (%r10)
96	jmp 13f
97
98	/*
99	 * Start to prepare for backward copy.
100	 */
101	.p2align 4
1022:
103	cmp $0x20, %rdx
104	jb 1f
105	cmp $680, %rdx
106	jb 6f
107	cmp %dil, %sil
108	je 7b
1096:
110	/*
111	 * Calculate copy position to tail.
112	 */
113	addq %rdx, %rsi
114	addq %rdx, %rdi
115	subq $0x20, %rdx
116	/*
117	 * We gobble 32 bytes backward in each loop.
118	 */
1198:
120	subq $0x20, %rdx
121	movq -1*8(%rsi), %r11
122	movq -2*8(%rsi), %r10
123	movq -3*8(%rsi), %r9
124	movq -4*8(%rsi), %r8
125	leaq -4*8(%rsi), %rsi
126
127	movq %r11, -1*8(%rdi)
128	movq %r10, -2*8(%rdi)
129	movq %r9, -3*8(%rdi)
130	movq %r8, -4*8(%rdi)
131	leaq -4*8(%rdi), %rdi
132	jae 8b
133	/*
134	 * Calculate copy position to head.
135	 */
136	addq $0x20, %rdx
137	subq %rdx, %rsi
138	subq %rdx, %rdi
1391:
140	cmpq $16, %rdx
141	jb 9f
142	/*
143	 * Move data from 16 bytes to 31 bytes.
144	 */
145	movq 0*8(%rsi), %r11
146	movq 1*8(%rsi), %r10
147	movq -2*8(%rsi, %rdx), %r9
148	movq -1*8(%rsi, %rdx), %r8
149	movq %r11, 0*8(%rdi)
150	movq %r10, 1*8(%rdi)
151	movq %r9, -2*8(%rdi, %rdx)
152	movq %r8, -1*8(%rdi, %rdx)
153	jmp 13f
154	.p2align 4
1559:
156	cmpq $8, %rdx
157	jb 10f
158	/*
159	 * Move data from 8 bytes to 15 bytes.
160	 */
161	movq 0*8(%rsi), %r11
162	movq -1*8(%rsi, %rdx), %r10
163	movq %r11, 0*8(%rdi)
164	movq %r10, -1*8(%rdi, %rdx)
165	jmp 13f
16610:
167	cmpq $4, %rdx
168	jb 11f
169	/*
170	 * Move data from 4 bytes to 7 bytes.
171	 */
172	movl (%rsi), %r11d
173	movl -4(%rsi, %rdx), %r10d
174	movl %r11d, (%rdi)
175	movl %r10d, -4(%rdi, %rdx)
176	jmp 13f
17711:
178	cmp $2, %rdx
179	jb 12f
180	/*
181	 * Move data from 2 bytes to 3 bytes.
182	 */
183	movw (%rsi), %r11w
184	movw -2(%rsi, %rdx), %r10w
185	movw %r11w, (%rdi)
186	movw %r10w, -2(%rdi, %rdx)
187	jmp 13f
18812:
189	cmp $1, %rdx
190	jb 13f
191	/*
192	 * Move data for 1 byte.
193	 */
194	movb (%rsi), %r11b
195	movb %r11b, (%rdi)
19613:
197	RET
198