xref: /aosp_15_r20/bionic/libc/arch-x86/string/sse2-memmove-slm.S (revision 8d67ca893c1523eb926b9080dbe4e2ffd2a27ba1)
1/*
2Copyright (c) 2014, Intel Corporation
3All rights reserved.
4
5Redistribution and use in source and binary forms, with or without
6modification, are permitted provided that the following conditions are met:
7
8    * Redistributions of source code must retain the above copyright notice,
9    * this list of conditions and the following disclaimer.
10
11    * Redistributions in binary form must reproduce the above copyright notice,
12    * this list of conditions and the following disclaimer in the documentation
13    * and/or other materials provided with the distribution.
14
15    * Neither the name of Intel Corporation nor the names of its contributors
16    * may be used to endorse or promote products derived from this software
17    * without specific prior written permission.
18
19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29*/
30
31#define FOR_SILVERMONT
32
33#ifndef MEMMOVE
34# define MEMMOVE	memmove
35#endif
36
37#ifndef L
38# define L(label)	.L##label
39#endif
40
41#ifndef cfi_startproc
42# define cfi_startproc	.cfi_startproc
43#endif
44
45#ifndef cfi_endproc
46# define cfi_endproc	.cfi_endproc
47#endif
48
49#ifndef cfi_rel_offset
50# define cfi_rel_offset(reg, off)	.cfi_rel_offset reg, off
51#endif
52
53#ifndef cfi_restore
54# define cfi_restore(reg)	.cfi_restore reg
55#endif
56
57#ifndef cfi_adjust_cfa_offset
58# define cfi_adjust_cfa_offset(off)	.cfi_adjust_cfa_offset off
59#endif
60
61#ifndef ENTRY
62# define ENTRY(name)		\
63	.type name,  @function;		\
64	.globl name;		\
65	.p2align 4;		\
66name:		\
67	cfi_startproc
68#endif
69
70#ifndef END
71# define END(name)		\
72	cfi_endproc;		\
73	.size name, .-name
74#endif
75
76#define DEST		PARMS
77#define SRC		DEST+4
78#define LEN		SRC+4
79
80#define CFI_PUSH(REG)		\
81  cfi_adjust_cfa_offset (4);		\
82  cfi_rel_offset (REG, 0)
83
84#define CFI_POP(REG)		\
85  cfi_adjust_cfa_offset (-4);		\
86  cfi_restore (REG)
87
88#define PUSH(REG)	pushl REG; CFI_PUSH (REG)
89#define POP(REG)	popl REG; CFI_POP (REG)
90
91#define PARMS		8		/* Preserve EBX.  */
92#define ENTRANCE	PUSH (%ebx);
93#define RETURN_END	POP (%ebx); ret
94#define RETURN		RETURN_END; CFI_PUSH (%ebx)
95
96#define SETUP_PIC_REG(x)	call	__x86.get_pc_thunk.x
97
98	.section .text.sse2,"ax",@progbits
99ENTRY (MEMMOVE)
100	ENTRANCE
101	movl	LEN(%esp), %ecx
102	movl	SRC(%esp), %eax
103	movl	DEST(%esp), %edx
104
105/* Check whether we should copy backward or forward.  */
106	cmp	%eax, %edx
107	je	L(mm_return)
108	jg	L(mm_len_0_or_more_backward)
109
110/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128]
111	separately.  */
112	cmp	$16, %ecx
113	jbe	L(mm_len_0_16_bytes_forward)
114
115	cmpl	$32, %ecx
116	ja	L(mm_len_32_or_more_forward)
117
118/* Copy [0..32] and return.  */
119	movdqu	(%eax), %xmm0
120	movdqu	-16(%eax, %ecx), %xmm1
121	movdqu	%xmm0, (%edx)
122	movdqu	%xmm1, -16(%edx, %ecx)
123	jmp	L(mm_return)
124
125L(mm_len_32_or_more_forward):
126	cmpl	$64, %ecx
127	ja	L(mm_len_64_or_more_forward)
128
129/* Copy [0..64] and return.  */
130	movdqu	(%eax), %xmm0
131	movdqu	16(%eax), %xmm1
132	movdqu	-16(%eax, %ecx), %xmm2
133	movdqu	-32(%eax, %ecx), %xmm3
134	movdqu	%xmm0, (%edx)
135	movdqu	%xmm1, 16(%edx)
136	movdqu	%xmm2, -16(%edx, %ecx)
137	movdqu	%xmm3, -32(%edx, %ecx)
138	jmp	L(mm_return)
139
140L(mm_len_64_or_more_forward):
141	cmpl	$128, %ecx
142	ja	L(mm_len_128_or_more_forward)
143
144/* Copy [0..128] and return.  */
145	movdqu	(%eax), %xmm0
146	movdqu	16(%eax), %xmm1
147	movdqu	32(%eax), %xmm2
148	movdqu	48(%eax), %xmm3
149	movdqu	-64(%eax, %ecx), %xmm4
150	movdqu	-48(%eax, %ecx), %xmm5
151	movdqu	-32(%eax, %ecx), %xmm6
152	movdqu	-16(%eax, %ecx), %xmm7
153	movdqu	%xmm0, (%edx)
154	movdqu	%xmm1, 16(%edx)
155	movdqu	%xmm2, 32(%edx)
156	movdqu	%xmm3, 48(%edx)
157	movdqu	%xmm4, -64(%edx, %ecx)
158	movdqu	%xmm5, -48(%edx, %ecx)
159	movdqu	%xmm6, -32(%edx, %ecx)
160	movdqu	%xmm7, -16(%edx, %ecx)
161	jmp	L(mm_return)
162
163L(mm_len_128_or_more_forward):
164	PUSH (%esi)
165	PUSH (%edi)
166
167/* Aligning the address of destination.  */
168	movdqu	(%eax), %xmm0
169	movdqu	16(%eax), %xmm1
170	movdqu	32(%eax), %xmm2
171	movdqu	48(%eax), %xmm3
172
173	leal	64(%edx), %edi
174	andl	$-64, %edi
175	subl	%edx, %eax
176
177	movdqu	(%eax, %edi), %xmm4
178	movdqu	16(%eax, %edi), %xmm5
179	movdqu	32(%eax, %edi), %xmm6
180	movdqu	48(%eax, %edi), %xmm7
181
182	movdqu	%xmm0, (%edx)
183	movdqu	%xmm1, 16(%edx)
184	movdqu	%xmm2, 32(%edx)
185	movdqu	%xmm3, 48(%edx)
186	movdqa	%xmm4, (%edi)
187	movaps	%xmm5, 16(%edi)
188	movaps	%xmm6, 32(%edi)
189	movaps	%xmm7, 48(%edi)
190	addl	$64, %edi
191
192	leal	(%edx, %ecx), %ebx
193	andl	$-64, %ebx
194	cmp	%edi, %ebx
195	jbe	L(mm_copy_remaining_forward)
196
197	PUSH(%ebx)
198	SETUP_PIC_REG(bx)
199	add	$_GLOBAL_OFFSET_TABLE_, %ebx
200	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
201	/* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */
202	POP(%ebx)
203
204	jae	L(mm_large_page_loop_forward)
205
206	.p2align 4
207L(mm_main_loop_forward):
208
209	prefetcht0 128(%eax, %edi)
210
211	movdqu	(%eax, %edi), %xmm0
212	movdqu	16(%eax, %edi), %xmm1
213	movdqu	32(%eax, %edi), %xmm2
214	movdqu	48(%eax, %edi), %xmm3
215	movdqa	%xmm0, (%edi)
216	movaps	%xmm1, 16(%edi)
217	movaps	%xmm2, 32(%edi)
218	movaps	%xmm3, 48(%edi)
219	leal	64(%edi), %edi
220	cmp	%edi, %ebx
221	ja	L(mm_main_loop_forward)
222
223L(mm_copy_remaining_forward):
224	addl	%edx, %ecx
225	subl	%edi, %ecx
226/* We copied all up till %edi position in the dst.
227	In %ecx now is how many bytes are left to copy.
228	Now we need to advance %esi. */
229	leal	(%edi, %eax), %esi
230
231L(mm_remaining_0_64_bytes_forward):
232	cmp	$32, %ecx
233	ja	L(mm_remaining_33_64_bytes_forward)
234	cmp	$16, %ecx
235	ja	L(mm_remaining_17_32_bytes_forward)
236	testl	%ecx, %ecx
237	.p2align 4,,2
238	je	L(mm_return_pop_all)
239
240	cmpb	$8, %cl
241	ja	L(mm_remaining_9_16_bytes_forward)
242	cmpb	$4, %cl
243	.p2align 4,,5
244	ja	L(mm_remaining_5_8_bytes_forward)
245	cmpb	$2, %cl
246	.p2align 4,,1
247	ja	L(mm_remaining_3_4_bytes_forward)
248	movzbl	-1(%esi,%ecx), %eax
249	movzbl	(%esi), %ebx
250	movb	%al, -1(%edi,%ecx)
251	movb	%bl, (%edi)
252	jmp	L(mm_return_pop_all)
253
254L(mm_remaining_33_64_bytes_forward):
255	movdqu	(%esi), %xmm0
256	movdqu	16(%esi), %xmm1
257	movdqu	-32(%esi, %ecx), %xmm2
258	movdqu	-16(%esi, %ecx), %xmm3
259	movdqu	%xmm0, (%edi)
260	movdqu	%xmm1, 16(%edi)
261	movdqu	%xmm2, -32(%edi, %ecx)
262	movdqu	%xmm3, -16(%edi, %ecx)
263	jmp	L(mm_return_pop_all)
264
265L(mm_remaining_17_32_bytes_forward):
266	movdqu	(%esi), %xmm0
267	movdqu	-16(%esi, %ecx), %xmm1
268	movdqu	%xmm0, (%edi)
269	movdqu	%xmm1, -16(%edi, %ecx)
270	jmp	L(mm_return_pop_all)
271
272L(mm_remaining_9_16_bytes_forward):
273	movq	(%esi), %xmm0
274	movq	-8(%esi, %ecx), %xmm1
275	movq	%xmm0, (%edi)
276	movq	%xmm1, -8(%edi, %ecx)
277	jmp	L(mm_return_pop_all)
278
279L(mm_remaining_5_8_bytes_forward):
280	movl	(%esi), %eax
281	movl	-4(%esi,%ecx), %ebx
282	movl	%eax, (%edi)
283	movl	%ebx, -4(%edi,%ecx)
284	jmp	L(mm_return_pop_all)
285
286L(mm_remaining_3_4_bytes_forward):
287	movzwl	-2(%esi,%ecx), %eax
288	movzwl	(%esi), %ebx
289	movw	%ax, -2(%edi,%ecx)
290	movw	%bx, (%edi)
291	jmp	L(mm_return_pop_all)
292
293L(mm_len_0_16_bytes_forward):
294	testb	$24, %cl
295	jne	L(mm_len_9_16_bytes_forward)
296	testb	$4, %cl
297	.p2align 4,,5
298	jne	L(mm_len_5_8_bytes_forward)
299	testl	%ecx, %ecx
300	.p2align 4,,2
301	je	L(mm_return)
302	testb	$2, %cl
303	.p2align 4,,1
304	jne	L(mm_len_2_4_bytes_forward)
305	movzbl	-1(%eax,%ecx), %ebx
306	movzbl	(%eax), %eax
307	movb	%bl, -1(%edx,%ecx)
308	movb	%al, (%edx)
309	jmp	L(mm_return)
310
311L(mm_len_2_4_bytes_forward):
312	movzwl	-2(%eax,%ecx), %ebx
313	movzwl	(%eax), %eax
314	movw	%bx, -2(%edx,%ecx)
315	movw	%ax, (%edx)
316	jmp	L(mm_return)
317
318L(mm_len_5_8_bytes_forward):
319	movl	(%eax), %ebx
320	movl	-4(%eax,%ecx), %eax
321	movl	%ebx, (%edx)
322	movl	%eax, -4(%edx,%ecx)
323	jmp	L(mm_return)
324
325L(mm_len_9_16_bytes_forward):
326	movq	(%eax), %xmm0
327	movq	-8(%eax, %ecx), %xmm1
328	movq	%xmm0, (%edx)
329	movq	%xmm1, -8(%edx, %ecx)
330	jmp	L(mm_return)
331
332	CFI_POP (%edi)
333	CFI_POP (%esi)
334
335L(mm_recalc_len):
336/* Compute in %ecx how many bytes are left to copy after
337	the main loop stops.  */
338	movl	%ebx, %ecx
339	subl	%edx, %ecx
340/* The code for copying backwards.  */
341L(mm_len_0_or_more_backward):
342
343/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128]
344	separately.  */
345	cmp	$16, %ecx
346	jbe	L(mm_len_0_16_bytes_backward)
347
348	cmpl	$32, %ecx
349	jg	L(mm_len_32_or_more_backward)
350
351/* Copy [0..32] and return.  */
352	movdqu	(%eax), %xmm0
353	movdqu	-16(%eax, %ecx), %xmm1
354	movdqu	%xmm0, (%edx)
355	movdqu	%xmm1, -16(%edx, %ecx)
356	jmp	L(mm_return)
357
358L(mm_len_32_or_more_backward):
359	cmpl	$64, %ecx
360	jg	L(mm_len_64_or_more_backward)
361
362/* Copy [0..64] and return.  */
363	movdqu	(%eax), %xmm0
364	movdqu	16(%eax), %xmm1
365	movdqu	-16(%eax, %ecx), %xmm2
366	movdqu	-32(%eax, %ecx), %xmm3
367	movdqu	%xmm0, (%edx)
368	movdqu	%xmm1, 16(%edx)
369	movdqu	%xmm2, -16(%edx, %ecx)
370	movdqu	%xmm3, -32(%edx, %ecx)
371	jmp	L(mm_return)
372
373L(mm_len_64_or_more_backward):
374	cmpl	$128, %ecx
375	jg	L(mm_len_128_or_more_backward)
376
377/* Copy [0..128] and return.  */
378	movdqu	(%eax), %xmm0
379	movdqu	16(%eax), %xmm1
380	movdqu	32(%eax), %xmm2
381	movdqu	48(%eax), %xmm3
382	movdqu	-64(%eax, %ecx), %xmm4
383	movdqu	-48(%eax, %ecx), %xmm5
384	movdqu	-32(%eax, %ecx), %xmm6
385	movdqu	-16(%eax, %ecx), %xmm7
386	movdqu	%xmm0, (%edx)
387	movdqu	%xmm1, 16(%edx)
388	movdqu	%xmm2, 32(%edx)
389	movdqu	%xmm3, 48(%edx)
390	movdqu	%xmm4, -64(%edx, %ecx)
391	movdqu	%xmm5, -48(%edx, %ecx)
392	movdqu	%xmm6, -32(%edx, %ecx)
393	movdqu	%xmm7, -16(%edx, %ecx)
394	jmp	L(mm_return)
395
396L(mm_len_128_or_more_backward):
397	PUSH (%esi)
398	PUSH (%edi)
399
400/* Aligning the address of destination. We need to save
401	16 bits from the source in order not to overwrite them.  */
402	movdqu	-16(%eax, %ecx), %xmm0
403	movdqu	-32(%eax, %ecx), %xmm1
404	movdqu	-48(%eax, %ecx), %xmm2
405	movdqu	-64(%eax, %ecx), %xmm3
406
407	leal	(%edx, %ecx), %edi
408	andl	$-64, %edi
409
410	movl	%eax, %esi
411	subl	%edx, %esi
412
413	movdqu	-16(%edi, %esi), %xmm4
414	movdqu	-32(%edi, %esi), %xmm5
415	movdqu	-48(%edi, %esi), %xmm6
416	movdqu	-64(%edi, %esi), %xmm7
417
418	movdqu	%xmm0, -16(%edx, %ecx)
419	movdqu	%xmm1, -32(%edx, %ecx)
420	movdqu	%xmm2, -48(%edx, %ecx)
421	movdqu	%xmm3, -64(%edx, %ecx)
422	movdqa	%xmm4, -16(%edi)
423	movdqa	%xmm5, -32(%edi)
424	movdqa	%xmm6, -48(%edi)
425	movdqa	%xmm7, -64(%edi)
426	leal	-64(%edi), %edi
427
428	leal	64(%edx), %ebx
429	andl	$-64, %ebx
430
431	cmp	%edi, %ebx
432	jae	L(mm_main_loop_backward_end)
433
434	PUSH(%ebx)
435	SETUP_PIC_REG(bx)
436	add	$_GLOBAL_OFFSET_TABLE_, %ebx
437	cmp	__x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
438	/* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */
439	POP(%ebx)
440
441	jae	L(mm_large_page_loop_backward)
442
443	.p2align 4
444L(mm_main_loop_backward):
445
446	prefetcht0 -128(%edi, %esi)
447
448	movdqu	-64(%edi, %esi), %xmm0
449	movdqu	-48(%edi, %esi), %xmm1
450	movdqu	-32(%edi, %esi), %xmm2
451	movdqu	-16(%edi, %esi), %xmm3
452	movdqa	%xmm0, -64(%edi)
453	movdqa	%xmm1, -48(%edi)
454	movdqa	%xmm2, -32(%edi)
455	movdqa	%xmm3, -16(%edi)
456	leal	-64(%edi), %edi
457	cmp	%edi, %ebx
458	jb	L(mm_main_loop_backward)
459L(mm_main_loop_backward_end):
460	POP (%edi)
461	POP (%esi)
462	jmp	L(mm_recalc_len)
463
464/* Copy [0..16] and return.  */
465L(mm_len_0_16_bytes_backward):
466	testb	$24, %cl
467	jnz	L(mm_len_9_16_bytes_backward)
468	testb	$4, %cl
469	.p2align 4,,5
470	jnz	L(mm_len_5_8_bytes_backward)
471	testl	%ecx, %ecx
472	.p2align 4,,2
473	je	L(mm_return)
474	testb	$2, %cl
475	.p2align 4,,1
476	jne	L(mm_len_3_4_bytes_backward)
477	movzbl	-1(%eax,%ecx), %ebx
478	movzbl	(%eax), %eax
479	movb	%bl, -1(%edx,%ecx)
480	movb	%al, (%edx)
481	jmp	L(mm_return)
482
483L(mm_len_3_4_bytes_backward):
484	movzwl	-2(%eax,%ecx), %ebx
485	movzwl	(%eax), %eax
486	movw	%bx, -2(%edx,%ecx)
487	movw	%ax, (%edx)
488	jmp	L(mm_return)
489
490L(mm_len_9_16_bytes_backward):
491	PUSH (%esi)
492	movl	-4(%eax,%ecx), %ebx
493	movl	-8(%eax,%ecx), %esi
494	movl	%ebx, -4(%edx,%ecx)
495	movl	%esi, -8(%edx,%ecx)
496	subl	$8, %ecx
497	POP (%esi)
498	jmp	L(mm_len_0_16_bytes_backward)
499
500L(mm_len_5_8_bytes_backward):
501	movl	(%eax), %ebx
502	movl	-4(%eax,%ecx), %eax
503	movl	%ebx, (%edx)
504	movl	%eax, -4(%edx,%ecx)
505
506L(mm_return):
507	movl	%edx, %eax
508	RETURN
509
510L(mm_return_pop_all):
511	movl	%edx, %eax
512	POP (%edi)
513	POP (%esi)
514	RETURN
515
516/* Big length copy forward part.  */
517
518	.p2align 4
519L(mm_large_page_loop_forward):
520	movdqu	(%eax, %edi), %xmm0
521	movdqu	16(%eax, %edi), %xmm1
522	movdqu	32(%eax, %edi), %xmm2
523	movdqu	48(%eax, %edi), %xmm3
524	movntdq	%xmm0, (%edi)
525	movntdq	%xmm1, 16(%edi)
526	movntdq	%xmm2, 32(%edi)
527	movntdq	%xmm3, 48(%edi)
528	leal	64(%edi), %edi
529	cmp	%edi, %ebx
530	ja	L(mm_large_page_loop_forward)
531	sfence
532	jmp	L(mm_copy_remaining_forward)
533
534/* Big length copy backward part.  */
535	.p2align 4
536L(mm_large_page_loop_backward):
537	movdqu	-64(%edi, %esi), %xmm0
538	movdqu	-48(%edi, %esi), %xmm1
539	movdqu	-32(%edi, %esi), %xmm2
540	movdqu	-16(%edi, %esi), %xmm3
541	movntdq	%xmm0, -64(%edi)
542	movntdq	%xmm1, -48(%edi)
543	movntdq	%xmm2, -32(%edi)
544	movntdq	%xmm3, -16(%edi)
545	leal	-64(%edi), %edi
546	cmp	%edi, %ebx
547	jb	L(mm_large_page_loop_backward)
548	sfence
549	POP (%edi)
550	POP (%esi)
551	jmp	L(mm_recalc_len)
552
553END (MEMMOVE)
554
555// N.B., `private/bionic_asm.h` provides ALIAS_SYMBOL, but that file provides
556// conflicting definitions for some macros in this file. Since ALIAS_SYMBOL is
557// small, inline it here.
558.globl memcpy;
559.equ memcpy, MEMMOVE
560