1/* 2Copyright (c) 2014, Intel Corporation 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are met: 7 8 * Redistributions of source code must retain the above copyright notice, 9 * this list of conditions and the following disclaimer. 10 11 * Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 15 * Neither the name of Intel Corporation nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 19THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 20ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 23ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 26ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29*/ 30 31#define FOR_SILVERMONT 32 33#ifndef MEMMOVE 34# define MEMMOVE memmove 35#endif 36 37#ifndef L 38# define L(label) .L##label 39#endif 40 41#ifndef cfi_startproc 42# define cfi_startproc .cfi_startproc 43#endif 44 45#ifndef cfi_endproc 46# define cfi_endproc .cfi_endproc 47#endif 48 49#ifndef cfi_rel_offset 50# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off 51#endif 52 53#ifndef cfi_restore 54# define cfi_restore(reg) .cfi_restore reg 55#endif 56 57#ifndef cfi_adjust_cfa_offset 58# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off 59#endif 60 61#ifndef ENTRY 62# define ENTRY(name) \ 63 .type name, @function; \ 64 .globl name; \ 65 .p2align 4; \ 66name: \ 67 cfi_startproc 68#endif 69 70#ifndef END 71# define END(name) \ 72 cfi_endproc; \ 73 .size name, .-name 74#endif 75 76#define DEST PARMS 77#define SRC DEST+4 78#define LEN SRC+4 79 80#define CFI_PUSH(REG) \ 81 cfi_adjust_cfa_offset (4); \ 82 cfi_rel_offset (REG, 0) 83 84#define CFI_POP(REG) \ 85 cfi_adjust_cfa_offset (-4); \ 86 cfi_restore (REG) 87 88#define PUSH(REG) pushl REG; CFI_PUSH (REG) 89#define POP(REG) popl REG; CFI_POP (REG) 90 91#define PARMS 8 /* Preserve EBX. */ 92#define ENTRANCE PUSH (%ebx); 93#define RETURN_END POP (%ebx); ret 94#define RETURN RETURN_END; CFI_PUSH (%ebx) 95 96#define SETUP_PIC_REG(x) call __x86.get_pc_thunk.x 97 98 .section .text.sse2,"ax",@progbits 99ENTRY (MEMMOVE) 100 ENTRANCE 101 movl LEN(%esp), %ecx 102 movl SRC(%esp), %eax 103 movl DEST(%esp), %edx 104 105/* Check whether we should copy backward or forward. */ 106 cmp %eax, %edx 107 je L(mm_return) 108 jg L(mm_len_0_or_more_backward) 109 110/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] 111 separately. */ 112 cmp $16, %ecx 113 jbe L(mm_len_0_16_bytes_forward) 114 115 cmpl $32, %ecx 116 ja L(mm_len_32_or_more_forward) 117 118/* Copy [0..32] and return. */ 119 movdqu (%eax), %xmm0 120 movdqu -16(%eax, %ecx), %xmm1 121 movdqu %xmm0, (%edx) 122 movdqu %xmm1, -16(%edx, %ecx) 123 jmp L(mm_return) 124 125L(mm_len_32_or_more_forward): 126 cmpl $64, %ecx 127 ja L(mm_len_64_or_more_forward) 128 129/* Copy [0..64] and return. */ 130 movdqu (%eax), %xmm0 131 movdqu 16(%eax), %xmm1 132 movdqu -16(%eax, %ecx), %xmm2 133 movdqu -32(%eax, %ecx), %xmm3 134 movdqu %xmm0, (%edx) 135 movdqu %xmm1, 16(%edx) 136 movdqu %xmm2, -16(%edx, %ecx) 137 movdqu %xmm3, -32(%edx, %ecx) 138 jmp L(mm_return) 139 140L(mm_len_64_or_more_forward): 141 cmpl $128, %ecx 142 ja L(mm_len_128_or_more_forward) 143 144/* Copy [0..128] and return. */ 145 movdqu (%eax), %xmm0 146 movdqu 16(%eax), %xmm1 147 movdqu 32(%eax), %xmm2 148 movdqu 48(%eax), %xmm3 149 movdqu -64(%eax, %ecx), %xmm4 150 movdqu -48(%eax, %ecx), %xmm5 151 movdqu -32(%eax, %ecx), %xmm6 152 movdqu -16(%eax, %ecx), %xmm7 153 movdqu %xmm0, (%edx) 154 movdqu %xmm1, 16(%edx) 155 movdqu %xmm2, 32(%edx) 156 movdqu %xmm3, 48(%edx) 157 movdqu %xmm4, -64(%edx, %ecx) 158 movdqu %xmm5, -48(%edx, %ecx) 159 movdqu %xmm6, -32(%edx, %ecx) 160 movdqu %xmm7, -16(%edx, %ecx) 161 jmp L(mm_return) 162 163L(mm_len_128_or_more_forward): 164 PUSH (%esi) 165 PUSH (%edi) 166 167/* Aligning the address of destination. */ 168 movdqu (%eax), %xmm0 169 movdqu 16(%eax), %xmm1 170 movdqu 32(%eax), %xmm2 171 movdqu 48(%eax), %xmm3 172 173 leal 64(%edx), %edi 174 andl $-64, %edi 175 subl %edx, %eax 176 177 movdqu (%eax, %edi), %xmm4 178 movdqu 16(%eax, %edi), %xmm5 179 movdqu 32(%eax, %edi), %xmm6 180 movdqu 48(%eax, %edi), %xmm7 181 182 movdqu %xmm0, (%edx) 183 movdqu %xmm1, 16(%edx) 184 movdqu %xmm2, 32(%edx) 185 movdqu %xmm3, 48(%edx) 186 movdqa %xmm4, (%edi) 187 movaps %xmm5, 16(%edi) 188 movaps %xmm6, 32(%edi) 189 movaps %xmm7, 48(%edi) 190 addl $64, %edi 191 192 leal (%edx, %ecx), %ebx 193 andl $-64, %ebx 194 cmp %edi, %ebx 195 jbe L(mm_copy_remaining_forward) 196 197 PUSH(%ebx) 198 SETUP_PIC_REG(bx) 199 add $_GLOBAL_OFFSET_TABLE_, %ebx 200 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 201 /* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */ 202 POP(%ebx) 203 204 jae L(mm_large_page_loop_forward) 205 206 .p2align 4 207L(mm_main_loop_forward): 208 209 prefetcht0 128(%eax, %edi) 210 211 movdqu (%eax, %edi), %xmm0 212 movdqu 16(%eax, %edi), %xmm1 213 movdqu 32(%eax, %edi), %xmm2 214 movdqu 48(%eax, %edi), %xmm3 215 movdqa %xmm0, (%edi) 216 movaps %xmm1, 16(%edi) 217 movaps %xmm2, 32(%edi) 218 movaps %xmm3, 48(%edi) 219 leal 64(%edi), %edi 220 cmp %edi, %ebx 221 ja L(mm_main_loop_forward) 222 223L(mm_copy_remaining_forward): 224 addl %edx, %ecx 225 subl %edi, %ecx 226/* We copied all up till %edi position in the dst. 227 In %ecx now is how many bytes are left to copy. 228 Now we need to advance %esi. */ 229 leal (%edi, %eax), %esi 230 231L(mm_remaining_0_64_bytes_forward): 232 cmp $32, %ecx 233 ja L(mm_remaining_33_64_bytes_forward) 234 cmp $16, %ecx 235 ja L(mm_remaining_17_32_bytes_forward) 236 testl %ecx, %ecx 237 .p2align 4,,2 238 je L(mm_return_pop_all) 239 240 cmpb $8, %cl 241 ja L(mm_remaining_9_16_bytes_forward) 242 cmpb $4, %cl 243 .p2align 4,,5 244 ja L(mm_remaining_5_8_bytes_forward) 245 cmpb $2, %cl 246 .p2align 4,,1 247 ja L(mm_remaining_3_4_bytes_forward) 248 movzbl -1(%esi,%ecx), %eax 249 movzbl (%esi), %ebx 250 movb %al, -1(%edi,%ecx) 251 movb %bl, (%edi) 252 jmp L(mm_return_pop_all) 253 254L(mm_remaining_33_64_bytes_forward): 255 movdqu (%esi), %xmm0 256 movdqu 16(%esi), %xmm1 257 movdqu -32(%esi, %ecx), %xmm2 258 movdqu -16(%esi, %ecx), %xmm3 259 movdqu %xmm0, (%edi) 260 movdqu %xmm1, 16(%edi) 261 movdqu %xmm2, -32(%edi, %ecx) 262 movdqu %xmm3, -16(%edi, %ecx) 263 jmp L(mm_return_pop_all) 264 265L(mm_remaining_17_32_bytes_forward): 266 movdqu (%esi), %xmm0 267 movdqu -16(%esi, %ecx), %xmm1 268 movdqu %xmm0, (%edi) 269 movdqu %xmm1, -16(%edi, %ecx) 270 jmp L(mm_return_pop_all) 271 272L(mm_remaining_9_16_bytes_forward): 273 movq (%esi), %xmm0 274 movq -8(%esi, %ecx), %xmm1 275 movq %xmm0, (%edi) 276 movq %xmm1, -8(%edi, %ecx) 277 jmp L(mm_return_pop_all) 278 279L(mm_remaining_5_8_bytes_forward): 280 movl (%esi), %eax 281 movl -4(%esi,%ecx), %ebx 282 movl %eax, (%edi) 283 movl %ebx, -4(%edi,%ecx) 284 jmp L(mm_return_pop_all) 285 286L(mm_remaining_3_4_bytes_forward): 287 movzwl -2(%esi,%ecx), %eax 288 movzwl (%esi), %ebx 289 movw %ax, -2(%edi,%ecx) 290 movw %bx, (%edi) 291 jmp L(mm_return_pop_all) 292 293L(mm_len_0_16_bytes_forward): 294 testb $24, %cl 295 jne L(mm_len_9_16_bytes_forward) 296 testb $4, %cl 297 .p2align 4,,5 298 jne L(mm_len_5_8_bytes_forward) 299 testl %ecx, %ecx 300 .p2align 4,,2 301 je L(mm_return) 302 testb $2, %cl 303 .p2align 4,,1 304 jne L(mm_len_2_4_bytes_forward) 305 movzbl -1(%eax,%ecx), %ebx 306 movzbl (%eax), %eax 307 movb %bl, -1(%edx,%ecx) 308 movb %al, (%edx) 309 jmp L(mm_return) 310 311L(mm_len_2_4_bytes_forward): 312 movzwl -2(%eax,%ecx), %ebx 313 movzwl (%eax), %eax 314 movw %bx, -2(%edx,%ecx) 315 movw %ax, (%edx) 316 jmp L(mm_return) 317 318L(mm_len_5_8_bytes_forward): 319 movl (%eax), %ebx 320 movl -4(%eax,%ecx), %eax 321 movl %ebx, (%edx) 322 movl %eax, -4(%edx,%ecx) 323 jmp L(mm_return) 324 325L(mm_len_9_16_bytes_forward): 326 movq (%eax), %xmm0 327 movq -8(%eax, %ecx), %xmm1 328 movq %xmm0, (%edx) 329 movq %xmm1, -8(%edx, %ecx) 330 jmp L(mm_return) 331 332 CFI_POP (%edi) 333 CFI_POP (%esi) 334 335L(mm_recalc_len): 336/* Compute in %ecx how many bytes are left to copy after 337 the main loop stops. */ 338 movl %ebx, %ecx 339 subl %edx, %ecx 340/* The code for copying backwards. */ 341L(mm_len_0_or_more_backward): 342 343/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] 344 separately. */ 345 cmp $16, %ecx 346 jbe L(mm_len_0_16_bytes_backward) 347 348 cmpl $32, %ecx 349 jg L(mm_len_32_or_more_backward) 350 351/* Copy [0..32] and return. */ 352 movdqu (%eax), %xmm0 353 movdqu -16(%eax, %ecx), %xmm1 354 movdqu %xmm0, (%edx) 355 movdqu %xmm1, -16(%edx, %ecx) 356 jmp L(mm_return) 357 358L(mm_len_32_or_more_backward): 359 cmpl $64, %ecx 360 jg L(mm_len_64_or_more_backward) 361 362/* Copy [0..64] and return. */ 363 movdqu (%eax), %xmm0 364 movdqu 16(%eax), %xmm1 365 movdqu -16(%eax, %ecx), %xmm2 366 movdqu -32(%eax, %ecx), %xmm3 367 movdqu %xmm0, (%edx) 368 movdqu %xmm1, 16(%edx) 369 movdqu %xmm2, -16(%edx, %ecx) 370 movdqu %xmm3, -32(%edx, %ecx) 371 jmp L(mm_return) 372 373L(mm_len_64_or_more_backward): 374 cmpl $128, %ecx 375 jg L(mm_len_128_or_more_backward) 376 377/* Copy [0..128] and return. */ 378 movdqu (%eax), %xmm0 379 movdqu 16(%eax), %xmm1 380 movdqu 32(%eax), %xmm2 381 movdqu 48(%eax), %xmm3 382 movdqu -64(%eax, %ecx), %xmm4 383 movdqu -48(%eax, %ecx), %xmm5 384 movdqu -32(%eax, %ecx), %xmm6 385 movdqu -16(%eax, %ecx), %xmm7 386 movdqu %xmm0, (%edx) 387 movdqu %xmm1, 16(%edx) 388 movdqu %xmm2, 32(%edx) 389 movdqu %xmm3, 48(%edx) 390 movdqu %xmm4, -64(%edx, %ecx) 391 movdqu %xmm5, -48(%edx, %ecx) 392 movdqu %xmm6, -32(%edx, %ecx) 393 movdqu %xmm7, -16(%edx, %ecx) 394 jmp L(mm_return) 395 396L(mm_len_128_or_more_backward): 397 PUSH (%esi) 398 PUSH (%edi) 399 400/* Aligning the address of destination. We need to save 401 16 bits from the source in order not to overwrite them. */ 402 movdqu -16(%eax, %ecx), %xmm0 403 movdqu -32(%eax, %ecx), %xmm1 404 movdqu -48(%eax, %ecx), %xmm2 405 movdqu -64(%eax, %ecx), %xmm3 406 407 leal (%edx, %ecx), %edi 408 andl $-64, %edi 409 410 movl %eax, %esi 411 subl %edx, %esi 412 413 movdqu -16(%edi, %esi), %xmm4 414 movdqu -32(%edi, %esi), %xmm5 415 movdqu -48(%edi, %esi), %xmm6 416 movdqu -64(%edi, %esi), %xmm7 417 418 movdqu %xmm0, -16(%edx, %ecx) 419 movdqu %xmm1, -32(%edx, %ecx) 420 movdqu %xmm2, -48(%edx, %ecx) 421 movdqu %xmm3, -64(%edx, %ecx) 422 movdqa %xmm4, -16(%edi) 423 movdqa %xmm5, -32(%edi) 424 movdqa %xmm6, -48(%edi) 425 movdqa %xmm7, -64(%edi) 426 leal -64(%edi), %edi 427 428 leal 64(%edx), %ebx 429 andl $-64, %ebx 430 431 cmp %edi, %ebx 432 jae L(mm_main_loop_backward_end) 433 434 PUSH(%ebx) 435 SETUP_PIC_REG(bx) 436 add $_GLOBAL_OFFSET_TABLE_, %ebx 437 cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx 438 /* Restore ebx. We can place a pop before jump as it doesn't affect any flags. */ 439 POP(%ebx) 440 441 jae L(mm_large_page_loop_backward) 442 443 .p2align 4 444L(mm_main_loop_backward): 445 446 prefetcht0 -128(%edi, %esi) 447 448 movdqu -64(%edi, %esi), %xmm0 449 movdqu -48(%edi, %esi), %xmm1 450 movdqu -32(%edi, %esi), %xmm2 451 movdqu -16(%edi, %esi), %xmm3 452 movdqa %xmm0, -64(%edi) 453 movdqa %xmm1, -48(%edi) 454 movdqa %xmm2, -32(%edi) 455 movdqa %xmm3, -16(%edi) 456 leal -64(%edi), %edi 457 cmp %edi, %ebx 458 jb L(mm_main_loop_backward) 459L(mm_main_loop_backward_end): 460 POP (%edi) 461 POP (%esi) 462 jmp L(mm_recalc_len) 463 464/* Copy [0..16] and return. */ 465L(mm_len_0_16_bytes_backward): 466 testb $24, %cl 467 jnz L(mm_len_9_16_bytes_backward) 468 testb $4, %cl 469 .p2align 4,,5 470 jnz L(mm_len_5_8_bytes_backward) 471 testl %ecx, %ecx 472 .p2align 4,,2 473 je L(mm_return) 474 testb $2, %cl 475 .p2align 4,,1 476 jne L(mm_len_3_4_bytes_backward) 477 movzbl -1(%eax,%ecx), %ebx 478 movzbl (%eax), %eax 479 movb %bl, -1(%edx,%ecx) 480 movb %al, (%edx) 481 jmp L(mm_return) 482 483L(mm_len_3_4_bytes_backward): 484 movzwl -2(%eax,%ecx), %ebx 485 movzwl (%eax), %eax 486 movw %bx, -2(%edx,%ecx) 487 movw %ax, (%edx) 488 jmp L(mm_return) 489 490L(mm_len_9_16_bytes_backward): 491 PUSH (%esi) 492 movl -4(%eax,%ecx), %ebx 493 movl -8(%eax,%ecx), %esi 494 movl %ebx, -4(%edx,%ecx) 495 movl %esi, -8(%edx,%ecx) 496 subl $8, %ecx 497 POP (%esi) 498 jmp L(mm_len_0_16_bytes_backward) 499 500L(mm_len_5_8_bytes_backward): 501 movl (%eax), %ebx 502 movl -4(%eax,%ecx), %eax 503 movl %ebx, (%edx) 504 movl %eax, -4(%edx,%ecx) 505 506L(mm_return): 507 movl %edx, %eax 508 RETURN 509 510L(mm_return_pop_all): 511 movl %edx, %eax 512 POP (%edi) 513 POP (%esi) 514 RETURN 515 516/* Big length copy forward part. */ 517 518 .p2align 4 519L(mm_large_page_loop_forward): 520 movdqu (%eax, %edi), %xmm0 521 movdqu 16(%eax, %edi), %xmm1 522 movdqu 32(%eax, %edi), %xmm2 523 movdqu 48(%eax, %edi), %xmm3 524 movntdq %xmm0, (%edi) 525 movntdq %xmm1, 16(%edi) 526 movntdq %xmm2, 32(%edi) 527 movntdq %xmm3, 48(%edi) 528 leal 64(%edi), %edi 529 cmp %edi, %ebx 530 ja L(mm_large_page_loop_forward) 531 sfence 532 jmp L(mm_copy_remaining_forward) 533 534/* Big length copy backward part. */ 535 .p2align 4 536L(mm_large_page_loop_backward): 537 movdqu -64(%edi, %esi), %xmm0 538 movdqu -48(%edi, %esi), %xmm1 539 movdqu -32(%edi, %esi), %xmm2 540 movdqu -16(%edi, %esi), %xmm3 541 movntdq %xmm0, -64(%edi) 542 movntdq %xmm1, -48(%edi) 543 movntdq %xmm2, -32(%edi) 544 movntdq %xmm3, -16(%edi) 545 leal -64(%edi), %edi 546 cmp %edi, %ebx 547 jb L(mm_large_page_loop_backward) 548 sfence 549 POP (%edi) 550 POP (%esi) 551 jmp L(mm_recalc_len) 552 553END (MEMMOVE) 554 555// N.B., `private/bionic_asm.h` provides ALIAS_SYMBOL, but that file provides 556// conflicting definitions for some macros in this file. Since ALIAS_SYMBOL is 557// small, inline it here. 558.globl memcpy; 559.equ memcpy, MEMMOVE 560