1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8 9.extern OPENSSL_ia32cap_P 10.hidden OPENSSL_ia32cap_P 11 12.section .rodata 13.align 64 14.Lzero: 15.long 0,0,0,0 16.Lone: 17.long 1,0,0,0 18.Linc: 19.long 0,1,2,3 20.Lfour: 21.long 4,4,4,4 22.Lincy: 23.long 0,2,4,6,1,3,5,7 24.Leight: 25.long 8,8,8,8,8,8,8,8 26.Lrot16: 27.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd 28.Lrot24: 29.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe 30.Lsigma: 31.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 32.align 64 33.Lzeroz: 34.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 35.Lfourz: 36.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 37.Lincz: 38.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 39.Lsixteen: 40.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 41.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 42.text 43.globl ChaCha20_ctr32 44.hidden ChaCha20_ctr32 45.type ChaCha20_ctr32,@function 46.align 64 47ChaCha20_ctr32: 48.cfi_startproc 49_CET_ENDBR 50 cmpq $0,%rdx 51 je .Lno_data 52 movq OPENSSL_ia32cap_P+4(%rip),%r10 53 testl $512,%r10d 54 jnz .LChaCha20_ssse3 55 56 pushq %rbx 57.cfi_adjust_cfa_offset 8 58.cfi_offset rbx,-16 59 pushq %rbp 60.cfi_adjust_cfa_offset 8 61.cfi_offset rbp,-24 62 pushq %r12 63.cfi_adjust_cfa_offset 8 64.cfi_offset r12,-32 65 pushq %r13 66.cfi_adjust_cfa_offset 8 67.cfi_offset r13,-40 68 pushq %r14 69.cfi_adjust_cfa_offset 8 70.cfi_offset r14,-48 71 pushq %r15 72.cfi_adjust_cfa_offset 8 73.cfi_offset r15,-56 74 subq $64+24,%rsp 75.cfi_adjust_cfa_offset 88 76.Lctr32_body: 77 78 79 movdqu (%rcx),%xmm1 80 movdqu 16(%rcx),%xmm2 81 movdqu (%r8),%xmm3 82 movdqa .Lone(%rip),%xmm4 83 84 85 movdqa %xmm1,16(%rsp) 86 movdqa %xmm2,32(%rsp) 87 movdqa %xmm3,48(%rsp) 88 movq %rdx,%rbp 89 jmp .Loop_outer 90 91.align 32 92.Loop_outer: 93 movl $0x61707865,%eax 94 movl $0x3320646e,%ebx 95 movl $0x79622d32,%ecx 96 movl $0x6b206574,%edx 97 movl 16(%rsp),%r8d 98 movl 20(%rsp),%r9d 99 movl 24(%rsp),%r10d 100 movl 28(%rsp),%r11d 101 movd %xmm3,%r12d 102 movl 52(%rsp),%r13d 103 movl 56(%rsp),%r14d 104 movl 60(%rsp),%r15d 105 106 movq %rbp,64+0(%rsp) 107 movl $10,%ebp 108 movq %rsi,64+8(%rsp) 109.byte 102,72,15,126,214 110 movq %rdi,64+16(%rsp) 111 movq %rsi,%rdi 112 shrq $32,%rdi 113 jmp .Loop 114 115.align 32 116.Loop: 117 addl %r8d,%eax 118 xorl %eax,%r12d 119 roll $16,%r12d 120 addl %r9d,%ebx 121 xorl %ebx,%r13d 122 roll $16,%r13d 123 addl %r12d,%esi 124 xorl %esi,%r8d 125 roll $12,%r8d 126 addl %r13d,%edi 127 xorl %edi,%r9d 128 roll $12,%r9d 129 addl %r8d,%eax 130 xorl %eax,%r12d 131 roll $8,%r12d 132 addl %r9d,%ebx 133 xorl %ebx,%r13d 134 roll $8,%r13d 135 addl %r12d,%esi 136 xorl %esi,%r8d 137 roll $7,%r8d 138 addl %r13d,%edi 139 xorl %edi,%r9d 140 roll $7,%r9d 141 movl %esi,32(%rsp) 142 movl %edi,36(%rsp) 143 movl 40(%rsp),%esi 144 movl 44(%rsp),%edi 145 addl %r10d,%ecx 146 xorl %ecx,%r14d 147 roll $16,%r14d 148 addl %r11d,%edx 149 xorl %edx,%r15d 150 roll $16,%r15d 151 addl %r14d,%esi 152 xorl %esi,%r10d 153 roll $12,%r10d 154 addl %r15d,%edi 155 xorl %edi,%r11d 156 roll $12,%r11d 157 addl %r10d,%ecx 158 xorl %ecx,%r14d 159 roll $8,%r14d 160 addl %r11d,%edx 161 xorl %edx,%r15d 162 roll $8,%r15d 163 addl %r14d,%esi 164 xorl %esi,%r10d 165 roll $7,%r10d 166 addl %r15d,%edi 167 xorl %edi,%r11d 168 roll $7,%r11d 169 addl %r9d,%eax 170 xorl %eax,%r15d 171 roll $16,%r15d 172 addl %r10d,%ebx 173 xorl %ebx,%r12d 174 roll $16,%r12d 175 addl %r15d,%esi 176 xorl %esi,%r9d 177 roll $12,%r9d 178 addl %r12d,%edi 179 xorl %edi,%r10d 180 roll $12,%r10d 181 addl %r9d,%eax 182 xorl %eax,%r15d 183 roll $8,%r15d 184 addl %r10d,%ebx 185 xorl %ebx,%r12d 186 roll $8,%r12d 187 addl %r15d,%esi 188 xorl %esi,%r9d 189 roll $7,%r9d 190 addl %r12d,%edi 191 xorl %edi,%r10d 192 roll $7,%r10d 193 movl %esi,40(%rsp) 194 movl %edi,44(%rsp) 195 movl 32(%rsp),%esi 196 movl 36(%rsp),%edi 197 addl %r11d,%ecx 198 xorl %ecx,%r13d 199 roll $16,%r13d 200 addl %r8d,%edx 201 xorl %edx,%r14d 202 roll $16,%r14d 203 addl %r13d,%esi 204 xorl %esi,%r11d 205 roll $12,%r11d 206 addl %r14d,%edi 207 xorl %edi,%r8d 208 roll $12,%r8d 209 addl %r11d,%ecx 210 xorl %ecx,%r13d 211 roll $8,%r13d 212 addl %r8d,%edx 213 xorl %edx,%r14d 214 roll $8,%r14d 215 addl %r13d,%esi 216 xorl %esi,%r11d 217 roll $7,%r11d 218 addl %r14d,%edi 219 xorl %edi,%r8d 220 roll $7,%r8d 221 decl %ebp 222 jnz .Loop 223 movl %edi,36(%rsp) 224 movl %esi,32(%rsp) 225 movq 64(%rsp),%rbp 226 movdqa %xmm2,%xmm1 227 movq 64+8(%rsp),%rsi 228 paddd %xmm4,%xmm3 229 movq 64+16(%rsp),%rdi 230 231 addl $0x61707865,%eax 232 addl $0x3320646e,%ebx 233 addl $0x79622d32,%ecx 234 addl $0x6b206574,%edx 235 addl 16(%rsp),%r8d 236 addl 20(%rsp),%r9d 237 addl 24(%rsp),%r10d 238 addl 28(%rsp),%r11d 239 addl 48(%rsp),%r12d 240 addl 52(%rsp),%r13d 241 addl 56(%rsp),%r14d 242 addl 60(%rsp),%r15d 243 paddd 32(%rsp),%xmm1 244 245 cmpq $64,%rbp 246 jb .Ltail 247 248 xorl 0(%rsi),%eax 249 xorl 4(%rsi),%ebx 250 xorl 8(%rsi),%ecx 251 xorl 12(%rsi),%edx 252 xorl 16(%rsi),%r8d 253 xorl 20(%rsi),%r9d 254 xorl 24(%rsi),%r10d 255 xorl 28(%rsi),%r11d 256 movdqu 32(%rsi),%xmm0 257 xorl 48(%rsi),%r12d 258 xorl 52(%rsi),%r13d 259 xorl 56(%rsi),%r14d 260 xorl 60(%rsi),%r15d 261 leaq 64(%rsi),%rsi 262 pxor %xmm1,%xmm0 263 264 movdqa %xmm2,32(%rsp) 265 movd %xmm3,48(%rsp) 266 267 movl %eax,0(%rdi) 268 movl %ebx,4(%rdi) 269 movl %ecx,8(%rdi) 270 movl %edx,12(%rdi) 271 movl %r8d,16(%rdi) 272 movl %r9d,20(%rdi) 273 movl %r10d,24(%rdi) 274 movl %r11d,28(%rdi) 275 movdqu %xmm0,32(%rdi) 276 movl %r12d,48(%rdi) 277 movl %r13d,52(%rdi) 278 movl %r14d,56(%rdi) 279 movl %r15d,60(%rdi) 280 leaq 64(%rdi),%rdi 281 282 subq $64,%rbp 283 jnz .Loop_outer 284 285 jmp .Ldone 286 287.align 16 288.Ltail: 289 movl %eax,0(%rsp) 290 movl %ebx,4(%rsp) 291 xorq %rbx,%rbx 292 movl %ecx,8(%rsp) 293 movl %edx,12(%rsp) 294 movl %r8d,16(%rsp) 295 movl %r9d,20(%rsp) 296 movl %r10d,24(%rsp) 297 movl %r11d,28(%rsp) 298 movdqa %xmm1,32(%rsp) 299 movl %r12d,48(%rsp) 300 movl %r13d,52(%rsp) 301 movl %r14d,56(%rsp) 302 movl %r15d,60(%rsp) 303 304.Loop_tail: 305 movzbl (%rsi,%rbx,1),%eax 306 movzbl (%rsp,%rbx,1),%edx 307 leaq 1(%rbx),%rbx 308 xorl %edx,%eax 309 movb %al,-1(%rdi,%rbx,1) 310 decq %rbp 311 jnz .Loop_tail 312 313.Ldone: 314 leaq 64+24+48(%rsp),%rsi 315 movq -48(%rsi),%r15 316.cfi_restore r15 317 movq -40(%rsi),%r14 318.cfi_restore r14 319 movq -32(%rsi),%r13 320.cfi_restore r13 321 movq -24(%rsi),%r12 322.cfi_restore r12 323 movq -16(%rsi),%rbp 324.cfi_restore rbp 325 movq -8(%rsi),%rbx 326.cfi_restore rbx 327 leaq (%rsi),%rsp 328.cfi_adjust_cfa_offset -136 329.Lno_data: 330 ret 331.cfi_endproc 332.size ChaCha20_ctr32,.-ChaCha20_ctr32 333.type ChaCha20_ssse3,@function 334.align 32 335ChaCha20_ssse3: 336.LChaCha20_ssse3: 337.cfi_startproc 338 movq %rsp,%r9 339.cfi_def_cfa_register r9 340 cmpq $128,%rdx 341 ja .LChaCha20_4x 342 343.Ldo_sse3_after_all: 344 subq $64+8,%rsp 345 movdqa .Lsigma(%rip),%xmm0 346 movdqu (%rcx),%xmm1 347 movdqu 16(%rcx),%xmm2 348 movdqu (%r8),%xmm3 349 movdqa .Lrot16(%rip),%xmm6 350 movdqa .Lrot24(%rip),%xmm7 351 352 movdqa %xmm0,0(%rsp) 353 movdqa %xmm1,16(%rsp) 354 movdqa %xmm2,32(%rsp) 355 movdqa %xmm3,48(%rsp) 356 movq $10,%r8 357 jmp .Loop_ssse3 358 359.align 32 360.Loop_outer_ssse3: 361 movdqa .Lone(%rip),%xmm3 362 movdqa 0(%rsp),%xmm0 363 movdqa 16(%rsp),%xmm1 364 movdqa 32(%rsp),%xmm2 365 paddd 48(%rsp),%xmm3 366 movq $10,%r8 367 movdqa %xmm3,48(%rsp) 368 jmp .Loop_ssse3 369 370.align 32 371.Loop_ssse3: 372 paddd %xmm1,%xmm0 373 pxor %xmm0,%xmm3 374.byte 102,15,56,0,222 375 paddd %xmm3,%xmm2 376 pxor %xmm2,%xmm1 377 movdqa %xmm1,%xmm4 378 psrld $20,%xmm1 379 pslld $12,%xmm4 380 por %xmm4,%xmm1 381 paddd %xmm1,%xmm0 382 pxor %xmm0,%xmm3 383.byte 102,15,56,0,223 384 paddd %xmm3,%xmm2 385 pxor %xmm2,%xmm1 386 movdqa %xmm1,%xmm4 387 psrld $25,%xmm1 388 pslld $7,%xmm4 389 por %xmm4,%xmm1 390 pshufd $78,%xmm2,%xmm2 391 pshufd $57,%xmm1,%xmm1 392 pshufd $147,%xmm3,%xmm3 393 nop 394 paddd %xmm1,%xmm0 395 pxor %xmm0,%xmm3 396.byte 102,15,56,0,222 397 paddd %xmm3,%xmm2 398 pxor %xmm2,%xmm1 399 movdqa %xmm1,%xmm4 400 psrld $20,%xmm1 401 pslld $12,%xmm4 402 por %xmm4,%xmm1 403 paddd %xmm1,%xmm0 404 pxor %xmm0,%xmm3 405.byte 102,15,56,0,223 406 paddd %xmm3,%xmm2 407 pxor %xmm2,%xmm1 408 movdqa %xmm1,%xmm4 409 psrld $25,%xmm1 410 pslld $7,%xmm4 411 por %xmm4,%xmm1 412 pshufd $78,%xmm2,%xmm2 413 pshufd $147,%xmm1,%xmm1 414 pshufd $57,%xmm3,%xmm3 415 decq %r8 416 jnz .Loop_ssse3 417 paddd 0(%rsp),%xmm0 418 paddd 16(%rsp),%xmm1 419 paddd 32(%rsp),%xmm2 420 paddd 48(%rsp),%xmm3 421 422 cmpq $64,%rdx 423 jb .Ltail_ssse3 424 425 movdqu 0(%rsi),%xmm4 426 movdqu 16(%rsi),%xmm5 427 pxor %xmm4,%xmm0 428 movdqu 32(%rsi),%xmm4 429 pxor %xmm5,%xmm1 430 movdqu 48(%rsi),%xmm5 431 leaq 64(%rsi),%rsi 432 pxor %xmm4,%xmm2 433 pxor %xmm5,%xmm3 434 435 movdqu %xmm0,0(%rdi) 436 movdqu %xmm1,16(%rdi) 437 movdqu %xmm2,32(%rdi) 438 movdqu %xmm3,48(%rdi) 439 leaq 64(%rdi),%rdi 440 441 subq $64,%rdx 442 jnz .Loop_outer_ssse3 443 444 jmp .Ldone_ssse3 445 446.align 16 447.Ltail_ssse3: 448 movdqa %xmm0,0(%rsp) 449 movdqa %xmm1,16(%rsp) 450 movdqa %xmm2,32(%rsp) 451 movdqa %xmm3,48(%rsp) 452 xorq %r8,%r8 453 454.Loop_tail_ssse3: 455 movzbl (%rsi,%r8,1),%eax 456 movzbl (%rsp,%r8,1),%ecx 457 leaq 1(%r8),%r8 458 xorl %ecx,%eax 459 movb %al,-1(%rdi,%r8,1) 460 decq %rdx 461 jnz .Loop_tail_ssse3 462 463.Ldone_ssse3: 464 leaq (%r9),%rsp 465.cfi_def_cfa_register rsp 466.Lssse3_epilogue: 467 ret 468.cfi_endproc 469.size ChaCha20_ssse3,.-ChaCha20_ssse3 470.type ChaCha20_4x,@function 471.align 32 472ChaCha20_4x: 473.LChaCha20_4x: 474.cfi_startproc 475 movq %rsp,%r9 476.cfi_def_cfa_register r9 477 movq %r10,%r11 478 shrq $32,%r10 479 testq $32,%r10 480 jnz .LChaCha20_8x 481 cmpq $192,%rdx 482 ja .Lproceed4x 483 484 andq $71303168,%r11 485 cmpq $4194304,%r11 486 je .Ldo_sse3_after_all 487 488.Lproceed4x: 489 subq $0x140+8,%rsp 490 movdqa .Lsigma(%rip),%xmm11 491 movdqu (%rcx),%xmm15 492 movdqu 16(%rcx),%xmm7 493 movdqu (%r8),%xmm3 494 leaq 256(%rsp),%rcx 495 leaq .Lrot16(%rip),%r10 496 leaq .Lrot24(%rip),%r11 497 498 pshufd $0x00,%xmm11,%xmm8 499 pshufd $0x55,%xmm11,%xmm9 500 movdqa %xmm8,64(%rsp) 501 pshufd $0xaa,%xmm11,%xmm10 502 movdqa %xmm9,80(%rsp) 503 pshufd $0xff,%xmm11,%xmm11 504 movdqa %xmm10,96(%rsp) 505 movdqa %xmm11,112(%rsp) 506 507 pshufd $0x00,%xmm15,%xmm12 508 pshufd $0x55,%xmm15,%xmm13 509 movdqa %xmm12,128-256(%rcx) 510 pshufd $0xaa,%xmm15,%xmm14 511 movdqa %xmm13,144-256(%rcx) 512 pshufd $0xff,%xmm15,%xmm15 513 movdqa %xmm14,160-256(%rcx) 514 movdqa %xmm15,176-256(%rcx) 515 516 pshufd $0x00,%xmm7,%xmm4 517 pshufd $0x55,%xmm7,%xmm5 518 movdqa %xmm4,192-256(%rcx) 519 pshufd $0xaa,%xmm7,%xmm6 520 movdqa %xmm5,208-256(%rcx) 521 pshufd $0xff,%xmm7,%xmm7 522 movdqa %xmm6,224-256(%rcx) 523 movdqa %xmm7,240-256(%rcx) 524 525 pshufd $0x00,%xmm3,%xmm0 526 pshufd $0x55,%xmm3,%xmm1 527 paddd .Linc(%rip),%xmm0 528 pshufd $0xaa,%xmm3,%xmm2 529 movdqa %xmm1,272-256(%rcx) 530 pshufd $0xff,%xmm3,%xmm3 531 movdqa %xmm2,288-256(%rcx) 532 movdqa %xmm3,304-256(%rcx) 533 534 jmp .Loop_enter4x 535 536.align 32 537.Loop_outer4x: 538 movdqa 64(%rsp),%xmm8 539 movdqa 80(%rsp),%xmm9 540 movdqa 96(%rsp),%xmm10 541 movdqa 112(%rsp),%xmm11 542 movdqa 128-256(%rcx),%xmm12 543 movdqa 144-256(%rcx),%xmm13 544 movdqa 160-256(%rcx),%xmm14 545 movdqa 176-256(%rcx),%xmm15 546 movdqa 192-256(%rcx),%xmm4 547 movdqa 208-256(%rcx),%xmm5 548 movdqa 224-256(%rcx),%xmm6 549 movdqa 240-256(%rcx),%xmm7 550 movdqa 256-256(%rcx),%xmm0 551 movdqa 272-256(%rcx),%xmm1 552 movdqa 288-256(%rcx),%xmm2 553 movdqa 304-256(%rcx),%xmm3 554 paddd .Lfour(%rip),%xmm0 555 556.Loop_enter4x: 557 movdqa %xmm6,32(%rsp) 558 movdqa %xmm7,48(%rsp) 559 movdqa (%r10),%xmm7 560 movl $10,%eax 561 movdqa %xmm0,256-256(%rcx) 562 jmp .Loop4x 563 564.align 32 565.Loop4x: 566 paddd %xmm12,%xmm8 567 paddd %xmm13,%xmm9 568 pxor %xmm8,%xmm0 569 pxor %xmm9,%xmm1 570.byte 102,15,56,0,199 571.byte 102,15,56,0,207 572 paddd %xmm0,%xmm4 573 paddd %xmm1,%xmm5 574 pxor %xmm4,%xmm12 575 pxor %xmm5,%xmm13 576 movdqa %xmm12,%xmm6 577 pslld $12,%xmm12 578 psrld $20,%xmm6 579 movdqa %xmm13,%xmm7 580 pslld $12,%xmm13 581 por %xmm6,%xmm12 582 psrld $20,%xmm7 583 movdqa (%r11),%xmm6 584 por %xmm7,%xmm13 585 paddd %xmm12,%xmm8 586 paddd %xmm13,%xmm9 587 pxor %xmm8,%xmm0 588 pxor %xmm9,%xmm1 589.byte 102,15,56,0,198 590.byte 102,15,56,0,206 591 paddd %xmm0,%xmm4 592 paddd %xmm1,%xmm5 593 pxor %xmm4,%xmm12 594 pxor %xmm5,%xmm13 595 movdqa %xmm12,%xmm7 596 pslld $7,%xmm12 597 psrld $25,%xmm7 598 movdqa %xmm13,%xmm6 599 pslld $7,%xmm13 600 por %xmm7,%xmm12 601 psrld $25,%xmm6 602 movdqa (%r10),%xmm7 603 por %xmm6,%xmm13 604 movdqa %xmm4,0(%rsp) 605 movdqa %xmm5,16(%rsp) 606 movdqa 32(%rsp),%xmm4 607 movdqa 48(%rsp),%xmm5 608 paddd %xmm14,%xmm10 609 paddd %xmm15,%xmm11 610 pxor %xmm10,%xmm2 611 pxor %xmm11,%xmm3 612.byte 102,15,56,0,215 613.byte 102,15,56,0,223 614 paddd %xmm2,%xmm4 615 paddd %xmm3,%xmm5 616 pxor %xmm4,%xmm14 617 pxor %xmm5,%xmm15 618 movdqa %xmm14,%xmm6 619 pslld $12,%xmm14 620 psrld $20,%xmm6 621 movdqa %xmm15,%xmm7 622 pslld $12,%xmm15 623 por %xmm6,%xmm14 624 psrld $20,%xmm7 625 movdqa (%r11),%xmm6 626 por %xmm7,%xmm15 627 paddd %xmm14,%xmm10 628 paddd %xmm15,%xmm11 629 pxor %xmm10,%xmm2 630 pxor %xmm11,%xmm3 631.byte 102,15,56,0,214 632.byte 102,15,56,0,222 633 paddd %xmm2,%xmm4 634 paddd %xmm3,%xmm5 635 pxor %xmm4,%xmm14 636 pxor %xmm5,%xmm15 637 movdqa %xmm14,%xmm7 638 pslld $7,%xmm14 639 psrld $25,%xmm7 640 movdqa %xmm15,%xmm6 641 pslld $7,%xmm15 642 por %xmm7,%xmm14 643 psrld $25,%xmm6 644 movdqa (%r10),%xmm7 645 por %xmm6,%xmm15 646 paddd %xmm13,%xmm8 647 paddd %xmm14,%xmm9 648 pxor %xmm8,%xmm3 649 pxor %xmm9,%xmm0 650.byte 102,15,56,0,223 651.byte 102,15,56,0,199 652 paddd %xmm3,%xmm4 653 paddd %xmm0,%xmm5 654 pxor %xmm4,%xmm13 655 pxor %xmm5,%xmm14 656 movdqa %xmm13,%xmm6 657 pslld $12,%xmm13 658 psrld $20,%xmm6 659 movdqa %xmm14,%xmm7 660 pslld $12,%xmm14 661 por %xmm6,%xmm13 662 psrld $20,%xmm7 663 movdqa (%r11),%xmm6 664 por %xmm7,%xmm14 665 paddd %xmm13,%xmm8 666 paddd %xmm14,%xmm9 667 pxor %xmm8,%xmm3 668 pxor %xmm9,%xmm0 669.byte 102,15,56,0,222 670.byte 102,15,56,0,198 671 paddd %xmm3,%xmm4 672 paddd %xmm0,%xmm5 673 pxor %xmm4,%xmm13 674 pxor %xmm5,%xmm14 675 movdqa %xmm13,%xmm7 676 pslld $7,%xmm13 677 psrld $25,%xmm7 678 movdqa %xmm14,%xmm6 679 pslld $7,%xmm14 680 por %xmm7,%xmm13 681 psrld $25,%xmm6 682 movdqa (%r10),%xmm7 683 por %xmm6,%xmm14 684 movdqa %xmm4,32(%rsp) 685 movdqa %xmm5,48(%rsp) 686 movdqa 0(%rsp),%xmm4 687 movdqa 16(%rsp),%xmm5 688 paddd %xmm15,%xmm10 689 paddd %xmm12,%xmm11 690 pxor %xmm10,%xmm1 691 pxor %xmm11,%xmm2 692.byte 102,15,56,0,207 693.byte 102,15,56,0,215 694 paddd %xmm1,%xmm4 695 paddd %xmm2,%xmm5 696 pxor %xmm4,%xmm15 697 pxor %xmm5,%xmm12 698 movdqa %xmm15,%xmm6 699 pslld $12,%xmm15 700 psrld $20,%xmm6 701 movdqa %xmm12,%xmm7 702 pslld $12,%xmm12 703 por %xmm6,%xmm15 704 psrld $20,%xmm7 705 movdqa (%r11),%xmm6 706 por %xmm7,%xmm12 707 paddd %xmm15,%xmm10 708 paddd %xmm12,%xmm11 709 pxor %xmm10,%xmm1 710 pxor %xmm11,%xmm2 711.byte 102,15,56,0,206 712.byte 102,15,56,0,214 713 paddd %xmm1,%xmm4 714 paddd %xmm2,%xmm5 715 pxor %xmm4,%xmm15 716 pxor %xmm5,%xmm12 717 movdqa %xmm15,%xmm7 718 pslld $7,%xmm15 719 psrld $25,%xmm7 720 movdqa %xmm12,%xmm6 721 pslld $7,%xmm12 722 por %xmm7,%xmm15 723 psrld $25,%xmm6 724 movdqa (%r10),%xmm7 725 por %xmm6,%xmm12 726 decl %eax 727 jnz .Loop4x 728 729 paddd 64(%rsp),%xmm8 730 paddd 80(%rsp),%xmm9 731 paddd 96(%rsp),%xmm10 732 paddd 112(%rsp),%xmm11 733 734 movdqa %xmm8,%xmm6 735 punpckldq %xmm9,%xmm8 736 movdqa %xmm10,%xmm7 737 punpckldq %xmm11,%xmm10 738 punpckhdq %xmm9,%xmm6 739 punpckhdq %xmm11,%xmm7 740 movdqa %xmm8,%xmm9 741 punpcklqdq %xmm10,%xmm8 742 movdqa %xmm6,%xmm11 743 punpcklqdq %xmm7,%xmm6 744 punpckhqdq %xmm10,%xmm9 745 punpckhqdq %xmm7,%xmm11 746 paddd 128-256(%rcx),%xmm12 747 paddd 144-256(%rcx),%xmm13 748 paddd 160-256(%rcx),%xmm14 749 paddd 176-256(%rcx),%xmm15 750 751 movdqa %xmm8,0(%rsp) 752 movdqa %xmm9,16(%rsp) 753 movdqa 32(%rsp),%xmm8 754 movdqa 48(%rsp),%xmm9 755 756 movdqa %xmm12,%xmm10 757 punpckldq %xmm13,%xmm12 758 movdqa %xmm14,%xmm7 759 punpckldq %xmm15,%xmm14 760 punpckhdq %xmm13,%xmm10 761 punpckhdq %xmm15,%xmm7 762 movdqa %xmm12,%xmm13 763 punpcklqdq %xmm14,%xmm12 764 movdqa %xmm10,%xmm15 765 punpcklqdq %xmm7,%xmm10 766 punpckhqdq %xmm14,%xmm13 767 punpckhqdq %xmm7,%xmm15 768 paddd 192-256(%rcx),%xmm4 769 paddd 208-256(%rcx),%xmm5 770 paddd 224-256(%rcx),%xmm8 771 paddd 240-256(%rcx),%xmm9 772 773 movdqa %xmm6,32(%rsp) 774 movdqa %xmm11,48(%rsp) 775 776 movdqa %xmm4,%xmm14 777 punpckldq %xmm5,%xmm4 778 movdqa %xmm8,%xmm7 779 punpckldq %xmm9,%xmm8 780 punpckhdq %xmm5,%xmm14 781 punpckhdq %xmm9,%xmm7 782 movdqa %xmm4,%xmm5 783 punpcklqdq %xmm8,%xmm4 784 movdqa %xmm14,%xmm9 785 punpcklqdq %xmm7,%xmm14 786 punpckhqdq %xmm8,%xmm5 787 punpckhqdq %xmm7,%xmm9 788 paddd 256-256(%rcx),%xmm0 789 paddd 272-256(%rcx),%xmm1 790 paddd 288-256(%rcx),%xmm2 791 paddd 304-256(%rcx),%xmm3 792 793 movdqa %xmm0,%xmm8 794 punpckldq %xmm1,%xmm0 795 movdqa %xmm2,%xmm7 796 punpckldq %xmm3,%xmm2 797 punpckhdq %xmm1,%xmm8 798 punpckhdq %xmm3,%xmm7 799 movdqa %xmm0,%xmm1 800 punpcklqdq %xmm2,%xmm0 801 movdqa %xmm8,%xmm3 802 punpcklqdq %xmm7,%xmm8 803 punpckhqdq %xmm2,%xmm1 804 punpckhqdq %xmm7,%xmm3 805 cmpq $256,%rdx 806 jb .Ltail4x 807 808 movdqu 0(%rsi),%xmm6 809 movdqu 16(%rsi),%xmm11 810 movdqu 32(%rsi),%xmm2 811 movdqu 48(%rsi),%xmm7 812 pxor 0(%rsp),%xmm6 813 pxor %xmm12,%xmm11 814 pxor %xmm4,%xmm2 815 pxor %xmm0,%xmm7 816 817 movdqu %xmm6,0(%rdi) 818 movdqu 64(%rsi),%xmm6 819 movdqu %xmm11,16(%rdi) 820 movdqu 80(%rsi),%xmm11 821 movdqu %xmm2,32(%rdi) 822 movdqu 96(%rsi),%xmm2 823 movdqu %xmm7,48(%rdi) 824 movdqu 112(%rsi),%xmm7 825 leaq 128(%rsi),%rsi 826 pxor 16(%rsp),%xmm6 827 pxor %xmm13,%xmm11 828 pxor %xmm5,%xmm2 829 pxor %xmm1,%xmm7 830 831 movdqu %xmm6,64(%rdi) 832 movdqu 0(%rsi),%xmm6 833 movdqu %xmm11,80(%rdi) 834 movdqu 16(%rsi),%xmm11 835 movdqu %xmm2,96(%rdi) 836 movdqu 32(%rsi),%xmm2 837 movdqu %xmm7,112(%rdi) 838 leaq 128(%rdi),%rdi 839 movdqu 48(%rsi),%xmm7 840 pxor 32(%rsp),%xmm6 841 pxor %xmm10,%xmm11 842 pxor %xmm14,%xmm2 843 pxor %xmm8,%xmm7 844 845 movdqu %xmm6,0(%rdi) 846 movdqu 64(%rsi),%xmm6 847 movdqu %xmm11,16(%rdi) 848 movdqu 80(%rsi),%xmm11 849 movdqu %xmm2,32(%rdi) 850 movdqu 96(%rsi),%xmm2 851 movdqu %xmm7,48(%rdi) 852 movdqu 112(%rsi),%xmm7 853 leaq 128(%rsi),%rsi 854 pxor 48(%rsp),%xmm6 855 pxor %xmm15,%xmm11 856 pxor %xmm9,%xmm2 857 pxor %xmm3,%xmm7 858 movdqu %xmm6,64(%rdi) 859 movdqu %xmm11,80(%rdi) 860 movdqu %xmm2,96(%rdi) 861 movdqu %xmm7,112(%rdi) 862 leaq 128(%rdi),%rdi 863 864 subq $256,%rdx 865 jnz .Loop_outer4x 866 867 jmp .Ldone4x 868 869.Ltail4x: 870 cmpq $192,%rdx 871 jae .L192_or_more4x 872 cmpq $128,%rdx 873 jae .L128_or_more4x 874 cmpq $64,%rdx 875 jae .L64_or_more4x 876 877 878 xorq %r10,%r10 879 880 movdqa %xmm12,16(%rsp) 881 movdqa %xmm4,32(%rsp) 882 movdqa %xmm0,48(%rsp) 883 jmp .Loop_tail4x 884 885.align 32 886.L64_or_more4x: 887 movdqu 0(%rsi),%xmm6 888 movdqu 16(%rsi),%xmm11 889 movdqu 32(%rsi),%xmm2 890 movdqu 48(%rsi),%xmm7 891 pxor 0(%rsp),%xmm6 892 pxor %xmm12,%xmm11 893 pxor %xmm4,%xmm2 894 pxor %xmm0,%xmm7 895 movdqu %xmm6,0(%rdi) 896 movdqu %xmm11,16(%rdi) 897 movdqu %xmm2,32(%rdi) 898 movdqu %xmm7,48(%rdi) 899 je .Ldone4x 900 901 movdqa 16(%rsp),%xmm6 902 leaq 64(%rsi),%rsi 903 xorq %r10,%r10 904 movdqa %xmm6,0(%rsp) 905 movdqa %xmm13,16(%rsp) 906 leaq 64(%rdi),%rdi 907 movdqa %xmm5,32(%rsp) 908 subq $64,%rdx 909 movdqa %xmm1,48(%rsp) 910 jmp .Loop_tail4x 911 912.align 32 913.L128_or_more4x: 914 movdqu 0(%rsi),%xmm6 915 movdqu 16(%rsi),%xmm11 916 movdqu 32(%rsi),%xmm2 917 movdqu 48(%rsi),%xmm7 918 pxor 0(%rsp),%xmm6 919 pxor %xmm12,%xmm11 920 pxor %xmm4,%xmm2 921 pxor %xmm0,%xmm7 922 923 movdqu %xmm6,0(%rdi) 924 movdqu 64(%rsi),%xmm6 925 movdqu %xmm11,16(%rdi) 926 movdqu 80(%rsi),%xmm11 927 movdqu %xmm2,32(%rdi) 928 movdqu 96(%rsi),%xmm2 929 movdqu %xmm7,48(%rdi) 930 movdqu 112(%rsi),%xmm7 931 pxor 16(%rsp),%xmm6 932 pxor %xmm13,%xmm11 933 pxor %xmm5,%xmm2 934 pxor %xmm1,%xmm7 935 movdqu %xmm6,64(%rdi) 936 movdqu %xmm11,80(%rdi) 937 movdqu %xmm2,96(%rdi) 938 movdqu %xmm7,112(%rdi) 939 je .Ldone4x 940 941 movdqa 32(%rsp),%xmm6 942 leaq 128(%rsi),%rsi 943 xorq %r10,%r10 944 movdqa %xmm6,0(%rsp) 945 movdqa %xmm10,16(%rsp) 946 leaq 128(%rdi),%rdi 947 movdqa %xmm14,32(%rsp) 948 subq $128,%rdx 949 movdqa %xmm8,48(%rsp) 950 jmp .Loop_tail4x 951 952.align 32 953.L192_or_more4x: 954 movdqu 0(%rsi),%xmm6 955 movdqu 16(%rsi),%xmm11 956 movdqu 32(%rsi),%xmm2 957 movdqu 48(%rsi),%xmm7 958 pxor 0(%rsp),%xmm6 959 pxor %xmm12,%xmm11 960 pxor %xmm4,%xmm2 961 pxor %xmm0,%xmm7 962 963 movdqu %xmm6,0(%rdi) 964 movdqu 64(%rsi),%xmm6 965 movdqu %xmm11,16(%rdi) 966 movdqu 80(%rsi),%xmm11 967 movdqu %xmm2,32(%rdi) 968 movdqu 96(%rsi),%xmm2 969 movdqu %xmm7,48(%rdi) 970 movdqu 112(%rsi),%xmm7 971 leaq 128(%rsi),%rsi 972 pxor 16(%rsp),%xmm6 973 pxor %xmm13,%xmm11 974 pxor %xmm5,%xmm2 975 pxor %xmm1,%xmm7 976 977 movdqu %xmm6,64(%rdi) 978 movdqu 0(%rsi),%xmm6 979 movdqu %xmm11,80(%rdi) 980 movdqu 16(%rsi),%xmm11 981 movdqu %xmm2,96(%rdi) 982 movdqu 32(%rsi),%xmm2 983 movdqu %xmm7,112(%rdi) 984 leaq 128(%rdi),%rdi 985 movdqu 48(%rsi),%xmm7 986 pxor 32(%rsp),%xmm6 987 pxor %xmm10,%xmm11 988 pxor %xmm14,%xmm2 989 pxor %xmm8,%xmm7 990 movdqu %xmm6,0(%rdi) 991 movdqu %xmm11,16(%rdi) 992 movdqu %xmm2,32(%rdi) 993 movdqu %xmm7,48(%rdi) 994 je .Ldone4x 995 996 movdqa 48(%rsp),%xmm6 997 leaq 64(%rsi),%rsi 998 xorq %r10,%r10 999 movdqa %xmm6,0(%rsp) 1000 movdqa %xmm15,16(%rsp) 1001 leaq 64(%rdi),%rdi 1002 movdqa %xmm9,32(%rsp) 1003 subq $192,%rdx 1004 movdqa %xmm3,48(%rsp) 1005 1006.Loop_tail4x: 1007 movzbl (%rsi,%r10,1),%eax 1008 movzbl (%rsp,%r10,1),%ecx 1009 leaq 1(%r10),%r10 1010 xorl %ecx,%eax 1011 movb %al,-1(%rdi,%r10,1) 1012 decq %rdx 1013 jnz .Loop_tail4x 1014 1015.Ldone4x: 1016 leaq (%r9),%rsp 1017.cfi_def_cfa_register rsp 1018.L4x_epilogue: 1019 ret 1020.cfi_endproc 1021.size ChaCha20_4x,.-ChaCha20_4x 1022.type ChaCha20_8x,@function 1023.align 32 1024ChaCha20_8x: 1025.LChaCha20_8x: 1026.cfi_startproc 1027 movq %rsp,%r9 1028.cfi_def_cfa_register r9 1029 subq $0x280+8,%rsp 1030 andq $-32,%rsp 1031 vzeroupper 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 vbroadcasti128 .Lsigma(%rip),%ymm11 1043 vbroadcasti128 (%rcx),%ymm3 1044 vbroadcasti128 16(%rcx),%ymm15 1045 vbroadcasti128 (%r8),%ymm7 1046 leaq 256(%rsp),%rcx 1047 leaq 512(%rsp),%rax 1048 leaq .Lrot16(%rip),%r10 1049 leaq .Lrot24(%rip),%r11 1050 1051 vpshufd $0x00,%ymm11,%ymm8 1052 vpshufd $0x55,%ymm11,%ymm9 1053 vmovdqa %ymm8,128-256(%rcx) 1054 vpshufd $0xaa,%ymm11,%ymm10 1055 vmovdqa %ymm9,160-256(%rcx) 1056 vpshufd $0xff,%ymm11,%ymm11 1057 vmovdqa %ymm10,192-256(%rcx) 1058 vmovdqa %ymm11,224-256(%rcx) 1059 1060 vpshufd $0x00,%ymm3,%ymm0 1061 vpshufd $0x55,%ymm3,%ymm1 1062 vmovdqa %ymm0,256-256(%rcx) 1063 vpshufd $0xaa,%ymm3,%ymm2 1064 vmovdqa %ymm1,288-256(%rcx) 1065 vpshufd $0xff,%ymm3,%ymm3 1066 vmovdqa %ymm2,320-256(%rcx) 1067 vmovdqa %ymm3,352-256(%rcx) 1068 1069 vpshufd $0x00,%ymm15,%ymm12 1070 vpshufd $0x55,%ymm15,%ymm13 1071 vmovdqa %ymm12,384-512(%rax) 1072 vpshufd $0xaa,%ymm15,%ymm14 1073 vmovdqa %ymm13,416-512(%rax) 1074 vpshufd $0xff,%ymm15,%ymm15 1075 vmovdqa %ymm14,448-512(%rax) 1076 vmovdqa %ymm15,480-512(%rax) 1077 1078 vpshufd $0x00,%ymm7,%ymm4 1079 vpshufd $0x55,%ymm7,%ymm5 1080 vpaddd .Lincy(%rip),%ymm4,%ymm4 1081 vpshufd $0xaa,%ymm7,%ymm6 1082 vmovdqa %ymm5,544-512(%rax) 1083 vpshufd $0xff,%ymm7,%ymm7 1084 vmovdqa %ymm6,576-512(%rax) 1085 vmovdqa %ymm7,608-512(%rax) 1086 1087 jmp .Loop_enter8x 1088 1089.align 32 1090.Loop_outer8x: 1091 vmovdqa 128-256(%rcx),%ymm8 1092 vmovdqa 160-256(%rcx),%ymm9 1093 vmovdqa 192-256(%rcx),%ymm10 1094 vmovdqa 224-256(%rcx),%ymm11 1095 vmovdqa 256-256(%rcx),%ymm0 1096 vmovdqa 288-256(%rcx),%ymm1 1097 vmovdqa 320-256(%rcx),%ymm2 1098 vmovdqa 352-256(%rcx),%ymm3 1099 vmovdqa 384-512(%rax),%ymm12 1100 vmovdqa 416-512(%rax),%ymm13 1101 vmovdqa 448-512(%rax),%ymm14 1102 vmovdqa 480-512(%rax),%ymm15 1103 vmovdqa 512-512(%rax),%ymm4 1104 vmovdqa 544-512(%rax),%ymm5 1105 vmovdqa 576-512(%rax),%ymm6 1106 vmovdqa 608-512(%rax),%ymm7 1107 vpaddd .Leight(%rip),%ymm4,%ymm4 1108 1109.Loop_enter8x: 1110 vmovdqa %ymm14,64(%rsp) 1111 vmovdqa %ymm15,96(%rsp) 1112 vbroadcasti128 (%r10),%ymm15 1113 vmovdqa %ymm4,512-512(%rax) 1114 movl $10,%eax 1115 jmp .Loop8x 1116 1117.align 32 1118.Loop8x: 1119 vpaddd %ymm0,%ymm8,%ymm8 1120 vpxor %ymm4,%ymm8,%ymm4 1121 vpshufb %ymm15,%ymm4,%ymm4 1122 vpaddd %ymm1,%ymm9,%ymm9 1123 vpxor %ymm5,%ymm9,%ymm5 1124 vpshufb %ymm15,%ymm5,%ymm5 1125 vpaddd %ymm4,%ymm12,%ymm12 1126 vpxor %ymm0,%ymm12,%ymm0 1127 vpslld $12,%ymm0,%ymm14 1128 vpsrld $20,%ymm0,%ymm0 1129 vpor %ymm0,%ymm14,%ymm0 1130 vbroadcasti128 (%r11),%ymm14 1131 vpaddd %ymm5,%ymm13,%ymm13 1132 vpxor %ymm1,%ymm13,%ymm1 1133 vpslld $12,%ymm1,%ymm15 1134 vpsrld $20,%ymm1,%ymm1 1135 vpor %ymm1,%ymm15,%ymm1 1136 vpaddd %ymm0,%ymm8,%ymm8 1137 vpxor %ymm4,%ymm8,%ymm4 1138 vpshufb %ymm14,%ymm4,%ymm4 1139 vpaddd %ymm1,%ymm9,%ymm9 1140 vpxor %ymm5,%ymm9,%ymm5 1141 vpshufb %ymm14,%ymm5,%ymm5 1142 vpaddd %ymm4,%ymm12,%ymm12 1143 vpxor %ymm0,%ymm12,%ymm0 1144 vpslld $7,%ymm0,%ymm15 1145 vpsrld $25,%ymm0,%ymm0 1146 vpor %ymm0,%ymm15,%ymm0 1147 vbroadcasti128 (%r10),%ymm15 1148 vpaddd %ymm5,%ymm13,%ymm13 1149 vpxor %ymm1,%ymm13,%ymm1 1150 vpslld $7,%ymm1,%ymm14 1151 vpsrld $25,%ymm1,%ymm1 1152 vpor %ymm1,%ymm14,%ymm1 1153 vmovdqa %ymm12,0(%rsp) 1154 vmovdqa %ymm13,32(%rsp) 1155 vmovdqa 64(%rsp),%ymm12 1156 vmovdqa 96(%rsp),%ymm13 1157 vpaddd %ymm2,%ymm10,%ymm10 1158 vpxor %ymm6,%ymm10,%ymm6 1159 vpshufb %ymm15,%ymm6,%ymm6 1160 vpaddd %ymm3,%ymm11,%ymm11 1161 vpxor %ymm7,%ymm11,%ymm7 1162 vpshufb %ymm15,%ymm7,%ymm7 1163 vpaddd %ymm6,%ymm12,%ymm12 1164 vpxor %ymm2,%ymm12,%ymm2 1165 vpslld $12,%ymm2,%ymm14 1166 vpsrld $20,%ymm2,%ymm2 1167 vpor %ymm2,%ymm14,%ymm2 1168 vbroadcasti128 (%r11),%ymm14 1169 vpaddd %ymm7,%ymm13,%ymm13 1170 vpxor %ymm3,%ymm13,%ymm3 1171 vpslld $12,%ymm3,%ymm15 1172 vpsrld $20,%ymm3,%ymm3 1173 vpor %ymm3,%ymm15,%ymm3 1174 vpaddd %ymm2,%ymm10,%ymm10 1175 vpxor %ymm6,%ymm10,%ymm6 1176 vpshufb %ymm14,%ymm6,%ymm6 1177 vpaddd %ymm3,%ymm11,%ymm11 1178 vpxor %ymm7,%ymm11,%ymm7 1179 vpshufb %ymm14,%ymm7,%ymm7 1180 vpaddd %ymm6,%ymm12,%ymm12 1181 vpxor %ymm2,%ymm12,%ymm2 1182 vpslld $7,%ymm2,%ymm15 1183 vpsrld $25,%ymm2,%ymm2 1184 vpor %ymm2,%ymm15,%ymm2 1185 vbroadcasti128 (%r10),%ymm15 1186 vpaddd %ymm7,%ymm13,%ymm13 1187 vpxor %ymm3,%ymm13,%ymm3 1188 vpslld $7,%ymm3,%ymm14 1189 vpsrld $25,%ymm3,%ymm3 1190 vpor %ymm3,%ymm14,%ymm3 1191 vpaddd %ymm1,%ymm8,%ymm8 1192 vpxor %ymm7,%ymm8,%ymm7 1193 vpshufb %ymm15,%ymm7,%ymm7 1194 vpaddd %ymm2,%ymm9,%ymm9 1195 vpxor %ymm4,%ymm9,%ymm4 1196 vpshufb %ymm15,%ymm4,%ymm4 1197 vpaddd %ymm7,%ymm12,%ymm12 1198 vpxor %ymm1,%ymm12,%ymm1 1199 vpslld $12,%ymm1,%ymm14 1200 vpsrld $20,%ymm1,%ymm1 1201 vpor %ymm1,%ymm14,%ymm1 1202 vbroadcasti128 (%r11),%ymm14 1203 vpaddd %ymm4,%ymm13,%ymm13 1204 vpxor %ymm2,%ymm13,%ymm2 1205 vpslld $12,%ymm2,%ymm15 1206 vpsrld $20,%ymm2,%ymm2 1207 vpor %ymm2,%ymm15,%ymm2 1208 vpaddd %ymm1,%ymm8,%ymm8 1209 vpxor %ymm7,%ymm8,%ymm7 1210 vpshufb %ymm14,%ymm7,%ymm7 1211 vpaddd %ymm2,%ymm9,%ymm9 1212 vpxor %ymm4,%ymm9,%ymm4 1213 vpshufb %ymm14,%ymm4,%ymm4 1214 vpaddd %ymm7,%ymm12,%ymm12 1215 vpxor %ymm1,%ymm12,%ymm1 1216 vpslld $7,%ymm1,%ymm15 1217 vpsrld $25,%ymm1,%ymm1 1218 vpor %ymm1,%ymm15,%ymm1 1219 vbroadcasti128 (%r10),%ymm15 1220 vpaddd %ymm4,%ymm13,%ymm13 1221 vpxor %ymm2,%ymm13,%ymm2 1222 vpslld $7,%ymm2,%ymm14 1223 vpsrld $25,%ymm2,%ymm2 1224 vpor %ymm2,%ymm14,%ymm2 1225 vmovdqa %ymm12,64(%rsp) 1226 vmovdqa %ymm13,96(%rsp) 1227 vmovdqa 0(%rsp),%ymm12 1228 vmovdqa 32(%rsp),%ymm13 1229 vpaddd %ymm3,%ymm10,%ymm10 1230 vpxor %ymm5,%ymm10,%ymm5 1231 vpshufb %ymm15,%ymm5,%ymm5 1232 vpaddd %ymm0,%ymm11,%ymm11 1233 vpxor %ymm6,%ymm11,%ymm6 1234 vpshufb %ymm15,%ymm6,%ymm6 1235 vpaddd %ymm5,%ymm12,%ymm12 1236 vpxor %ymm3,%ymm12,%ymm3 1237 vpslld $12,%ymm3,%ymm14 1238 vpsrld $20,%ymm3,%ymm3 1239 vpor %ymm3,%ymm14,%ymm3 1240 vbroadcasti128 (%r11),%ymm14 1241 vpaddd %ymm6,%ymm13,%ymm13 1242 vpxor %ymm0,%ymm13,%ymm0 1243 vpslld $12,%ymm0,%ymm15 1244 vpsrld $20,%ymm0,%ymm0 1245 vpor %ymm0,%ymm15,%ymm0 1246 vpaddd %ymm3,%ymm10,%ymm10 1247 vpxor %ymm5,%ymm10,%ymm5 1248 vpshufb %ymm14,%ymm5,%ymm5 1249 vpaddd %ymm0,%ymm11,%ymm11 1250 vpxor %ymm6,%ymm11,%ymm6 1251 vpshufb %ymm14,%ymm6,%ymm6 1252 vpaddd %ymm5,%ymm12,%ymm12 1253 vpxor %ymm3,%ymm12,%ymm3 1254 vpslld $7,%ymm3,%ymm15 1255 vpsrld $25,%ymm3,%ymm3 1256 vpor %ymm3,%ymm15,%ymm3 1257 vbroadcasti128 (%r10),%ymm15 1258 vpaddd %ymm6,%ymm13,%ymm13 1259 vpxor %ymm0,%ymm13,%ymm0 1260 vpslld $7,%ymm0,%ymm14 1261 vpsrld $25,%ymm0,%ymm0 1262 vpor %ymm0,%ymm14,%ymm0 1263 decl %eax 1264 jnz .Loop8x 1265 1266 leaq 512(%rsp),%rax 1267 vpaddd 128-256(%rcx),%ymm8,%ymm8 1268 vpaddd 160-256(%rcx),%ymm9,%ymm9 1269 vpaddd 192-256(%rcx),%ymm10,%ymm10 1270 vpaddd 224-256(%rcx),%ymm11,%ymm11 1271 1272 vpunpckldq %ymm9,%ymm8,%ymm14 1273 vpunpckldq %ymm11,%ymm10,%ymm15 1274 vpunpckhdq %ymm9,%ymm8,%ymm8 1275 vpunpckhdq %ymm11,%ymm10,%ymm10 1276 vpunpcklqdq %ymm15,%ymm14,%ymm9 1277 vpunpckhqdq %ymm15,%ymm14,%ymm14 1278 vpunpcklqdq %ymm10,%ymm8,%ymm11 1279 vpunpckhqdq %ymm10,%ymm8,%ymm8 1280 vpaddd 256-256(%rcx),%ymm0,%ymm0 1281 vpaddd 288-256(%rcx),%ymm1,%ymm1 1282 vpaddd 320-256(%rcx),%ymm2,%ymm2 1283 vpaddd 352-256(%rcx),%ymm3,%ymm3 1284 1285 vpunpckldq %ymm1,%ymm0,%ymm10 1286 vpunpckldq %ymm3,%ymm2,%ymm15 1287 vpunpckhdq %ymm1,%ymm0,%ymm0 1288 vpunpckhdq %ymm3,%ymm2,%ymm2 1289 vpunpcklqdq %ymm15,%ymm10,%ymm1 1290 vpunpckhqdq %ymm15,%ymm10,%ymm10 1291 vpunpcklqdq %ymm2,%ymm0,%ymm3 1292 vpunpckhqdq %ymm2,%ymm0,%ymm0 1293 vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 1294 vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 1295 vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 1296 vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 1297 vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 1298 vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 1299 vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 1300 vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 1301 vmovdqa %ymm15,0(%rsp) 1302 vmovdqa %ymm9,32(%rsp) 1303 vmovdqa 64(%rsp),%ymm15 1304 vmovdqa 96(%rsp),%ymm9 1305 1306 vpaddd 384-512(%rax),%ymm12,%ymm12 1307 vpaddd 416-512(%rax),%ymm13,%ymm13 1308 vpaddd 448-512(%rax),%ymm15,%ymm15 1309 vpaddd 480-512(%rax),%ymm9,%ymm9 1310 1311 vpunpckldq %ymm13,%ymm12,%ymm2 1312 vpunpckldq %ymm9,%ymm15,%ymm8 1313 vpunpckhdq %ymm13,%ymm12,%ymm12 1314 vpunpckhdq %ymm9,%ymm15,%ymm15 1315 vpunpcklqdq %ymm8,%ymm2,%ymm13 1316 vpunpckhqdq %ymm8,%ymm2,%ymm2 1317 vpunpcklqdq %ymm15,%ymm12,%ymm9 1318 vpunpckhqdq %ymm15,%ymm12,%ymm12 1319 vpaddd 512-512(%rax),%ymm4,%ymm4 1320 vpaddd 544-512(%rax),%ymm5,%ymm5 1321 vpaddd 576-512(%rax),%ymm6,%ymm6 1322 vpaddd 608-512(%rax),%ymm7,%ymm7 1323 1324 vpunpckldq %ymm5,%ymm4,%ymm15 1325 vpunpckldq %ymm7,%ymm6,%ymm8 1326 vpunpckhdq %ymm5,%ymm4,%ymm4 1327 vpunpckhdq %ymm7,%ymm6,%ymm6 1328 vpunpcklqdq %ymm8,%ymm15,%ymm5 1329 vpunpckhqdq %ymm8,%ymm15,%ymm15 1330 vpunpcklqdq %ymm6,%ymm4,%ymm7 1331 vpunpckhqdq %ymm6,%ymm4,%ymm4 1332 vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 1333 vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 1334 vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 1335 vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 1336 vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 1337 vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 1338 vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 1339 vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 1340 vmovdqa 0(%rsp),%ymm6 1341 vmovdqa 32(%rsp),%ymm12 1342 1343 cmpq $512,%rdx 1344 jb .Ltail8x 1345 1346 vpxor 0(%rsi),%ymm6,%ymm6 1347 vpxor 32(%rsi),%ymm8,%ymm8 1348 vpxor 64(%rsi),%ymm1,%ymm1 1349 vpxor 96(%rsi),%ymm5,%ymm5 1350 leaq 128(%rsi),%rsi 1351 vmovdqu %ymm6,0(%rdi) 1352 vmovdqu %ymm8,32(%rdi) 1353 vmovdqu %ymm1,64(%rdi) 1354 vmovdqu %ymm5,96(%rdi) 1355 leaq 128(%rdi),%rdi 1356 1357 vpxor 0(%rsi),%ymm12,%ymm12 1358 vpxor 32(%rsi),%ymm13,%ymm13 1359 vpxor 64(%rsi),%ymm10,%ymm10 1360 vpxor 96(%rsi),%ymm15,%ymm15 1361 leaq 128(%rsi),%rsi 1362 vmovdqu %ymm12,0(%rdi) 1363 vmovdqu %ymm13,32(%rdi) 1364 vmovdqu %ymm10,64(%rdi) 1365 vmovdqu %ymm15,96(%rdi) 1366 leaq 128(%rdi),%rdi 1367 1368 vpxor 0(%rsi),%ymm14,%ymm14 1369 vpxor 32(%rsi),%ymm2,%ymm2 1370 vpxor 64(%rsi),%ymm3,%ymm3 1371 vpxor 96(%rsi),%ymm7,%ymm7 1372 leaq 128(%rsi),%rsi 1373 vmovdqu %ymm14,0(%rdi) 1374 vmovdqu %ymm2,32(%rdi) 1375 vmovdqu %ymm3,64(%rdi) 1376 vmovdqu %ymm7,96(%rdi) 1377 leaq 128(%rdi),%rdi 1378 1379 vpxor 0(%rsi),%ymm11,%ymm11 1380 vpxor 32(%rsi),%ymm9,%ymm9 1381 vpxor 64(%rsi),%ymm0,%ymm0 1382 vpxor 96(%rsi),%ymm4,%ymm4 1383 leaq 128(%rsi),%rsi 1384 vmovdqu %ymm11,0(%rdi) 1385 vmovdqu %ymm9,32(%rdi) 1386 vmovdqu %ymm0,64(%rdi) 1387 vmovdqu %ymm4,96(%rdi) 1388 leaq 128(%rdi),%rdi 1389 1390 subq $512,%rdx 1391 jnz .Loop_outer8x 1392 1393 jmp .Ldone8x 1394 1395.Ltail8x: 1396 cmpq $448,%rdx 1397 jae .L448_or_more8x 1398 cmpq $384,%rdx 1399 jae .L384_or_more8x 1400 cmpq $320,%rdx 1401 jae .L320_or_more8x 1402 cmpq $256,%rdx 1403 jae .L256_or_more8x 1404 cmpq $192,%rdx 1405 jae .L192_or_more8x 1406 cmpq $128,%rdx 1407 jae .L128_or_more8x 1408 cmpq $64,%rdx 1409 jae .L64_or_more8x 1410 1411 xorq %r10,%r10 1412 vmovdqa %ymm6,0(%rsp) 1413 vmovdqa %ymm8,32(%rsp) 1414 jmp .Loop_tail8x 1415 1416.align 32 1417.L64_or_more8x: 1418 vpxor 0(%rsi),%ymm6,%ymm6 1419 vpxor 32(%rsi),%ymm8,%ymm8 1420 vmovdqu %ymm6,0(%rdi) 1421 vmovdqu %ymm8,32(%rdi) 1422 je .Ldone8x 1423 1424 leaq 64(%rsi),%rsi 1425 xorq %r10,%r10 1426 vmovdqa %ymm1,0(%rsp) 1427 leaq 64(%rdi),%rdi 1428 subq $64,%rdx 1429 vmovdqa %ymm5,32(%rsp) 1430 jmp .Loop_tail8x 1431 1432.align 32 1433.L128_or_more8x: 1434 vpxor 0(%rsi),%ymm6,%ymm6 1435 vpxor 32(%rsi),%ymm8,%ymm8 1436 vpxor 64(%rsi),%ymm1,%ymm1 1437 vpxor 96(%rsi),%ymm5,%ymm5 1438 vmovdqu %ymm6,0(%rdi) 1439 vmovdqu %ymm8,32(%rdi) 1440 vmovdqu %ymm1,64(%rdi) 1441 vmovdqu %ymm5,96(%rdi) 1442 je .Ldone8x 1443 1444 leaq 128(%rsi),%rsi 1445 xorq %r10,%r10 1446 vmovdqa %ymm12,0(%rsp) 1447 leaq 128(%rdi),%rdi 1448 subq $128,%rdx 1449 vmovdqa %ymm13,32(%rsp) 1450 jmp .Loop_tail8x 1451 1452.align 32 1453.L192_or_more8x: 1454 vpxor 0(%rsi),%ymm6,%ymm6 1455 vpxor 32(%rsi),%ymm8,%ymm8 1456 vpxor 64(%rsi),%ymm1,%ymm1 1457 vpxor 96(%rsi),%ymm5,%ymm5 1458 vpxor 128(%rsi),%ymm12,%ymm12 1459 vpxor 160(%rsi),%ymm13,%ymm13 1460 vmovdqu %ymm6,0(%rdi) 1461 vmovdqu %ymm8,32(%rdi) 1462 vmovdqu %ymm1,64(%rdi) 1463 vmovdqu %ymm5,96(%rdi) 1464 vmovdqu %ymm12,128(%rdi) 1465 vmovdqu %ymm13,160(%rdi) 1466 je .Ldone8x 1467 1468 leaq 192(%rsi),%rsi 1469 xorq %r10,%r10 1470 vmovdqa %ymm10,0(%rsp) 1471 leaq 192(%rdi),%rdi 1472 subq $192,%rdx 1473 vmovdqa %ymm15,32(%rsp) 1474 jmp .Loop_tail8x 1475 1476.align 32 1477.L256_or_more8x: 1478 vpxor 0(%rsi),%ymm6,%ymm6 1479 vpxor 32(%rsi),%ymm8,%ymm8 1480 vpxor 64(%rsi),%ymm1,%ymm1 1481 vpxor 96(%rsi),%ymm5,%ymm5 1482 vpxor 128(%rsi),%ymm12,%ymm12 1483 vpxor 160(%rsi),%ymm13,%ymm13 1484 vpxor 192(%rsi),%ymm10,%ymm10 1485 vpxor 224(%rsi),%ymm15,%ymm15 1486 vmovdqu %ymm6,0(%rdi) 1487 vmovdqu %ymm8,32(%rdi) 1488 vmovdqu %ymm1,64(%rdi) 1489 vmovdqu %ymm5,96(%rdi) 1490 vmovdqu %ymm12,128(%rdi) 1491 vmovdqu %ymm13,160(%rdi) 1492 vmovdqu %ymm10,192(%rdi) 1493 vmovdqu %ymm15,224(%rdi) 1494 je .Ldone8x 1495 1496 leaq 256(%rsi),%rsi 1497 xorq %r10,%r10 1498 vmovdqa %ymm14,0(%rsp) 1499 leaq 256(%rdi),%rdi 1500 subq $256,%rdx 1501 vmovdqa %ymm2,32(%rsp) 1502 jmp .Loop_tail8x 1503 1504.align 32 1505.L320_or_more8x: 1506 vpxor 0(%rsi),%ymm6,%ymm6 1507 vpxor 32(%rsi),%ymm8,%ymm8 1508 vpxor 64(%rsi),%ymm1,%ymm1 1509 vpxor 96(%rsi),%ymm5,%ymm5 1510 vpxor 128(%rsi),%ymm12,%ymm12 1511 vpxor 160(%rsi),%ymm13,%ymm13 1512 vpxor 192(%rsi),%ymm10,%ymm10 1513 vpxor 224(%rsi),%ymm15,%ymm15 1514 vpxor 256(%rsi),%ymm14,%ymm14 1515 vpxor 288(%rsi),%ymm2,%ymm2 1516 vmovdqu %ymm6,0(%rdi) 1517 vmovdqu %ymm8,32(%rdi) 1518 vmovdqu %ymm1,64(%rdi) 1519 vmovdqu %ymm5,96(%rdi) 1520 vmovdqu %ymm12,128(%rdi) 1521 vmovdqu %ymm13,160(%rdi) 1522 vmovdqu %ymm10,192(%rdi) 1523 vmovdqu %ymm15,224(%rdi) 1524 vmovdqu %ymm14,256(%rdi) 1525 vmovdqu %ymm2,288(%rdi) 1526 je .Ldone8x 1527 1528 leaq 320(%rsi),%rsi 1529 xorq %r10,%r10 1530 vmovdqa %ymm3,0(%rsp) 1531 leaq 320(%rdi),%rdi 1532 subq $320,%rdx 1533 vmovdqa %ymm7,32(%rsp) 1534 jmp .Loop_tail8x 1535 1536.align 32 1537.L384_or_more8x: 1538 vpxor 0(%rsi),%ymm6,%ymm6 1539 vpxor 32(%rsi),%ymm8,%ymm8 1540 vpxor 64(%rsi),%ymm1,%ymm1 1541 vpxor 96(%rsi),%ymm5,%ymm5 1542 vpxor 128(%rsi),%ymm12,%ymm12 1543 vpxor 160(%rsi),%ymm13,%ymm13 1544 vpxor 192(%rsi),%ymm10,%ymm10 1545 vpxor 224(%rsi),%ymm15,%ymm15 1546 vpxor 256(%rsi),%ymm14,%ymm14 1547 vpxor 288(%rsi),%ymm2,%ymm2 1548 vpxor 320(%rsi),%ymm3,%ymm3 1549 vpxor 352(%rsi),%ymm7,%ymm7 1550 vmovdqu %ymm6,0(%rdi) 1551 vmovdqu %ymm8,32(%rdi) 1552 vmovdqu %ymm1,64(%rdi) 1553 vmovdqu %ymm5,96(%rdi) 1554 vmovdqu %ymm12,128(%rdi) 1555 vmovdqu %ymm13,160(%rdi) 1556 vmovdqu %ymm10,192(%rdi) 1557 vmovdqu %ymm15,224(%rdi) 1558 vmovdqu %ymm14,256(%rdi) 1559 vmovdqu %ymm2,288(%rdi) 1560 vmovdqu %ymm3,320(%rdi) 1561 vmovdqu %ymm7,352(%rdi) 1562 je .Ldone8x 1563 1564 leaq 384(%rsi),%rsi 1565 xorq %r10,%r10 1566 vmovdqa %ymm11,0(%rsp) 1567 leaq 384(%rdi),%rdi 1568 subq $384,%rdx 1569 vmovdqa %ymm9,32(%rsp) 1570 jmp .Loop_tail8x 1571 1572.align 32 1573.L448_or_more8x: 1574 vpxor 0(%rsi),%ymm6,%ymm6 1575 vpxor 32(%rsi),%ymm8,%ymm8 1576 vpxor 64(%rsi),%ymm1,%ymm1 1577 vpxor 96(%rsi),%ymm5,%ymm5 1578 vpxor 128(%rsi),%ymm12,%ymm12 1579 vpxor 160(%rsi),%ymm13,%ymm13 1580 vpxor 192(%rsi),%ymm10,%ymm10 1581 vpxor 224(%rsi),%ymm15,%ymm15 1582 vpxor 256(%rsi),%ymm14,%ymm14 1583 vpxor 288(%rsi),%ymm2,%ymm2 1584 vpxor 320(%rsi),%ymm3,%ymm3 1585 vpxor 352(%rsi),%ymm7,%ymm7 1586 vpxor 384(%rsi),%ymm11,%ymm11 1587 vpxor 416(%rsi),%ymm9,%ymm9 1588 vmovdqu %ymm6,0(%rdi) 1589 vmovdqu %ymm8,32(%rdi) 1590 vmovdqu %ymm1,64(%rdi) 1591 vmovdqu %ymm5,96(%rdi) 1592 vmovdqu %ymm12,128(%rdi) 1593 vmovdqu %ymm13,160(%rdi) 1594 vmovdqu %ymm10,192(%rdi) 1595 vmovdqu %ymm15,224(%rdi) 1596 vmovdqu %ymm14,256(%rdi) 1597 vmovdqu %ymm2,288(%rdi) 1598 vmovdqu %ymm3,320(%rdi) 1599 vmovdqu %ymm7,352(%rdi) 1600 vmovdqu %ymm11,384(%rdi) 1601 vmovdqu %ymm9,416(%rdi) 1602 je .Ldone8x 1603 1604 leaq 448(%rsi),%rsi 1605 xorq %r10,%r10 1606 vmovdqa %ymm0,0(%rsp) 1607 leaq 448(%rdi),%rdi 1608 subq $448,%rdx 1609 vmovdqa %ymm4,32(%rsp) 1610 1611.Loop_tail8x: 1612 movzbl (%rsi,%r10,1),%eax 1613 movzbl (%rsp,%r10,1),%ecx 1614 leaq 1(%r10),%r10 1615 xorl %ecx,%eax 1616 movb %al,-1(%rdi,%r10,1) 1617 decq %rdx 1618 jnz .Loop_tail8x 1619 1620.Ldone8x: 1621 vzeroall 1622 leaq (%r9),%rsp 1623.cfi_def_cfa_register rsp 1624.L8x_epilogue: 1625 ret 1626.cfi_endproc 1627.size ChaCha20_8x,.-ChaCha20_8x 1628#endif 1629