1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) 7.text 8.globl gcm_init_clmul 9.hidden gcm_init_clmul 10.type gcm_init_clmul,@function 11.align 16 12gcm_init_clmul: 13.cfi_startproc 14 15_CET_ENDBR 16.L_init_clmul: 17 movdqu (%rsi),%xmm2 18 pshufd $78,%xmm2,%xmm2 19 20 21 pshufd $255,%xmm2,%xmm4 22 movdqa %xmm2,%xmm3 23 psllq $1,%xmm2 24 pxor %xmm5,%xmm5 25 psrlq $63,%xmm3 26 pcmpgtd %xmm4,%xmm5 27 pslldq $8,%xmm3 28 por %xmm3,%xmm2 29 30 31 pand .L0x1c2_polynomial(%rip),%xmm5 32 pxor %xmm5,%xmm2 33 34 35 pshufd $78,%xmm2,%xmm6 36 movdqa %xmm2,%xmm0 37 pxor %xmm2,%xmm6 38 movdqa %xmm0,%xmm1 39 pshufd $78,%xmm0,%xmm3 40 pxor %xmm0,%xmm3 41.byte 102,15,58,68,194,0 42.byte 102,15,58,68,202,17 43.byte 102,15,58,68,222,0 44 pxor %xmm0,%xmm3 45 pxor %xmm1,%xmm3 46 47 movdqa %xmm3,%xmm4 48 psrldq $8,%xmm3 49 pslldq $8,%xmm4 50 pxor %xmm3,%xmm1 51 pxor %xmm4,%xmm0 52 53 movdqa %xmm0,%xmm4 54 movdqa %xmm0,%xmm3 55 psllq $5,%xmm0 56 pxor %xmm0,%xmm3 57 psllq $1,%xmm0 58 pxor %xmm3,%xmm0 59 psllq $57,%xmm0 60 movdqa %xmm0,%xmm3 61 pslldq $8,%xmm0 62 psrldq $8,%xmm3 63 pxor %xmm4,%xmm0 64 pxor %xmm3,%xmm1 65 66 67 movdqa %xmm0,%xmm4 68 psrlq $1,%xmm0 69 pxor %xmm4,%xmm1 70 pxor %xmm0,%xmm4 71 psrlq $5,%xmm0 72 pxor %xmm4,%xmm0 73 psrlq $1,%xmm0 74 pxor %xmm1,%xmm0 75 pshufd $78,%xmm2,%xmm3 76 pshufd $78,%xmm0,%xmm4 77 pxor %xmm2,%xmm3 78 movdqu %xmm2,0(%rdi) 79 pxor %xmm0,%xmm4 80 movdqu %xmm0,16(%rdi) 81.byte 102,15,58,15,227,8 82 movdqu %xmm4,32(%rdi) 83 movdqa %xmm0,%xmm1 84 pshufd $78,%xmm0,%xmm3 85 pxor %xmm0,%xmm3 86.byte 102,15,58,68,194,0 87.byte 102,15,58,68,202,17 88.byte 102,15,58,68,222,0 89 pxor %xmm0,%xmm3 90 pxor %xmm1,%xmm3 91 92 movdqa %xmm3,%xmm4 93 psrldq $8,%xmm3 94 pslldq $8,%xmm4 95 pxor %xmm3,%xmm1 96 pxor %xmm4,%xmm0 97 98 movdqa %xmm0,%xmm4 99 movdqa %xmm0,%xmm3 100 psllq $5,%xmm0 101 pxor %xmm0,%xmm3 102 psllq $1,%xmm0 103 pxor %xmm3,%xmm0 104 psllq $57,%xmm0 105 movdqa %xmm0,%xmm3 106 pslldq $8,%xmm0 107 psrldq $8,%xmm3 108 pxor %xmm4,%xmm0 109 pxor %xmm3,%xmm1 110 111 112 movdqa %xmm0,%xmm4 113 psrlq $1,%xmm0 114 pxor %xmm4,%xmm1 115 pxor %xmm0,%xmm4 116 psrlq $5,%xmm0 117 pxor %xmm4,%xmm0 118 psrlq $1,%xmm0 119 pxor %xmm1,%xmm0 120 movdqa %xmm0,%xmm5 121 movdqa %xmm0,%xmm1 122 pshufd $78,%xmm0,%xmm3 123 pxor %xmm0,%xmm3 124.byte 102,15,58,68,194,0 125.byte 102,15,58,68,202,17 126.byte 102,15,58,68,222,0 127 pxor %xmm0,%xmm3 128 pxor %xmm1,%xmm3 129 130 movdqa %xmm3,%xmm4 131 psrldq $8,%xmm3 132 pslldq $8,%xmm4 133 pxor %xmm3,%xmm1 134 pxor %xmm4,%xmm0 135 136 movdqa %xmm0,%xmm4 137 movdqa %xmm0,%xmm3 138 psllq $5,%xmm0 139 pxor %xmm0,%xmm3 140 psllq $1,%xmm0 141 pxor %xmm3,%xmm0 142 psllq $57,%xmm0 143 movdqa %xmm0,%xmm3 144 pslldq $8,%xmm0 145 psrldq $8,%xmm3 146 pxor %xmm4,%xmm0 147 pxor %xmm3,%xmm1 148 149 150 movdqa %xmm0,%xmm4 151 psrlq $1,%xmm0 152 pxor %xmm4,%xmm1 153 pxor %xmm0,%xmm4 154 psrlq $5,%xmm0 155 pxor %xmm4,%xmm0 156 psrlq $1,%xmm0 157 pxor %xmm1,%xmm0 158 pshufd $78,%xmm5,%xmm3 159 pshufd $78,%xmm0,%xmm4 160 pxor %xmm5,%xmm3 161 movdqu %xmm5,48(%rdi) 162 pxor %xmm0,%xmm4 163 movdqu %xmm0,64(%rdi) 164.byte 102,15,58,15,227,8 165 movdqu %xmm4,80(%rdi) 166 ret 167.cfi_endproc 168 169.size gcm_init_clmul,.-gcm_init_clmul 170.globl gcm_gmult_clmul 171.hidden gcm_gmult_clmul 172.type gcm_gmult_clmul,@function 173.align 16 174gcm_gmult_clmul: 175.cfi_startproc 176_CET_ENDBR 177.L_gmult_clmul: 178 movdqu (%rdi),%xmm0 179 movdqa .Lbswap_mask(%rip),%xmm5 180 movdqu (%rsi),%xmm2 181 movdqu 32(%rsi),%xmm4 182.byte 102,15,56,0,197 183 movdqa %xmm0,%xmm1 184 pshufd $78,%xmm0,%xmm3 185 pxor %xmm0,%xmm3 186.byte 102,15,58,68,194,0 187.byte 102,15,58,68,202,17 188.byte 102,15,58,68,220,0 189 pxor %xmm0,%xmm3 190 pxor %xmm1,%xmm3 191 192 movdqa %xmm3,%xmm4 193 psrldq $8,%xmm3 194 pslldq $8,%xmm4 195 pxor %xmm3,%xmm1 196 pxor %xmm4,%xmm0 197 198 movdqa %xmm0,%xmm4 199 movdqa %xmm0,%xmm3 200 psllq $5,%xmm0 201 pxor %xmm0,%xmm3 202 psllq $1,%xmm0 203 pxor %xmm3,%xmm0 204 psllq $57,%xmm0 205 movdqa %xmm0,%xmm3 206 pslldq $8,%xmm0 207 psrldq $8,%xmm3 208 pxor %xmm4,%xmm0 209 pxor %xmm3,%xmm1 210 211 212 movdqa %xmm0,%xmm4 213 psrlq $1,%xmm0 214 pxor %xmm4,%xmm1 215 pxor %xmm0,%xmm4 216 psrlq $5,%xmm0 217 pxor %xmm4,%xmm0 218 psrlq $1,%xmm0 219 pxor %xmm1,%xmm0 220.byte 102,15,56,0,197 221 movdqu %xmm0,(%rdi) 222 ret 223.cfi_endproc 224.size gcm_gmult_clmul,.-gcm_gmult_clmul 225.globl gcm_ghash_clmul 226.hidden gcm_ghash_clmul 227.type gcm_ghash_clmul,@function 228.align 32 229gcm_ghash_clmul: 230.cfi_startproc 231 232_CET_ENDBR 233.L_ghash_clmul: 234 movdqa .Lbswap_mask(%rip),%xmm10 235 236 movdqu (%rdi),%xmm0 237 movdqu (%rsi),%xmm2 238 movdqu 32(%rsi),%xmm7 239.byte 102,65,15,56,0,194 240 241 subq $0x10,%rcx 242 jz .Lodd_tail 243 244 movdqu 16(%rsi),%xmm6 245 cmpq $0x30,%rcx 246 jb .Lskip4x 247 248 subq $0x30,%rcx 249 movq $0xA040608020C0E000,%rax 250 movdqu 48(%rsi),%xmm14 251 movdqu 64(%rsi),%xmm15 252 253 254 255 256 movdqu 48(%rdx),%xmm3 257 movdqu 32(%rdx),%xmm11 258.byte 102,65,15,56,0,218 259.byte 102,69,15,56,0,218 260 movdqa %xmm3,%xmm5 261 pshufd $78,%xmm3,%xmm4 262 pxor %xmm3,%xmm4 263.byte 102,15,58,68,218,0 264.byte 102,15,58,68,234,17 265.byte 102,15,58,68,231,0 266 267 movdqa %xmm11,%xmm13 268 pshufd $78,%xmm11,%xmm12 269 pxor %xmm11,%xmm12 270.byte 102,68,15,58,68,222,0 271.byte 102,68,15,58,68,238,17 272.byte 102,68,15,58,68,231,16 273 xorps %xmm11,%xmm3 274 xorps %xmm13,%xmm5 275 movups 80(%rsi),%xmm7 276 xorps %xmm12,%xmm4 277 278 movdqu 16(%rdx),%xmm11 279 movdqu 0(%rdx),%xmm8 280.byte 102,69,15,56,0,218 281.byte 102,69,15,56,0,194 282 movdqa %xmm11,%xmm13 283 pshufd $78,%xmm11,%xmm12 284 pxor %xmm8,%xmm0 285 pxor %xmm11,%xmm12 286.byte 102,69,15,58,68,222,0 287 movdqa %xmm0,%xmm1 288 pshufd $78,%xmm0,%xmm8 289 pxor %xmm0,%xmm8 290.byte 102,69,15,58,68,238,17 291.byte 102,68,15,58,68,231,0 292 xorps %xmm11,%xmm3 293 xorps %xmm13,%xmm5 294 295 leaq 64(%rdx),%rdx 296 subq $0x40,%rcx 297 jc .Ltail4x 298 299 jmp .Lmod4_loop 300.align 32 301.Lmod4_loop: 302.byte 102,65,15,58,68,199,0 303 xorps %xmm12,%xmm4 304 movdqu 48(%rdx),%xmm11 305.byte 102,69,15,56,0,218 306.byte 102,65,15,58,68,207,17 307 xorps %xmm3,%xmm0 308 movdqu 32(%rdx),%xmm3 309 movdqa %xmm11,%xmm13 310.byte 102,68,15,58,68,199,16 311 pshufd $78,%xmm11,%xmm12 312 xorps %xmm5,%xmm1 313 pxor %xmm11,%xmm12 314.byte 102,65,15,56,0,218 315 movups 32(%rsi),%xmm7 316 xorps %xmm4,%xmm8 317.byte 102,68,15,58,68,218,0 318 pshufd $78,%xmm3,%xmm4 319 320 pxor %xmm0,%xmm8 321 movdqa %xmm3,%xmm5 322 pxor %xmm1,%xmm8 323 pxor %xmm3,%xmm4 324 movdqa %xmm8,%xmm9 325.byte 102,68,15,58,68,234,17 326 pslldq $8,%xmm8 327 psrldq $8,%xmm9 328 pxor %xmm8,%xmm0 329 movdqa .L7_mask(%rip),%xmm8 330 pxor %xmm9,%xmm1 331.byte 102,76,15,110,200 332 333 pand %xmm0,%xmm8 334.byte 102,69,15,56,0,200 335 pxor %xmm0,%xmm9 336.byte 102,68,15,58,68,231,0 337 psllq $57,%xmm9 338 movdqa %xmm9,%xmm8 339 pslldq $8,%xmm9 340.byte 102,15,58,68,222,0 341 psrldq $8,%xmm8 342 pxor %xmm9,%xmm0 343 pxor %xmm8,%xmm1 344 movdqu 0(%rdx),%xmm8 345 346 movdqa %xmm0,%xmm9 347 psrlq $1,%xmm0 348.byte 102,15,58,68,238,17 349 xorps %xmm11,%xmm3 350 movdqu 16(%rdx),%xmm11 351.byte 102,69,15,56,0,218 352.byte 102,15,58,68,231,16 353 xorps %xmm13,%xmm5 354 movups 80(%rsi),%xmm7 355.byte 102,69,15,56,0,194 356 pxor %xmm9,%xmm1 357 pxor %xmm0,%xmm9 358 psrlq $5,%xmm0 359 360 movdqa %xmm11,%xmm13 361 pxor %xmm12,%xmm4 362 pshufd $78,%xmm11,%xmm12 363 pxor %xmm9,%xmm0 364 pxor %xmm8,%xmm1 365 pxor %xmm11,%xmm12 366.byte 102,69,15,58,68,222,0 367 psrlq $1,%xmm0 368 pxor %xmm1,%xmm0 369 movdqa %xmm0,%xmm1 370.byte 102,69,15,58,68,238,17 371 xorps %xmm11,%xmm3 372 pshufd $78,%xmm0,%xmm8 373 pxor %xmm0,%xmm8 374 375.byte 102,68,15,58,68,231,0 376 xorps %xmm13,%xmm5 377 378 leaq 64(%rdx),%rdx 379 subq $0x40,%rcx 380 jnc .Lmod4_loop 381 382.Ltail4x: 383.byte 102,65,15,58,68,199,0 384.byte 102,65,15,58,68,207,17 385.byte 102,68,15,58,68,199,16 386 xorps %xmm12,%xmm4 387 xorps %xmm3,%xmm0 388 xorps %xmm5,%xmm1 389 pxor %xmm0,%xmm1 390 pxor %xmm4,%xmm8 391 392 pxor %xmm1,%xmm8 393 pxor %xmm0,%xmm1 394 395 movdqa %xmm8,%xmm9 396 psrldq $8,%xmm8 397 pslldq $8,%xmm9 398 pxor %xmm8,%xmm1 399 pxor %xmm9,%xmm0 400 401 movdqa %xmm0,%xmm4 402 movdqa %xmm0,%xmm3 403 psllq $5,%xmm0 404 pxor %xmm0,%xmm3 405 psllq $1,%xmm0 406 pxor %xmm3,%xmm0 407 psllq $57,%xmm0 408 movdqa %xmm0,%xmm3 409 pslldq $8,%xmm0 410 psrldq $8,%xmm3 411 pxor %xmm4,%xmm0 412 pxor %xmm3,%xmm1 413 414 415 movdqa %xmm0,%xmm4 416 psrlq $1,%xmm0 417 pxor %xmm4,%xmm1 418 pxor %xmm0,%xmm4 419 psrlq $5,%xmm0 420 pxor %xmm4,%xmm0 421 psrlq $1,%xmm0 422 pxor %xmm1,%xmm0 423 addq $0x40,%rcx 424 jz .Ldone 425 movdqu 32(%rsi),%xmm7 426 subq $0x10,%rcx 427 jz .Lodd_tail 428.Lskip4x: 429 430 431 432 433 434 movdqu (%rdx),%xmm8 435 movdqu 16(%rdx),%xmm3 436.byte 102,69,15,56,0,194 437.byte 102,65,15,56,0,218 438 pxor %xmm8,%xmm0 439 440 movdqa %xmm3,%xmm5 441 pshufd $78,%xmm3,%xmm4 442 pxor %xmm3,%xmm4 443.byte 102,15,58,68,218,0 444.byte 102,15,58,68,234,17 445.byte 102,15,58,68,231,0 446 447 leaq 32(%rdx),%rdx 448 nop 449 subq $0x20,%rcx 450 jbe .Leven_tail 451 nop 452 jmp .Lmod_loop 453 454.align 32 455.Lmod_loop: 456 movdqa %xmm0,%xmm1 457 movdqa %xmm4,%xmm8 458 pshufd $78,%xmm0,%xmm4 459 pxor %xmm0,%xmm4 460 461.byte 102,15,58,68,198,0 462.byte 102,15,58,68,206,17 463.byte 102,15,58,68,231,16 464 465 pxor %xmm3,%xmm0 466 pxor %xmm5,%xmm1 467 movdqu (%rdx),%xmm9 468 pxor %xmm0,%xmm8 469.byte 102,69,15,56,0,202 470 movdqu 16(%rdx),%xmm3 471 472 pxor %xmm1,%xmm8 473 pxor %xmm9,%xmm1 474 pxor %xmm8,%xmm4 475.byte 102,65,15,56,0,218 476 movdqa %xmm4,%xmm8 477 psrldq $8,%xmm8 478 pslldq $8,%xmm4 479 pxor %xmm8,%xmm1 480 pxor %xmm4,%xmm0 481 482 movdqa %xmm3,%xmm5 483 484 movdqa %xmm0,%xmm9 485 movdqa %xmm0,%xmm8 486 psllq $5,%xmm0 487 pxor %xmm0,%xmm8 488.byte 102,15,58,68,218,0 489 psllq $1,%xmm0 490 pxor %xmm8,%xmm0 491 psllq $57,%xmm0 492 movdqa %xmm0,%xmm8 493 pslldq $8,%xmm0 494 psrldq $8,%xmm8 495 pxor %xmm9,%xmm0 496 pshufd $78,%xmm5,%xmm4 497 pxor %xmm8,%xmm1 498 pxor %xmm5,%xmm4 499 500 movdqa %xmm0,%xmm9 501 psrlq $1,%xmm0 502.byte 102,15,58,68,234,17 503 pxor %xmm9,%xmm1 504 pxor %xmm0,%xmm9 505 psrlq $5,%xmm0 506 pxor %xmm9,%xmm0 507 leaq 32(%rdx),%rdx 508 psrlq $1,%xmm0 509.byte 102,15,58,68,231,0 510 pxor %xmm1,%xmm0 511 512 subq $0x20,%rcx 513 ja .Lmod_loop 514 515.Leven_tail: 516 movdqa %xmm0,%xmm1 517 movdqa %xmm4,%xmm8 518 pshufd $78,%xmm0,%xmm4 519 pxor %xmm0,%xmm4 520 521.byte 102,15,58,68,198,0 522.byte 102,15,58,68,206,17 523.byte 102,15,58,68,231,16 524 525 pxor %xmm3,%xmm0 526 pxor %xmm5,%xmm1 527 pxor %xmm0,%xmm8 528 pxor %xmm1,%xmm8 529 pxor %xmm8,%xmm4 530 movdqa %xmm4,%xmm8 531 psrldq $8,%xmm8 532 pslldq $8,%xmm4 533 pxor %xmm8,%xmm1 534 pxor %xmm4,%xmm0 535 536 movdqa %xmm0,%xmm4 537 movdqa %xmm0,%xmm3 538 psllq $5,%xmm0 539 pxor %xmm0,%xmm3 540 psllq $1,%xmm0 541 pxor %xmm3,%xmm0 542 psllq $57,%xmm0 543 movdqa %xmm0,%xmm3 544 pslldq $8,%xmm0 545 psrldq $8,%xmm3 546 pxor %xmm4,%xmm0 547 pxor %xmm3,%xmm1 548 549 550 movdqa %xmm0,%xmm4 551 psrlq $1,%xmm0 552 pxor %xmm4,%xmm1 553 pxor %xmm0,%xmm4 554 psrlq $5,%xmm0 555 pxor %xmm4,%xmm0 556 psrlq $1,%xmm0 557 pxor %xmm1,%xmm0 558 testq %rcx,%rcx 559 jnz .Ldone 560 561.Lodd_tail: 562 movdqu (%rdx),%xmm8 563.byte 102,69,15,56,0,194 564 pxor %xmm8,%xmm0 565 movdqa %xmm0,%xmm1 566 pshufd $78,%xmm0,%xmm3 567 pxor %xmm0,%xmm3 568.byte 102,15,58,68,194,0 569.byte 102,15,58,68,202,17 570.byte 102,15,58,68,223,0 571 pxor %xmm0,%xmm3 572 pxor %xmm1,%xmm3 573 574 movdqa %xmm3,%xmm4 575 psrldq $8,%xmm3 576 pslldq $8,%xmm4 577 pxor %xmm3,%xmm1 578 pxor %xmm4,%xmm0 579 580 movdqa %xmm0,%xmm4 581 movdqa %xmm0,%xmm3 582 psllq $5,%xmm0 583 pxor %xmm0,%xmm3 584 psllq $1,%xmm0 585 pxor %xmm3,%xmm0 586 psllq $57,%xmm0 587 movdqa %xmm0,%xmm3 588 pslldq $8,%xmm0 589 psrldq $8,%xmm3 590 pxor %xmm4,%xmm0 591 pxor %xmm3,%xmm1 592 593 594 movdqa %xmm0,%xmm4 595 psrlq $1,%xmm0 596 pxor %xmm4,%xmm1 597 pxor %xmm0,%xmm4 598 psrlq $5,%xmm0 599 pxor %xmm4,%xmm0 600 psrlq $1,%xmm0 601 pxor %xmm1,%xmm0 602.Ldone: 603.byte 102,65,15,56,0,194 604 movdqu %xmm0,(%rdi) 605 ret 606.cfi_endproc 607 608.size gcm_ghash_clmul,.-gcm_ghash_clmul 609.globl gcm_init_avx 610.hidden gcm_init_avx 611.type gcm_init_avx,@function 612.align 32 613gcm_init_avx: 614.cfi_startproc 615_CET_ENDBR 616 vzeroupper 617 618 vmovdqu (%rsi),%xmm2 619 vpshufd $78,%xmm2,%xmm2 620 621 622 vpshufd $255,%xmm2,%xmm4 623 vpsrlq $63,%xmm2,%xmm3 624 vpsllq $1,%xmm2,%xmm2 625 vpxor %xmm5,%xmm5,%xmm5 626 vpcmpgtd %xmm4,%xmm5,%xmm5 627 vpslldq $8,%xmm3,%xmm3 628 vpor %xmm3,%xmm2,%xmm2 629 630 631 vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 632 vpxor %xmm5,%xmm2,%xmm2 633 634 vpunpckhqdq %xmm2,%xmm2,%xmm6 635 vmovdqa %xmm2,%xmm0 636 vpxor %xmm2,%xmm6,%xmm6 637 movq $4,%r10 638 jmp .Linit_start_avx 639.align 32 640.Linit_loop_avx: 641 vpalignr $8,%xmm3,%xmm4,%xmm5 642 vmovdqu %xmm5,-16(%rdi) 643 vpunpckhqdq %xmm0,%xmm0,%xmm3 644 vpxor %xmm0,%xmm3,%xmm3 645 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 646 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 647 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 648 vpxor %xmm0,%xmm1,%xmm4 649 vpxor %xmm4,%xmm3,%xmm3 650 651 vpslldq $8,%xmm3,%xmm4 652 vpsrldq $8,%xmm3,%xmm3 653 vpxor %xmm4,%xmm0,%xmm0 654 vpxor %xmm3,%xmm1,%xmm1 655 vpsllq $57,%xmm0,%xmm3 656 vpsllq $62,%xmm0,%xmm4 657 vpxor %xmm3,%xmm4,%xmm4 658 vpsllq $63,%xmm0,%xmm3 659 vpxor %xmm3,%xmm4,%xmm4 660 vpslldq $8,%xmm4,%xmm3 661 vpsrldq $8,%xmm4,%xmm4 662 vpxor %xmm3,%xmm0,%xmm0 663 vpxor %xmm4,%xmm1,%xmm1 664 665 vpsrlq $1,%xmm0,%xmm4 666 vpxor %xmm0,%xmm1,%xmm1 667 vpxor %xmm4,%xmm0,%xmm0 668 vpsrlq $5,%xmm4,%xmm4 669 vpxor %xmm4,%xmm0,%xmm0 670 vpsrlq $1,%xmm0,%xmm0 671 vpxor %xmm1,%xmm0,%xmm0 672.Linit_start_avx: 673 vmovdqa %xmm0,%xmm5 674 vpunpckhqdq %xmm0,%xmm0,%xmm3 675 vpxor %xmm0,%xmm3,%xmm3 676 vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 677 vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 678 vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 679 vpxor %xmm0,%xmm1,%xmm4 680 vpxor %xmm4,%xmm3,%xmm3 681 682 vpslldq $8,%xmm3,%xmm4 683 vpsrldq $8,%xmm3,%xmm3 684 vpxor %xmm4,%xmm0,%xmm0 685 vpxor %xmm3,%xmm1,%xmm1 686 vpsllq $57,%xmm0,%xmm3 687 vpsllq $62,%xmm0,%xmm4 688 vpxor %xmm3,%xmm4,%xmm4 689 vpsllq $63,%xmm0,%xmm3 690 vpxor %xmm3,%xmm4,%xmm4 691 vpslldq $8,%xmm4,%xmm3 692 vpsrldq $8,%xmm4,%xmm4 693 vpxor %xmm3,%xmm0,%xmm0 694 vpxor %xmm4,%xmm1,%xmm1 695 696 vpsrlq $1,%xmm0,%xmm4 697 vpxor %xmm0,%xmm1,%xmm1 698 vpxor %xmm4,%xmm0,%xmm0 699 vpsrlq $5,%xmm4,%xmm4 700 vpxor %xmm4,%xmm0,%xmm0 701 vpsrlq $1,%xmm0,%xmm0 702 vpxor %xmm1,%xmm0,%xmm0 703 vpshufd $78,%xmm5,%xmm3 704 vpshufd $78,%xmm0,%xmm4 705 vpxor %xmm5,%xmm3,%xmm3 706 vmovdqu %xmm5,0(%rdi) 707 vpxor %xmm0,%xmm4,%xmm4 708 vmovdqu %xmm0,16(%rdi) 709 leaq 48(%rdi),%rdi 710 subq $1,%r10 711 jnz .Linit_loop_avx 712 713 vpalignr $8,%xmm4,%xmm3,%xmm5 714 vmovdqu %xmm5,-16(%rdi) 715 716 vzeroupper 717 ret 718 719.cfi_endproc 720.size gcm_init_avx,.-gcm_init_avx 721.globl gcm_gmult_avx 722.hidden gcm_gmult_avx 723.type gcm_gmult_avx,@function 724.align 32 725gcm_gmult_avx: 726.cfi_startproc 727_CET_ENDBR 728 jmp .L_gmult_clmul 729.cfi_endproc 730.size gcm_gmult_avx,.-gcm_gmult_avx 731.globl gcm_ghash_avx 732.hidden gcm_ghash_avx 733.type gcm_ghash_avx,@function 734.align 32 735gcm_ghash_avx: 736.cfi_startproc 737_CET_ENDBR 738 vzeroupper 739 740 vmovdqu (%rdi),%xmm10 741 leaq .L0x1c2_polynomial(%rip),%r10 742 leaq 64(%rsi),%rsi 743 vmovdqu .Lbswap_mask(%rip),%xmm13 744 vpshufb %xmm13,%xmm10,%xmm10 745 cmpq $0x80,%rcx 746 jb .Lshort_avx 747 subq $0x80,%rcx 748 749 vmovdqu 112(%rdx),%xmm14 750 vmovdqu 0-64(%rsi),%xmm6 751 vpshufb %xmm13,%xmm14,%xmm14 752 vmovdqu 32-64(%rsi),%xmm7 753 754 vpunpckhqdq %xmm14,%xmm14,%xmm9 755 vmovdqu 96(%rdx),%xmm15 756 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 757 vpxor %xmm14,%xmm9,%xmm9 758 vpshufb %xmm13,%xmm15,%xmm15 759 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 760 vmovdqu 16-64(%rsi),%xmm6 761 vpunpckhqdq %xmm15,%xmm15,%xmm8 762 vmovdqu 80(%rdx),%xmm14 763 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 764 vpxor %xmm15,%xmm8,%xmm8 765 766 vpshufb %xmm13,%xmm14,%xmm14 767 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 768 vpunpckhqdq %xmm14,%xmm14,%xmm9 769 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 770 vmovdqu 48-64(%rsi),%xmm6 771 vpxor %xmm14,%xmm9,%xmm9 772 vmovdqu 64(%rdx),%xmm15 773 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 774 vmovdqu 80-64(%rsi),%xmm7 775 776 vpshufb %xmm13,%xmm15,%xmm15 777 vpxor %xmm0,%xmm3,%xmm3 778 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 779 vpxor %xmm1,%xmm4,%xmm4 780 vpunpckhqdq %xmm15,%xmm15,%xmm8 781 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 782 vmovdqu 64-64(%rsi),%xmm6 783 vpxor %xmm2,%xmm5,%xmm5 784 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 785 vpxor %xmm15,%xmm8,%xmm8 786 787 vmovdqu 48(%rdx),%xmm14 788 vpxor %xmm3,%xmm0,%xmm0 789 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 790 vpxor %xmm4,%xmm1,%xmm1 791 vpshufb %xmm13,%xmm14,%xmm14 792 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 793 vmovdqu 96-64(%rsi),%xmm6 794 vpxor %xmm5,%xmm2,%xmm2 795 vpunpckhqdq %xmm14,%xmm14,%xmm9 796 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 797 vmovdqu 128-64(%rsi),%xmm7 798 vpxor %xmm14,%xmm9,%xmm9 799 800 vmovdqu 32(%rdx),%xmm15 801 vpxor %xmm0,%xmm3,%xmm3 802 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 803 vpxor %xmm1,%xmm4,%xmm4 804 vpshufb %xmm13,%xmm15,%xmm15 805 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 806 vmovdqu 112-64(%rsi),%xmm6 807 vpxor %xmm2,%xmm5,%xmm5 808 vpunpckhqdq %xmm15,%xmm15,%xmm8 809 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 810 vpxor %xmm15,%xmm8,%xmm8 811 812 vmovdqu 16(%rdx),%xmm14 813 vpxor %xmm3,%xmm0,%xmm0 814 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 815 vpxor %xmm4,%xmm1,%xmm1 816 vpshufb %xmm13,%xmm14,%xmm14 817 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 818 vmovdqu 144-64(%rsi),%xmm6 819 vpxor %xmm5,%xmm2,%xmm2 820 vpunpckhqdq %xmm14,%xmm14,%xmm9 821 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 822 vmovdqu 176-64(%rsi),%xmm7 823 vpxor %xmm14,%xmm9,%xmm9 824 825 vmovdqu (%rdx),%xmm15 826 vpxor %xmm0,%xmm3,%xmm3 827 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 828 vpxor %xmm1,%xmm4,%xmm4 829 vpshufb %xmm13,%xmm15,%xmm15 830 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 831 vmovdqu 160-64(%rsi),%xmm6 832 vpxor %xmm2,%xmm5,%xmm5 833 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 834 835 leaq 128(%rdx),%rdx 836 cmpq $0x80,%rcx 837 jb .Ltail_avx 838 839 vpxor %xmm10,%xmm15,%xmm15 840 subq $0x80,%rcx 841 jmp .Loop8x_avx 842 843.align 32 844.Loop8x_avx: 845 vpunpckhqdq %xmm15,%xmm15,%xmm8 846 vmovdqu 112(%rdx),%xmm14 847 vpxor %xmm0,%xmm3,%xmm3 848 vpxor %xmm15,%xmm8,%xmm8 849 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 850 vpshufb %xmm13,%xmm14,%xmm14 851 vpxor %xmm1,%xmm4,%xmm4 852 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 853 vmovdqu 0-64(%rsi),%xmm6 854 vpunpckhqdq %xmm14,%xmm14,%xmm9 855 vpxor %xmm2,%xmm5,%xmm5 856 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 857 vmovdqu 32-64(%rsi),%xmm7 858 vpxor %xmm14,%xmm9,%xmm9 859 860 vmovdqu 96(%rdx),%xmm15 861 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 862 vpxor %xmm3,%xmm10,%xmm10 863 vpshufb %xmm13,%xmm15,%xmm15 864 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 865 vxorps %xmm4,%xmm11,%xmm11 866 vmovdqu 16-64(%rsi),%xmm6 867 vpunpckhqdq %xmm15,%xmm15,%xmm8 868 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 869 vpxor %xmm5,%xmm12,%xmm12 870 vxorps %xmm15,%xmm8,%xmm8 871 872 vmovdqu 80(%rdx),%xmm14 873 vpxor %xmm10,%xmm12,%xmm12 874 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 875 vpxor %xmm11,%xmm12,%xmm12 876 vpslldq $8,%xmm12,%xmm9 877 vpxor %xmm0,%xmm3,%xmm3 878 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 879 vpsrldq $8,%xmm12,%xmm12 880 vpxor %xmm9,%xmm10,%xmm10 881 vmovdqu 48-64(%rsi),%xmm6 882 vpshufb %xmm13,%xmm14,%xmm14 883 vxorps %xmm12,%xmm11,%xmm11 884 vpxor %xmm1,%xmm4,%xmm4 885 vpunpckhqdq %xmm14,%xmm14,%xmm9 886 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 887 vmovdqu 80-64(%rsi),%xmm7 888 vpxor %xmm14,%xmm9,%xmm9 889 vpxor %xmm2,%xmm5,%xmm5 890 891 vmovdqu 64(%rdx),%xmm15 892 vpalignr $8,%xmm10,%xmm10,%xmm12 893 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 894 vpshufb %xmm13,%xmm15,%xmm15 895 vpxor %xmm3,%xmm0,%xmm0 896 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 897 vmovdqu 64-64(%rsi),%xmm6 898 vpunpckhqdq %xmm15,%xmm15,%xmm8 899 vpxor %xmm4,%xmm1,%xmm1 900 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 901 vxorps %xmm15,%xmm8,%xmm8 902 vpxor %xmm5,%xmm2,%xmm2 903 904 vmovdqu 48(%rdx),%xmm14 905 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 906 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 907 vpshufb %xmm13,%xmm14,%xmm14 908 vpxor %xmm0,%xmm3,%xmm3 909 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 910 vmovdqu 96-64(%rsi),%xmm6 911 vpunpckhqdq %xmm14,%xmm14,%xmm9 912 vpxor %xmm1,%xmm4,%xmm4 913 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 914 vmovdqu 128-64(%rsi),%xmm7 915 vpxor %xmm14,%xmm9,%xmm9 916 vpxor %xmm2,%xmm5,%xmm5 917 918 vmovdqu 32(%rdx),%xmm15 919 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 920 vpshufb %xmm13,%xmm15,%xmm15 921 vpxor %xmm3,%xmm0,%xmm0 922 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 923 vmovdqu 112-64(%rsi),%xmm6 924 vpunpckhqdq %xmm15,%xmm15,%xmm8 925 vpxor %xmm4,%xmm1,%xmm1 926 vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 927 vpxor %xmm15,%xmm8,%xmm8 928 vpxor %xmm5,%xmm2,%xmm2 929 vxorps %xmm12,%xmm10,%xmm10 930 931 vmovdqu 16(%rdx),%xmm14 932 vpalignr $8,%xmm10,%xmm10,%xmm12 933 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 934 vpshufb %xmm13,%xmm14,%xmm14 935 vpxor %xmm0,%xmm3,%xmm3 936 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 937 vmovdqu 144-64(%rsi),%xmm6 938 vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 939 vxorps %xmm11,%xmm12,%xmm12 940 vpunpckhqdq %xmm14,%xmm14,%xmm9 941 vpxor %xmm1,%xmm4,%xmm4 942 vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 943 vmovdqu 176-64(%rsi),%xmm7 944 vpxor %xmm14,%xmm9,%xmm9 945 vpxor %xmm2,%xmm5,%xmm5 946 947 vmovdqu (%rdx),%xmm15 948 vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 949 vpshufb %xmm13,%xmm15,%xmm15 950 vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 951 vmovdqu 160-64(%rsi),%xmm6 952 vpxor %xmm12,%xmm15,%xmm15 953 vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 954 vpxor %xmm10,%xmm15,%xmm15 955 956 leaq 128(%rdx),%rdx 957 subq $0x80,%rcx 958 jnc .Loop8x_avx 959 960 addq $0x80,%rcx 961 jmp .Ltail_no_xor_avx 962 963.align 32 964.Lshort_avx: 965 vmovdqu -16(%rdx,%rcx,1),%xmm14 966 leaq (%rdx,%rcx,1),%rdx 967 vmovdqu 0-64(%rsi),%xmm6 968 vmovdqu 32-64(%rsi),%xmm7 969 vpshufb %xmm13,%xmm14,%xmm15 970 971 vmovdqa %xmm0,%xmm3 972 vmovdqa %xmm1,%xmm4 973 vmovdqa %xmm2,%xmm5 974 subq $0x10,%rcx 975 jz .Ltail_avx 976 977 vpunpckhqdq %xmm15,%xmm15,%xmm8 978 vpxor %xmm0,%xmm3,%xmm3 979 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 980 vpxor %xmm15,%xmm8,%xmm8 981 vmovdqu -32(%rdx),%xmm14 982 vpxor %xmm1,%xmm4,%xmm4 983 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 984 vmovdqu 16-64(%rsi),%xmm6 985 vpshufb %xmm13,%xmm14,%xmm15 986 vpxor %xmm2,%xmm5,%xmm5 987 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 988 vpsrldq $8,%xmm7,%xmm7 989 subq $0x10,%rcx 990 jz .Ltail_avx 991 992 vpunpckhqdq %xmm15,%xmm15,%xmm8 993 vpxor %xmm0,%xmm3,%xmm3 994 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 995 vpxor %xmm15,%xmm8,%xmm8 996 vmovdqu -48(%rdx),%xmm14 997 vpxor %xmm1,%xmm4,%xmm4 998 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 999 vmovdqu 48-64(%rsi),%xmm6 1000 vpshufb %xmm13,%xmm14,%xmm15 1001 vpxor %xmm2,%xmm5,%xmm5 1002 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1003 vmovdqu 80-64(%rsi),%xmm7 1004 subq $0x10,%rcx 1005 jz .Ltail_avx 1006 1007 vpunpckhqdq %xmm15,%xmm15,%xmm8 1008 vpxor %xmm0,%xmm3,%xmm3 1009 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1010 vpxor %xmm15,%xmm8,%xmm8 1011 vmovdqu -64(%rdx),%xmm14 1012 vpxor %xmm1,%xmm4,%xmm4 1013 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1014 vmovdqu 64-64(%rsi),%xmm6 1015 vpshufb %xmm13,%xmm14,%xmm15 1016 vpxor %xmm2,%xmm5,%xmm5 1017 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1018 vpsrldq $8,%xmm7,%xmm7 1019 subq $0x10,%rcx 1020 jz .Ltail_avx 1021 1022 vpunpckhqdq %xmm15,%xmm15,%xmm8 1023 vpxor %xmm0,%xmm3,%xmm3 1024 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1025 vpxor %xmm15,%xmm8,%xmm8 1026 vmovdqu -80(%rdx),%xmm14 1027 vpxor %xmm1,%xmm4,%xmm4 1028 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1029 vmovdqu 96-64(%rsi),%xmm6 1030 vpshufb %xmm13,%xmm14,%xmm15 1031 vpxor %xmm2,%xmm5,%xmm5 1032 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1033 vmovdqu 128-64(%rsi),%xmm7 1034 subq $0x10,%rcx 1035 jz .Ltail_avx 1036 1037 vpunpckhqdq %xmm15,%xmm15,%xmm8 1038 vpxor %xmm0,%xmm3,%xmm3 1039 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1040 vpxor %xmm15,%xmm8,%xmm8 1041 vmovdqu -96(%rdx),%xmm14 1042 vpxor %xmm1,%xmm4,%xmm4 1043 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1044 vmovdqu 112-64(%rsi),%xmm6 1045 vpshufb %xmm13,%xmm14,%xmm15 1046 vpxor %xmm2,%xmm5,%xmm5 1047 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1048 vpsrldq $8,%xmm7,%xmm7 1049 subq $0x10,%rcx 1050 jz .Ltail_avx 1051 1052 vpunpckhqdq %xmm15,%xmm15,%xmm8 1053 vpxor %xmm0,%xmm3,%xmm3 1054 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1055 vpxor %xmm15,%xmm8,%xmm8 1056 vmovdqu -112(%rdx),%xmm14 1057 vpxor %xmm1,%xmm4,%xmm4 1058 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1059 vmovdqu 144-64(%rsi),%xmm6 1060 vpshufb %xmm13,%xmm14,%xmm15 1061 vpxor %xmm2,%xmm5,%xmm5 1062 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1063 vmovq 184-64(%rsi),%xmm7 1064 subq $0x10,%rcx 1065 jmp .Ltail_avx 1066 1067.align 32 1068.Ltail_avx: 1069 vpxor %xmm10,%xmm15,%xmm15 1070.Ltail_no_xor_avx: 1071 vpunpckhqdq %xmm15,%xmm15,%xmm8 1072 vpxor %xmm0,%xmm3,%xmm3 1073 vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 1074 vpxor %xmm15,%xmm8,%xmm8 1075 vpxor %xmm1,%xmm4,%xmm4 1076 vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 1077 vpxor %xmm2,%xmm5,%xmm5 1078 vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 1079 1080 vmovdqu (%r10),%xmm12 1081 1082 vpxor %xmm0,%xmm3,%xmm10 1083 vpxor %xmm1,%xmm4,%xmm11 1084 vpxor %xmm2,%xmm5,%xmm5 1085 1086 vpxor %xmm10,%xmm5,%xmm5 1087 vpxor %xmm11,%xmm5,%xmm5 1088 vpslldq $8,%xmm5,%xmm9 1089 vpsrldq $8,%xmm5,%xmm5 1090 vpxor %xmm9,%xmm10,%xmm10 1091 vpxor %xmm5,%xmm11,%xmm11 1092 1093 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1094 vpalignr $8,%xmm10,%xmm10,%xmm10 1095 vpxor %xmm9,%xmm10,%xmm10 1096 1097 vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 1098 vpalignr $8,%xmm10,%xmm10,%xmm10 1099 vpxor %xmm11,%xmm10,%xmm10 1100 vpxor %xmm9,%xmm10,%xmm10 1101 1102 cmpq $0,%rcx 1103 jne .Lshort_avx 1104 1105 vpshufb %xmm13,%xmm10,%xmm10 1106 vmovdqu %xmm10,(%rdi) 1107 vzeroupper 1108 ret 1109.cfi_endproc 1110 1111.size gcm_ghash_avx,.-gcm_ghash_avx 1112.section .rodata 1113.align 64 1114.Lbswap_mask: 1115.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 1116.L0x1c2_polynomial: 1117.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 1118.L7_mask: 1119.long 7,0,7,0 1120.align 64 1121 1122.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1123.align 64 1124.text 1125#endif 1126