1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <openssl/arm_arch.h> 8 9.section .rodata 10 11.align 5 12Lsigma: 13.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 14Lone: 15.long 1,0,0,0 16.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 17.align 2 18 19.text 20 21.globl ChaCha20_ctr32_nohw 22 23.def ChaCha20_ctr32_nohw 24 .type 32 25.endef 26.align 5 27ChaCha20_ctr32_nohw: 28 AARCH64_SIGN_LINK_REGISTER 29 stp x29,x30,[sp,#-96]! 30 add x29,sp,#0 31 32 adrp x5,Lsigma 33 add x5,x5,:lo12:Lsigma 34 stp x19,x20,[sp,#16] 35 stp x21,x22,[sp,#32] 36 stp x23,x24,[sp,#48] 37 stp x25,x26,[sp,#64] 38 stp x27,x28,[sp,#80] 39 sub sp,sp,#64 40 41 ldp x22,x23,[x5] // load sigma 42 ldp x24,x25,[x3] // load key 43 ldp x26,x27,[x3,#16] 44 ldp x28,x30,[x4] // load counter 45#ifdef __AARCH64EB__ 46 ror x24,x24,#32 47 ror x25,x25,#32 48 ror x26,x26,#32 49 ror x27,x27,#32 50 ror x28,x28,#32 51 ror x30,x30,#32 52#endif 53 54Loop_outer: 55 mov w5,w22 // unpack key block 56 lsr x6,x22,#32 57 mov w7,w23 58 lsr x8,x23,#32 59 mov w9,w24 60 lsr x10,x24,#32 61 mov w11,w25 62 lsr x12,x25,#32 63 mov w13,w26 64 lsr x14,x26,#32 65 mov w15,w27 66 lsr x16,x27,#32 67 mov w17,w28 68 lsr x19,x28,#32 69 mov w20,w30 70 lsr x21,x30,#32 71 72 mov x4,#10 73 subs x2,x2,#64 74Loop: 75 sub x4,x4,#1 76 add w5,w5,w9 77 add w6,w6,w10 78 add w7,w7,w11 79 add w8,w8,w12 80 eor w17,w17,w5 81 eor w19,w19,w6 82 eor w20,w20,w7 83 eor w21,w21,w8 84 ror w17,w17,#16 85 ror w19,w19,#16 86 ror w20,w20,#16 87 ror w21,w21,#16 88 add w13,w13,w17 89 add w14,w14,w19 90 add w15,w15,w20 91 add w16,w16,w21 92 eor w9,w9,w13 93 eor w10,w10,w14 94 eor w11,w11,w15 95 eor w12,w12,w16 96 ror w9,w9,#20 97 ror w10,w10,#20 98 ror w11,w11,#20 99 ror w12,w12,#20 100 add w5,w5,w9 101 add w6,w6,w10 102 add w7,w7,w11 103 add w8,w8,w12 104 eor w17,w17,w5 105 eor w19,w19,w6 106 eor w20,w20,w7 107 eor w21,w21,w8 108 ror w17,w17,#24 109 ror w19,w19,#24 110 ror w20,w20,#24 111 ror w21,w21,#24 112 add w13,w13,w17 113 add w14,w14,w19 114 add w15,w15,w20 115 add w16,w16,w21 116 eor w9,w9,w13 117 eor w10,w10,w14 118 eor w11,w11,w15 119 eor w12,w12,w16 120 ror w9,w9,#25 121 ror w10,w10,#25 122 ror w11,w11,#25 123 ror w12,w12,#25 124 add w5,w5,w10 125 add w6,w6,w11 126 add w7,w7,w12 127 add w8,w8,w9 128 eor w21,w21,w5 129 eor w17,w17,w6 130 eor w19,w19,w7 131 eor w20,w20,w8 132 ror w21,w21,#16 133 ror w17,w17,#16 134 ror w19,w19,#16 135 ror w20,w20,#16 136 add w15,w15,w21 137 add w16,w16,w17 138 add w13,w13,w19 139 add w14,w14,w20 140 eor w10,w10,w15 141 eor w11,w11,w16 142 eor w12,w12,w13 143 eor w9,w9,w14 144 ror w10,w10,#20 145 ror w11,w11,#20 146 ror w12,w12,#20 147 ror w9,w9,#20 148 add w5,w5,w10 149 add w6,w6,w11 150 add w7,w7,w12 151 add w8,w8,w9 152 eor w21,w21,w5 153 eor w17,w17,w6 154 eor w19,w19,w7 155 eor w20,w20,w8 156 ror w21,w21,#24 157 ror w17,w17,#24 158 ror w19,w19,#24 159 ror w20,w20,#24 160 add w15,w15,w21 161 add w16,w16,w17 162 add w13,w13,w19 163 add w14,w14,w20 164 eor w10,w10,w15 165 eor w11,w11,w16 166 eor w12,w12,w13 167 eor w9,w9,w14 168 ror w10,w10,#25 169 ror w11,w11,#25 170 ror w12,w12,#25 171 ror w9,w9,#25 172 cbnz x4,Loop 173 174 add w5,w5,w22 // accumulate key block 175 add x6,x6,x22,lsr#32 176 add w7,w7,w23 177 add x8,x8,x23,lsr#32 178 add w9,w9,w24 179 add x10,x10,x24,lsr#32 180 add w11,w11,w25 181 add x12,x12,x25,lsr#32 182 add w13,w13,w26 183 add x14,x14,x26,lsr#32 184 add w15,w15,w27 185 add x16,x16,x27,lsr#32 186 add w17,w17,w28 187 add x19,x19,x28,lsr#32 188 add w20,w20,w30 189 add x21,x21,x30,lsr#32 190 191 b.lo Ltail 192 193 add x5,x5,x6,lsl#32 // pack 194 add x7,x7,x8,lsl#32 195 ldp x6,x8,[x1,#0] // load input 196 add x9,x9,x10,lsl#32 197 add x11,x11,x12,lsl#32 198 ldp x10,x12,[x1,#16] 199 add x13,x13,x14,lsl#32 200 add x15,x15,x16,lsl#32 201 ldp x14,x16,[x1,#32] 202 add x17,x17,x19,lsl#32 203 add x20,x20,x21,lsl#32 204 ldp x19,x21,[x1,#48] 205 add x1,x1,#64 206#ifdef __AARCH64EB__ 207 rev x5,x5 208 rev x7,x7 209 rev x9,x9 210 rev x11,x11 211 rev x13,x13 212 rev x15,x15 213 rev x17,x17 214 rev x20,x20 215#endif 216 eor x5,x5,x6 217 eor x7,x7,x8 218 eor x9,x9,x10 219 eor x11,x11,x12 220 eor x13,x13,x14 221 eor x15,x15,x16 222 eor x17,x17,x19 223 eor x20,x20,x21 224 225 stp x5,x7,[x0,#0] // store output 226 add x28,x28,#1 // increment counter 227 stp x9,x11,[x0,#16] 228 stp x13,x15,[x0,#32] 229 stp x17,x20,[x0,#48] 230 add x0,x0,#64 231 232 b.hi Loop_outer 233 234 ldp x19,x20,[x29,#16] 235 add sp,sp,#64 236 ldp x21,x22,[x29,#32] 237 ldp x23,x24,[x29,#48] 238 ldp x25,x26,[x29,#64] 239 ldp x27,x28,[x29,#80] 240 ldp x29,x30,[sp],#96 241 AARCH64_VALIDATE_LINK_REGISTER 242 ret 243 244.align 4 245Ltail: 246 add x2,x2,#64 247Less_than_64: 248 sub x0,x0,#1 249 add x1,x1,x2 250 add x0,x0,x2 251 add x4,sp,x2 252 neg x2,x2 253 254 add x5,x5,x6,lsl#32 // pack 255 add x7,x7,x8,lsl#32 256 add x9,x9,x10,lsl#32 257 add x11,x11,x12,lsl#32 258 add x13,x13,x14,lsl#32 259 add x15,x15,x16,lsl#32 260 add x17,x17,x19,lsl#32 261 add x20,x20,x21,lsl#32 262#ifdef __AARCH64EB__ 263 rev x5,x5 264 rev x7,x7 265 rev x9,x9 266 rev x11,x11 267 rev x13,x13 268 rev x15,x15 269 rev x17,x17 270 rev x20,x20 271#endif 272 stp x5,x7,[sp,#0] 273 stp x9,x11,[sp,#16] 274 stp x13,x15,[sp,#32] 275 stp x17,x20,[sp,#48] 276 277Loop_tail: 278 ldrb w10,[x1,x2] 279 ldrb w11,[x4,x2] 280 add x2,x2,#1 281 eor w10,w10,w11 282 strb w10,[x0,x2] 283 cbnz x2,Loop_tail 284 285 stp xzr,xzr,[sp,#0] 286 stp xzr,xzr,[sp,#16] 287 stp xzr,xzr,[sp,#32] 288 stp xzr,xzr,[sp,#48] 289 290 ldp x19,x20,[x29,#16] 291 add sp,sp,#64 292 ldp x21,x22,[x29,#32] 293 ldp x23,x24,[x29,#48] 294 ldp x25,x26,[x29,#64] 295 ldp x27,x28,[x29,#80] 296 ldp x29,x30,[sp],#96 297 AARCH64_VALIDATE_LINK_REGISTER 298 ret 299 300 301.globl ChaCha20_ctr32_neon 302 303.def ChaCha20_ctr32_neon 304 .type 32 305.endef 306.align 5 307ChaCha20_ctr32_neon: 308 AARCH64_SIGN_LINK_REGISTER 309 stp x29,x30,[sp,#-96]! 310 add x29,sp,#0 311 312 adrp x5,Lsigma 313 add x5,x5,:lo12:Lsigma 314 stp x19,x20,[sp,#16] 315 stp x21,x22,[sp,#32] 316 stp x23,x24,[sp,#48] 317 stp x25,x26,[sp,#64] 318 stp x27,x28,[sp,#80] 319 cmp x2,#512 320 b.hs L512_or_more_neon 321 322 sub sp,sp,#64 323 324 ldp x22,x23,[x5] // load sigma 325 ld1 {v24.4s},[x5],#16 326 ldp x24,x25,[x3] // load key 327 ldp x26,x27,[x3,#16] 328 ld1 {v25.4s,v26.4s},[x3] 329 ldp x28,x30,[x4] // load counter 330 ld1 {v27.4s},[x4] 331 ld1 {v31.4s},[x5] 332#ifdef __AARCH64EB__ 333 rev64 v24.4s,v24.4s 334 ror x24,x24,#32 335 ror x25,x25,#32 336 ror x26,x26,#32 337 ror x27,x27,#32 338 ror x28,x28,#32 339 ror x30,x30,#32 340#endif 341 add v27.4s,v27.4s,v31.4s // += 1 342 add v28.4s,v27.4s,v31.4s 343 add v29.4s,v28.4s,v31.4s 344 shl v31.4s,v31.4s,#2 // 1 -> 4 345 346Loop_outer_neon: 347 mov w5,w22 // unpack key block 348 lsr x6,x22,#32 349 mov v0.16b,v24.16b 350 mov w7,w23 351 lsr x8,x23,#32 352 mov v4.16b,v24.16b 353 mov w9,w24 354 lsr x10,x24,#32 355 mov v16.16b,v24.16b 356 mov w11,w25 357 mov v1.16b,v25.16b 358 lsr x12,x25,#32 359 mov v5.16b,v25.16b 360 mov w13,w26 361 mov v17.16b,v25.16b 362 lsr x14,x26,#32 363 mov v3.16b,v27.16b 364 mov w15,w27 365 mov v7.16b,v28.16b 366 lsr x16,x27,#32 367 mov v19.16b,v29.16b 368 mov w17,w28 369 mov v2.16b,v26.16b 370 lsr x19,x28,#32 371 mov v6.16b,v26.16b 372 mov w20,w30 373 mov v18.16b,v26.16b 374 lsr x21,x30,#32 375 376 mov x4,#10 377 subs x2,x2,#256 378Loop_neon: 379 sub x4,x4,#1 380 add v0.4s,v0.4s,v1.4s 381 add w5,w5,w9 382 add v4.4s,v4.4s,v5.4s 383 add w6,w6,w10 384 add v16.4s,v16.4s,v17.4s 385 add w7,w7,w11 386 eor v3.16b,v3.16b,v0.16b 387 add w8,w8,w12 388 eor v7.16b,v7.16b,v4.16b 389 eor w17,w17,w5 390 eor v19.16b,v19.16b,v16.16b 391 eor w19,w19,w6 392 rev32 v3.8h,v3.8h 393 eor w20,w20,w7 394 rev32 v7.8h,v7.8h 395 eor w21,w21,w8 396 rev32 v19.8h,v19.8h 397 ror w17,w17,#16 398 add v2.4s,v2.4s,v3.4s 399 ror w19,w19,#16 400 add v6.4s,v6.4s,v7.4s 401 ror w20,w20,#16 402 add v18.4s,v18.4s,v19.4s 403 ror w21,w21,#16 404 eor v20.16b,v1.16b,v2.16b 405 add w13,w13,w17 406 eor v21.16b,v5.16b,v6.16b 407 add w14,w14,w19 408 eor v22.16b,v17.16b,v18.16b 409 add w15,w15,w20 410 ushr v1.4s,v20.4s,#20 411 add w16,w16,w21 412 ushr v5.4s,v21.4s,#20 413 eor w9,w9,w13 414 ushr v17.4s,v22.4s,#20 415 eor w10,w10,w14 416 sli v1.4s,v20.4s,#12 417 eor w11,w11,w15 418 sli v5.4s,v21.4s,#12 419 eor w12,w12,w16 420 sli v17.4s,v22.4s,#12 421 ror w9,w9,#20 422 add v0.4s,v0.4s,v1.4s 423 ror w10,w10,#20 424 add v4.4s,v4.4s,v5.4s 425 ror w11,w11,#20 426 add v16.4s,v16.4s,v17.4s 427 ror w12,w12,#20 428 eor v20.16b,v3.16b,v0.16b 429 add w5,w5,w9 430 eor v21.16b,v7.16b,v4.16b 431 add w6,w6,w10 432 eor v22.16b,v19.16b,v16.16b 433 add w7,w7,w11 434 ushr v3.4s,v20.4s,#24 435 add w8,w8,w12 436 ushr v7.4s,v21.4s,#24 437 eor w17,w17,w5 438 ushr v19.4s,v22.4s,#24 439 eor w19,w19,w6 440 sli v3.4s,v20.4s,#8 441 eor w20,w20,w7 442 sli v7.4s,v21.4s,#8 443 eor w21,w21,w8 444 sli v19.4s,v22.4s,#8 445 ror w17,w17,#24 446 add v2.4s,v2.4s,v3.4s 447 ror w19,w19,#24 448 add v6.4s,v6.4s,v7.4s 449 ror w20,w20,#24 450 add v18.4s,v18.4s,v19.4s 451 ror w21,w21,#24 452 eor v20.16b,v1.16b,v2.16b 453 add w13,w13,w17 454 eor v21.16b,v5.16b,v6.16b 455 add w14,w14,w19 456 eor v22.16b,v17.16b,v18.16b 457 add w15,w15,w20 458 ushr v1.4s,v20.4s,#25 459 add w16,w16,w21 460 ushr v5.4s,v21.4s,#25 461 eor w9,w9,w13 462 ushr v17.4s,v22.4s,#25 463 eor w10,w10,w14 464 sli v1.4s,v20.4s,#7 465 eor w11,w11,w15 466 sli v5.4s,v21.4s,#7 467 eor w12,w12,w16 468 sli v17.4s,v22.4s,#7 469 ror w9,w9,#25 470 ext v2.16b,v2.16b,v2.16b,#8 471 ror w10,w10,#25 472 ext v6.16b,v6.16b,v6.16b,#8 473 ror w11,w11,#25 474 ext v18.16b,v18.16b,v18.16b,#8 475 ror w12,w12,#25 476 ext v3.16b,v3.16b,v3.16b,#12 477 ext v7.16b,v7.16b,v7.16b,#12 478 ext v19.16b,v19.16b,v19.16b,#12 479 ext v1.16b,v1.16b,v1.16b,#4 480 ext v5.16b,v5.16b,v5.16b,#4 481 ext v17.16b,v17.16b,v17.16b,#4 482 add v0.4s,v0.4s,v1.4s 483 add w5,w5,w10 484 add v4.4s,v4.4s,v5.4s 485 add w6,w6,w11 486 add v16.4s,v16.4s,v17.4s 487 add w7,w7,w12 488 eor v3.16b,v3.16b,v0.16b 489 add w8,w8,w9 490 eor v7.16b,v7.16b,v4.16b 491 eor w21,w21,w5 492 eor v19.16b,v19.16b,v16.16b 493 eor w17,w17,w6 494 rev32 v3.8h,v3.8h 495 eor w19,w19,w7 496 rev32 v7.8h,v7.8h 497 eor w20,w20,w8 498 rev32 v19.8h,v19.8h 499 ror w21,w21,#16 500 add v2.4s,v2.4s,v3.4s 501 ror w17,w17,#16 502 add v6.4s,v6.4s,v7.4s 503 ror w19,w19,#16 504 add v18.4s,v18.4s,v19.4s 505 ror w20,w20,#16 506 eor v20.16b,v1.16b,v2.16b 507 add w15,w15,w21 508 eor v21.16b,v5.16b,v6.16b 509 add w16,w16,w17 510 eor v22.16b,v17.16b,v18.16b 511 add w13,w13,w19 512 ushr v1.4s,v20.4s,#20 513 add w14,w14,w20 514 ushr v5.4s,v21.4s,#20 515 eor w10,w10,w15 516 ushr v17.4s,v22.4s,#20 517 eor w11,w11,w16 518 sli v1.4s,v20.4s,#12 519 eor w12,w12,w13 520 sli v5.4s,v21.4s,#12 521 eor w9,w9,w14 522 sli v17.4s,v22.4s,#12 523 ror w10,w10,#20 524 add v0.4s,v0.4s,v1.4s 525 ror w11,w11,#20 526 add v4.4s,v4.4s,v5.4s 527 ror w12,w12,#20 528 add v16.4s,v16.4s,v17.4s 529 ror w9,w9,#20 530 eor v20.16b,v3.16b,v0.16b 531 add w5,w5,w10 532 eor v21.16b,v7.16b,v4.16b 533 add w6,w6,w11 534 eor v22.16b,v19.16b,v16.16b 535 add w7,w7,w12 536 ushr v3.4s,v20.4s,#24 537 add w8,w8,w9 538 ushr v7.4s,v21.4s,#24 539 eor w21,w21,w5 540 ushr v19.4s,v22.4s,#24 541 eor w17,w17,w6 542 sli v3.4s,v20.4s,#8 543 eor w19,w19,w7 544 sli v7.4s,v21.4s,#8 545 eor w20,w20,w8 546 sli v19.4s,v22.4s,#8 547 ror w21,w21,#24 548 add v2.4s,v2.4s,v3.4s 549 ror w17,w17,#24 550 add v6.4s,v6.4s,v7.4s 551 ror w19,w19,#24 552 add v18.4s,v18.4s,v19.4s 553 ror w20,w20,#24 554 eor v20.16b,v1.16b,v2.16b 555 add w15,w15,w21 556 eor v21.16b,v5.16b,v6.16b 557 add w16,w16,w17 558 eor v22.16b,v17.16b,v18.16b 559 add w13,w13,w19 560 ushr v1.4s,v20.4s,#25 561 add w14,w14,w20 562 ushr v5.4s,v21.4s,#25 563 eor w10,w10,w15 564 ushr v17.4s,v22.4s,#25 565 eor w11,w11,w16 566 sli v1.4s,v20.4s,#7 567 eor w12,w12,w13 568 sli v5.4s,v21.4s,#7 569 eor w9,w9,w14 570 sli v17.4s,v22.4s,#7 571 ror w10,w10,#25 572 ext v2.16b,v2.16b,v2.16b,#8 573 ror w11,w11,#25 574 ext v6.16b,v6.16b,v6.16b,#8 575 ror w12,w12,#25 576 ext v18.16b,v18.16b,v18.16b,#8 577 ror w9,w9,#25 578 ext v3.16b,v3.16b,v3.16b,#4 579 ext v7.16b,v7.16b,v7.16b,#4 580 ext v19.16b,v19.16b,v19.16b,#4 581 ext v1.16b,v1.16b,v1.16b,#12 582 ext v5.16b,v5.16b,v5.16b,#12 583 ext v17.16b,v17.16b,v17.16b,#12 584 cbnz x4,Loop_neon 585 586 add w5,w5,w22 // accumulate key block 587 add v0.4s,v0.4s,v24.4s 588 add x6,x6,x22,lsr#32 589 add v4.4s,v4.4s,v24.4s 590 add w7,w7,w23 591 add v16.4s,v16.4s,v24.4s 592 add x8,x8,x23,lsr#32 593 add v2.4s,v2.4s,v26.4s 594 add w9,w9,w24 595 add v6.4s,v6.4s,v26.4s 596 add x10,x10,x24,lsr#32 597 add v18.4s,v18.4s,v26.4s 598 add w11,w11,w25 599 add v3.4s,v3.4s,v27.4s 600 add x12,x12,x25,lsr#32 601 add w13,w13,w26 602 add v7.4s,v7.4s,v28.4s 603 add x14,x14,x26,lsr#32 604 add w15,w15,w27 605 add v19.4s,v19.4s,v29.4s 606 add x16,x16,x27,lsr#32 607 add w17,w17,w28 608 add v1.4s,v1.4s,v25.4s 609 add x19,x19,x28,lsr#32 610 add w20,w20,w30 611 add v5.4s,v5.4s,v25.4s 612 add x21,x21,x30,lsr#32 613 add v17.4s,v17.4s,v25.4s 614 615 b.lo Ltail_neon 616 617 add x5,x5,x6,lsl#32 // pack 618 add x7,x7,x8,lsl#32 619 ldp x6,x8,[x1,#0] // load input 620 add x9,x9,x10,lsl#32 621 add x11,x11,x12,lsl#32 622 ldp x10,x12,[x1,#16] 623 add x13,x13,x14,lsl#32 624 add x15,x15,x16,lsl#32 625 ldp x14,x16,[x1,#32] 626 add x17,x17,x19,lsl#32 627 add x20,x20,x21,lsl#32 628 ldp x19,x21,[x1,#48] 629 add x1,x1,#64 630#ifdef __AARCH64EB__ 631 rev x5,x5 632 rev x7,x7 633 rev x9,x9 634 rev x11,x11 635 rev x13,x13 636 rev x15,x15 637 rev x17,x17 638 rev x20,x20 639#endif 640 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 641 eor x5,x5,x6 642 eor x7,x7,x8 643 eor x9,x9,x10 644 eor x11,x11,x12 645 eor x13,x13,x14 646 eor v0.16b,v0.16b,v20.16b 647 eor x15,x15,x16 648 eor v1.16b,v1.16b,v21.16b 649 eor x17,x17,x19 650 eor v2.16b,v2.16b,v22.16b 651 eor x20,x20,x21 652 eor v3.16b,v3.16b,v23.16b 653 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 654 655 stp x5,x7,[x0,#0] // store output 656 add x28,x28,#4 // increment counter 657 stp x9,x11,[x0,#16] 658 add v27.4s,v27.4s,v31.4s // += 4 659 stp x13,x15,[x0,#32] 660 add v28.4s,v28.4s,v31.4s 661 stp x17,x20,[x0,#48] 662 add v29.4s,v29.4s,v31.4s 663 add x0,x0,#64 664 665 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 666 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 667 668 eor v4.16b,v4.16b,v20.16b 669 eor v5.16b,v5.16b,v21.16b 670 eor v6.16b,v6.16b,v22.16b 671 eor v7.16b,v7.16b,v23.16b 672 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 673 674 eor v16.16b,v16.16b,v0.16b 675 eor v17.16b,v17.16b,v1.16b 676 eor v18.16b,v18.16b,v2.16b 677 eor v19.16b,v19.16b,v3.16b 678 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 679 680 b.hi Loop_outer_neon 681 682 ldp x19,x20,[x29,#16] 683 add sp,sp,#64 684 ldp x21,x22,[x29,#32] 685 ldp x23,x24,[x29,#48] 686 ldp x25,x26,[x29,#64] 687 ldp x27,x28,[x29,#80] 688 ldp x29,x30,[sp],#96 689 AARCH64_VALIDATE_LINK_REGISTER 690 ret 691 692Ltail_neon: 693 add x2,x2,#256 694 cmp x2,#64 695 b.lo Less_than_64 696 697 add x5,x5,x6,lsl#32 // pack 698 add x7,x7,x8,lsl#32 699 ldp x6,x8,[x1,#0] // load input 700 add x9,x9,x10,lsl#32 701 add x11,x11,x12,lsl#32 702 ldp x10,x12,[x1,#16] 703 add x13,x13,x14,lsl#32 704 add x15,x15,x16,lsl#32 705 ldp x14,x16,[x1,#32] 706 add x17,x17,x19,lsl#32 707 add x20,x20,x21,lsl#32 708 ldp x19,x21,[x1,#48] 709 add x1,x1,#64 710#ifdef __AARCH64EB__ 711 rev x5,x5 712 rev x7,x7 713 rev x9,x9 714 rev x11,x11 715 rev x13,x13 716 rev x15,x15 717 rev x17,x17 718 rev x20,x20 719#endif 720 eor x5,x5,x6 721 eor x7,x7,x8 722 eor x9,x9,x10 723 eor x11,x11,x12 724 eor x13,x13,x14 725 eor x15,x15,x16 726 eor x17,x17,x19 727 eor x20,x20,x21 728 729 stp x5,x7,[x0,#0] // store output 730 add x28,x28,#4 // increment counter 731 stp x9,x11,[x0,#16] 732 stp x13,x15,[x0,#32] 733 stp x17,x20,[x0,#48] 734 add x0,x0,#64 735 b.eq Ldone_neon 736 sub x2,x2,#64 737 cmp x2,#64 738 b.lo Less_than_128 739 740 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 741 eor v0.16b,v0.16b,v20.16b 742 eor v1.16b,v1.16b,v21.16b 743 eor v2.16b,v2.16b,v22.16b 744 eor v3.16b,v3.16b,v23.16b 745 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 746 b.eq Ldone_neon 747 sub x2,x2,#64 748 cmp x2,#64 749 b.lo Less_than_192 750 751 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 752 eor v4.16b,v4.16b,v20.16b 753 eor v5.16b,v5.16b,v21.16b 754 eor v6.16b,v6.16b,v22.16b 755 eor v7.16b,v7.16b,v23.16b 756 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 757 b.eq Ldone_neon 758 sub x2,x2,#64 759 760 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 761 b Last_neon 762 763Less_than_128: 764 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 765 b Last_neon 766Less_than_192: 767 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 768 b Last_neon 769 770.align 4 771Last_neon: 772 sub x0,x0,#1 773 add x1,x1,x2 774 add x0,x0,x2 775 add x4,sp,x2 776 neg x2,x2 777 778Loop_tail_neon: 779 ldrb w10,[x1,x2] 780 ldrb w11,[x4,x2] 781 add x2,x2,#1 782 eor w10,w10,w11 783 strb w10,[x0,x2] 784 cbnz x2,Loop_tail_neon 785 786 stp xzr,xzr,[sp,#0] 787 stp xzr,xzr,[sp,#16] 788 stp xzr,xzr,[sp,#32] 789 stp xzr,xzr,[sp,#48] 790 791Ldone_neon: 792 ldp x19,x20,[x29,#16] 793 add sp,sp,#64 794 ldp x21,x22,[x29,#32] 795 ldp x23,x24,[x29,#48] 796 ldp x25,x26,[x29,#64] 797 ldp x27,x28,[x29,#80] 798 ldp x29,x30,[sp],#96 799 AARCH64_VALIDATE_LINK_REGISTER 800 ret 801 802.def ChaCha20_512_neon 803 .type 32 804.endef 805.align 5 806ChaCha20_512_neon: 807 AARCH64_SIGN_LINK_REGISTER 808 stp x29,x30,[sp,#-96]! 809 add x29,sp,#0 810 811 adrp x5,Lsigma 812 add x5,x5,:lo12:Lsigma 813 stp x19,x20,[sp,#16] 814 stp x21,x22,[sp,#32] 815 stp x23,x24,[sp,#48] 816 stp x25,x26,[sp,#64] 817 stp x27,x28,[sp,#80] 818 819L512_or_more_neon: 820 sub sp,sp,#128+64 821 822 ldp x22,x23,[x5] // load sigma 823 ld1 {v24.4s},[x5],#16 824 ldp x24,x25,[x3] // load key 825 ldp x26,x27,[x3,#16] 826 ld1 {v25.4s,v26.4s},[x3] 827 ldp x28,x30,[x4] // load counter 828 ld1 {v27.4s},[x4] 829 ld1 {v31.4s},[x5] 830#ifdef __AARCH64EB__ 831 rev64 v24.4s,v24.4s 832 ror x24,x24,#32 833 ror x25,x25,#32 834 ror x26,x26,#32 835 ror x27,x27,#32 836 ror x28,x28,#32 837 ror x30,x30,#32 838#endif 839 add v27.4s,v27.4s,v31.4s // += 1 840 stp q24,q25,[sp,#0] // off-load key block, invariant part 841 add v27.4s,v27.4s,v31.4s // not typo 842 str q26,[sp,#32] 843 add v28.4s,v27.4s,v31.4s 844 add v29.4s,v28.4s,v31.4s 845 add v30.4s,v29.4s,v31.4s 846 shl v31.4s,v31.4s,#2 // 1 -> 4 847 848 stp d8,d9,[sp,#128+0] // meet ABI requirements 849 stp d10,d11,[sp,#128+16] 850 stp d12,d13,[sp,#128+32] 851 stp d14,d15,[sp,#128+48] 852 853 sub x2,x2,#512 // not typo 854 855Loop_outer_512_neon: 856 mov v0.16b,v24.16b 857 mov v4.16b,v24.16b 858 mov v8.16b,v24.16b 859 mov v12.16b,v24.16b 860 mov v16.16b,v24.16b 861 mov v20.16b,v24.16b 862 mov v1.16b,v25.16b 863 mov w5,w22 // unpack key block 864 mov v5.16b,v25.16b 865 lsr x6,x22,#32 866 mov v9.16b,v25.16b 867 mov w7,w23 868 mov v13.16b,v25.16b 869 lsr x8,x23,#32 870 mov v17.16b,v25.16b 871 mov w9,w24 872 mov v21.16b,v25.16b 873 lsr x10,x24,#32 874 mov v3.16b,v27.16b 875 mov w11,w25 876 mov v7.16b,v28.16b 877 lsr x12,x25,#32 878 mov v11.16b,v29.16b 879 mov w13,w26 880 mov v15.16b,v30.16b 881 lsr x14,x26,#32 882 mov v2.16b,v26.16b 883 mov w15,w27 884 mov v6.16b,v26.16b 885 lsr x16,x27,#32 886 add v19.4s,v3.4s,v31.4s // +4 887 mov w17,w28 888 add v23.4s,v7.4s,v31.4s // +4 889 lsr x19,x28,#32 890 mov v10.16b,v26.16b 891 mov w20,w30 892 mov v14.16b,v26.16b 893 lsr x21,x30,#32 894 mov v18.16b,v26.16b 895 stp q27,q28,[sp,#48] // off-load key block, variable part 896 mov v22.16b,v26.16b 897 str q29,[sp,#80] 898 899 mov x4,#5 900 subs x2,x2,#512 901Loop_upper_neon: 902 sub x4,x4,#1 903 add v0.4s,v0.4s,v1.4s 904 add w5,w5,w9 905 add v4.4s,v4.4s,v5.4s 906 add w6,w6,w10 907 add v8.4s,v8.4s,v9.4s 908 add w7,w7,w11 909 add v12.4s,v12.4s,v13.4s 910 add w8,w8,w12 911 add v16.4s,v16.4s,v17.4s 912 eor w17,w17,w5 913 add v20.4s,v20.4s,v21.4s 914 eor w19,w19,w6 915 eor v3.16b,v3.16b,v0.16b 916 eor w20,w20,w7 917 eor v7.16b,v7.16b,v4.16b 918 eor w21,w21,w8 919 eor v11.16b,v11.16b,v8.16b 920 ror w17,w17,#16 921 eor v15.16b,v15.16b,v12.16b 922 ror w19,w19,#16 923 eor v19.16b,v19.16b,v16.16b 924 ror w20,w20,#16 925 eor v23.16b,v23.16b,v20.16b 926 ror w21,w21,#16 927 rev32 v3.8h,v3.8h 928 add w13,w13,w17 929 rev32 v7.8h,v7.8h 930 add w14,w14,w19 931 rev32 v11.8h,v11.8h 932 add w15,w15,w20 933 rev32 v15.8h,v15.8h 934 add w16,w16,w21 935 rev32 v19.8h,v19.8h 936 eor w9,w9,w13 937 rev32 v23.8h,v23.8h 938 eor w10,w10,w14 939 add v2.4s,v2.4s,v3.4s 940 eor w11,w11,w15 941 add v6.4s,v6.4s,v7.4s 942 eor w12,w12,w16 943 add v10.4s,v10.4s,v11.4s 944 ror w9,w9,#20 945 add v14.4s,v14.4s,v15.4s 946 ror w10,w10,#20 947 add v18.4s,v18.4s,v19.4s 948 ror w11,w11,#20 949 add v22.4s,v22.4s,v23.4s 950 ror w12,w12,#20 951 eor v24.16b,v1.16b,v2.16b 952 add w5,w5,w9 953 eor v25.16b,v5.16b,v6.16b 954 add w6,w6,w10 955 eor v26.16b,v9.16b,v10.16b 956 add w7,w7,w11 957 eor v27.16b,v13.16b,v14.16b 958 add w8,w8,w12 959 eor v28.16b,v17.16b,v18.16b 960 eor w17,w17,w5 961 eor v29.16b,v21.16b,v22.16b 962 eor w19,w19,w6 963 ushr v1.4s,v24.4s,#20 964 eor w20,w20,w7 965 ushr v5.4s,v25.4s,#20 966 eor w21,w21,w8 967 ushr v9.4s,v26.4s,#20 968 ror w17,w17,#24 969 ushr v13.4s,v27.4s,#20 970 ror w19,w19,#24 971 ushr v17.4s,v28.4s,#20 972 ror w20,w20,#24 973 ushr v21.4s,v29.4s,#20 974 ror w21,w21,#24 975 sli v1.4s,v24.4s,#12 976 add w13,w13,w17 977 sli v5.4s,v25.4s,#12 978 add w14,w14,w19 979 sli v9.4s,v26.4s,#12 980 add w15,w15,w20 981 sli v13.4s,v27.4s,#12 982 add w16,w16,w21 983 sli v17.4s,v28.4s,#12 984 eor w9,w9,w13 985 sli v21.4s,v29.4s,#12 986 eor w10,w10,w14 987 add v0.4s,v0.4s,v1.4s 988 eor w11,w11,w15 989 add v4.4s,v4.4s,v5.4s 990 eor w12,w12,w16 991 add v8.4s,v8.4s,v9.4s 992 ror w9,w9,#25 993 add v12.4s,v12.4s,v13.4s 994 ror w10,w10,#25 995 add v16.4s,v16.4s,v17.4s 996 ror w11,w11,#25 997 add v20.4s,v20.4s,v21.4s 998 ror w12,w12,#25 999 eor v24.16b,v3.16b,v0.16b 1000 add w5,w5,w10 1001 eor v25.16b,v7.16b,v4.16b 1002 add w6,w6,w11 1003 eor v26.16b,v11.16b,v8.16b 1004 add w7,w7,w12 1005 eor v27.16b,v15.16b,v12.16b 1006 add w8,w8,w9 1007 eor v28.16b,v19.16b,v16.16b 1008 eor w21,w21,w5 1009 eor v29.16b,v23.16b,v20.16b 1010 eor w17,w17,w6 1011 ushr v3.4s,v24.4s,#24 1012 eor w19,w19,w7 1013 ushr v7.4s,v25.4s,#24 1014 eor w20,w20,w8 1015 ushr v11.4s,v26.4s,#24 1016 ror w21,w21,#16 1017 ushr v15.4s,v27.4s,#24 1018 ror w17,w17,#16 1019 ushr v19.4s,v28.4s,#24 1020 ror w19,w19,#16 1021 ushr v23.4s,v29.4s,#24 1022 ror w20,w20,#16 1023 sli v3.4s,v24.4s,#8 1024 add w15,w15,w21 1025 sli v7.4s,v25.4s,#8 1026 add w16,w16,w17 1027 sli v11.4s,v26.4s,#8 1028 add w13,w13,w19 1029 sli v15.4s,v27.4s,#8 1030 add w14,w14,w20 1031 sli v19.4s,v28.4s,#8 1032 eor w10,w10,w15 1033 sli v23.4s,v29.4s,#8 1034 eor w11,w11,w16 1035 add v2.4s,v2.4s,v3.4s 1036 eor w12,w12,w13 1037 add v6.4s,v6.4s,v7.4s 1038 eor w9,w9,w14 1039 add v10.4s,v10.4s,v11.4s 1040 ror w10,w10,#20 1041 add v14.4s,v14.4s,v15.4s 1042 ror w11,w11,#20 1043 add v18.4s,v18.4s,v19.4s 1044 ror w12,w12,#20 1045 add v22.4s,v22.4s,v23.4s 1046 ror w9,w9,#20 1047 eor v24.16b,v1.16b,v2.16b 1048 add w5,w5,w10 1049 eor v25.16b,v5.16b,v6.16b 1050 add w6,w6,w11 1051 eor v26.16b,v9.16b,v10.16b 1052 add w7,w7,w12 1053 eor v27.16b,v13.16b,v14.16b 1054 add w8,w8,w9 1055 eor v28.16b,v17.16b,v18.16b 1056 eor w21,w21,w5 1057 eor v29.16b,v21.16b,v22.16b 1058 eor w17,w17,w6 1059 ushr v1.4s,v24.4s,#25 1060 eor w19,w19,w7 1061 ushr v5.4s,v25.4s,#25 1062 eor w20,w20,w8 1063 ushr v9.4s,v26.4s,#25 1064 ror w21,w21,#24 1065 ushr v13.4s,v27.4s,#25 1066 ror w17,w17,#24 1067 ushr v17.4s,v28.4s,#25 1068 ror w19,w19,#24 1069 ushr v21.4s,v29.4s,#25 1070 ror w20,w20,#24 1071 sli v1.4s,v24.4s,#7 1072 add w15,w15,w21 1073 sli v5.4s,v25.4s,#7 1074 add w16,w16,w17 1075 sli v9.4s,v26.4s,#7 1076 add w13,w13,w19 1077 sli v13.4s,v27.4s,#7 1078 add w14,w14,w20 1079 sli v17.4s,v28.4s,#7 1080 eor w10,w10,w15 1081 sli v21.4s,v29.4s,#7 1082 eor w11,w11,w16 1083 ext v2.16b,v2.16b,v2.16b,#8 1084 eor w12,w12,w13 1085 ext v6.16b,v6.16b,v6.16b,#8 1086 eor w9,w9,w14 1087 ext v10.16b,v10.16b,v10.16b,#8 1088 ror w10,w10,#25 1089 ext v14.16b,v14.16b,v14.16b,#8 1090 ror w11,w11,#25 1091 ext v18.16b,v18.16b,v18.16b,#8 1092 ror w12,w12,#25 1093 ext v22.16b,v22.16b,v22.16b,#8 1094 ror w9,w9,#25 1095 ext v3.16b,v3.16b,v3.16b,#12 1096 ext v7.16b,v7.16b,v7.16b,#12 1097 ext v11.16b,v11.16b,v11.16b,#12 1098 ext v15.16b,v15.16b,v15.16b,#12 1099 ext v19.16b,v19.16b,v19.16b,#12 1100 ext v23.16b,v23.16b,v23.16b,#12 1101 ext v1.16b,v1.16b,v1.16b,#4 1102 ext v5.16b,v5.16b,v5.16b,#4 1103 ext v9.16b,v9.16b,v9.16b,#4 1104 ext v13.16b,v13.16b,v13.16b,#4 1105 ext v17.16b,v17.16b,v17.16b,#4 1106 ext v21.16b,v21.16b,v21.16b,#4 1107 add v0.4s,v0.4s,v1.4s 1108 add w5,w5,w9 1109 add v4.4s,v4.4s,v5.4s 1110 add w6,w6,w10 1111 add v8.4s,v8.4s,v9.4s 1112 add w7,w7,w11 1113 add v12.4s,v12.4s,v13.4s 1114 add w8,w8,w12 1115 add v16.4s,v16.4s,v17.4s 1116 eor w17,w17,w5 1117 add v20.4s,v20.4s,v21.4s 1118 eor w19,w19,w6 1119 eor v3.16b,v3.16b,v0.16b 1120 eor w20,w20,w7 1121 eor v7.16b,v7.16b,v4.16b 1122 eor w21,w21,w8 1123 eor v11.16b,v11.16b,v8.16b 1124 ror w17,w17,#16 1125 eor v15.16b,v15.16b,v12.16b 1126 ror w19,w19,#16 1127 eor v19.16b,v19.16b,v16.16b 1128 ror w20,w20,#16 1129 eor v23.16b,v23.16b,v20.16b 1130 ror w21,w21,#16 1131 rev32 v3.8h,v3.8h 1132 add w13,w13,w17 1133 rev32 v7.8h,v7.8h 1134 add w14,w14,w19 1135 rev32 v11.8h,v11.8h 1136 add w15,w15,w20 1137 rev32 v15.8h,v15.8h 1138 add w16,w16,w21 1139 rev32 v19.8h,v19.8h 1140 eor w9,w9,w13 1141 rev32 v23.8h,v23.8h 1142 eor w10,w10,w14 1143 add v2.4s,v2.4s,v3.4s 1144 eor w11,w11,w15 1145 add v6.4s,v6.4s,v7.4s 1146 eor w12,w12,w16 1147 add v10.4s,v10.4s,v11.4s 1148 ror w9,w9,#20 1149 add v14.4s,v14.4s,v15.4s 1150 ror w10,w10,#20 1151 add v18.4s,v18.4s,v19.4s 1152 ror w11,w11,#20 1153 add v22.4s,v22.4s,v23.4s 1154 ror w12,w12,#20 1155 eor v24.16b,v1.16b,v2.16b 1156 add w5,w5,w9 1157 eor v25.16b,v5.16b,v6.16b 1158 add w6,w6,w10 1159 eor v26.16b,v9.16b,v10.16b 1160 add w7,w7,w11 1161 eor v27.16b,v13.16b,v14.16b 1162 add w8,w8,w12 1163 eor v28.16b,v17.16b,v18.16b 1164 eor w17,w17,w5 1165 eor v29.16b,v21.16b,v22.16b 1166 eor w19,w19,w6 1167 ushr v1.4s,v24.4s,#20 1168 eor w20,w20,w7 1169 ushr v5.4s,v25.4s,#20 1170 eor w21,w21,w8 1171 ushr v9.4s,v26.4s,#20 1172 ror w17,w17,#24 1173 ushr v13.4s,v27.4s,#20 1174 ror w19,w19,#24 1175 ushr v17.4s,v28.4s,#20 1176 ror w20,w20,#24 1177 ushr v21.4s,v29.4s,#20 1178 ror w21,w21,#24 1179 sli v1.4s,v24.4s,#12 1180 add w13,w13,w17 1181 sli v5.4s,v25.4s,#12 1182 add w14,w14,w19 1183 sli v9.4s,v26.4s,#12 1184 add w15,w15,w20 1185 sli v13.4s,v27.4s,#12 1186 add w16,w16,w21 1187 sli v17.4s,v28.4s,#12 1188 eor w9,w9,w13 1189 sli v21.4s,v29.4s,#12 1190 eor w10,w10,w14 1191 add v0.4s,v0.4s,v1.4s 1192 eor w11,w11,w15 1193 add v4.4s,v4.4s,v5.4s 1194 eor w12,w12,w16 1195 add v8.4s,v8.4s,v9.4s 1196 ror w9,w9,#25 1197 add v12.4s,v12.4s,v13.4s 1198 ror w10,w10,#25 1199 add v16.4s,v16.4s,v17.4s 1200 ror w11,w11,#25 1201 add v20.4s,v20.4s,v21.4s 1202 ror w12,w12,#25 1203 eor v24.16b,v3.16b,v0.16b 1204 add w5,w5,w10 1205 eor v25.16b,v7.16b,v4.16b 1206 add w6,w6,w11 1207 eor v26.16b,v11.16b,v8.16b 1208 add w7,w7,w12 1209 eor v27.16b,v15.16b,v12.16b 1210 add w8,w8,w9 1211 eor v28.16b,v19.16b,v16.16b 1212 eor w21,w21,w5 1213 eor v29.16b,v23.16b,v20.16b 1214 eor w17,w17,w6 1215 ushr v3.4s,v24.4s,#24 1216 eor w19,w19,w7 1217 ushr v7.4s,v25.4s,#24 1218 eor w20,w20,w8 1219 ushr v11.4s,v26.4s,#24 1220 ror w21,w21,#16 1221 ushr v15.4s,v27.4s,#24 1222 ror w17,w17,#16 1223 ushr v19.4s,v28.4s,#24 1224 ror w19,w19,#16 1225 ushr v23.4s,v29.4s,#24 1226 ror w20,w20,#16 1227 sli v3.4s,v24.4s,#8 1228 add w15,w15,w21 1229 sli v7.4s,v25.4s,#8 1230 add w16,w16,w17 1231 sli v11.4s,v26.4s,#8 1232 add w13,w13,w19 1233 sli v15.4s,v27.4s,#8 1234 add w14,w14,w20 1235 sli v19.4s,v28.4s,#8 1236 eor w10,w10,w15 1237 sli v23.4s,v29.4s,#8 1238 eor w11,w11,w16 1239 add v2.4s,v2.4s,v3.4s 1240 eor w12,w12,w13 1241 add v6.4s,v6.4s,v7.4s 1242 eor w9,w9,w14 1243 add v10.4s,v10.4s,v11.4s 1244 ror w10,w10,#20 1245 add v14.4s,v14.4s,v15.4s 1246 ror w11,w11,#20 1247 add v18.4s,v18.4s,v19.4s 1248 ror w12,w12,#20 1249 add v22.4s,v22.4s,v23.4s 1250 ror w9,w9,#20 1251 eor v24.16b,v1.16b,v2.16b 1252 add w5,w5,w10 1253 eor v25.16b,v5.16b,v6.16b 1254 add w6,w6,w11 1255 eor v26.16b,v9.16b,v10.16b 1256 add w7,w7,w12 1257 eor v27.16b,v13.16b,v14.16b 1258 add w8,w8,w9 1259 eor v28.16b,v17.16b,v18.16b 1260 eor w21,w21,w5 1261 eor v29.16b,v21.16b,v22.16b 1262 eor w17,w17,w6 1263 ushr v1.4s,v24.4s,#25 1264 eor w19,w19,w7 1265 ushr v5.4s,v25.4s,#25 1266 eor w20,w20,w8 1267 ushr v9.4s,v26.4s,#25 1268 ror w21,w21,#24 1269 ushr v13.4s,v27.4s,#25 1270 ror w17,w17,#24 1271 ushr v17.4s,v28.4s,#25 1272 ror w19,w19,#24 1273 ushr v21.4s,v29.4s,#25 1274 ror w20,w20,#24 1275 sli v1.4s,v24.4s,#7 1276 add w15,w15,w21 1277 sli v5.4s,v25.4s,#7 1278 add w16,w16,w17 1279 sli v9.4s,v26.4s,#7 1280 add w13,w13,w19 1281 sli v13.4s,v27.4s,#7 1282 add w14,w14,w20 1283 sli v17.4s,v28.4s,#7 1284 eor w10,w10,w15 1285 sli v21.4s,v29.4s,#7 1286 eor w11,w11,w16 1287 ext v2.16b,v2.16b,v2.16b,#8 1288 eor w12,w12,w13 1289 ext v6.16b,v6.16b,v6.16b,#8 1290 eor w9,w9,w14 1291 ext v10.16b,v10.16b,v10.16b,#8 1292 ror w10,w10,#25 1293 ext v14.16b,v14.16b,v14.16b,#8 1294 ror w11,w11,#25 1295 ext v18.16b,v18.16b,v18.16b,#8 1296 ror w12,w12,#25 1297 ext v22.16b,v22.16b,v22.16b,#8 1298 ror w9,w9,#25 1299 ext v3.16b,v3.16b,v3.16b,#4 1300 ext v7.16b,v7.16b,v7.16b,#4 1301 ext v11.16b,v11.16b,v11.16b,#4 1302 ext v15.16b,v15.16b,v15.16b,#4 1303 ext v19.16b,v19.16b,v19.16b,#4 1304 ext v23.16b,v23.16b,v23.16b,#4 1305 ext v1.16b,v1.16b,v1.16b,#12 1306 ext v5.16b,v5.16b,v5.16b,#12 1307 ext v9.16b,v9.16b,v9.16b,#12 1308 ext v13.16b,v13.16b,v13.16b,#12 1309 ext v17.16b,v17.16b,v17.16b,#12 1310 ext v21.16b,v21.16b,v21.16b,#12 1311 cbnz x4,Loop_upper_neon 1312 1313 add w5,w5,w22 // accumulate key block 1314 add x6,x6,x22,lsr#32 1315 add w7,w7,w23 1316 add x8,x8,x23,lsr#32 1317 add w9,w9,w24 1318 add x10,x10,x24,lsr#32 1319 add w11,w11,w25 1320 add x12,x12,x25,lsr#32 1321 add w13,w13,w26 1322 add x14,x14,x26,lsr#32 1323 add w15,w15,w27 1324 add x16,x16,x27,lsr#32 1325 add w17,w17,w28 1326 add x19,x19,x28,lsr#32 1327 add w20,w20,w30 1328 add x21,x21,x30,lsr#32 1329 1330 add x5,x5,x6,lsl#32 // pack 1331 add x7,x7,x8,lsl#32 1332 ldp x6,x8,[x1,#0] // load input 1333 add x9,x9,x10,lsl#32 1334 add x11,x11,x12,lsl#32 1335 ldp x10,x12,[x1,#16] 1336 add x13,x13,x14,lsl#32 1337 add x15,x15,x16,lsl#32 1338 ldp x14,x16,[x1,#32] 1339 add x17,x17,x19,lsl#32 1340 add x20,x20,x21,lsl#32 1341 ldp x19,x21,[x1,#48] 1342 add x1,x1,#64 1343#ifdef __AARCH64EB__ 1344 rev x5,x5 1345 rev x7,x7 1346 rev x9,x9 1347 rev x11,x11 1348 rev x13,x13 1349 rev x15,x15 1350 rev x17,x17 1351 rev x20,x20 1352#endif 1353 eor x5,x5,x6 1354 eor x7,x7,x8 1355 eor x9,x9,x10 1356 eor x11,x11,x12 1357 eor x13,x13,x14 1358 eor x15,x15,x16 1359 eor x17,x17,x19 1360 eor x20,x20,x21 1361 1362 stp x5,x7,[x0,#0] // store output 1363 add x28,x28,#1 // increment counter 1364 mov w5,w22 // unpack key block 1365 lsr x6,x22,#32 1366 stp x9,x11,[x0,#16] 1367 mov w7,w23 1368 lsr x8,x23,#32 1369 stp x13,x15,[x0,#32] 1370 mov w9,w24 1371 lsr x10,x24,#32 1372 stp x17,x20,[x0,#48] 1373 add x0,x0,#64 1374 mov w11,w25 1375 lsr x12,x25,#32 1376 mov w13,w26 1377 lsr x14,x26,#32 1378 mov w15,w27 1379 lsr x16,x27,#32 1380 mov w17,w28 1381 lsr x19,x28,#32 1382 mov w20,w30 1383 lsr x21,x30,#32 1384 1385 mov x4,#5 1386Loop_lower_neon: 1387 sub x4,x4,#1 1388 add v0.4s,v0.4s,v1.4s 1389 add w5,w5,w9 1390 add v4.4s,v4.4s,v5.4s 1391 add w6,w6,w10 1392 add v8.4s,v8.4s,v9.4s 1393 add w7,w7,w11 1394 add v12.4s,v12.4s,v13.4s 1395 add w8,w8,w12 1396 add v16.4s,v16.4s,v17.4s 1397 eor w17,w17,w5 1398 add v20.4s,v20.4s,v21.4s 1399 eor w19,w19,w6 1400 eor v3.16b,v3.16b,v0.16b 1401 eor w20,w20,w7 1402 eor v7.16b,v7.16b,v4.16b 1403 eor w21,w21,w8 1404 eor v11.16b,v11.16b,v8.16b 1405 ror w17,w17,#16 1406 eor v15.16b,v15.16b,v12.16b 1407 ror w19,w19,#16 1408 eor v19.16b,v19.16b,v16.16b 1409 ror w20,w20,#16 1410 eor v23.16b,v23.16b,v20.16b 1411 ror w21,w21,#16 1412 rev32 v3.8h,v3.8h 1413 add w13,w13,w17 1414 rev32 v7.8h,v7.8h 1415 add w14,w14,w19 1416 rev32 v11.8h,v11.8h 1417 add w15,w15,w20 1418 rev32 v15.8h,v15.8h 1419 add w16,w16,w21 1420 rev32 v19.8h,v19.8h 1421 eor w9,w9,w13 1422 rev32 v23.8h,v23.8h 1423 eor w10,w10,w14 1424 add v2.4s,v2.4s,v3.4s 1425 eor w11,w11,w15 1426 add v6.4s,v6.4s,v7.4s 1427 eor w12,w12,w16 1428 add v10.4s,v10.4s,v11.4s 1429 ror w9,w9,#20 1430 add v14.4s,v14.4s,v15.4s 1431 ror w10,w10,#20 1432 add v18.4s,v18.4s,v19.4s 1433 ror w11,w11,#20 1434 add v22.4s,v22.4s,v23.4s 1435 ror w12,w12,#20 1436 eor v24.16b,v1.16b,v2.16b 1437 add w5,w5,w9 1438 eor v25.16b,v5.16b,v6.16b 1439 add w6,w6,w10 1440 eor v26.16b,v9.16b,v10.16b 1441 add w7,w7,w11 1442 eor v27.16b,v13.16b,v14.16b 1443 add w8,w8,w12 1444 eor v28.16b,v17.16b,v18.16b 1445 eor w17,w17,w5 1446 eor v29.16b,v21.16b,v22.16b 1447 eor w19,w19,w6 1448 ushr v1.4s,v24.4s,#20 1449 eor w20,w20,w7 1450 ushr v5.4s,v25.4s,#20 1451 eor w21,w21,w8 1452 ushr v9.4s,v26.4s,#20 1453 ror w17,w17,#24 1454 ushr v13.4s,v27.4s,#20 1455 ror w19,w19,#24 1456 ushr v17.4s,v28.4s,#20 1457 ror w20,w20,#24 1458 ushr v21.4s,v29.4s,#20 1459 ror w21,w21,#24 1460 sli v1.4s,v24.4s,#12 1461 add w13,w13,w17 1462 sli v5.4s,v25.4s,#12 1463 add w14,w14,w19 1464 sli v9.4s,v26.4s,#12 1465 add w15,w15,w20 1466 sli v13.4s,v27.4s,#12 1467 add w16,w16,w21 1468 sli v17.4s,v28.4s,#12 1469 eor w9,w9,w13 1470 sli v21.4s,v29.4s,#12 1471 eor w10,w10,w14 1472 add v0.4s,v0.4s,v1.4s 1473 eor w11,w11,w15 1474 add v4.4s,v4.4s,v5.4s 1475 eor w12,w12,w16 1476 add v8.4s,v8.4s,v9.4s 1477 ror w9,w9,#25 1478 add v12.4s,v12.4s,v13.4s 1479 ror w10,w10,#25 1480 add v16.4s,v16.4s,v17.4s 1481 ror w11,w11,#25 1482 add v20.4s,v20.4s,v21.4s 1483 ror w12,w12,#25 1484 eor v24.16b,v3.16b,v0.16b 1485 add w5,w5,w10 1486 eor v25.16b,v7.16b,v4.16b 1487 add w6,w6,w11 1488 eor v26.16b,v11.16b,v8.16b 1489 add w7,w7,w12 1490 eor v27.16b,v15.16b,v12.16b 1491 add w8,w8,w9 1492 eor v28.16b,v19.16b,v16.16b 1493 eor w21,w21,w5 1494 eor v29.16b,v23.16b,v20.16b 1495 eor w17,w17,w6 1496 ushr v3.4s,v24.4s,#24 1497 eor w19,w19,w7 1498 ushr v7.4s,v25.4s,#24 1499 eor w20,w20,w8 1500 ushr v11.4s,v26.4s,#24 1501 ror w21,w21,#16 1502 ushr v15.4s,v27.4s,#24 1503 ror w17,w17,#16 1504 ushr v19.4s,v28.4s,#24 1505 ror w19,w19,#16 1506 ushr v23.4s,v29.4s,#24 1507 ror w20,w20,#16 1508 sli v3.4s,v24.4s,#8 1509 add w15,w15,w21 1510 sli v7.4s,v25.4s,#8 1511 add w16,w16,w17 1512 sli v11.4s,v26.4s,#8 1513 add w13,w13,w19 1514 sli v15.4s,v27.4s,#8 1515 add w14,w14,w20 1516 sli v19.4s,v28.4s,#8 1517 eor w10,w10,w15 1518 sli v23.4s,v29.4s,#8 1519 eor w11,w11,w16 1520 add v2.4s,v2.4s,v3.4s 1521 eor w12,w12,w13 1522 add v6.4s,v6.4s,v7.4s 1523 eor w9,w9,w14 1524 add v10.4s,v10.4s,v11.4s 1525 ror w10,w10,#20 1526 add v14.4s,v14.4s,v15.4s 1527 ror w11,w11,#20 1528 add v18.4s,v18.4s,v19.4s 1529 ror w12,w12,#20 1530 add v22.4s,v22.4s,v23.4s 1531 ror w9,w9,#20 1532 eor v24.16b,v1.16b,v2.16b 1533 add w5,w5,w10 1534 eor v25.16b,v5.16b,v6.16b 1535 add w6,w6,w11 1536 eor v26.16b,v9.16b,v10.16b 1537 add w7,w7,w12 1538 eor v27.16b,v13.16b,v14.16b 1539 add w8,w8,w9 1540 eor v28.16b,v17.16b,v18.16b 1541 eor w21,w21,w5 1542 eor v29.16b,v21.16b,v22.16b 1543 eor w17,w17,w6 1544 ushr v1.4s,v24.4s,#25 1545 eor w19,w19,w7 1546 ushr v5.4s,v25.4s,#25 1547 eor w20,w20,w8 1548 ushr v9.4s,v26.4s,#25 1549 ror w21,w21,#24 1550 ushr v13.4s,v27.4s,#25 1551 ror w17,w17,#24 1552 ushr v17.4s,v28.4s,#25 1553 ror w19,w19,#24 1554 ushr v21.4s,v29.4s,#25 1555 ror w20,w20,#24 1556 sli v1.4s,v24.4s,#7 1557 add w15,w15,w21 1558 sli v5.4s,v25.4s,#7 1559 add w16,w16,w17 1560 sli v9.4s,v26.4s,#7 1561 add w13,w13,w19 1562 sli v13.4s,v27.4s,#7 1563 add w14,w14,w20 1564 sli v17.4s,v28.4s,#7 1565 eor w10,w10,w15 1566 sli v21.4s,v29.4s,#7 1567 eor w11,w11,w16 1568 ext v2.16b,v2.16b,v2.16b,#8 1569 eor w12,w12,w13 1570 ext v6.16b,v6.16b,v6.16b,#8 1571 eor w9,w9,w14 1572 ext v10.16b,v10.16b,v10.16b,#8 1573 ror w10,w10,#25 1574 ext v14.16b,v14.16b,v14.16b,#8 1575 ror w11,w11,#25 1576 ext v18.16b,v18.16b,v18.16b,#8 1577 ror w12,w12,#25 1578 ext v22.16b,v22.16b,v22.16b,#8 1579 ror w9,w9,#25 1580 ext v3.16b,v3.16b,v3.16b,#12 1581 ext v7.16b,v7.16b,v7.16b,#12 1582 ext v11.16b,v11.16b,v11.16b,#12 1583 ext v15.16b,v15.16b,v15.16b,#12 1584 ext v19.16b,v19.16b,v19.16b,#12 1585 ext v23.16b,v23.16b,v23.16b,#12 1586 ext v1.16b,v1.16b,v1.16b,#4 1587 ext v5.16b,v5.16b,v5.16b,#4 1588 ext v9.16b,v9.16b,v9.16b,#4 1589 ext v13.16b,v13.16b,v13.16b,#4 1590 ext v17.16b,v17.16b,v17.16b,#4 1591 ext v21.16b,v21.16b,v21.16b,#4 1592 add v0.4s,v0.4s,v1.4s 1593 add w5,w5,w9 1594 add v4.4s,v4.4s,v5.4s 1595 add w6,w6,w10 1596 add v8.4s,v8.4s,v9.4s 1597 add w7,w7,w11 1598 add v12.4s,v12.4s,v13.4s 1599 add w8,w8,w12 1600 add v16.4s,v16.4s,v17.4s 1601 eor w17,w17,w5 1602 add v20.4s,v20.4s,v21.4s 1603 eor w19,w19,w6 1604 eor v3.16b,v3.16b,v0.16b 1605 eor w20,w20,w7 1606 eor v7.16b,v7.16b,v4.16b 1607 eor w21,w21,w8 1608 eor v11.16b,v11.16b,v8.16b 1609 ror w17,w17,#16 1610 eor v15.16b,v15.16b,v12.16b 1611 ror w19,w19,#16 1612 eor v19.16b,v19.16b,v16.16b 1613 ror w20,w20,#16 1614 eor v23.16b,v23.16b,v20.16b 1615 ror w21,w21,#16 1616 rev32 v3.8h,v3.8h 1617 add w13,w13,w17 1618 rev32 v7.8h,v7.8h 1619 add w14,w14,w19 1620 rev32 v11.8h,v11.8h 1621 add w15,w15,w20 1622 rev32 v15.8h,v15.8h 1623 add w16,w16,w21 1624 rev32 v19.8h,v19.8h 1625 eor w9,w9,w13 1626 rev32 v23.8h,v23.8h 1627 eor w10,w10,w14 1628 add v2.4s,v2.4s,v3.4s 1629 eor w11,w11,w15 1630 add v6.4s,v6.4s,v7.4s 1631 eor w12,w12,w16 1632 add v10.4s,v10.4s,v11.4s 1633 ror w9,w9,#20 1634 add v14.4s,v14.4s,v15.4s 1635 ror w10,w10,#20 1636 add v18.4s,v18.4s,v19.4s 1637 ror w11,w11,#20 1638 add v22.4s,v22.4s,v23.4s 1639 ror w12,w12,#20 1640 eor v24.16b,v1.16b,v2.16b 1641 add w5,w5,w9 1642 eor v25.16b,v5.16b,v6.16b 1643 add w6,w6,w10 1644 eor v26.16b,v9.16b,v10.16b 1645 add w7,w7,w11 1646 eor v27.16b,v13.16b,v14.16b 1647 add w8,w8,w12 1648 eor v28.16b,v17.16b,v18.16b 1649 eor w17,w17,w5 1650 eor v29.16b,v21.16b,v22.16b 1651 eor w19,w19,w6 1652 ushr v1.4s,v24.4s,#20 1653 eor w20,w20,w7 1654 ushr v5.4s,v25.4s,#20 1655 eor w21,w21,w8 1656 ushr v9.4s,v26.4s,#20 1657 ror w17,w17,#24 1658 ushr v13.4s,v27.4s,#20 1659 ror w19,w19,#24 1660 ushr v17.4s,v28.4s,#20 1661 ror w20,w20,#24 1662 ushr v21.4s,v29.4s,#20 1663 ror w21,w21,#24 1664 sli v1.4s,v24.4s,#12 1665 add w13,w13,w17 1666 sli v5.4s,v25.4s,#12 1667 add w14,w14,w19 1668 sli v9.4s,v26.4s,#12 1669 add w15,w15,w20 1670 sli v13.4s,v27.4s,#12 1671 add w16,w16,w21 1672 sli v17.4s,v28.4s,#12 1673 eor w9,w9,w13 1674 sli v21.4s,v29.4s,#12 1675 eor w10,w10,w14 1676 add v0.4s,v0.4s,v1.4s 1677 eor w11,w11,w15 1678 add v4.4s,v4.4s,v5.4s 1679 eor w12,w12,w16 1680 add v8.4s,v8.4s,v9.4s 1681 ror w9,w9,#25 1682 add v12.4s,v12.4s,v13.4s 1683 ror w10,w10,#25 1684 add v16.4s,v16.4s,v17.4s 1685 ror w11,w11,#25 1686 add v20.4s,v20.4s,v21.4s 1687 ror w12,w12,#25 1688 eor v24.16b,v3.16b,v0.16b 1689 add w5,w5,w10 1690 eor v25.16b,v7.16b,v4.16b 1691 add w6,w6,w11 1692 eor v26.16b,v11.16b,v8.16b 1693 add w7,w7,w12 1694 eor v27.16b,v15.16b,v12.16b 1695 add w8,w8,w9 1696 eor v28.16b,v19.16b,v16.16b 1697 eor w21,w21,w5 1698 eor v29.16b,v23.16b,v20.16b 1699 eor w17,w17,w6 1700 ushr v3.4s,v24.4s,#24 1701 eor w19,w19,w7 1702 ushr v7.4s,v25.4s,#24 1703 eor w20,w20,w8 1704 ushr v11.4s,v26.4s,#24 1705 ror w21,w21,#16 1706 ushr v15.4s,v27.4s,#24 1707 ror w17,w17,#16 1708 ushr v19.4s,v28.4s,#24 1709 ror w19,w19,#16 1710 ushr v23.4s,v29.4s,#24 1711 ror w20,w20,#16 1712 sli v3.4s,v24.4s,#8 1713 add w15,w15,w21 1714 sli v7.4s,v25.4s,#8 1715 add w16,w16,w17 1716 sli v11.4s,v26.4s,#8 1717 add w13,w13,w19 1718 sli v15.4s,v27.4s,#8 1719 add w14,w14,w20 1720 sli v19.4s,v28.4s,#8 1721 eor w10,w10,w15 1722 sli v23.4s,v29.4s,#8 1723 eor w11,w11,w16 1724 add v2.4s,v2.4s,v3.4s 1725 eor w12,w12,w13 1726 add v6.4s,v6.4s,v7.4s 1727 eor w9,w9,w14 1728 add v10.4s,v10.4s,v11.4s 1729 ror w10,w10,#20 1730 add v14.4s,v14.4s,v15.4s 1731 ror w11,w11,#20 1732 add v18.4s,v18.4s,v19.4s 1733 ror w12,w12,#20 1734 add v22.4s,v22.4s,v23.4s 1735 ror w9,w9,#20 1736 eor v24.16b,v1.16b,v2.16b 1737 add w5,w5,w10 1738 eor v25.16b,v5.16b,v6.16b 1739 add w6,w6,w11 1740 eor v26.16b,v9.16b,v10.16b 1741 add w7,w7,w12 1742 eor v27.16b,v13.16b,v14.16b 1743 add w8,w8,w9 1744 eor v28.16b,v17.16b,v18.16b 1745 eor w21,w21,w5 1746 eor v29.16b,v21.16b,v22.16b 1747 eor w17,w17,w6 1748 ushr v1.4s,v24.4s,#25 1749 eor w19,w19,w7 1750 ushr v5.4s,v25.4s,#25 1751 eor w20,w20,w8 1752 ushr v9.4s,v26.4s,#25 1753 ror w21,w21,#24 1754 ushr v13.4s,v27.4s,#25 1755 ror w17,w17,#24 1756 ushr v17.4s,v28.4s,#25 1757 ror w19,w19,#24 1758 ushr v21.4s,v29.4s,#25 1759 ror w20,w20,#24 1760 sli v1.4s,v24.4s,#7 1761 add w15,w15,w21 1762 sli v5.4s,v25.4s,#7 1763 add w16,w16,w17 1764 sli v9.4s,v26.4s,#7 1765 add w13,w13,w19 1766 sli v13.4s,v27.4s,#7 1767 add w14,w14,w20 1768 sli v17.4s,v28.4s,#7 1769 eor w10,w10,w15 1770 sli v21.4s,v29.4s,#7 1771 eor w11,w11,w16 1772 ext v2.16b,v2.16b,v2.16b,#8 1773 eor w12,w12,w13 1774 ext v6.16b,v6.16b,v6.16b,#8 1775 eor w9,w9,w14 1776 ext v10.16b,v10.16b,v10.16b,#8 1777 ror w10,w10,#25 1778 ext v14.16b,v14.16b,v14.16b,#8 1779 ror w11,w11,#25 1780 ext v18.16b,v18.16b,v18.16b,#8 1781 ror w12,w12,#25 1782 ext v22.16b,v22.16b,v22.16b,#8 1783 ror w9,w9,#25 1784 ext v3.16b,v3.16b,v3.16b,#4 1785 ext v7.16b,v7.16b,v7.16b,#4 1786 ext v11.16b,v11.16b,v11.16b,#4 1787 ext v15.16b,v15.16b,v15.16b,#4 1788 ext v19.16b,v19.16b,v19.16b,#4 1789 ext v23.16b,v23.16b,v23.16b,#4 1790 ext v1.16b,v1.16b,v1.16b,#12 1791 ext v5.16b,v5.16b,v5.16b,#12 1792 ext v9.16b,v9.16b,v9.16b,#12 1793 ext v13.16b,v13.16b,v13.16b,#12 1794 ext v17.16b,v17.16b,v17.16b,#12 1795 ext v21.16b,v21.16b,v21.16b,#12 1796 cbnz x4,Loop_lower_neon 1797 1798 add w5,w5,w22 // accumulate key block 1799 ldp q24,q25,[sp,#0] 1800 add x6,x6,x22,lsr#32 1801 ldp q26,q27,[sp,#32] 1802 add w7,w7,w23 1803 ldp q28,q29,[sp,#64] 1804 add x8,x8,x23,lsr#32 1805 add v0.4s,v0.4s,v24.4s 1806 add w9,w9,w24 1807 add v4.4s,v4.4s,v24.4s 1808 add x10,x10,x24,lsr#32 1809 add v8.4s,v8.4s,v24.4s 1810 add w11,w11,w25 1811 add v12.4s,v12.4s,v24.4s 1812 add x12,x12,x25,lsr#32 1813 add v16.4s,v16.4s,v24.4s 1814 add w13,w13,w26 1815 add v20.4s,v20.4s,v24.4s 1816 add x14,x14,x26,lsr#32 1817 add v2.4s,v2.4s,v26.4s 1818 add w15,w15,w27 1819 add v6.4s,v6.4s,v26.4s 1820 add x16,x16,x27,lsr#32 1821 add v10.4s,v10.4s,v26.4s 1822 add w17,w17,w28 1823 add v14.4s,v14.4s,v26.4s 1824 add x19,x19,x28,lsr#32 1825 add v18.4s,v18.4s,v26.4s 1826 add w20,w20,w30 1827 add v22.4s,v22.4s,v26.4s 1828 add x21,x21,x30,lsr#32 1829 add v19.4s,v19.4s,v31.4s // +4 1830 add x5,x5,x6,lsl#32 // pack 1831 add v23.4s,v23.4s,v31.4s // +4 1832 add x7,x7,x8,lsl#32 1833 add v3.4s,v3.4s,v27.4s 1834 ldp x6,x8,[x1,#0] // load input 1835 add v7.4s,v7.4s,v28.4s 1836 add x9,x9,x10,lsl#32 1837 add v11.4s,v11.4s,v29.4s 1838 add x11,x11,x12,lsl#32 1839 add v15.4s,v15.4s,v30.4s 1840 ldp x10,x12,[x1,#16] 1841 add v19.4s,v19.4s,v27.4s 1842 add x13,x13,x14,lsl#32 1843 add v23.4s,v23.4s,v28.4s 1844 add x15,x15,x16,lsl#32 1845 add v1.4s,v1.4s,v25.4s 1846 ldp x14,x16,[x1,#32] 1847 add v5.4s,v5.4s,v25.4s 1848 add x17,x17,x19,lsl#32 1849 add v9.4s,v9.4s,v25.4s 1850 add x20,x20,x21,lsl#32 1851 add v13.4s,v13.4s,v25.4s 1852 ldp x19,x21,[x1,#48] 1853 add v17.4s,v17.4s,v25.4s 1854 add x1,x1,#64 1855 add v21.4s,v21.4s,v25.4s 1856 1857#ifdef __AARCH64EB__ 1858 rev x5,x5 1859 rev x7,x7 1860 rev x9,x9 1861 rev x11,x11 1862 rev x13,x13 1863 rev x15,x15 1864 rev x17,x17 1865 rev x20,x20 1866#endif 1867 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1868 eor x5,x5,x6 1869 eor x7,x7,x8 1870 eor x9,x9,x10 1871 eor x11,x11,x12 1872 eor x13,x13,x14 1873 eor v0.16b,v0.16b,v24.16b 1874 eor x15,x15,x16 1875 eor v1.16b,v1.16b,v25.16b 1876 eor x17,x17,x19 1877 eor v2.16b,v2.16b,v26.16b 1878 eor x20,x20,x21 1879 eor v3.16b,v3.16b,v27.16b 1880 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1881 1882 stp x5,x7,[x0,#0] // store output 1883 add x28,x28,#7 // increment counter 1884 stp x9,x11,[x0,#16] 1885 stp x13,x15,[x0,#32] 1886 stp x17,x20,[x0,#48] 1887 add x0,x0,#64 1888 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1889 1890 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1891 eor v4.16b,v4.16b,v24.16b 1892 eor v5.16b,v5.16b,v25.16b 1893 eor v6.16b,v6.16b,v26.16b 1894 eor v7.16b,v7.16b,v27.16b 1895 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1896 1897 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1898 eor v8.16b,v8.16b,v0.16b 1899 ldp q24,q25,[sp,#0] 1900 eor v9.16b,v9.16b,v1.16b 1901 ldp q26,q27,[sp,#32] 1902 eor v10.16b,v10.16b,v2.16b 1903 eor v11.16b,v11.16b,v3.16b 1904 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1905 1906 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1907 eor v12.16b,v12.16b,v4.16b 1908 eor v13.16b,v13.16b,v5.16b 1909 eor v14.16b,v14.16b,v6.16b 1910 eor v15.16b,v15.16b,v7.16b 1911 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1912 1913 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1914 eor v16.16b,v16.16b,v8.16b 1915 eor v17.16b,v17.16b,v9.16b 1916 eor v18.16b,v18.16b,v10.16b 1917 eor v19.16b,v19.16b,v11.16b 1918 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1919 1920 shl v0.4s,v31.4s,#1 // 4 -> 8 1921 eor v20.16b,v20.16b,v12.16b 1922 eor v21.16b,v21.16b,v13.16b 1923 eor v22.16b,v22.16b,v14.16b 1924 eor v23.16b,v23.16b,v15.16b 1925 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1926 1927 add v27.4s,v27.4s,v0.4s // += 8 1928 add v28.4s,v28.4s,v0.4s 1929 add v29.4s,v29.4s,v0.4s 1930 add v30.4s,v30.4s,v0.4s 1931 1932 b.hs Loop_outer_512_neon 1933 1934 adds x2,x2,#512 1935 ushr v0.4s,v31.4s,#2 // 4 -> 1 1936 1937 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1938 ldp d10,d11,[sp,#128+16] 1939 ldp d12,d13,[sp,#128+32] 1940 ldp d14,d15,[sp,#128+48] 1941 1942 stp q24,q31,[sp,#0] // wipe off-load area 1943 stp q24,q31,[sp,#32] 1944 stp q24,q31,[sp,#64] 1945 1946 b.eq Ldone_512_neon 1947 1948 cmp x2,#192 1949 sub v27.4s,v27.4s,v0.4s // -= 1 1950 sub v28.4s,v28.4s,v0.4s 1951 sub v29.4s,v29.4s,v0.4s 1952 add sp,sp,#128 1953 b.hs Loop_outer_neon 1954 1955 eor v25.16b,v25.16b,v25.16b 1956 eor v26.16b,v26.16b,v26.16b 1957 eor v27.16b,v27.16b,v27.16b 1958 eor v28.16b,v28.16b,v28.16b 1959 eor v29.16b,v29.16b,v29.16b 1960 eor v30.16b,v30.16b,v30.16b 1961 b Loop_outer 1962 1963Ldone_512_neon: 1964 ldp x19,x20,[x29,#16] 1965 add sp,sp,#128+64 1966 ldp x21,x22,[x29,#32] 1967 ldp x23,x24,[x29,#48] 1968 ldp x25,x26,[x29,#64] 1969 ldp x27,x28,[x29,#80] 1970 ldp x29,x30,[sp],#96 1971 AARCH64_VALIDATE_LINK_REGISTER 1972 ret 1973 1974#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 1975