1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <openssl/arm_arch.h> 8 9.section .rodata 10 11.align 5 12.Lsigma: 13.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 14.Lone: 15.long 1,0,0,0 16.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 17.align 2 18 19.text 20 21.globl ChaCha20_ctr32_nohw 22.hidden ChaCha20_ctr32_nohw 23.type ChaCha20_ctr32_nohw,%function 24.align 5 25ChaCha20_ctr32_nohw: 26 AARCH64_SIGN_LINK_REGISTER 27 stp x29,x30,[sp,#-96]! 28 add x29,sp,#0 29 30 adrp x5,.Lsigma 31 add x5,x5,:lo12:.Lsigma 32 stp x19,x20,[sp,#16] 33 stp x21,x22,[sp,#32] 34 stp x23,x24,[sp,#48] 35 stp x25,x26,[sp,#64] 36 stp x27,x28,[sp,#80] 37 sub sp,sp,#64 38 39 ldp x22,x23,[x5] // load sigma 40 ldp x24,x25,[x3] // load key 41 ldp x26,x27,[x3,#16] 42 ldp x28,x30,[x4] // load counter 43#ifdef __AARCH64EB__ 44 ror x24,x24,#32 45 ror x25,x25,#32 46 ror x26,x26,#32 47 ror x27,x27,#32 48 ror x28,x28,#32 49 ror x30,x30,#32 50#endif 51 52.Loop_outer: 53 mov w5,w22 // unpack key block 54 lsr x6,x22,#32 55 mov w7,w23 56 lsr x8,x23,#32 57 mov w9,w24 58 lsr x10,x24,#32 59 mov w11,w25 60 lsr x12,x25,#32 61 mov w13,w26 62 lsr x14,x26,#32 63 mov w15,w27 64 lsr x16,x27,#32 65 mov w17,w28 66 lsr x19,x28,#32 67 mov w20,w30 68 lsr x21,x30,#32 69 70 mov x4,#10 71 subs x2,x2,#64 72.Loop: 73 sub x4,x4,#1 74 add w5,w5,w9 75 add w6,w6,w10 76 add w7,w7,w11 77 add w8,w8,w12 78 eor w17,w17,w5 79 eor w19,w19,w6 80 eor w20,w20,w7 81 eor w21,w21,w8 82 ror w17,w17,#16 83 ror w19,w19,#16 84 ror w20,w20,#16 85 ror w21,w21,#16 86 add w13,w13,w17 87 add w14,w14,w19 88 add w15,w15,w20 89 add w16,w16,w21 90 eor w9,w9,w13 91 eor w10,w10,w14 92 eor w11,w11,w15 93 eor w12,w12,w16 94 ror w9,w9,#20 95 ror w10,w10,#20 96 ror w11,w11,#20 97 ror w12,w12,#20 98 add w5,w5,w9 99 add w6,w6,w10 100 add w7,w7,w11 101 add w8,w8,w12 102 eor w17,w17,w5 103 eor w19,w19,w6 104 eor w20,w20,w7 105 eor w21,w21,w8 106 ror w17,w17,#24 107 ror w19,w19,#24 108 ror w20,w20,#24 109 ror w21,w21,#24 110 add w13,w13,w17 111 add w14,w14,w19 112 add w15,w15,w20 113 add w16,w16,w21 114 eor w9,w9,w13 115 eor w10,w10,w14 116 eor w11,w11,w15 117 eor w12,w12,w16 118 ror w9,w9,#25 119 ror w10,w10,#25 120 ror w11,w11,#25 121 ror w12,w12,#25 122 add w5,w5,w10 123 add w6,w6,w11 124 add w7,w7,w12 125 add w8,w8,w9 126 eor w21,w21,w5 127 eor w17,w17,w6 128 eor w19,w19,w7 129 eor w20,w20,w8 130 ror w21,w21,#16 131 ror w17,w17,#16 132 ror w19,w19,#16 133 ror w20,w20,#16 134 add w15,w15,w21 135 add w16,w16,w17 136 add w13,w13,w19 137 add w14,w14,w20 138 eor w10,w10,w15 139 eor w11,w11,w16 140 eor w12,w12,w13 141 eor w9,w9,w14 142 ror w10,w10,#20 143 ror w11,w11,#20 144 ror w12,w12,#20 145 ror w9,w9,#20 146 add w5,w5,w10 147 add w6,w6,w11 148 add w7,w7,w12 149 add w8,w8,w9 150 eor w21,w21,w5 151 eor w17,w17,w6 152 eor w19,w19,w7 153 eor w20,w20,w8 154 ror w21,w21,#24 155 ror w17,w17,#24 156 ror w19,w19,#24 157 ror w20,w20,#24 158 add w15,w15,w21 159 add w16,w16,w17 160 add w13,w13,w19 161 add w14,w14,w20 162 eor w10,w10,w15 163 eor w11,w11,w16 164 eor w12,w12,w13 165 eor w9,w9,w14 166 ror w10,w10,#25 167 ror w11,w11,#25 168 ror w12,w12,#25 169 ror w9,w9,#25 170 cbnz x4,.Loop 171 172 add w5,w5,w22 // accumulate key block 173 add x6,x6,x22,lsr#32 174 add w7,w7,w23 175 add x8,x8,x23,lsr#32 176 add w9,w9,w24 177 add x10,x10,x24,lsr#32 178 add w11,w11,w25 179 add x12,x12,x25,lsr#32 180 add w13,w13,w26 181 add x14,x14,x26,lsr#32 182 add w15,w15,w27 183 add x16,x16,x27,lsr#32 184 add w17,w17,w28 185 add x19,x19,x28,lsr#32 186 add w20,w20,w30 187 add x21,x21,x30,lsr#32 188 189 b.lo .Ltail 190 191 add x5,x5,x6,lsl#32 // pack 192 add x7,x7,x8,lsl#32 193 ldp x6,x8,[x1,#0] // load input 194 add x9,x9,x10,lsl#32 195 add x11,x11,x12,lsl#32 196 ldp x10,x12,[x1,#16] 197 add x13,x13,x14,lsl#32 198 add x15,x15,x16,lsl#32 199 ldp x14,x16,[x1,#32] 200 add x17,x17,x19,lsl#32 201 add x20,x20,x21,lsl#32 202 ldp x19,x21,[x1,#48] 203 add x1,x1,#64 204#ifdef __AARCH64EB__ 205 rev x5,x5 206 rev x7,x7 207 rev x9,x9 208 rev x11,x11 209 rev x13,x13 210 rev x15,x15 211 rev x17,x17 212 rev x20,x20 213#endif 214 eor x5,x5,x6 215 eor x7,x7,x8 216 eor x9,x9,x10 217 eor x11,x11,x12 218 eor x13,x13,x14 219 eor x15,x15,x16 220 eor x17,x17,x19 221 eor x20,x20,x21 222 223 stp x5,x7,[x0,#0] // store output 224 add x28,x28,#1 // increment counter 225 stp x9,x11,[x0,#16] 226 stp x13,x15,[x0,#32] 227 stp x17,x20,[x0,#48] 228 add x0,x0,#64 229 230 b.hi .Loop_outer 231 232 ldp x19,x20,[x29,#16] 233 add sp,sp,#64 234 ldp x21,x22,[x29,#32] 235 ldp x23,x24,[x29,#48] 236 ldp x25,x26,[x29,#64] 237 ldp x27,x28,[x29,#80] 238 ldp x29,x30,[sp],#96 239 AARCH64_VALIDATE_LINK_REGISTER 240 ret 241 242.align 4 243.Ltail: 244 add x2,x2,#64 245.Less_than_64: 246 sub x0,x0,#1 247 add x1,x1,x2 248 add x0,x0,x2 249 add x4,sp,x2 250 neg x2,x2 251 252 add x5,x5,x6,lsl#32 // pack 253 add x7,x7,x8,lsl#32 254 add x9,x9,x10,lsl#32 255 add x11,x11,x12,lsl#32 256 add x13,x13,x14,lsl#32 257 add x15,x15,x16,lsl#32 258 add x17,x17,x19,lsl#32 259 add x20,x20,x21,lsl#32 260#ifdef __AARCH64EB__ 261 rev x5,x5 262 rev x7,x7 263 rev x9,x9 264 rev x11,x11 265 rev x13,x13 266 rev x15,x15 267 rev x17,x17 268 rev x20,x20 269#endif 270 stp x5,x7,[sp,#0] 271 stp x9,x11,[sp,#16] 272 stp x13,x15,[sp,#32] 273 stp x17,x20,[sp,#48] 274 275.Loop_tail: 276 ldrb w10,[x1,x2] 277 ldrb w11,[x4,x2] 278 add x2,x2,#1 279 eor w10,w10,w11 280 strb w10,[x0,x2] 281 cbnz x2,.Loop_tail 282 283 stp xzr,xzr,[sp,#0] 284 stp xzr,xzr,[sp,#16] 285 stp xzr,xzr,[sp,#32] 286 stp xzr,xzr,[sp,#48] 287 288 ldp x19,x20,[x29,#16] 289 add sp,sp,#64 290 ldp x21,x22,[x29,#32] 291 ldp x23,x24,[x29,#48] 292 ldp x25,x26,[x29,#64] 293 ldp x27,x28,[x29,#80] 294 ldp x29,x30,[sp],#96 295 AARCH64_VALIDATE_LINK_REGISTER 296 ret 297.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw 298 299.globl ChaCha20_ctr32_neon 300.hidden ChaCha20_ctr32_neon 301.type ChaCha20_ctr32_neon,%function 302.align 5 303ChaCha20_ctr32_neon: 304 AARCH64_SIGN_LINK_REGISTER 305 stp x29,x30,[sp,#-96]! 306 add x29,sp,#0 307 308 adrp x5,.Lsigma 309 add x5,x5,:lo12:.Lsigma 310 stp x19,x20,[sp,#16] 311 stp x21,x22,[sp,#32] 312 stp x23,x24,[sp,#48] 313 stp x25,x26,[sp,#64] 314 stp x27,x28,[sp,#80] 315 cmp x2,#512 316 b.hs .L512_or_more_neon 317 318 sub sp,sp,#64 319 320 ldp x22,x23,[x5] // load sigma 321 ld1 {v24.4s},[x5],#16 322 ldp x24,x25,[x3] // load key 323 ldp x26,x27,[x3,#16] 324 ld1 {v25.4s,v26.4s},[x3] 325 ldp x28,x30,[x4] // load counter 326 ld1 {v27.4s},[x4] 327 ld1 {v31.4s},[x5] 328#ifdef __AARCH64EB__ 329 rev64 v24.4s,v24.4s 330 ror x24,x24,#32 331 ror x25,x25,#32 332 ror x26,x26,#32 333 ror x27,x27,#32 334 ror x28,x28,#32 335 ror x30,x30,#32 336#endif 337 add v27.4s,v27.4s,v31.4s // += 1 338 add v28.4s,v27.4s,v31.4s 339 add v29.4s,v28.4s,v31.4s 340 shl v31.4s,v31.4s,#2 // 1 -> 4 341 342.Loop_outer_neon: 343 mov w5,w22 // unpack key block 344 lsr x6,x22,#32 345 mov v0.16b,v24.16b 346 mov w7,w23 347 lsr x8,x23,#32 348 mov v4.16b,v24.16b 349 mov w9,w24 350 lsr x10,x24,#32 351 mov v16.16b,v24.16b 352 mov w11,w25 353 mov v1.16b,v25.16b 354 lsr x12,x25,#32 355 mov v5.16b,v25.16b 356 mov w13,w26 357 mov v17.16b,v25.16b 358 lsr x14,x26,#32 359 mov v3.16b,v27.16b 360 mov w15,w27 361 mov v7.16b,v28.16b 362 lsr x16,x27,#32 363 mov v19.16b,v29.16b 364 mov w17,w28 365 mov v2.16b,v26.16b 366 lsr x19,x28,#32 367 mov v6.16b,v26.16b 368 mov w20,w30 369 mov v18.16b,v26.16b 370 lsr x21,x30,#32 371 372 mov x4,#10 373 subs x2,x2,#256 374.Loop_neon: 375 sub x4,x4,#1 376 add v0.4s,v0.4s,v1.4s 377 add w5,w5,w9 378 add v4.4s,v4.4s,v5.4s 379 add w6,w6,w10 380 add v16.4s,v16.4s,v17.4s 381 add w7,w7,w11 382 eor v3.16b,v3.16b,v0.16b 383 add w8,w8,w12 384 eor v7.16b,v7.16b,v4.16b 385 eor w17,w17,w5 386 eor v19.16b,v19.16b,v16.16b 387 eor w19,w19,w6 388 rev32 v3.8h,v3.8h 389 eor w20,w20,w7 390 rev32 v7.8h,v7.8h 391 eor w21,w21,w8 392 rev32 v19.8h,v19.8h 393 ror w17,w17,#16 394 add v2.4s,v2.4s,v3.4s 395 ror w19,w19,#16 396 add v6.4s,v6.4s,v7.4s 397 ror w20,w20,#16 398 add v18.4s,v18.4s,v19.4s 399 ror w21,w21,#16 400 eor v20.16b,v1.16b,v2.16b 401 add w13,w13,w17 402 eor v21.16b,v5.16b,v6.16b 403 add w14,w14,w19 404 eor v22.16b,v17.16b,v18.16b 405 add w15,w15,w20 406 ushr v1.4s,v20.4s,#20 407 add w16,w16,w21 408 ushr v5.4s,v21.4s,#20 409 eor w9,w9,w13 410 ushr v17.4s,v22.4s,#20 411 eor w10,w10,w14 412 sli v1.4s,v20.4s,#12 413 eor w11,w11,w15 414 sli v5.4s,v21.4s,#12 415 eor w12,w12,w16 416 sli v17.4s,v22.4s,#12 417 ror w9,w9,#20 418 add v0.4s,v0.4s,v1.4s 419 ror w10,w10,#20 420 add v4.4s,v4.4s,v5.4s 421 ror w11,w11,#20 422 add v16.4s,v16.4s,v17.4s 423 ror w12,w12,#20 424 eor v20.16b,v3.16b,v0.16b 425 add w5,w5,w9 426 eor v21.16b,v7.16b,v4.16b 427 add w6,w6,w10 428 eor v22.16b,v19.16b,v16.16b 429 add w7,w7,w11 430 ushr v3.4s,v20.4s,#24 431 add w8,w8,w12 432 ushr v7.4s,v21.4s,#24 433 eor w17,w17,w5 434 ushr v19.4s,v22.4s,#24 435 eor w19,w19,w6 436 sli v3.4s,v20.4s,#8 437 eor w20,w20,w7 438 sli v7.4s,v21.4s,#8 439 eor w21,w21,w8 440 sli v19.4s,v22.4s,#8 441 ror w17,w17,#24 442 add v2.4s,v2.4s,v3.4s 443 ror w19,w19,#24 444 add v6.4s,v6.4s,v7.4s 445 ror w20,w20,#24 446 add v18.4s,v18.4s,v19.4s 447 ror w21,w21,#24 448 eor v20.16b,v1.16b,v2.16b 449 add w13,w13,w17 450 eor v21.16b,v5.16b,v6.16b 451 add w14,w14,w19 452 eor v22.16b,v17.16b,v18.16b 453 add w15,w15,w20 454 ushr v1.4s,v20.4s,#25 455 add w16,w16,w21 456 ushr v5.4s,v21.4s,#25 457 eor w9,w9,w13 458 ushr v17.4s,v22.4s,#25 459 eor w10,w10,w14 460 sli v1.4s,v20.4s,#7 461 eor w11,w11,w15 462 sli v5.4s,v21.4s,#7 463 eor w12,w12,w16 464 sli v17.4s,v22.4s,#7 465 ror w9,w9,#25 466 ext v2.16b,v2.16b,v2.16b,#8 467 ror w10,w10,#25 468 ext v6.16b,v6.16b,v6.16b,#8 469 ror w11,w11,#25 470 ext v18.16b,v18.16b,v18.16b,#8 471 ror w12,w12,#25 472 ext v3.16b,v3.16b,v3.16b,#12 473 ext v7.16b,v7.16b,v7.16b,#12 474 ext v19.16b,v19.16b,v19.16b,#12 475 ext v1.16b,v1.16b,v1.16b,#4 476 ext v5.16b,v5.16b,v5.16b,#4 477 ext v17.16b,v17.16b,v17.16b,#4 478 add v0.4s,v0.4s,v1.4s 479 add w5,w5,w10 480 add v4.4s,v4.4s,v5.4s 481 add w6,w6,w11 482 add v16.4s,v16.4s,v17.4s 483 add w7,w7,w12 484 eor v3.16b,v3.16b,v0.16b 485 add w8,w8,w9 486 eor v7.16b,v7.16b,v4.16b 487 eor w21,w21,w5 488 eor v19.16b,v19.16b,v16.16b 489 eor w17,w17,w6 490 rev32 v3.8h,v3.8h 491 eor w19,w19,w7 492 rev32 v7.8h,v7.8h 493 eor w20,w20,w8 494 rev32 v19.8h,v19.8h 495 ror w21,w21,#16 496 add v2.4s,v2.4s,v3.4s 497 ror w17,w17,#16 498 add v6.4s,v6.4s,v7.4s 499 ror w19,w19,#16 500 add v18.4s,v18.4s,v19.4s 501 ror w20,w20,#16 502 eor v20.16b,v1.16b,v2.16b 503 add w15,w15,w21 504 eor v21.16b,v5.16b,v6.16b 505 add w16,w16,w17 506 eor v22.16b,v17.16b,v18.16b 507 add w13,w13,w19 508 ushr v1.4s,v20.4s,#20 509 add w14,w14,w20 510 ushr v5.4s,v21.4s,#20 511 eor w10,w10,w15 512 ushr v17.4s,v22.4s,#20 513 eor w11,w11,w16 514 sli v1.4s,v20.4s,#12 515 eor w12,w12,w13 516 sli v5.4s,v21.4s,#12 517 eor w9,w9,w14 518 sli v17.4s,v22.4s,#12 519 ror w10,w10,#20 520 add v0.4s,v0.4s,v1.4s 521 ror w11,w11,#20 522 add v4.4s,v4.4s,v5.4s 523 ror w12,w12,#20 524 add v16.4s,v16.4s,v17.4s 525 ror w9,w9,#20 526 eor v20.16b,v3.16b,v0.16b 527 add w5,w5,w10 528 eor v21.16b,v7.16b,v4.16b 529 add w6,w6,w11 530 eor v22.16b,v19.16b,v16.16b 531 add w7,w7,w12 532 ushr v3.4s,v20.4s,#24 533 add w8,w8,w9 534 ushr v7.4s,v21.4s,#24 535 eor w21,w21,w5 536 ushr v19.4s,v22.4s,#24 537 eor w17,w17,w6 538 sli v3.4s,v20.4s,#8 539 eor w19,w19,w7 540 sli v7.4s,v21.4s,#8 541 eor w20,w20,w8 542 sli v19.4s,v22.4s,#8 543 ror w21,w21,#24 544 add v2.4s,v2.4s,v3.4s 545 ror w17,w17,#24 546 add v6.4s,v6.4s,v7.4s 547 ror w19,w19,#24 548 add v18.4s,v18.4s,v19.4s 549 ror w20,w20,#24 550 eor v20.16b,v1.16b,v2.16b 551 add w15,w15,w21 552 eor v21.16b,v5.16b,v6.16b 553 add w16,w16,w17 554 eor v22.16b,v17.16b,v18.16b 555 add w13,w13,w19 556 ushr v1.4s,v20.4s,#25 557 add w14,w14,w20 558 ushr v5.4s,v21.4s,#25 559 eor w10,w10,w15 560 ushr v17.4s,v22.4s,#25 561 eor w11,w11,w16 562 sli v1.4s,v20.4s,#7 563 eor w12,w12,w13 564 sli v5.4s,v21.4s,#7 565 eor w9,w9,w14 566 sli v17.4s,v22.4s,#7 567 ror w10,w10,#25 568 ext v2.16b,v2.16b,v2.16b,#8 569 ror w11,w11,#25 570 ext v6.16b,v6.16b,v6.16b,#8 571 ror w12,w12,#25 572 ext v18.16b,v18.16b,v18.16b,#8 573 ror w9,w9,#25 574 ext v3.16b,v3.16b,v3.16b,#4 575 ext v7.16b,v7.16b,v7.16b,#4 576 ext v19.16b,v19.16b,v19.16b,#4 577 ext v1.16b,v1.16b,v1.16b,#12 578 ext v5.16b,v5.16b,v5.16b,#12 579 ext v17.16b,v17.16b,v17.16b,#12 580 cbnz x4,.Loop_neon 581 582 add w5,w5,w22 // accumulate key block 583 add v0.4s,v0.4s,v24.4s 584 add x6,x6,x22,lsr#32 585 add v4.4s,v4.4s,v24.4s 586 add w7,w7,w23 587 add v16.4s,v16.4s,v24.4s 588 add x8,x8,x23,lsr#32 589 add v2.4s,v2.4s,v26.4s 590 add w9,w9,w24 591 add v6.4s,v6.4s,v26.4s 592 add x10,x10,x24,lsr#32 593 add v18.4s,v18.4s,v26.4s 594 add w11,w11,w25 595 add v3.4s,v3.4s,v27.4s 596 add x12,x12,x25,lsr#32 597 add w13,w13,w26 598 add v7.4s,v7.4s,v28.4s 599 add x14,x14,x26,lsr#32 600 add w15,w15,w27 601 add v19.4s,v19.4s,v29.4s 602 add x16,x16,x27,lsr#32 603 add w17,w17,w28 604 add v1.4s,v1.4s,v25.4s 605 add x19,x19,x28,lsr#32 606 add w20,w20,w30 607 add v5.4s,v5.4s,v25.4s 608 add x21,x21,x30,lsr#32 609 add v17.4s,v17.4s,v25.4s 610 611 b.lo .Ltail_neon 612 613 add x5,x5,x6,lsl#32 // pack 614 add x7,x7,x8,lsl#32 615 ldp x6,x8,[x1,#0] // load input 616 add x9,x9,x10,lsl#32 617 add x11,x11,x12,lsl#32 618 ldp x10,x12,[x1,#16] 619 add x13,x13,x14,lsl#32 620 add x15,x15,x16,lsl#32 621 ldp x14,x16,[x1,#32] 622 add x17,x17,x19,lsl#32 623 add x20,x20,x21,lsl#32 624 ldp x19,x21,[x1,#48] 625 add x1,x1,#64 626#ifdef __AARCH64EB__ 627 rev x5,x5 628 rev x7,x7 629 rev x9,x9 630 rev x11,x11 631 rev x13,x13 632 rev x15,x15 633 rev x17,x17 634 rev x20,x20 635#endif 636 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 637 eor x5,x5,x6 638 eor x7,x7,x8 639 eor x9,x9,x10 640 eor x11,x11,x12 641 eor x13,x13,x14 642 eor v0.16b,v0.16b,v20.16b 643 eor x15,x15,x16 644 eor v1.16b,v1.16b,v21.16b 645 eor x17,x17,x19 646 eor v2.16b,v2.16b,v22.16b 647 eor x20,x20,x21 648 eor v3.16b,v3.16b,v23.16b 649 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 650 651 stp x5,x7,[x0,#0] // store output 652 add x28,x28,#4 // increment counter 653 stp x9,x11,[x0,#16] 654 add v27.4s,v27.4s,v31.4s // += 4 655 stp x13,x15,[x0,#32] 656 add v28.4s,v28.4s,v31.4s 657 stp x17,x20,[x0,#48] 658 add v29.4s,v29.4s,v31.4s 659 add x0,x0,#64 660 661 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 662 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 663 664 eor v4.16b,v4.16b,v20.16b 665 eor v5.16b,v5.16b,v21.16b 666 eor v6.16b,v6.16b,v22.16b 667 eor v7.16b,v7.16b,v23.16b 668 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 669 670 eor v16.16b,v16.16b,v0.16b 671 eor v17.16b,v17.16b,v1.16b 672 eor v18.16b,v18.16b,v2.16b 673 eor v19.16b,v19.16b,v3.16b 674 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 675 676 b.hi .Loop_outer_neon 677 678 ldp x19,x20,[x29,#16] 679 add sp,sp,#64 680 ldp x21,x22,[x29,#32] 681 ldp x23,x24,[x29,#48] 682 ldp x25,x26,[x29,#64] 683 ldp x27,x28,[x29,#80] 684 ldp x29,x30,[sp],#96 685 AARCH64_VALIDATE_LINK_REGISTER 686 ret 687 688.Ltail_neon: 689 add x2,x2,#256 690 cmp x2,#64 691 b.lo .Less_than_64 692 693 add x5,x5,x6,lsl#32 // pack 694 add x7,x7,x8,lsl#32 695 ldp x6,x8,[x1,#0] // load input 696 add x9,x9,x10,lsl#32 697 add x11,x11,x12,lsl#32 698 ldp x10,x12,[x1,#16] 699 add x13,x13,x14,lsl#32 700 add x15,x15,x16,lsl#32 701 ldp x14,x16,[x1,#32] 702 add x17,x17,x19,lsl#32 703 add x20,x20,x21,lsl#32 704 ldp x19,x21,[x1,#48] 705 add x1,x1,#64 706#ifdef __AARCH64EB__ 707 rev x5,x5 708 rev x7,x7 709 rev x9,x9 710 rev x11,x11 711 rev x13,x13 712 rev x15,x15 713 rev x17,x17 714 rev x20,x20 715#endif 716 eor x5,x5,x6 717 eor x7,x7,x8 718 eor x9,x9,x10 719 eor x11,x11,x12 720 eor x13,x13,x14 721 eor x15,x15,x16 722 eor x17,x17,x19 723 eor x20,x20,x21 724 725 stp x5,x7,[x0,#0] // store output 726 add x28,x28,#4 // increment counter 727 stp x9,x11,[x0,#16] 728 stp x13,x15,[x0,#32] 729 stp x17,x20,[x0,#48] 730 add x0,x0,#64 731 b.eq .Ldone_neon 732 sub x2,x2,#64 733 cmp x2,#64 734 b.lo .Less_than_128 735 736 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 737 eor v0.16b,v0.16b,v20.16b 738 eor v1.16b,v1.16b,v21.16b 739 eor v2.16b,v2.16b,v22.16b 740 eor v3.16b,v3.16b,v23.16b 741 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 742 b.eq .Ldone_neon 743 sub x2,x2,#64 744 cmp x2,#64 745 b.lo .Less_than_192 746 747 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 748 eor v4.16b,v4.16b,v20.16b 749 eor v5.16b,v5.16b,v21.16b 750 eor v6.16b,v6.16b,v22.16b 751 eor v7.16b,v7.16b,v23.16b 752 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 753 b.eq .Ldone_neon 754 sub x2,x2,#64 755 756 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 757 b .Last_neon 758 759.Less_than_128: 760 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 761 b .Last_neon 762.Less_than_192: 763 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 764 b .Last_neon 765 766.align 4 767.Last_neon: 768 sub x0,x0,#1 769 add x1,x1,x2 770 add x0,x0,x2 771 add x4,sp,x2 772 neg x2,x2 773 774.Loop_tail_neon: 775 ldrb w10,[x1,x2] 776 ldrb w11,[x4,x2] 777 add x2,x2,#1 778 eor w10,w10,w11 779 strb w10,[x0,x2] 780 cbnz x2,.Loop_tail_neon 781 782 stp xzr,xzr,[sp,#0] 783 stp xzr,xzr,[sp,#16] 784 stp xzr,xzr,[sp,#32] 785 stp xzr,xzr,[sp,#48] 786 787.Ldone_neon: 788 ldp x19,x20,[x29,#16] 789 add sp,sp,#64 790 ldp x21,x22,[x29,#32] 791 ldp x23,x24,[x29,#48] 792 ldp x25,x26,[x29,#64] 793 ldp x27,x28,[x29,#80] 794 ldp x29,x30,[sp],#96 795 AARCH64_VALIDATE_LINK_REGISTER 796 ret 797.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon 798.type ChaCha20_512_neon,%function 799.align 5 800ChaCha20_512_neon: 801 AARCH64_SIGN_LINK_REGISTER 802 stp x29,x30,[sp,#-96]! 803 add x29,sp,#0 804 805 adrp x5,.Lsigma 806 add x5,x5,:lo12:.Lsigma 807 stp x19,x20,[sp,#16] 808 stp x21,x22,[sp,#32] 809 stp x23,x24,[sp,#48] 810 stp x25,x26,[sp,#64] 811 stp x27,x28,[sp,#80] 812 813.L512_or_more_neon: 814 sub sp,sp,#128+64 815 816 ldp x22,x23,[x5] // load sigma 817 ld1 {v24.4s},[x5],#16 818 ldp x24,x25,[x3] // load key 819 ldp x26,x27,[x3,#16] 820 ld1 {v25.4s,v26.4s},[x3] 821 ldp x28,x30,[x4] // load counter 822 ld1 {v27.4s},[x4] 823 ld1 {v31.4s},[x5] 824#ifdef __AARCH64EB__ 825 rev64 v24.4s,v24.4s 826 ror x24,x24,#32 827 ror x25,x25,#32 828 ror x26,x26,#32 829 ror x27,x27,#32 830 ror x28,x28,#32 831 ror x30,x30,#32 832#endif 833 add v27.4s,v27.4s,v31.4s // += 1 834 stp q24,q25,[sp,#0] // off-load key block, invariant part 835 add v27.4s,v27.4s,v31.4s // not typo 836 str q26,[sp,#32] 837 add v28.4s,v27.4s,v31.4s 838 add v29.4s,v28.4s,v31.4s 839 add v30.4s,v29.4s,v31.4s 840 shl v31.4s,v31.4s,#2 // 1 -> 4 841 842 stp d8,d9,[sp,#128+0] // meet ABI requirements 843 stp d10,d11,[sp,#128+16] 844 stp d12,d13,[sp,#128+32] 845 stp d14,d15,[sp,#128+48] 846 847 sub x2,x2,#512 // not typo 848 849.Loop_outer_512_neon: 850 mov v0.16b,v24.16b 851 mov v4.16b,v24.16b 852 mov v8.16b,v24.16b 853 mov v12.16b,v24.16b 854 mov v16.16b,v24.16b 855 mov v20.16b,v24.16b 856 mov v1.16b,v25.16b 857 mov w5,w22 // unpack key block 858 mov v5.16b,v25.16b 859 lsr x6,x22,#32 860 mov v9.16b,v25.16b 861 mov w7,w23 862 mov v13.16b,v25.16b 863 lsr x8,x23,#32 864 mov v17.16b,v25.16b 865 mov w9,w24 866 mov v21.16b,v25.16b 867 lsr x10,x24,#32 868 mov v3.16b,v27.16b 869 mov w11,w25 870 mov v7.16b,v28.16b 871 lsr x12,x25,#32 872 mov v11.16b,v29.16b 873 mov w13,w26 874 mov v15.16b,v30.16b 875 lsr x14,x26,#32 876 mov v2.16b,v26.16b 877 mov w15,w27 878 mov v6.16b,v26.16b 879 lsr x16,x27,#32 880 add v19.4s,v3.4s,v31.4s // +4 881 mov w17,w28 882 add v23.4s,v7.4s,v31.4s // +4 883 lsr x19,x28,#32 884 mov v10.16b,v26.16b 885 mov w20,w30 886 mov v14.16b,v26.16b 887 lsr x21,x30,#32 888 mov v18.16b,v26.16b 889 stp q27,q28,[sp,#48] // off-load key block, variable part 890 mov v22.16b,v26.16b 891 str q29,[sp,#80] 892 893 mov x4,#5 894 subs x2,x2,#512 895.Loop_upper_neon: 896 sub x4,x4,#1 897 add v0.4s,v0.4s,v1.4s 898 add w5,w5,w9 899 add v4.4s,v4.4s,v5.4s 900 add w6,w6,w10 901 add v8.4s,v8.4s,v9.4s 902 add w7,w7,w11 903 add v12.4s,v12.4s,v13.4s 904 add w8,w8,w12 905 add v16.4s,v16.4s,v17.4s 906 eor w17,w17,w5 907 add v20.4s,v20.4s,v21.4s 908 eor w19,w19,w6 909 eor v3.16b,v3.16b,v0.16b 910 eor w20,w20,w7 911 eor v7.16b,v7.16b,v4.16b 912 eor w21,w21,w8 913 eor v11.16b,v11.16b,v8.16b 914 ror w17,w17,#16 915 eor v15.16b,v15.16b,v12.16b 916 ror w19,w19,#16 917 eor v19.16b,v19.16b,v16.16b 918 ror w20,w20,#16 919 eor v23.16b,v23.16b,v20.16b 920 ror w21,w21,#16 921 rev32 v3.8h,v3.8h 922 add w13,w13,w17 923 rev32 v7.8h,v7.8h 924 add w14,w14,w19 925 rev32 v11.8h,v11.8h 926 add w15,w15,w20 927 rev32 v15.8h,v15.8h 928 add w16,w16,w21 929 rev32 v19.8h,v19.8h 930 eor w9,w9,w13 931 rev32 v23.8h,v23.8h 932 eor w10,w10,w14 933 add v2.4s,v2.4s,v3.4s 934 eor w11,w11,w15 935 add v6.4s,v6.4s,v7.4s 936 eor w12,w12,w16 937 add v10.4s,v10.4s,v11.4s 938 ror w9,w9,#20 939 add v14.4s,v14.4s,v15.4s 940 ror w10,w10,#20 941 add v18.4s,v18.4s,v19.4s 942 ror w11,w11,#20 943 add v22.4s,v22.4s,v23.4s 944 ror w12,w12,#20 945 eor v24.16b,v1.16b,v2.16b 946 add w5,w5,w9 947 eor v25.16b,v5.16b,v6.16b 948 add w6,w6,w10 949 eor v26.16b,v9.16b,v10.16b 950 add w7,w7,w11 951 eor v27.16b,v13.16b,v14.16b 952 add w8,w8,w12 953 eor v28.16b,v17.16b,v18.16b 954 eor w17,w17,w5 955 eor v29.16b,v21.16b,v22.16b 956 eor w19,w19,w6 957 ushr v1.4s,v24.4s,#20 958 eor w20,w20,w7 959 ushr v5.4s,v25.4s,#20 960 eor w21,w21,w8 961 ushr v9.4s,v26.4s,#20 962 ror w17,w17,#24 963 ushr v13.4s,v27.4s,#20 964 ror w19,w19,#24 965 ushr v17.4s,v28.4s,#20 966 ror w20,w20,#24 967 ushr v21.4s,v29.4s,#20 968 ror w21,w21,#24 969 sli v1.4s,v24.4s,#12 970 add w13,w13,w17 971 sli v5.4s,v25.4s,#12 972 add w14,w14,w19 973 sli v9.4s,v26.4s,#12 974 add w15,w15,w20 975 sli v13.4s,v27.4s,#12 976 add w16,w16,w21 977 sli v17.4s,v28.4s,#12 978 eor w9,w9,w13 979 sli v21.4s,v29.4s,#12 980 eor w10,w10,w14 981 add v0.4s,v0.4s,v1.4s 982 eor w11,w11,w15 983 add v4.4s,v4.4s,v5.4s 984 eor w12,w12,w16 985 add v8.4s,v8.4s,v9.4s 986 ror w9,w9,#25 987 add v12.4s,v12.4s,v13.4s 988 ror w10,w10,#25 989 add v16.4s,v16.4s,v17.4s 990 ror w11,w11,#25 991 add v20.4s,v20.4s,v21.4s 992 ror w12,w12,#25 993 eor v24.16b,v3.16b,v0.16b 994 add w5,w5,w10 995 eor v25.16b,v7.16b,v4.16b 996 add w6,w6,w11 997 eor v26.16b,v11.16b,v8.16b 998 add w7,w7,w12 999 eor v27.16b,v15.16b,v12.16b 1000 add w8,w8,w9 1001 eor v28.16b,v19.16b,v16.16b 1002 eor w21,w21,w5 1003 eor v29.16b,v23.16b,v20.16b 1004 eor w17,w17,w6 1005 ushr v3.4s,v24.4s,#24 1006 eor w19,w19,w7 1007 ushr v7.4s,v25.4s,#24 1008 eor w20,w20,w8 1009 ushr v11.4s,v26.4s,#24 1010 ror w21,w21,#16 1011 ushr v15.4s,v27.4s,#24 1012 ror w17,w17,#16 1013 ushr v19.4s,v28.4s,#24 1014 ror w19,w19,#16 1015 ushr v23.4s,v29.4s,#24 1016 ror w20,w20,#16 1017 sli v3.4s,v24.4s,#8 1018 add w15,w15,w21 1019 sli v7.4s,v25.4s,#8 1020 add w16,w16,w17 1021 sli v11.4s,v26.4s,#8 1022 add w13,w13,w19 1023 sli v15.4s,v27.4s,#8 1024 add w14,w14,w20 1025 sli v19.4s,v28.4s,#8 1026 eor w10,w10,w15 1027 sli v23.4s,v29.4s,#8 1028 eor w11,w11,w16 1029 add v2.4s,v2.4s,v3.4s 1030 eor w12,w12,w13 1031 add v6.4s,v6.4s,v7.4s 1032 eor w9,w9,w14 1033 add v10.4s,v10.4s,v11.4s 1034 ror w10,w10,#20 1035 add v14.4s,v14.4s,v15.4s 1036 ror w11,w11,#20 1037 add v18.4s,v18.4s,v19.4s 1038 ror w12,w12,#20 1039 add v22.4s,v22.4s,v23.4s 1040 ror w9,w9,#20 1041 eor v24.16b,v1.16b,v2.16b 1042 add w5,w5,w10 1043 eor v25.16b,v5.16b,v6.16b 1044 add w6,w6,w11 1045 eor v26.16b,v9.16b,v10.16b 1046 add w7,w7,w12 1047 eor v27.16b,v13.16b,v14.16b 1048 add w8,w8,w9 1049 eor v28.16b,v17.16b,v18.16b 1050 eor w21,w21,w5 1051 eor v29.16b,v21.16b,v22.16b 1052 eor w17,w17,w6 1053 ushr v1.4s,v24.4s,#25 1054 eor w19,w19,w7 1055 ushr v5.4s,v25.4s,#25 1056 eor w20,w20,w8 1057 ushr v9.4s,v26.4s,#25 1058 ror w21,w21,#24 1059 ushr v13.4s,v27.4s,#25 1060 ror w17,w17,#24 1061 ushr v17.4s,v28.4s,#25 1062 ror w19,w19,#24 1063 ushr v21.4s,v29.4s,#25 1064 ror w20,w20,#24 1065 sli v1.4s,v24.4s,#7 1066 add w15,w15,w21 1067 sli v5.4s,v25.4s,#7 1068 add w16,w16,w17 1069 sli v9.4s,v26.4s,#7 1070 add w13,w13,w19 1071 sli v13.4s,v27.4s,#7 1072 add w14,w14,w20 1073 sli v17.4s,v28.4s,#7 1074 eor w10,w10,w15 1075 sli v21.4s,v29.4s,#7 1076 eor w11,w11,w16 1077 ext v2.16b,v2.16b,v2.16b,#8 1078 eor w12,w12,w13 1079 ext v6.16b,v6.16b,v6.16b,#8 1080 eor w9,w9,w14 1081 ext v10.16b,v10.16b,v10.16b,#8 1082 ror w10,w10,#25 1083 ext v14.16b,v14.16b,v14.16b,#8 1084 ror w11,w11,#25 1085 ext v18.16b,v18.16b,v18.16b,#8 1086 ror w12,w12,#25 1087 ext v22.16b,v22.16b,v22.16b,#8 1088 ror w9,w9,#25 1089 ext v3.16b,v3.16b,v3.16b,#12 1090 ext v7.16b,v7.16b,v7.16b,#12 1091 ext v11.16b,v11.16b,v11.16b,#12 1092 ext v15.16b,v15.16b,v15.16b,#12 1093 ext v19.16b,v19.16b,v19.16b,#12 1094 ext v23.16b,v23.16b,v23.16b,#12 1095 ext v1.16b,v1.16b,v1.16b,#4 1096 ext v5.16b,v5.16b,v5.16b,#4 1097 ext v9.16b,v9.16b,v9.16b,#4 1098 ext v13.16b,v13.16b,v13.16b,#4 1099 ext v17.16b,v17.16b,v17.16b,#4 1100 ext v21.16b,v21.16b,v21.16b,#4 1101 add v0.4s,v0.4s,v1.4s 1102 add w5,w5,w9 1103 add v4.4s,v4.4s,v5.4s 1104 add w6,w6,w10 1105 add v8.4s,v8.4s,v9.4s 1106 add w7,w7,w11 1107 add v12.4s,v12.4s,v13.4s 1108 add w8,w8,w12 1109 add v16.4s,v16.4s,v17.4s 1110 eor w17,w17,w5 1111 add v20.4s,v20.4s,v21.4s 1112 eor w19,w19,w6 1113 eor v3.16b,v3.16b,v0.16b 1114 eor w20,w20,w7 1115 eor v7.16b,v7.16b,v4.16b 1116 eor w21,w21,w8 1117 eor v11.16b,v11.16b,v8.16b 1118 ror w17,w17,#16 1119 eor v15.16b,v15.16b,v12.16b 1120 ror w19,w19,#16 1121 eor v19.16b,v19.16b,v16.16b 1122 ror w20,w20,#16 1123 eor v23.16b,v23.16b,v20.16b 1124 ror w21,w21,#16 1125 rev32 v3.8h,v3.8h 1126 add w13,w13,w17 1127 rev32 v7.8h,v7.8h 1128 add w14,w14,w19 1129 rev32 v11.8h,v11.8h 1130 add w15,w15,w20 1131 rev32 v15.8h,v15.8h 1132 add w16,w16,w21 1133 rev32 v19.8h,v19.8h 1134 eor w9,w9,w13 1135 rev32 v23.8h,v23.8h 1136 eor w10,w10,w14 1137 add v2.4s,v2.4s,v3.4s 1138 eor w11,w11,w15 1139 add v6.4s,v6.4s,v7.4s 1140 eor w12,w12,w16 1141 add v10.4s,v10.4s,v11.4s 1142 ror w9,w9,#20 1143 add v14.4s,v14.4s,v15.4s 1144 ror w10,w10,#20 1145 add v18.4s,v18.4s,v19.4s 1146 ror w11,w11,#20 1147 add v22.4s,v22.4s,v23.4s 1148 ror w12,w12,#20 1149 eor v24.16b,v1.16b,v2.16b 1150 add w5,w5,w9 1151 eor v25.16b,v5.16b,v6.16b 1152 add w6,w6,w10 1153 eor v26.16b,v9.16b,v10.16b 1154 add w7,w7,w11 1155 eor v27.16b,v13.16b,v14.16b 1156 add w8,w8,w12 1157 eor v28.16b,v17.16b,v18.16b 1158 eor w17,w17,w5 1159 eor v29.16b,v21.16b,v22.16b 1160 eor w19,w19,w6 1161 ushr v1.4s,v24.4s,#20 1162 eor w20,w20,w7 1163 ushr v5.4s,v25.4s,#20 1164 eor w21,w21,w8 1165 ushr v9.4s,v26.4s,#20 1166 ror w17,w17,#24 1167 ushr v13.4s,v27.4s,#20 1168 ror w19,w19,#24 1169 ushr v17.4s,v28.4s,#20 1170 ror w20,w20,#24 1171 ushr v21.4s,v29.4s,#20 1172 ror w21,w21,#24 1173 sli v1.4s,v24.4s,#12 1174 add w13,w13,w17 1175 sli v5.4s,v25.4s,#12 1176 add w14,w14,w19 1177 sli v9.4s,v26.4s,#12 1178 add w15,w15,w20 1179 sli v13.4s,v27.4s,#12 1180 add w16,w16,w21 1181 sli v17.4s,v28.4s,#12 1182 eor w9,w9,w13 1183 sli v21.4s,v29.4s,#12 1184 eor w10,w10,w14 1185 add v0.4s,v0.4s,v1.4s 1186 eor w11,w11,w15 1187 add v4.4s,v4.4s,v5.4s 1188 eor w12,w12,w16 1189 add v8.4s,v8.4s,v9.4s 1190 ror w9,w9,#25 1191 add v12.4s,v12.4s,v13.4s 1192 ror w10,w10,#25 1193 add v16.4s,v16.4s,v17.4s 1194 ror w11,w11,#25 1195 add v20.4s,v20.4s,v21.4s 1196 ror w12,w12,#25 1197 eor v24.16b,v3.16b,v0.16b 1198 add w5,w5,w10 1199 eor v25.16b,v7.16b,v4.16b 1200 add w6,w6,w11 1201 eor v26.16b,v11.16b,v8.16b 1202 add w7,w7,w12 1203 eor v27.16b,v15.16b,v12.16b 1204 add w8,w8,w9 1205 eor v28.16b,v19.16b,v16.16b 1206 eor w21,w21,w5 1207 eor v29.16b,v23.16b,v20.16b 1208 eor w17,w17,w6 1209 ushr v3.4s,v24.4s,#24 1210 eor w19,w19,w7 1211 ushr v7.4s,v25.4s,#24 1212 eor w20,w20,w8 1213 ushr v11.4s,v26.4s,#24 1214 ror w21,w21,#16 1215 ushr v15.4s,v27.4s,#24 1216 ror w17,w17,#16 1217 ushr v19.4s,v28.4s,#24 1218 ror w19,w19,#16 1219 ushr v23.4s,v29.4s,#24 1220 ror w20,w20,#16 1221 sli v3.4s,v24.4s,#8 1222 add w15,w15,w21 1223 sli v7.4s,v25.4s,#8 1224 add w16,w16,w17 1225 sli v11.4s,v26.4s,#8 1226 add w13,w13,w19 1227 sli v15.4s,v27.4s,#8 1228 add w14,w14,w20 1229 sli v19.4s,v28.4s,#8 1230 eor w10,w10,w15 1231 sli v23.4s,v29.4s,#8 1232 eor w11,w11,w16 1233 add v2.4s,v2.4s,v3.4s 1234 eor w12,w12,w13 1235 add v6.4s,v6.4s,v7.4s 1236 eor w9,w9,w14 1237 add v10.4s,v10.4s,v11.4s 1238 ror w10,w10,#20 1239 add v14.4s,v14.4s,v15.4s 1240 ror w11,w11,#20 1241 add v18.4s,v18.4s,v19.4s 1242 ror w12,w12,#20 1243 add v22.4s,v22.4s,v23.4s 1244 ror w9,w9,#20 1245 eor v24.16b,v1.16b,v2.16b 1246 add w5,w5,w10 1247 eor v25.16b,v5.16b,v6.16b 1248 add w6,w6,w11 1249 eor v26.16b,v9.16b,v10.16b 1250 add w7,w7,w12 1251 eor v27.16b,v13.16b,v14.16b 1252 add w8,w8,w9 1253 eor v28.16b,v17.16b,v18.16b 1254 eor w21,w21,w5 1255 eor v29.16b,v21.16b,v22.16b 1256 eor w17,w17,w6 1257 ushr v1.4s,v24.4s,#25 1258 eor w19,w19,w7 1259 ushr v5.4s,v25.4s,#25 1260 eor w20,w20,w8 1261 ushr v9.4s,v26.4s,#25 1262 ror w21,w21,#24 1263 ushr v13.4s,v27.4s,#25 1264 ror w17,w17,#24 1265 ushr v17.4s,v28.4s,#25 1266 ror w19,w19,#24 1267 ushr v21.4s,v29.4s,#25 1268 ror w20,w20,#24 1269 sli v1.4s,v24.4s,#7 1270 add w15,w15,w21 1271 sli v5.4s,v25.4s,#7 1272 add w16,w16,w17 1273 sli v9.4s,v26.4s,#7 1274 add w13,w13,w19 1275 sli v13.4s,v27.4s,#7 1276 add w14,w14,w20 1277 sli v17.4s,v28.4s,#7 1278 eor w10,w10,w15 1279 sli v21.4s,v29.4s,#7 1280 eor w11,w11,w16 1281 ext v2.16b,v2.16b,v2.16b,#8 1282 eor w12,w12,w13 1283 ext v6.16b,v6.16b,v6.16b,#8 1284 eor w9,w9,w14 1285 ext v10.16b,v10.16b,v10.16b,#8 1286 ror w10,w10,#25 1287 ext v14.16b,v14.16b,v14.16b,#8 1288 ror w11,w11,#25 1289 ext v18.16b,v18.16b,v18.16b,#8 1290 ror w12,w12,#25 1291 ext v22.16b,v22.16b,v22.16b,#8 1292 ror w9,w9,#25 1293 ext v3.16b,v3.16b,v3.16b,#4 1294 ext v7.16b,v7.16b,v7.16b,#4 1295 ext v11.16b,v11.16b,v11.16b,#4 1296 ext v15.16b,v15.16b,v15.16b,#4 1297 ext v19.16b,v19.16b,v19.16b,#4 1298 ext v23.16b,v23.16b,v23.16b,#4 1299 ext v1.16b,v1.16b,v1.16b,#12 1300 ext v5.16b,v5.16b,v5.16b,#12 1301 ext v9.16b,v9.16b,v9.16b,#12 1302 ext v13.16b,v13.16b,v13.16b,#12 1303 ext v17.16b,v17.16b,v17.16b,#12 1304 ext v21.16b,v21.16b,v21.16b,#12 1305 cbnz x4,.Loop_upper_neon 1306 1307 add w5,w5,w22 // accumulate key block 1308 add x6,x6,x22,lsr#32 1309 add w7,w7,w23 1310 add x8,x8,x23,lsr#32 1311 add w9,w9,w24 1312 add x10,x10,x24,lsr#32 1313 add w11,w11,w25 1314 add x12,x12,x25,lsr#32 1315 add w13,w13,w26 1316 add x14,x14,x26,lsr#32 1317 add w15,w15,w27 1318 add x16,x16,x27,lsr#32 1319 add w17,w17,w28 1320 add x19,x19,x28,lsr#32 1321 add w20,w20,w30 1322 add x21,x21,x30,lsr#32 1323 1324 add x5,x5,x6,lsl#32 // pack 1325 add x7,x7,x8,lsl#32 1326 ldp x6,x8,[x1,#0] // load input 1327 add x9,x9,x10,lsl#32 1328 add x11,x11,x12,lsl#32 1329 ldp x10,x12,[x1,#16] 1330 add x13,x13,x14,lsl#32 1331 add x15,x15,x16,lsl#32 1332 ldp x14,x16,[x1,#32] 1333 add x17,x17,x19,lsl#32 1334 add x20,x20,x21,lsl#32 1335 ldp x19,x21,[x1,#48] 1336 add x1,x1,#64 1337#ifdef __AARCH64EB__ 1338 rev x5,x5 1339 rev x7,x7 1340 rev x9,x9 1341 rev x11,x11 1342 rev x13,x13 1343 rev x15,x15 1344 rev x17,x17 1345 rev x20,x20 1346#endif 1347 eor x5,x5,x6 1348 eor x7,x7,x8 1349 eor x9,x9,x10 1350 eor x11,x11,x12 1351 eor x13,x13,x14 1352 eor x15,x15,x16 1353 eor x17,x17,x19 1354 eor x20,x20,x21 1355 1356 stp x5,x7,[x0,#0] // store output 1357 add x28,x28,#1 // increment counter 1358 mov w5,w22 // unpack key block 1359 lsr x6,x22,#32 1360 stp x9,x11,[x0,#16] 1361 mov w7,w23 1362 lsr x8,x23,#32 1363 stp x13,x15,[x0,#32] 1364 mov w9,w24 1365 lsr x10,x24,#32 1366 stp x17,x20,[x0,#48] 1367 add x0,x0,#64 1368 mov w11,w25 1369 lsr x12,x25,#32 1370 mov w13,w26 1371 lsr x14,x26,#32 1372 mov w15,w27 1373 lsr x16,x27,#32 1374 mov w17,w28 1375 lsr x19,x28,#32 1376 mov w20,w30 1377 lsr x21,x30,#32 1378 1379 mov x4,#5 1380.Loop_lower_neon: 1381 sub x4,x4,#1 1382 add v0.4s,v0.4s,v1.4s 1383 add w5,w5,w9 1384 add v4.4s,v4.4s,v5.4s 1385 add w6,w6,w10 1386 add v8.4s,v8.4s,v9.4s 1387 add w7,w7,w11 1388 add v12.4s,v12.4s,v13.4s 1389 add w8,w8,w12 1390 add v16.4s,v16.4s,v17.4s 1391 eor w17,w17,w5 1392 add v20.4s,v20.4s,v21.4s 1393 eor w19,w19,w6 1394 eor v3.16b,v3.16b,v0.16b 1395 eor w20,w20,w7 1396 eor v7.16b,v7.16b,v4.16b 1397 eor w21,w21,w8 1398 eor v11.16b,v11.16b,v8.16b 1399 ror w17,w17,#16 1400 eor v15.16b,v15.16b,v12.16b 1401 ror w19,w19,#16 1402 eor v19.16b,v19.16b,v16.16b 1403 ror w20,w20,#16 1404 eor v23.16b,v23.16b,v20.16b 1405 ror w21,w21,#16 1406 rev32 v3.8h,v3.8h 1407 add w13,w13,w17 1408 rev32 v7.8h,v7.8h 1409 add w14,w14,w19 1410 rev32 v11.8h,v11.8h 1411 add w15,w15,w20 1412 rev32 v15.8h,v15.8h 1413 add w16,w16,w21 1414 rev32 v19.8h,v19.8h 1415 eor w9,w9,w13 1416 rev32 v23.8h,v23.8h 1417 eor w10,w10,w14 1418 add v2.4s,v2.4s,v3.4s 1419 eor w11,w11,w15 1420 add v6.4s,v6.4s,v7.4s 1421 eor w12,w12,w16 1422 add v10.4s,v10.4s,v11.4s 1423 ror w9,w9,#20 1424 add v14.4s,v14.4s,v15.4s 1425 ror w10,w10,#20 1426 add v18.4s,v18.4s,v19.4s 1427 ror w11,w11,#20 1428 add v22.4s,v22.4s,v23.4s 1429 ror w12,w12,#20 1430 eor v24.16b,v1.16b,v2.16b 1431 add w5,w5,w9 1432 eor v25.16b,v5.16b,v6.16b 1433 add w6,w6,w10 1434 eor v26.16b,v9.16b,v10.16b 1435 add w7,w7,w11 1436 eor v27.16b,v13.16b,v14.16b 1437 add w8,w8,w12 1438 eor v28.16b,v17.16b,v18.16b 1439 eor w17,w17,w5 1440 eor v29.16b,v21.16b,v22.16b 1441 eor w19,w19,w6 1442 ushr v1.4s,v24.4s,#20 1443 eor w20,w20,w7 1444 ushr v5.4s,v25.4s,#20 1445 eor w21,w21,w8 1446 ushr v9.4s,v26.4s,#20 1447 ror w17,w17,#24 1448 ushr v13.4s,v27.4s,#20 1449 ror w19,w19,#24 1450 ushr v17.4s,v28.4s,#20 1451 ror w20,w20,#24 1452 ushr v21.4s,v29.4s,#20 1453 ror w21,w21,#24 1454 sli v1.4s,v24.4s,#12 1455 add w13,w13,w17 1456 sli v5.4s,v25.4s,#12 1457 add w14,w14,w19 1458 sli v9.4s,v26.4s,#12 1459 add w15,w15,w20 1460 sli v13.4s,v27.4s,#12 1461 add w16,w16,w21 1462 sli v17.4s,v28.4s,#12 1463 eor w9,w9,w13 1464 sli v21.4s,v29.4s,#12 1465 eor w10,w10,w14 1466 add v0.4s,v0.4s,v1.4s 1467 eor w11,w11,w15 1468 add v4.4s,v4.4s,v5.4s 1469 eor w12,w12,w16 1470 add v8.4s,v8.4s,v9.4s 1471 ror w9,w9,#25 1472 add v12.4s,v12.4s,v13.4s 1473 ror w10,w10,#25 1474 add v16.4s,v16.4s,v17.4s 1475 ror w11,w11,#25 1476 add v20.4s,v20.4s,v21.4s 1477 ror w12,w12,#25 1478 eor v24.16b,v3.16b,v0.16b 1479 add w5,w5,w10 1480 eor v25.16b,v7.16b,v4.16b 1481 add w6,w6,w11 1482 eor v26.16b,v11.16b,v8.16b 1483 add w7,w7,w12 1484 eor v27.16b,v15.16b,v12.16b 1485 add w8,w8,w9 1486 eor v28.16b,v19.16b,v16.16b 1487 eor w21,w21,w5 1488 eor v29.16b,v23.16b,v20.16b 1489 eor w17,w17,w6 1490 ushr v3.4s,v24.4s,#24 1491 eor w19,w19,w7 1492 ushr v7.4s,v25.4s,#24 1493 eor w20,w20,w8 1494 ushr v11.4s,v26.4s,#24 1495 ror w21,w21,#16 1496 ushr v15.4s,v27.4s,#24 1497 ror w17,w17,#16 1498 ushr v19.4s,v28.4s,#24 1499 ror w19,w19,#16 1500 ushr v23.4s,v29.4s,#24 1501 ror w20,w20,#16 1502 sli v3.4s,v24.4s,#8 1503 add w15,w15,w21 1504 sli v7.4s,v25.4s,#8 1505 add w16,w16,w17 1506 sli v11.4s,v26.4s,#8 1507 add w13,w13,w19 1508 sli v15.4s,v27.4s,#8 1509 add w14,w14,w20 1510 sli v19.4s,v28.4s,#8 1511 eor w10,w10,w15 1512 sli v23.4s,v29.4s,#8 1513 eor w11,w11,w16 1514 add v2.4s,v2.4s,v3.4s 1515 eor w12,w12,w13 1516 add v6.4s,v6.4s,v7.4s 1517 eor w9,w9,w14 1518 add v10.4s,v10.4s,v11.4s 1519 ror w10,w10,#20 1520 add v14.4s,v14.4s,v15.4s 1521 ror w11,w11,#20 1522 add v18.4s,v18.4s,v19.4s 1523 ror w12,w12,#20 1524 add v22.4s,v22.4s,v23.4s 1525 ror w9,w9,#20 1526 eor v24.16b,v1.16b,v2.16b 1527 add w5,w5,w10 1528 eor v25.16b,v5.16b,v6.16b 1529 add w6,w6,w11 1530 eor v26.16b,v9.16b,v10.16b 1531 add w7,w7,w12 1532 eor v27.16b,v13.16b,v14.16b 1533 add w8,w8,w9 1534 eor v28.16b,v17.16b,v18.16b 1535 eor w21,w21,w5 1536 eor v29.16b,v21.16b,v22.16b 1537 eor w17,w17,w6 1538 ushr v1.4s,v24.4s,#25 1539 eor w19,w19,w7 1540 ushr v5.4s,v25.4s,#25 1541 eor w20,w20,w8 1542 ushr v9.4s,v26.4s,#25 1543 ror w21,w21,#24 1544 ushr v13.4s,v27.4s,#25 1545 ror w17,w17,#24 1546 ushr v17.4s,v28.4s,#25 1547 ror w19,w19,#24 1548 ushr v21.4s,v29.4s,#25 1549 ror w20,w20,#24 1550 sli v1.4s,v24.4s,#7 1551 add w15,w15,w21 1552 sli v5.4s,v25.4s,#7 1553 add w16,w16,w17 1554 sli v9.4s,v26.4s,#7 1555 add w13,w13,w19 1556 sli v13.4s,v27.4s,#7 1557 add w14,w14,w20 1558 sli v17.4s,v28.4s,#7 1559 eor w10,w10,w15 1560 sli v21.4s,v29.4s,#7 1561 eor w11,w11,w16 1562 ext v2.16b,v2.16b,v2.16b,#8 1563 eor w12,w12,w13 1564 ext v6.16b,v6.16b,v6.16b,#8 1565 eor w9,w9,w14 1566 ext v10.16b,v10.16b,v10.16b,#8 1567 ror w10,w10,#25 1568 ext v14.16b,v14.16b,v14.16b,#8 1569 ror w11,w11,#25 1570 ext v18.16b,v18.16b,v18.16b,#8 1571 ror w12,w12,#25 1572 ext v22.16b,v22.16b,v22.16b,#8 1573 ror w9,w9,#25 1574 ext v3.16b,v3.16b,v3.16b,#12 1575 ext v7.16b,v7.16b,v7.16b,#12 1576 ext v11.16b,v11.16b,v11.16b,#12 1577 ext v15.16b,v15.16b,v15.16b,#12 1578 ext v19.16b,v19.16b,v19.16b,#12 1579 ext v23.16b,v23.16b,v23.16b,#12 1580 ext v1.16b,v1.16b,v1.16b,#4 1581 ext v5.16b,v5.16b,v5.16b,#4 1582 ext v9.16b,v9.16b,v9.16b,#4 1583 ext v13.16b,v13.16b,v13.16b,#4 1584 ext v17.16b,v17.16b,v17.16b,#4 1585 ext v21.16b,v21.16b,v21.16b,#4 1586 add v0.4s,v0.4s,v1.4s 1587 add w5,w5,w9 1588 add v4.4s,v4.4s,v5.4s 1589 add w6,w6,w10 1590 add v8.4s,v8.4s,v9.4s 1591 add w7,w7,w11 1592 add v12.4s,v12.4s,v13.4s 1593 add w8,w8,w12 1594 add v16.4s,v16.4s,v17.4s 1595 eor w17,w17,w5 1596 add v20.4s,v20.4s,v21.4s 1597 eor w19,w19,w6 1598 eor v3.16b,v3.16b,v0.16b 1599 eor w20,w20,w7 1600 eor v7.16b,v7.16b,v4.16b 1601 eor w21,w21,w8 1602 eor v11.16b,v11.16b,v8.16b 1603 ror w17,w17,#16 1604 eor v15.16b,v15.16b,v12.16b 1605 ror w19,w19,#16 1606 eor v19.16b,v19.16b,v16.16b 1607 ror w20,w20,#16 1608 eor v23.16b,v23.16b,v20.16b 1609 ror w21,w21,#16 1610 rev32 v3.8h,v3.8h 1611 add w13,w13,w17 1612 rev32 v7.8h,v7.8h 1613 add w14,w14,w19 1614 rev32 v11.8h,v11.8h 1615 add w15,w15,w20 1616 rev32 v15.8h,v15.8h 1617 add w16,w16,w21 1618 rev32 v19.8h,v19.8h 1619 eor w9,w9,w13 1620 rev32 v23.8h,v23.8h 1621 eor w10,w10,w14 1622 add v2.4s,v2.4s,v3.4s 1623 eor w11,w11,w15 1624 add v6.4s,v6.4s,v7.4s 1625 eor w12,w12,w16 1626 add v10.4s,v10.4s,v11.4s 1627 ror w9,w9,#20 1628 add v14.4s,v14.4s,v15.4s 1629 ror w10,w10,#20 1630 add v18.4s,v18.4s,v19.4s 1631 ror w11,w11,#20 1632 add v22.4s,v22.4s,v23.4s 1633 ror w12,w12,#20 1634 eor v24.16b,v1.16b,v2.16b 1635 add w5,w5,w9 1636 eor v25.16b,v5.16b,v6.16b 1637 add w6,w6,w10 1638 eor v26.16b,v9.16b,v10.16b 1639 add w7,w7,w11 1640 eor v27.16b,v13.16b,v14.16b 1641 add w8,w8,w12 1642 eor v28.16b,v17.16b,v18.16b 1643 eor w17,w17,w5 1644 eor v29.16b,v21.16b,v22.16b 1645 eor w19,w19,w6 1646 ushr v1.4s,v24.4s,#20 1647 eor w20,w20,w7 1648 ushr v5.4s,v25.4s,#20 1649 eor w21,w21,w8 1650 ushr v9.4s,v26.4s,#20 1651 ror w17,w17,#24 1652 ushr v13.4s,v27.4s,#20 1653 ror w19,w19,#24 1654 ushr v17.4s,v28.4s,#20 1655 ror w20,w20,#24 1656 ushr v21.4s,v29.4s,#20 1657 ror w21,w21,#24 1658 sli v1.4s,v24.4s,#12 1659 add w13,w13,w17 1660 sli v5.4s,v25.4s,#12 1661 add w14,w14,w19 1662 sli v9.4s,v26.4s,#12 1663 add w15,w15,w20 1664 sli v13.4s,v27.4s,#12 1665 add w16,w16,w21 1666 sli v17.4s,v28.4s,#12 1667 eor w9,w9,w13 1668 sli v21.4s,v29.4s,#12 1669 eor w10,w10,w14 1670 add v0.4s,v0.4s,v1.4s 1671 eor w11,w11,w15 1672 add v4.4s,v4.4s,v5.4s 1673 eor w12,w12,w16 1674 add v8.4s,v8.4s,v9.4s 1675 ror w9,w9,#25 1676 add v12.4s,v12.4s,v13.4s 1677 ror w10,w10,#25 1678 add v16.4s,v16.4s,v17.4s 1679 ror w11,w11,#25 1680 add v20.4s,v20.4s,v21.4s 1681 ror w12,w12,#25 1682 eor v24.16b,v3.16b,v0.16b 1683 add w5,w5,w10 1684 eor v25.16b,v7.16b,v4.16b 1685 add w6,w6,w11 1686 eor v26.16b,v11.16b,v8.16b 1687 add w7,w7,w12 1688 eor v27.16b,v15.16b,v12.16b 1689 add w8,w8,w9 1690 eor v28.16b,v19.16b,v16.16b 1691 eor w21,w21,w5 1692 eor v29.16b,v23.16b,v20.16b 1693 eor w17,w17,w6 1694 ushr v3.4s,v24.4s,#24 1695 eor w19,w19,w7 1696 ushr v7.4s,v25.4s,#24 1697 eor w20,w20,w8 1698 ushr v11.4s,v26.4s,#24 1699 ror w21,w21,#16 1700 ushr v15.4s,v27.4s,#24 1701 ror w17,w17,#16 1702 ushr v19.4s,v28.4s,#24 1703 ror w19,w19,#16 1704 ushr v23.4s,v29.4s,#24 1705 ror w20,w20,#16 1706 sli v3.4s,v24.4s,#8 1707 add w15,w15,w21 1708 sli v7.4s,v25.4s,#8 1709 add w16,w16,w17 1710 sli v11.4s,v26.4s,#8 1711 add w13,w13,w19 1712 sli v15.4s,v27.4s,#8 1713 add w14,w14,w20 1714 sli v19.4s,v28.4s,#8 1715 eor w10,w10,w15 1716 sli v23.4s,v29.4s,#8 1717 eor w11,w11,w16 1718 add v2.4s,v2.4s,v3.4s 1719 eor w12,w12,w13 1720 add v6.4s,v6.4s,v7.4s 1721 eor w9,w9,w14 1722 add v10.4s,v10.4s,v11.4s 1723 ror w10,w10,#20 1724 add v14.4s,v14.4s,v15.4s 1725 ror w11,w11,#20 1726 add v18.4s,v18.4s,v19.4s 1727 ror w12,w12,#20 1728 add v22.4s,v22.4s,v23.4s 1729 ror w9,w9,#20 1730 eor v24.16b,v1.16b,v2.16b 1731 add w5,w5,w10 1732 eor v25.16b,v5.16b,v6.16b 1733 add w6,w6,w11 1734 eor v26.16b,v9.16b,v10.16b 1735 add w7,w7,w12 1736 eor v27.16b,v13.16b,v14.16b 1737 add w8,w8,w9 1738 eor v28.16b,v17.16b,v18.16b 1739 eor w21,w21,w5 1740 eor v29.16b,v21.16b,v22.16b 1741 eor w17,w17,w6 1742 ushr v1.4s,v24.4s,#25 1743 eor w19,w19,w7 1744 ushr v5.4s,v25.4s,#25 1745 eor w20,w20,w8 1746 ushr v9.4s,v26.4s,#25 1747 ror w21,w21,#24 1748 ushr v13.4s,v27.4s,#25 1749 ror w17,w17,#24 1750 ushr v17.4s,v28.4s,#25 1751 ror w19,w19,#24 1752 ushr v21.4s,v29.4s,#25 1753 ror w20,w20,#24 1754 sli v1.4s,v24.4s,#7 1755 add w15,w15,w21 1756 sli v5.4s,v25.4s,#7 1757 add w16,w16,w17 1758 sli v9.4s,v26.4s,#7 1759 add w13,w13,w19 1760 sli v13.4s,v27.4s,#7 1761 add w14,w14,w20 1762 sli v17.4s,v28.4s,#7 1763 eor w10,w10,w15 1764 sli v21.4s,v29.4s,#7 1765 eor w11,w11,w16 1766 ext v2.16b,v2.16b,v2.16b,#8 1767 eor w12,w12,w13 1768 ext v6.16b,v6.16b,v6.16b,#8 1769 eor w9,w9,w14 1770 ext v10.16b,v10.16b,v10.16b,#8 1771 ror w10,w10,#25 1772 ext v14.16b,v14.16b,v14.16b,#8 1773 ror w11,w11,#25 1774 ext v18.16b,v18.16b,v18.16b,#8 1775 ror w12,w12,#25 1776 ext v22.16b,v22.16b,v22.16b,#8 1777 ror w9,w9,#25 1778 ext v3.16b,v3.16b,v3.16b,#4 1779 ext v7.16b,v7.16b,v7.16b,#4 1780 ext v11.16b,v11.16b,v11.16b,#4 1781 ext v15.16b,v15.16b,v15.16b,#4 1782 ext v19.16b,v19.16b,v19.16b,#4 1783 ext v23.16b,v23.16b,v23.16b,#4 1784 ext v1.16b,v1.16b,v1.16b,#12 1785 ext v5.16b,v5.16b,v5.16b,#12 1786 ext v9.16b,v9.16b,v9.16b,#12 1787 ext v13.16b,v13.16b,v13.16b,#12 1788 ext v17.16b,v17.16b,v17.16b,#12 1789 ext v21.16b,v21.16b,v21.16b,#12 1790 cbnz x4,.Loop_lower_neon 1791 1792 add w5,w5,w22 // accumulate key block 1793 ldp q24,q25,[sp,#0] 1794 add x6,x6,x22,lsr#32 1795 ldp q26,q27,[sp,#32] 1796 add w7,w7,w23 1797 ldp q28,q29,[sp,#64] 1798 add x8,x8,x23,lsr#32 1799 add v0.4s,v0.4s,v24.4s 1800 add w9,w9,w24 1801 add v4.4s,v4.4s,v24.4s 1802 add x10,x10,x24,lsr#32 1803 add v8.4s,v8.4s,v24.4s 1804 add w11,w11,w25 1805 add v12.4s,v12.4s,v24.4s 1806 add x12,x12,x25,lsr#32 1807 add v16.4s,v16.4s,v24.4s 1808 add w13,w13,w26 1809 add v20.4s,v20.4s,v24.4s 1810 add x14,x14,x26,lsr#32 1811 add v2.4s,v2.4s,v26.4s 1812 add w15,w15,w27 1813 add v6.4s,v6.4s,v26.4s 1814 add x16,x16,x27,lsr#32 1815 add v10.4s,v10.4s,v26.4s 1816 add w17,w17,w28 1817 add v14.4s,v14.4s,v26.4s 1818 add x19,x19,x28,lsr#32 1819 add v18.4s,v18.4s,v26.4s 1820 add w20,w20,w30 1821 add v22.4s,v22.4s,v26.4s 1822 add x21,x21,x30,lsr#32 1823 add v19.4s,v19.4s,v31.4s // +4 1824 add x5,x5,x6,lsl#32 // pack 1825 add v23.4s,v23.4s,v31.4s // +4 1826 add x7,x7,x8,lsl#32 1827 add v3.4s,v3.4s,v27.4s 1828 ldp x6,x8,[x1,#0] // load input 1829 add v7.4s,v7.4s,v28.4s 1830 add x9,x9,x10,lsl#32 1831 add v11.4s,v11.4s,v29.4s 1832 add x11,x11,x12,lsl#32 1833 add v15.4s,v15.4s,v30.4s 1834 ldp x10,x12,[x1,#16] 1835 add v19.4s,v19.4s,v27.4s 1836 add x13,x13,x14,lsl#32 1837 add v23.4s,v23.4s,v28.4s 1838 add x15,x15,x16,lsl#32 1839 add v1.4s,v1.4s,v25.4s 1840 ldp x14,x16,[x1,#32] 1841 add v5.4s,v5.4s,v25.4s 1842 add x17,x17,x19,lsl#32 1843 add v9.4s,v9.4s,v25.4s 1844 add x20,x20,x21,lsl#32 1845 add v13.4s,v13.4s,v25.4s 1846 ldp x19,x21,[x1,#48] 1847 add v17.4s,v17.4s,v25.4s 1848 add x1,x1,#64 1849 add v21.4s,v21.4s,v25.4s 1850 1851#ifdef __AARCH64EB__ 1852 rev x5,x5 1853 rev x7,x7 1854 rev x9,x9 1855 rev x11,x11 1856 rev x13,x13 1857 rev x15,x15 1858 rev x17,x17 1859 rev x20,x20 1860#endif 1861 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1862 eor x5,x5,x6 1863 eor x7,x7,x8 1864 eor x9,x9,x10 1865 eor x11,x11,x12 1866 eor x13,x13,x14 1867 eor v0.16b,v0.16b,v24.16b 1868 eor x15,x15,x16 1869 eor v1.16b,v1.16b,v25.16b 1870 eor x17,x17,x19 1871 eor v2.16b,v2.16b,v26.16b 1872 eor x20,x20,x21 1873 eor v3.16b,v3.16b,v27.16b 1874 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1875 1876 stp x5,x7,[x0,#0] // store output 1877 add x28,x28,#7 // increment counter 1878 stp x9,x11,[x0,#16] 1879 stp x13,x15,[x0,#32] 1880 stp x17,x20,[x0,#48] 1881 add x0,x0,#64 1882 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1883 1884 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1885 eor v4.16b,v4.16b,v24.16b 1886 eor v5.16b,v5.16b,v25.16b 1887 eor v6.16b,v6.16b,v26.16b 1888 eor v7.16b,v7.16b,v27.16b 1889 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1890 1891 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1892 eor v8.16b,v8.16b,v0.16b 1893 ldp q24,q25,[sp,#0] 1894 eor v9.16b,v9.16b,v1.16b 1895 ldp q26,q27,[sp,#32] 1896 eor v10.16b,v10.16b,v2.16b 1897 eor v11.16b,v11.16b,v3.16b 1898 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1899 1900 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1901 eor v12.16b,v12.16b,v4.16b 1902 eor v13.16b,v13.16b,v5.16b 1903 eor v14.16b,v14.16b,v6.16b 1904 eor v15.16b,v15.16b,v7.16b 1905 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1906 1907 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1908 eor v16.16b,v16.16b,v8.16b 1909 eor v17.16b,v17.16b,v9.16b 1910 eor v18.16b,v18.16b,v10.16b 1911 eor v19.16b,v19.16b,v11.16b 1912 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1913 1914 shl v0.4s,v31.4s,#1 // 4 -> 8 1915 eor v20.16b,v20.16b,v12.16b 1916 eor v21.16b,v21.16b,v13.16b 1917 eor v22.16b,v22.16b,v14.16b 1918 eor v23.16b,v23.16b,v15.16b 1919 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1920 1921 add v27.4s,v27.4s,v0.4s // += 8 1922 add v28.4s,v28.4s,v0.4s 1923 add v29.4s,v29.4s,v0.4s 1924 add v30.4s,v30.4s,v0.4s 1925 1926 b.hs .Loop_outer_512_neon 1927 1928 adds x2,x2,#512 1929 ushr v0.4s,v31.4s,#2 // 4 -> 1 1930 1931 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1932 ldp d10,d11,[sp,#128+16] 1933 ldp d12,d13,[sp,#128+32] 1934 ldp d14,d15,[sp,#128+48] 1935 1936 stp q24,q31,[sp,#0] // wipe off-load area 1937 stp q24,q31,[sp,#32] 1938 stp q24,q31,[sp,#64] 1939 1940 b.eq .Ldone_512_neon 1941 1942 cmp x2,#192 1943 sub v27.4s,v27.4s,v0.4s // -= 1 1944 sub v28.4s,v28.4s,v0.4s 1945 sub v29.4s,v29.4s,v0.4s 1946 add sp,sp,#128 1947 b.hs .Loop_outer_neon 1948 1949 eor v25.16b,v25.16b,v25.16b 1950 eor v26.16b,v26.16b,v26.16b 1951 eor v27.16b,v27.16b,v27.16b 1952 eor v28.16b,v28.16b,v28.16b 1953 eor v29.16b,v29.16b,v29.16b 1954 eor v30.16b,v30.16b,v30.16b 1955 b .Loop_outer 1956 1957.Ldone_512_neon: 1958 ldp x19,x20,[x29,#16] 1959 add sp,sp,#128+64 1960 ldp x21,x22,[x29,#32] 1961 ldp x23,x24,[x29,#48] 1962 ldp x25,x26,[x29,#64] 1963 ldp x27,x28,[x29,#80] 1964 ldp x29,x30,[sp],#96 1965 AARCH64_VALIDATE_LINK_REGISTER 1966 ret 1967.size ChaCha20_512_neon,.-ChaCha20_512_neon 1968#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1969