1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#if !defined(__has_feature) 5#define __has_feature(x) 0 6#endif 7#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) 8#define OPENSSL_NO_ASM 9#endif 10 11#if !defined(OPENSSL_NO_ASM) 12#if defined(__aarch64__) 13#if defined(BORINGSSL_PREFIX) 14#include <boringssl_prefix_symbols_asm.h> 15#endif 16#include <openssl/arm_arch.h> 17 18 19.hidden OPENSSL_armcap_P 20 21.section .rodata 22 23.align 5 24.Lsigma: 25.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral 26.Lone: 27.long 1,0,0,0 28.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 29.align 2 30 31.text 32 33.globl ChaCha20_ctr32 34.hidden ChaCha20_ctr32 35.type ChaCha20_ctr32,%function 36.align 5 37ChaCha20_ctr32: 38 AARCH64_VALID_CALL_TARGET 39 cbz x2,.Labort 40#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 41 adrp x5,:pg_hi21_nc:OPENSSL_armcap_P 42#else 43 adrp x5,OPENSSL_armcap_P 44#endif 45 cmp x2,#192 46 b.lo .Lshort 47 ldr w17,[x5,:lo12:OPENSSL_armcap_P] 48 tst w17,#ARMV7_NEON 49 b.ne ChaCha20_neon 50 51.Lshort: 52 AARCH64_SIGN_LINK_REGISTER 53 stp x29,x30,[sp,#-96]! 54 add x29,sp,#0 55 56 adrp x5,.Lsigma 57 add x5,x5,:lo12:.Lsigma 58 stp x19,x20,[sp,#16] 59 stp x21,x22,[sp,#32] 60 stp x23,x24,[sp,#48] 61 stp x25,x26,[sp,#64] 62 stp x27,x28,[sp,#80] 63 sub sp,sp,#64 64 65 ldp x22,x23,[x5] // load sigma 66 ldp x24,x25,[x3] // load key 67 ldp x26,x27,[x3,#16] 68 ldp x28,x30,[x4] // load counter 69#ifdef __ARMEB__ 70 ror x24,x24,#32 71 ror x25,x25,#32 72 ror x26,x26,#32 73 ror x27,x27,#32 74 ror x28,x28,#32 75 ror x30,x30,#32 76#endif 77 78.Loop_outer: 79 mov w5,w22 // unpack key block 80 lsr x6,x22,#32 81 mov w7,w23 82 lsr x8,x23,#32 83 mov w9,w24 84 lsr x10,x24,#32 85 mov w11,w25 86 lsr x12,x25,#32 87 mov w13,w26 88 lsr x14,x26,#32 89 mov w15,w27 90 lsr x16,x27,#32 91 mov w17,w28 92 lsr x19,x28,#32 93 mov w20,w30 94 lsr x21,x30,#32 95 96 mov x4,#10 97 subs x2,x2,#64 98.Loop: 99 sub x4,x4,#1 100 add w5,w5,w9 101 add w6,w6,w10 102 add w7,w7,w11 103 add w8,w8,w12 104 eor w17,w17,w5 105 eor w19,w19,w6 106 eor w20,w20,w7 107 eor w21,w21,w8 108 ror w17,w17,#16 109 ror w19,w19,#16 110 ror w20,w20,#16 111 ror w21,w21,#16 112 add w13,w13,w17 113 add w14,w14,w19 114 add w15,w15,w20 115 add w16,w16,w21 116 eor w9,w9,w13 117 eor w10,w10,w14 118 eor w11,w11,w15 119 eor w12,w12,w16 120 ror w9,w9,#20 121 ror w10,w10,#20 122 ror w11,w11,#20 123 ror w12,w12,#20 124 add w5,w5,w9 125 add w6,w6,w10 126 add w7,w7,w11 127 add w8,w8,w12 128 eor w17,w17,w5 129 eor w19,w19,w6 130 eor w20,w20,w7 131 eor w21,w21,w8 132 ror w17,w17,#24 133 ror w19,w19,#24 134 ror w20,w20,#24 135 ror w21,w21,#24 136 add w13,w13,w17 137 add w14,w14,w19 138 add w15,w15,w20 139 add w16,w16,w21 140 eor w9,w9,w13 141 eor w10,w10,w14 142 eor w11,w11,w15 143 eor w12,w12,w16 144 ror w9,w9,#25 145 ror w10,w10,#25 146 ror w11,w11,#25 147 ror w12,w12,#25 148 add w5,w5,w10 149 add w6,w6,w11 150 add w7,w7,w12 151 add w8,w8,w9 152 eor w21,w21,w5 153 eor w17,w17,w6 154 eor w19,w19,w7 155 eor w20,w20,w8 156 ror w21,w21,#16 157 ror w17,w17,#16 158 ror w19,w19,#16 159 ror w20,w20,#16 160 add w15,w15,w21 161 add w16,w16,w17 162 add w13,w13,w19 163 add w14,w14,w20 164 eor w10,w10,w15 165 eor w11,w11,w16 166 eor w12,w12,w13 167 eor w9,w9,w14 168 ror w10,w10,#20 169 ror w11,w11,#20 170 ror w12,w12,#20 171 ror w9,w9,#20 172 add w5,w5,w10 173 add w6,w6,w11 174 add w7,w7,w12 175 add w8,w8,w9 176 eor w21,w21,w5 177 eor w17,w17,w6 178 eor w19,w19,w7 179 eor w20,w20,w8 180 ror w21,w21,#24 181 ror w17,w17,#24 182 ror w19,w19,#24 183 ror w20,w20,#24 184 add w15,w15,w21 185 add w16,w16,w17 186 add w13,w13,w19 187 add w14,w14,w20 188 eor w10,w10,w15 189 eor w11,w11,w16 190 eor w12,w12,w13 191 eor w9,w9,w14 192 ror w10,w10,#25 193 ror w11,w11,#25 194 ror w12,w12,#25 195 ror w9,w9,#25 196 cbnz x4,.Loop 197 198 add w5,w5,w22 // accumulate key block 199 add x6,x6,x22,lsr#32 200 add w7,w7,w23 201 add x8,x8,x23,lsr#32 202 add w9,w9,w24 203 add x10,x10,x24,lsr#32 204 add w11,w11,w25 205 add x12,x12,x25,lsr#32 206 add w13,w13,w26 207 add x14,x14,x26,lsr#32 208 add w15,w15,w27 209 add x16,x16,x27,lsr#32 210 add w17,w17,w28 211 add x19,x19,x28,lsr#32 212 add w20,w20,w30 213 add x21,x21,x30,lsr#32 214 215 b.lo .Ltail 216 217 add x5,x5,x6,lsl#32 // pack 218 add x7,x7,x8,lsl#32 219 ldp x6,x8,[x1,#0] // load input 220 add x9,x9,x10,lsl#32 221 add x11,x11,x12,lsl#32 222 ldp x10,x12,[x1,#16] 223 add x13,x13,x14,lsl#32 224 add x15,x15,x16,lsl#32 225 ldp x14,x16,[x1,#32] 226 add x17,x17,x19,lsl#32 227 add x20,x20,x21,lsl#32 228 ldp x19,x21,[x1,#48] 229 add x1,x1,#64 230#ifdef __ARMEB__ 231 rev x5,x5 232 rev x7,x7 233 rev x9,x9 234 rev x11,x11 235 rev x13,x13 236 rev x15,x15 237 rev x17,x17 238 rev x20,x20 239#endif 240 eor x5,x5,x6 241 eor x7,x7,x8 242 eor x9,x9,x10 243 eor x11,x11,x12 244 eor x13,x13,x14 245 eor x15,x15,x16 246 eor x17,x17,x19 247 eor x20,x20,x21 248 249 stp x5,x7,[x0,#0] // store output 250 add x28,x28,#1 // increment counter 251 stp x9,x11,[x0,#16] 252 stp x13,x15,[x0,#32] 253 stp x17,x20,[x0,#48] 254 add x0,x0,#64 255 256 b.hi .Loop_outer 257 258 ldp x19,x20,[x29,#16] 259 add sp,sp,#64 260 ldp x21,x22,[x29,#32] 261 ldp x23,x24,[x29,#48] 262 ldp x25,x26,[x29,#64] 263 ldp x27,x28,[x29,#80] 264 ldp x29,x30,[sp],#96 265 AARCH64_VALIDATE_LINK_REGISTER 266.Labort: 267 ret 268 269.align 4 270.Ltail: 271 add x2,x2,#64 272.Less_than_64: 273 sub x0,x0,#1 274 add x1,x1,x2 275 add x0,x0,x2 276 add x4,sp,x2 277 neg x2,x2 278 279 add x5,x5,x6,lsl#32 // pack 280 add x7,x7,x8,lsl#32 281 add x9,x9,x10,lsl#32 282 add x11,x11,x12,lsl#32 283 add x13,x13,x14,lsl#32 284 add x15,x15,x16,lsl#32 285 add x17,x17,x19,lsl#32 286 add x20,x20,x21,lsl#32 287#ifdef __ARMEB__ 288 rev x5,x5 289 rev x7,x7 290 rev x9,x9 291 rev x11,x11 292 rev x13,x13 293 rev x15,x15 294 rev x17,x17 295 rev x20,x20 296#endif 297 stp x5,x7,[sp,#0] 298 stp x9,x11,[sp,#16] 299 stp x13,x15,[sp,#32] 300 stp x17,x20,[sp,#48] 301 302.Loop_tail: 303 ldrb w10,[x1,x2] 304 ldrb w11,[x4,x2] 305 add x2,x2,#1 306 eor w10,w10,w11 307 strb w10,[x0,x2] 308 cbnz x2,.Loop_tail 309 310 stp xzr,xzr,[sp,#0] 311 stp xzr,xzr,[sp,#16] 312 stp xzr,xzr,[sp,#32] 313 stp xzr,xzr,[sp,#48] 314 315 ldp x19,x20,[x29,#16] 316 add sp,sp,#64 317 ldp x21,x22,[x29,#32] 318 ldp x23,x24,[x29,#48] 319 ldp x25,x26,[x29,#64] 320 ldp x27,x28,[x29,#80] 321 ldp x29,x30,[sp],#96 322 AARCH64_VALIDATE_LINK_REGISTER 323 ret 324.size ChaCha20_ctr32,.-ChaCha20_ctr32 325 326.type ChaCha20_neon,%function 327.align 5 328ChaCha20_neon: 329 AARCH64_SIGN_LINK_REGISTER 330 stp x29,x30,[sp,#-96]! 331 add x29,sp,#0 332 333 adrp x5,.Lsigma 334 add x5,x5,:lo12:.Lsigma 335 stp x19,x20,[sp,#16] 336 stp x21,x22,[sp,#32] 337 stp x23,x24,[sp,#48] 338 stp x25,x26,[sp,#64] 339 stp x27,x28,[sp,#80] 340 cmp x2,#512 341 b.hs .L512_or_more_neon 342 343 sub sp,sp,#64 344 345 ldp x22,x23,[x5] // load sigma 346 ld1 {v24.4s},[x5],#16 347 ldp x24,x25,[x3] // load key 348 ldp x26,x27,[x3,#16] 349 ld1 {v25.4s,v26.4s},[x3] 350 ldp x28,x30,[x4] // load counter 351 ld1 {v27.4s},[x4] 352 ld1 {v31.4s},[x5] 353#ifdef __ARMEB__ 354 rev64 v24.4s,v24.4s 355 ror x24,x24,#32 356 ror x25,x25,#32 357 ror x26,x26,#32 358 ror x27,x27,#32 359 ror x28,x28,#32 360 ror x30,x30,#32 361#endif 362 add v27.4s,v27.4s,v31.4s // += 1 363 add v28.4s,v27.4s,v31.4s 364 add v29.4s,v28.4s,v31.4s 365 shl v31.4s,v31.4s,#2 // 1 -> 4 366 367.Loop_outer_neon: 368 mov w5,w22 // unpack key block 369 lsr x6,x22,#32 370 mov v0.16b,v24.16b 371 mov w7,w23 372 lsr x8,x23,#32 373 mov v4.16b,v24.16b 374 mov w9,w24 375 lsr x10,x24,#32 376 mov v16.16b,v24.16b 377 mov w11,w25 378 mov v1.16b,v25.16b 379 lsr x12,x25,#32 380 mov v5.16b,v25.16b 381 mov w13,w26 382 mov v17.16b,v25.16b 383 lsr x14,x26,#32 384 mov v3.16b,v27.16b 385 mov w15,w27 386 mov v7.16b,v28.16b 387 lsr x16,x27,#32 388 mov v19.16b,v29.16b 389 mov w17,w28 390 mov v2.16b,v26.16b 391 lsr x19,x28,#32 392 mov v6.16b,v26.16b 393 mov w20,w30 394 mov v18.16b,v26.16b 395 lsr x21,x30,#32 396 397 mov x4,#10 398 subs x2,x2,#256 399.Loop_neon: 400 sub x4,x4,#1 401 add v0.4s,v0.4s,v1.4s 402 add w5,w5,w9 403 add v4.4s,v4.4s,v5.4s 404 add w6,w6,w10 405 add v16.4s,v16.4s,v17.4s 406 add w7,w7,w11 407 eor v3.16b,v3.16b,v0.16b 408 add w8,w8,w12 409 eor v7.16b,v7.16b,v4.16b 410 eor w17,w17,w5 411 eor v19.16b,v19.16b,v16.16b 412 eor w19,w19,w6 413 rev32 v3.8h,v3.8h 414 eor w20,w20,w7 415 rev32 v7.8h,v7.8h 416 eor w21,w21,w8 417 rev32 v19.8h,v19.8h 418 ror w17,w17,#16 419 add v2.4s,v2.4s,v3.4s 420 ror w19,w19,#16 421 add v6.4s,v6.4s,v7.4s 422 ror w20,w20,#16 423 add v18.4s,v18.4s,v19.4s 424 ror w21,w21,#16 425 eor v20.16b,v1.16b,v2.16b 426 add w13,w13,w17 427 eor v21.16b,v5.16b,v6.16b 428 add w14,w14,w19 429 eor v22.16b,v17.16b,v18.16b 430 add w15,w15,w20 431 ushr v1.4s,v20.4s,#20 432 add w16,w16,w21 433 ushr v5.4s,v21.4s,#20 434 eor w9,w9,w13 435 ushr v17.4s,v22.4s,#20 436 eor w10,w10,w14 437 sli v1.4s,v20.4s,#12 438 eor w11,w11,w15 439 sli v5.4s,v21.4s,#12 440 eor w12,w12,w16 441 sli v17.4s,v22.4s,#12 442 ror w9,w9,#20 443 add v0.4s,v0.4s,v1.4s 444 ror w10,w10,#20 445 add v4.4s,v4.4s,v5.4s 446 ror w11,w11,#20 447 add v16.4s,v16.4s,v17.4s 448 ror w12,w12,#20 449 eor v20.16b,v3.16b,v0.16b 450 add w5,w5,w9 451 eor v21.16b,v7.16b,v4.16b 452 add w6,w6,w10 453 eor v22.16b,v19.16b,v16.16b 454 add w7,w7,w11 455 ushr v3.4s,v20.4s,#24 456 add w8,w8,w12 457 ushr v7.4s,v21.4s,#24 458 eor w17,w17,w5 459 ushr v19.4s,v22.4s,#24 460 eor w19,w19,w6 461 sli v3.4s,v20.4s,#8 462 eor w20,w20,w7 463 sli v7.4s,v21.4s,#8 464 eor w21,w21,w8 465 sli v19.4s,v22.4s,#8 466 ror w17,w17,#24 467 add v2.4s,v2.4s,v3.4s 468 ror w19,w19,#24 469 add v6.4s,v6.4s,v7.4s 470 ror w20,w20,#24 471 add v18.4s,v18.4s,v19.4s 472 ror w21,w21,#24 473 eor v20.16b,v1.16b,v2.16b 474 add w13,w13,w17 475 eor v21.16b,v5.16b,v6.16b 476 add w14,w14,w19 477 eor v22.16b,v17.16b,v18.16b 478 add w15,w15,w20 479 ushr v1.4s,v20.4s,#25 480 add w16,w16,w21 481 ushr v5.4s,v21.4s,#25 482 eor w9,w9,w13 483 ushr v17.4s,v22.4s,#25 484 eor w10,w10,w14 485 sli v1.4s,v20.4s,#7 486 eor w11,w11,w15 487 sli v5.4s,v21.4s,#7 488 eor w12,w12,w16 489 sli v17.4s,v22.4s,#7 490 ror w9,w9,#25 491 ext v2.16b,v2.16b,v2.16b,#8 492 ror w10,w10,#25 493 ext v6.16b,v6.16b,v6.16b,#8 494 ror w11,w11,#25 495 ext v18.16b,v18.16b,v18.16b,#8 496 ror w12,w12,#25 497 ext v3.16b,v3.16b,v3.16b,#12 498 ext v7.16b,v7.16b,v7.16b,#12 499 ext v19.16b,v19.16b,v19.16b,#12 500 ext v1.16b,v1.16b,v1.16b,#4 501 ext v5.16b,v5.16b,v5.16b,#4 502 ext v17.16b,v17.16b,v17.16b,#4 503 add v0.4s,v0.4s,v1.4s 504 add w5,w5,w10 505 add v4.4s,v4.4s,v5.4s 506 add w6,w6,w11 507 add v16.4s,v16.4s,v17.4s 508 add w7,w7,w12 509 eor v3.16b,v3.16b,v0.16b 510 add w8,w8,w9 511 eor v7.16b,v7.16b,v4.16b 512 eor w21,w21,w5 513 eor v19.16b,v19.16b,v16.16b 514 eor w17,w17,w6 515 rev32 v3.8h,v3.8h 516 eor w19,w19,w7 517 rev32 v7.8h,v7.8h 518 eor w20,w20,w8 519 rev32 v19.8h,v19.8h 520 ror w21,w21,#16 521 add v2.4s,v2.4s,v3.4s 522 ror w17,w17,#16 523 add v6.4s,v6.4s,v7.4s 524 ror w19,w19,#16 525 add v18.4s,v18.4s,v19.4s 526 ror w20,w20,#16 527 eor v20.16b,v1.16b,v2.16b 528 add w15,w15,w21 529 eor v21.16b,v5.16b,v6.16b 530 add w16,w16,w17 531 eor v22.16b,v17.16b,v18.16b 532 add w13,w13,w19 533 ushr v1.4s,v20.4s,#20 534 add w14,w14,w20 535 ushr v5.4s,v21.4s,#20 536 eor w10,w10,w15 537 ushr v17.4s,v22.4s,#20 538 eor w11,w11,w16 539 sli v1.4s,v20.4s,#12 540 eor w12,w12,w13 541 sli v5.4s,v21.4s,#12 542 eor w9,w9,w14 543 sli v17.4s,v22.4s,#12 544 ror w10,w10,#20 545 add v0.4s,v0.4s,v1.4s 546 ror w11,w11,#20 547 add v4.4s,v4.4s,v5.4s 548 ror w12,w12,#20 549 add v16.4s,v16.4s,v17.4s 550 ror w9,w9,#20 551 eor v20.16b,v3.16b,v0.16b 552 add w5,w5,w10 553 eor v21.16b,v7.16b,v4.16b 554 add w6,w6,w11 555 eor v22.16b,v19.16b,v16.16b 556 add w7,w7,w12 557 ushr v3.4s,v20.4s,#24 558 add w8,w8,w9 559 ushr v7.4s,v21.4s,#24 560 eor w21,w21,w5 561 ushr v19.4s,v22.4s,#24 562 eor w17,w17,w6 563 sli v3.4s,v20.4s,#8 564 eor w19,w19,w7 565 sli v7.4s,v21.4s,#8 566 eor w20,w20,w8 567 sli v19.4s,v22.4s,#8 568 ror w21,w21,#24 569 add v2.4s,v2.4s,v3.4s 570 ror w17,w17,#24 571 add v6.4s,v6.4s,v7.4s 572 ror w19,w19,#24 573 add v18.4s,v18.4s,v19.4s 574 ror w20,w20,#24 575 eor v20.16b,v1.16b,v2.16b 576 add w15,w15,w21 577 eor v21.16b,v5.16b,v6.16b 578 add w16,w16,w17 579 eor v22.16b,v17.16b,v18.16b 580 add w13,w13,w19 581 ushr v1.4s,v20.4s,#25 582 add w14,w14,w20 583 ushr v5.4s,v21.4s,#25 584 eor w10,w10,w15 585 ushr v17.4s,v22.4s,#25 586 eor w11,w11,w16 587 sli v1.4s,v20.4s,#7 588 eor w12,w12,w13 589 sli v5.4s,v21.4s,#7 590 eor w9,w9,w14 591 sli v17.4s,v22.4s,#7 592 ror w10,w10,#25 593 ext v2.16b,v2.16b,v2.16b,#8 594 ror w11,w11,#25 595 ext v6.16b,v6.16b,v6.16b,#8 596 ror w12,w12,#25 597 ext v18.16b,v18.16b,v18.16b,#8 598 ror w9,w9,#25 599 ext v3.16b,v3.16b,v3.16b,#4 600 ext v7.16b,v7.16b,v7.16b,#4 601 ext v19.16b,v19.16b,v19.16b,#4 602 ext v1.16b,v1.16b,v1.16b,#12 603 ext v5.16b,v5.16b,v5.16b,#12 604 ext v17.16b,v17.16b,v17.16b,#12 605 cbnz x4,.Loop_neon 606 607 add w5,w5,w22 // accumulate key block 608 add v0.4s,v0.4s,v24.4s 609 add x6,x6,x22,lsr#32 610 add v4.4s,v4.4s,v24.4s 611 add w7,w7,w23 612 add v16.4s,v16.4s,v24.4s 613 add x8,x8,x23,lsr#32 614 add v2.4s,v2.4s,v26.4s 615 add w9,w9,w24 616 add v6.4s,v6.4s,v26.4s 617 add x10,x10,x24,lsr#32 618 add v18.4s,v18.4s,v26.4s 619 add w11,w11,w25 620 add v3.4s,v3.4s,v27.4s 621 add x12,x12,x25,lsr#32 622 add w13,w13,w26 623 add v7.4s,v7.4s,v28.4s 624 add x14,x14,x26,lsr#32 625 add w15,w15,w27 626 add v19.4s,v19.4s,v29.4s 627 add x16,x16,x27,lsr#32 628 add w17,w17,w28 629 add v1.4s,v1.4s,v25.4s 630 add x19,x19,x28,lsr#32 631 add w20,w20,w30 632 add v5.4s,v5.4s,v25.4s 633 add x21,x21,x30,lsr#32 634 add v17.4s,v17.4s,v25.4s 635 636 b.lo .Ltail_neon 637 638 add x5,x5,x6,lsl#32 // pack 639 add x7,x7,x8,lsl#32 640 ldp x6,x8,[x1,#0] // load input 641 add x9,x9,x10,lsl#32 642 add x11,x11,x12,lsl#32 643 ldp x10,x12,[x1,#16] 644 add x13,x13,x14,lsl#32 645 add x15,x15,x16,lsl#32 646 ldp x14,x16,[x1,#32] 647 add x17,x17,x19,lsl#32 648 add x20,x20,x21,lsl#32 649 ldp x19,x21,[x1,#48] 650 add x1,x1,#64 651#ifdef __ARMEB__ 652 rev x5,x5 653 rev x7,x7 654 rev x9,x9 655 rev x11,x11 656 rev x13,x13 657 rev x15,x15 658 rev x17,x17 659 rev x20,x20 660#endif 661 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 662 eor x5,x5,x6 663 eor x7,x7,x8 664 eor x9,x9,x10 665 eor x11,x11,x12 666 eor x13,x13,x14 667 eor v0.16b,v0.16b,v20.16b 668 eor x15,x15,x16 669 eor v1.16b,v1.16b,v21.16b 670 eor x17,x17,x19 671 eor v2.16b,v2.16b,v22.16b 672 eor x20,x20,x21 673 eor v3.16b,v3.16b,v23.16b 674 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 675 676 stp x5,x7,[x0,#0] // store output 677 add x28,x28,#4 // increment counter 678 stp x9,x11,[x0,#16] 679 add v27.4s,v27.4s,v31.4s // += 4 680 stp x13,x15,[x0,#32] 681 add v28.4s,v28.4s,v31.4s 682 stp x17,x20,[x0,#48] 683 add v29.4s,v29.4s,v31.4s 684 add x0,x0,#64 685 686 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 687 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 688 689 eor v4.16b,v4.16b,v20.16b 690 eor v5.16b,v5.16b,v21.16b 691 eor v6.16b,v6.16b,v22.16b 692 eor v7.16b,v7.16b,v23.16b 693 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 694 695 eor v16.16b,v16.16b,v0.16b 696 eor v17.16b,v17.16b,v1.16b 697 eor v18.16b,v18.16b,v2.16b 698 eor v19.16b,v19.16b,v3.16b 699 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 700 701 b.hi .Loop_outer_neon 702 703 ldp x19,x20,[x29,#16] 704 add sp,sp,#64 705 ldp x21,x22,[x29,#32] 706 ldp x23,x24,[x29,#48] 707 ldp x25,x26,[x29,#64] 708 ldp x27,x28,[x29,#80] 709 ldp x29,x30,[sp],#96 710 AARCH64_VALIDATE_LINK_REGISTER 711 ret 712 713.Ltail_neon: 714 add x2,x2,#256 715 cmp x2,#64 716 b.lo .Less_than_64 717 718 add x5,x5,x6,lsl#32 // pack 719 add x7,x7,x8,lsl#32 720 ldp x6,x8,[x1,#0] // load input 721 add x9,x9,x10,lsl#32 722 add x11,x11,x12,lsl#32 723 ldp x10,x12,[x1,#16] 724 add x13,x13,x14,lsl#32 725 add x15,x15,x16,lsl#32 726 ldp x14,x16,[x1,#32] 727 add x17,x17,x19,lsl#32 728 add x20,x20,x21,lsl#32 729 ldp x19,x21,[x1,#48] 730 add x1,x1,#64 731#ifdef __ARMEB__ 732 rev x5,x5 733 rev x7,x7 734 rev x9,x9 735 rev x11,x11 736 rev x13,x13 737 rev x15,x15 738 rev x17,x17 739 rev x20,x20 740#endif 741 eor x5,x5,x6 742 eor x7,x7,x8 743 eor x9,x9,x10 744 eor x11,x11,x12 745 eor x13,x13,x14 746 eor x15,x15,x16 747 eor x17,x17,x19 748 eor x20,x20,x21 749 750 stp x5,x7,[x0,#0] // store output 751 add x28,x28,#4 // increment counter 752 stp x9,x11,[x0,#16] 753 stp x13,x15,[x0,#32] 754 stp x17,x20,[x0,#48] 755 add x0,x0,#64 756 b.eq .Ldone_neon 757 sub x2,x2,#64 758 cmp x2,#64 759 b.lo .Less_than_128 760 761 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 762 eor v0.16b,v0.16b,v20.16b 763 eor v1.16b,v1.16b,v21.16b 764 eor v2.16b,v2.16b,v22.16b 765 eor v3.16b,v3.16b,v23.16b 766 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 767 b.eq .Ldone_neon 768 sub x2,x2,#64 769 cmp x2,#64 770 b.lo .Less_than_192 771 772 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 773 eor v4.16b,v4.16b,v20.16b 774 eor v5.16b,v5.16b,v21.16b 775 eor v6.16b,v6.16b,v22.16b 776 eor v7.16b,v7.16b,v23.16b 777 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 778 b.eq .Ldone_neon 779 sub x2,x2,#64 780 781 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] 782 b .Last_neon 783 784.Less_than_128: 785 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] 786 b .Last_neon 787.Less_than_192: 788 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] 789 b .Last_neon 790 791.align 4 792.Last_neon: 793 sub x0,x0,#1 794 add x1,x1,x2 795 add x0,x0,x2 796 add x4,sp,x2 797 neg x2,x2 798 799.Loop_tail_neon: 800 ldrb w10,[x1,x2] 801 ldrb w11,[x4,x2] 802 add x2,x2,#1 803 eor w10,w10,w11 804 strb w10,[x0,x2] 805 cbnz x2,.Loop_tail_neon 806 807 stp xzr,xzr,[sp,#0] 808 stp xzr,xzr,[sp,#16] 809 stp xzr,xzr,[sp,#32] 810 stp xzr,xzr,[sp,#48] 811 812.Ldone_neon: 813 ldp x19,x20,[x29,#16] 814 add sp,sp,#64 815 ldp x21,x22,[x29,#32] 816 ldp x23,x24,[x29,#48] 817 ldp x25,x26,[x29,#64] 818 ldp x27,x28,[x29,#80] 819 ldp x29,x30,[sp],#96 820 AARCH64_VALIDATE_LINK_REGISTER 821 ret 822.size ChaCha20_neon,.-ChaCha20_neon 823.type ChaCha20_512_neon,%function 824.align 5 825ChaCha20_512_neon: 826 AARCH64_SIGN_LINK_REGISTER 827 stp x29,x30,[sp,#-96]! 828 add x29,sp,#0 829 830 adrp x5,.Lsigma 831 add x5,x5,:lo12:.Lsigma 832 stp x19,x20,[sp,#16] 833 stp x21,x22,[sp,#32] 834 stp x23,x24,[sp,#48] 835 stp x25,x26,[sp,#64] 836 stp x27,x28,[sp,#80] 837 838.L512_or_more_neon: 839 sub sp,sp,#128+64 840 841 ldp x22,x23,[x5] // load sigma 842 ld1 {v24.4s},[x5],#16 843 ldp x24,x25,[x3] // load key 844 ldp x26,x27,[x3,#16] 845 ld1 {v25.4s,v26.4s},[x3] 846 ldp x28,x30,[x4] // load counter 847 ld1 {v27.4s},[x4] 848 ld1 {v31.4s},[x5] 849#ifdef __ARMEB__ 850 rev64 v24.4s,v24.4s 851 ror x24,x24,#32 852 ror x25,x25,#32 853 ror x26,x26,#32 854 ror x27,x27,#32 855 ror x28,x28,#32 856 ror x30,x30,#32 857#endif 858 add v27.4s,v27.4s,v31.4s // += 1 859 stp q24,q25,[sp,#0] // off-load key block, invariant part 860 add v27.4s,v27.4s,v31.4s // not typo 861 str q26,[sp,#32] 862 add v28.4s,v27.4s,v31.4s 863 add v29.4s,v28.4s,v31.4s 864 add v30.4s,v29.4s,v31.4s 865 shl v31.4s,v31.4s,#2 // 1 -> 4 866 867 stp d8,d9,[sp,#128+0] // meet ABI requirements 868 stp d10,d11,[sp,#128+16] 869 stp d12,d13,[sp,#128+32] 870 stp d14,d15,[sp,#128+48] 871 872 sub x2,x2,#512 // not typo 873 874.Loop_outer_512_neon: 875 mov v0.16b,v24.16b 876 mov v4.16b,v24.16b 877 mov v8.16b,v24.16b 878 mov v12.16b,v24.16b 879 mov v16.16b,v24.16b 880 mov v20.16b,v24.16b 881 mov v1.16b,v25.16b 882 mov w5,w22 // unpack key block 883 mov v5.16b,v25.16b 884 lsr x6,x22,#32 885 mov v9.16b,v25.16b 886 mov w7,w23 887 mov v13.16b,v25.16b 888 lsr x8,x23,#32 889 mov v17.16b,v25.16b 890 mov w9,w24 891 mov v21.16b,v25.16b 892 lsr x10,x24,#32 893 mov v3.16b,v27.16b 894 mov w11,w25 895 mov v7.16b,v28.16b 896 lsr x12,x25,#32 897 mov v11.16b,v29.16b 898 mov w13,w26 899 mov v15.16b,v30.16b 900 lsr x14,x26,#32 901 mov v2.16b,v26.16b 902 mov w15,w27 903 mov v6.16b,v26.16b 904 lsr x16,x27,#32 905 add v19.4s,v3.4s,v31.4s // +4 906 mov w17,w28 907 add v23.4s,v7.4s,v31.4s // +4 908 lsr x19,x28,#32 909 mov v10.16b,v26.16b 910 mov w20,w30 911 mov v14.16b,v26.16b 912 lsr x21,x30,#32 913 mov v18.16b,v26.16b 914 stp q27,q28,[sp,#48] // off-load key block, variable part 915 mov v22.16b,v26.16b 916 str q29,[sp,#80] 917 918 mov x4,#5 919 subs x2,x2,#512 920.Loop_upper_neon: 921 sub x4,x4,#1 922 add v0.4s,v0.4s,v1.4s 923 add w5,w5,w9 924 add v4.4s,v4.4s,v5.4s 925 add w6,w6,w10 926 add v8.4s,v8.4s,v9.4s 927 add w7,w7,w11 928 add v12.4s,v12.4s,v13.4s 929 add w8,w8,w12 930 add v16.4s,v16.4s,v17.4s 931 eor w17,w17,w5 932 add v20.4s,v20.4s,v21.4s 933 eor w19,w19,w6 934 eor v3.16b,v3.16b,v0.16b 935 eor w20,w20,w7 936 eor v7.16b,v7.16b,v4.16b 937 eor w21,w21,w8 938 eor v11.16b,v11.16b,v8.16b 939 ror w17,w17,#16 940 eor v15.16b,v15.16b,v12.16b 941 ror w19,w19,#16 942 eor v19.16b,v19.16b,v16.16b 943 ror w20,w20,#16 944 eor v23.16b,v23.16b,v20.16b 945 ror w21,w21,#16 946 rev32 v3.8h,v3.8h 947 add w13,w13,w17 948 rev32 v7.8h,v7.8h 949 add w14,w14,w19 950 rev32 v11.8h,v11.8h 951 add w15,w15,w20 952 rev32 v15.8h,v15.8h 953 add w16,w16,w21 954 rev32 v19.8h,v19.8h 955 eor w9,w9,w13 956 rev32 v23.8h,v23.8h 957 eor w10,w10,w14 958 add v2.4s,v2.4s,v3.4s 959 eor w11,w11,w15 960 add v6.4s,v6.4s,v7.4s 961 eor w12,w12,w16 962 add v10.4s,v10.4s,v11.4s 963 ror w9,w9,#20 964 add v14.4s,v14.4s,v15.4s 965 ror w10,w10,#20 966 add v18.4s,v18.4s,v19.4s 967 ror w11,w11,#20 968 add v22.4s,v22.4s,v23.4s 969 ror w12,w12,#20 970 eor v24.16b,v1.16b,v2.16b 971 add w5,w5,w9 972 eor v25.16b,v5.16b,v6.16b 973 add w6,w6,w10 974 eor v26.16b,v9.16b,v10.16b 975 add w7,w7,w11 976 eor v27.16b,v13.16b,v14.16b 977 add w8,w8,w12 978 eor v28.16b,v17.16b,v18.16b 979 eor w17,w17,w5 980 eor v29.16b,v21.16b,v22.16b 981 eor w19,w19,w6 982 ushr v1.4s,v24.4s,#20 983 eor w20,w20,w7 984 ushr v5.4s,v25.4s,#20 985 eor w21,w21,w8 986 ushr v9.4s,v26.4s,#20 987 ror w17,w17,#24 988 ushr v13.4s,v27.4s,#20 989 ror w19,w19,#24 990 ushr v17.4s,v28.4s,#20 991 ror w20,w20,#24 992 ushr v21.4s,v29.4s,#20 993 ror w21,w21,#24 994 sli v1.4s,v24.4s,#12 995 add w13,w13,w17 996 sli v5.4s,v25.4s,#12 997 add w14,w14,w19 998 sli v9.4s,v26.4s,#12 999 add w15,w15,w20 1000 sli v13.4s,v27.4s,#12 1001 add w16,w16,w21 1002 sli v17.4s,v28.4s,#12 1003 eor w9,w9,w13 1004 sli v21.4s,v29.4s,#12 1005 eor w10,w10,w14 1006 add v0.4s,v0.4s,v1.4s 1007 eor w11,w11,w15 1008 add v4.4s,v4.4s,v5.4s 1009 eor w12,w12,w16 1010 add v8.4s,v8.4s,v9.4s 1011 ror w9,w9,#25 1012 add v12.4s,v12.4s,v13.4s 1013 ror w10,w10,#25 1014 add v16.4s,v16.4s,v17.4s 1015 ror w11,w11,#25 1016 add v20.4s,v20.4s,v21.4s 1017 ror w12,w12,#25 1018 eor v24.16b,v3.16b,v0.16b 1019 add w5,w5,w10 1020 eor v25.16b,v7.16b,v4.16b 1021 add w6,w6,w11 1022 eor v26.16b,v11.16b,v8.16b 1023 add w7,w7,w12 1024 eor v27.16b,v15.16b,v12.16b 1025 add w8,w8,w9 1026 eor v28.16b,v19.16b,v16.16b 1027 eor w21,w21,w5 1028 eor v29.16b,v23.16b,v20.16b 1029 eor w17,w17,w6 1030 ushr v3.4s,v24.4s,#24 1031 eor w19,w19,w7 1032 ushr v7.4s,v25.4s,#24 1033 eor w20,w20,w8 1034 ushr v11.4s,v26.4s,#24 1035 ror w21,w21,#16 1036 ushr v15.4s,v27.4s,#24 1037 ror w17,w17,#16 1038 ushr v19.4s,v28.4s,#24 1039 ror w19,w19,#16 1040 ushr v23.4s,v29.4s,#24 1041 ror w20,w20,#16 1042 sli v3.4s,v24.4s,#8 1043 add w15,w15,w21 1044 sli v7.4s,v25.4s,#8 1045 add w16,w16,w17 1046 sli v11.4s,v26.4s,#8 1047 add w13,w13,w19 1048 sli v15.4s,v27.4s,#8 1049 add w14,w14,w20 1050 sli v19.4s,v28.4s,#8 1051 eor w10,w10,w15 1052 sli v23.4s,v29.4s,#8 1053 eor w11,w11,w16 1054 add v2.4s,v2.4s,v3.4s 1055 eor w12,w12,w13 1056 add v6.4s,v6.4s,v7.4s 1057 eor w9,w9,w14 1058 add v10.4s,v10.4s,v11.4s 1059 ror w10,w10,#20 1060 add v14.4s,v14.4s,v15.4s 1061 ror w11,w11,#20 1062 add v18.4s,v18.4s,v19.4s 1063 ror w12,w12,#20 1064 add v22.4s,v22.4s,v23.4s 1065 ror w9,w9,#20 1066 eor v24.16b,v1.16b,v2.16b 1067 add w5,w5,w10 1068 eor v25.16b,v5.16b,v6.16b 1069 add w6,w6,w11 1070 eor v26.16b,v9.16b,v10.16b 1071 add w7,w7,w12 1072 eor v27.16b,v13.16b,v14.16b 1073 add w8,w8,w9 1074 eor v28.16b,v17.16b,v18.16b 1075 eor w21,w21,w5 1076 eor v29.16b,v21.16b,v22.16b 1077 eor w17,w17,w6 1078 ushr v1.4s,v24.4s,#25 1079 eor w19,w19,w7 1080 ushr v5.4s,v25.4s,#25 1081 eor w20,w20,w8 1082 ushr v9.4s,v26.4s,#25 1083 ror w21,w21,#24 1084 ushr v13.4s,v27.4s,#25 1085 ror w17,w17,#24 1086 ushr v17.4s,v28.4s,#25 1087 ror w19,w19,#24 1088 ushr v21.4s,v29.4s,#25 1089 ror w20,w20,#24 1090 sli v1.4s,v24.4s,#7 1091 add w15,w15,w21 1092 sli v5.4s,v25.4s,#7 1093 add w16,w16,w17 1094 sli v9.4s,v26.4s,#7 1095 add w13,w13,w19 1096 sli v13.4s,v27.4s,#7 1097 add w14,w14,w20 1098 sli v17.4s,v28.4s,#7 1099 eor w10,w10,w15 1100 sli v21.4s,v29.4s,#7 1101 eor w11,w11,w16 1102 ext v2.16b,v2.16b,v2.16b,#8 1103 eor w12,w12,w13 1104 ext v6.16b,v6.16b,v6.16b,#8 1105 eor w9,w9,w14 1106 ext v10.16b,v10.16b,v10.16b,#8 1107 ror w10,w10,#25 1108 ext v14.16b,v14.16b,v14.16b,#8 1109 ror w11,w11,#25 1110 ext v18.16b,v18.16b,v18.16b,#8 1111 ror w12,w12,#25 1112 ext v22.16b,v22.16b,v22.16b,#8 1113 ror w9,w9,#25 1114 ext v3.16b,v3.16b,v3.16b,#12 1115 ext v7.16b,v7.16b,v7.16b,#12 1116 ext v11.16b,v11.16b,v11.16b,#12 1117 ext v15.16b,v15.16b,v15.16b,#12 1118 ext v19.16b,v19.16b,v19.16b,#12 1119 ext v23.16b,v23.16b,v23.16b,#12 1120 ext v1.16b,v1.16b,v1.16b,#4 1121 ext v5.16b,v5.16b,v5.16b,#4 1122 ext v9.16b,v9.16b,v9.16b,#4 1123 ext v13.16b,v13.16b,v13.16b,#4 1124 ext v17.16b,v17.16b,v17.16b,#4 1125 ext v21.16b,v21.16b,v21.16b,#4 1126 add v0.4s,v0.4s,v1.4s 1127 add w5,w5,w9 1128 add v4.4s,v4.4s,v5.4s 1129 add w6,w6,w10 1130 add v8.4s,v8.4s,v9.4s 1131 add w7,w7,w11 1132 add v12.4s,v12.4s,v13.4s 1133 add w8,w8,w12 1134 add v16.4s,v16.4s,v17.4s 1135 eor w17,w17,w5 1136 add v20.4s,v20.4s,v21.4s 1137 eor w19,w19,w6 1138 eor v3.16b,v3.16b,v0.16b 1139 eor w20,w20,w7 1140 eor v7.16b,v7.16b,v4.16b 1141 eor w21,w21,w8 1142 eor v11.16b,v11.16b,v8.16b 1143 ror w17,w17,#16 1144 eor v15.16b,v15.16b,v12.16b 1145 ror w19,w19,#16 1146 eor v19.16b,v19.16b,v16.16b 1147 ror w20,w20,#16 1148 eor v23.16b,v23.16b,v20.16b 1149 ror w21,w21,#16 1150 rev32 v3.8h,v3.8h 1151 add w13,w13,w17 1152 rev32 v7.8h,v7.8h 1153 add w14,w14,w19 1154 rev32 v11.8h,v11.8h 1155 add w15,w15,w20 1156 rev32 v15.8h,v15.8h 1157 add w16,w16,w21 1158 rev32 v19.8h,v19.8h 1159 eor w9,w9,w13 1160 rev32 v23.8h,v23.8h 1161 eor w10,w10,w14 1162 add v2.4s,v2.4s,v3.4s 1163 eor w11,w11,w15 1164 add v6.4s,v6.4s,v7.4s 1165 eor w12,w12,w16 1166 add v10.4s,v10.4s,v11.4s 1167 ror w9,w9,#20 1168 add v14.4s,v14.4s,v15.4s 1169 ror w10,w10,#20 1170 add v18.4s,v18.4s,v19.4s 1171 ror w11,w11,#20 1172 add v22.4s,v22.4s,v23.4s 1173 ror w12,w12,#20 1174 eor v24.16b,v1.16b,v2.16b 1175 add w5,w5,w9 1176 eor v25.16b,v5.16b,v6.16b 1177 add w6,w6,w10 1178 eor v26.16b,v9.16b,v10.16b 1179 add w7,w7,w11 1180 eor v27.16b,v13.16b,v14.16b 1181 add w8,w8,w12 1182 eor v28.16b,v17.16b,v18.16b 1183 eor w17,w17,w5 1184 eor v29.16b,v21.16b,v22.16b 1185 eor w19,w19,w6 1186 ushr v1.4s,v24.4s,#20 1187 eor w20,w20,w7 1188 ushr v5.4s,v25.4s,#20 1189 eor w21,w21,w8 1190 ushr v9.4s,v26.4s,#20 1191 ror w17,w17,#24 1192 ushr v13.4s,v27.4s,#20 1193 ror w19,w19,#24 1194 ushr v17.4s,v28.4s,#20 1195 ror w20,w20,#24 1196 ushr v21.4s,v29.4s,#20 1197 ror w21,w21,#24 1198 sli v1.4s,v24.4s,#12 1199 add w13,w13,w17 1200 sli v5.4s,v25.4s,#12 1201 add w14,w14,w19 1202 sli v9.4s,v26.4s,#12 1203 add w15,w15,w20 1204 sli v13.4s,v27.4s,#12 1205 add w16,w16,w21 1206 sli v17.4s,v28.4s,#12 1207 eor w9,w9,w13 1208 sli v21.4s,v29.4s,#12 1209 eor w10,w10,w14 1210 add v0.4s,v0.4s,v1.4s 1211 eor w11,w11,w15 1212 add v4.4s,v4.4s,v5.4s 1213 eor w12,w12,w16 1214 add v8.4s,v8.4s,v9.4s 1215 ror w9,w9,#25 1216 add v12.4s,v12.4s,v13.4s 1217 ror w10,w10,#25 1218 add v16.4s,v16.4s,v17.4s 1219 ror w11,w11,#25 1220 add v20.4s,v20.4s,v21.4s 1221 ror w12,w12,#25 1222 eor v24.16b,v3.16b,v0.16b 1223 add w5,w5,w10 1224 eor v25.16b,v7.16b,v4.16b 1225 add w6,w6,w11 1226 eor v26.16b,v11.16b,v8.16b 1227 add w7,w7,w12 1228 eor v27.16b,v15.16b,v12.16b 1229 add w8,w8,w9 1230 eor v28.16b,v19.16b,v16.16b 1231 eor w21,w21,w5 1232 eor v29.16b,v23.16b,v20.16b 1233 eor w17,w17,w6 1234 ushr v3.4s,v24.4s,#24 1235 eor w19,w19,w7 1236 ushr v7.4s,v25.4s,#24 1237 eor w20,w20,w8 1238 ushr v11.4s,v26.4s,#24 1239 ror w21,w21,#16 1240 ushr v15.4s,v27.4s,#24 1241 ror w17,w17,#16 1242 ushr v19.4s,v28.4s,#24 1243 ror w19,w19,#16 1244 ushr v23.4s,v29.4s,#24 1245 ror w20,w20,#16 1246 sli v3.4s,v24.4s,#8 1247 add w15,w15,w21 1248 sli v7.4s,v25.4s,#8 1249 add w16,w16,w17 1250 sli v11.4s,v26.4s,#8 1251 add w13,w13,w19 1252 sli v15.4s,v27.4s,#8 1253 add w14,w14,w20 1254 sli v19.4s,v28.4s,#8 1255 eor w10,w10,w15 1256 sli v23.4s,v29.4s,#8 1257 eor w11,w11,w16 1258 add v2.4s,v2.4s,v3.4s 1259 eor w12,w12,w13 1260 add v6.4s,v6.4s,v7.4s 1261 eor w9,w9,w14 1262 add v10.4s,v10.4s,v11.4s 1263 ror w10,w10,#20 1264 add v14.4s,v14.4s,v15.4s 1265 ror w11,w11,#20 1266 add v18.4s,v18.4s,v19.4s 1267 ror w12,w12,#20 1268 add v22.4s,v22.4s,v23.4s 1269 ror w9,w9,#20 1270 eor v24.16b,v1.16b,v2.16b 1271 add w5,w5,w10 1272 eor v25.16b,v5.16b,v6.16b 1273 add w6,w6,w11 1274 eor v26.16b,v9.16b,v10.16b 1275 add w7,w7,w12 1276 eor v27.16b,v13.16b,v14.16b 1277 add w8,w8,w9 1278 eor v28.16b,v17.16b,v18.16b 1279 eor w21,w21,w5 1280 eor v29.16b,v21.16b,v22.16b 1281 eor w17,w17,w6 1282 ushr v1.4s,v24.4s,#25 1283 eor w19,w19,w7 1284 ushr v5.4s,v25.4s,#25 1285 eor w20,w20,w8 1286 ushr v9.4s,v26.4s,#25 1287 ror w21,w21,#24 1288 ushr v13.4s,v27.4s,#25 1289 ror w17,w17,#24 1290 ushr v17.4s,v28.4s,#25 1291 ror w19,w19,#24 1292 ushr v21.4s,v29.4s,#25 1293 ror w20,w20,#24 1294 sli v1.4s,v24.4s,#7 1295 add w15,w15,w21 1296 sli v5.4s,v25.4s,#7 1297 add w16,w16,w17 1298 sli v9.4s,v26.4s,#7 1299 add w13,w13,w19 1300 sli v13.4s,v27.4s,#7 1301 add w14,w14,w20 1302 sli v17.4s,v28.4s,#7 1303 eor w10,w10,w15 1304 sli v21.4s,v29.4s,#7 1305 eor w11,w11,w16 1306 ext v2.16b,v2.16b,v2.16b,#8 1307 eor w12,w12,w13 1308 ext v6.16b,v6.16b,v6.16b,#8 1309 eor w9,w9,w14 1310 ext v10.16b,v10.16b,v10.16b,#8 1311 ror w10,w10,#25 1312 ext v14.16b,v14.16b,v14.16b,#8 1313 ror w11,w11,#25 1314 ext v18.16b,v18.16b,v18.16b,#8 1315 ror w12,w12,#25 1316 ext v22.16b,v22.16b,v22.16b,#8 1317 ror w9,w9,#25 1318 ext v3.16b,v3.16b,v3.16b,#4 1319 ext v7.16b,v7.16b,v7.16b,#4 1320 ext v11.16b,v11.16b,v11.16b,#4 1321 ext v15.16b,v15.16b,v15.16b,#4 1322 ext v19.16b,v19.16b,v19.16b,#4 1323 ext v23.16b,v23.16b,v23.16b,#4 1324 ext v1.16b,v1.16b,v1.16b,#12 1325 ext v5.16b,v5.16b,v5.16b,#12 1326 ext v9.16b,v9.16b,v9.16b,#12 1327 ext v13.16b,v13.16b,v13.16b,#12 1328 ext v17.16b,v17.16b,v17.16b,#12 1329 ext v21.16b,v21.16b,v21.16b,#12 1330 cbnz x4,.Loop_upper_neon 1331 1332 add w5,w5,w22 // accumulate key block 1333 add x6,x6,x22,lsr#32 1334 add w7,w7,w23 1335 add x8,x8,x23,lsr#32 1336 add w9,w9,w24 1337 add x10,x10,x24,lsr#32 1338 add w11,w11,w25 1339 add x12,x12,x25,lsr#32 1340 add w13,w13,w26 1341 add x14,x14,x26,lsr#32 1342 add w15,w15,w27 1343 add x16,x16,x27,lsr#32 1344 add w17,w17,w28 1345 add x19,x19,x28,lsr#32 1346 add w20,w20,w30 1347 add x21,x21,x30,lsr#32 1348 1349 add x5,x5,x6,lsl#32 // pack 1350 add x7,x7,x8,lsl#32 1351 ldp x6,x8,[x1,#0] // load input 1352 add x9,x9,x10,lsl#32 1353 add x11,x11,x12,lsl#32 1354 ldp x10,x12,[x1,#16] 1355 add x13,x13,x14,lsl#32 1356 add x15,x15,x16,lsl#32 1357 ldp x14,x16,[x1,#32] 1358 add x17,x17,x19,lsl#32 1359 add x20,x20,x21,lsl#32 1360 ldp x19,x21,[x1,#48] 1361 add x1,x1,#64 1362#ifdef __ARMEB__ 1363 rev x5,x5 1364 rev x7,x7 1365 rev x9,x9 1366 rev x11,x11 1367 rev x13,x13 1368 rev x15,x15 1369 rev x17,x17 1370 rev x20,x20 1371#endif 1372 eor x5,x5,x6 1373 eor x7,x7,x8 1374 eor x9,x9,x10 1375 eor x11,x11,x12 1376 eor x13,x13,x14 1377 eor x15,x15,x16 1378 eor x17,x17,x19 1379 eor x20,x20,x21 1380 1381 stp x5,x7,[x0,#0] // store output 1382 add x28,x28,#1 // increment counter 1383 mov w5,w22 // unpack key block 1384 lsr x6,x22,#32 1385 stp x9,x11,[x0,#16] 1386 mov w7,w23 1387 lsr x8,x23,#32 1388 stp x13,x15,[x0,#32] 1389 mov w9,w24 1390 lsr x10,x24,#32 1391 stp x17,x20,[x0,#48] 1392 add x0,x0,#64 1393 mov w11,w25 1394 lsr x12,x25,#32 1395 mov w13,w26 1396 lsr x14,x26,#32 1397 mov w15,w27 1398 lsr x16,x27,#32 1399 mov w17,w28 1400 lsr x19,x28,#32 1401 mov w20,w30 1402 lsr x21,x30,#32 1403 1404 mov x4,#5 1405.Loop_lower_neon: 1406 sub x4,x4,#1 1407 add v0.4s,v0.4s,v1.4s 1408 add w5,w5,w9 1409 add v4.4s,v4.4s,v5.4s 1410 add w6,w6,w10 1411 add v8.4s,v8.4s,v9.4s 1412 add w7,w7,w11 1413 add v12.4s,v12.4s,v13.4s 1414 add w8,w8,w12 1415 add v16.4s,v16.4s,v17.4s 1416 eor w17,w17,w5 1417 add v20.4s,v20.4s,v21.4s 1418 eor w19,w19,w6 1419 eor v3.16b,v3.16b,v0.16b 1420 eor w20,w20,w7 1421 eor v7.16b,v7.16b,v4.16b 1422 eor w21,w21,w8 1423 eor v11.16b,v11.16b,v8.16b 1424 ror w17,w17,#16 1425 eor v15.16b,v15.16b,v12.16b 1426 ror w19,w19,#16 1427 eor v19.16b,v19.16b,v16.16b 1428 ror w20,w20,#16 1429 eor v23.16b,v23.16b,v20.16b 1430 ror w21,w21,#16 1431 rev32 v3.8h,v3.8h 1432 add w13,w13,w17 1433 rev32 v7.8h,v7.8h 1434 add w14,w14,w19 1435 rev32 v11.8h,v11.8h 1436 add w15,w15,w20 1437 rev32 v15.8h,v15.8h 1438 add w16,w16,w21 1439 rev32 v19.8h,v19.8h 1440 eor w9,w9,w13 1441 rev32 v23.8h,v23.8h 1442 eor w10,w10,w14 1443 add v2.4s,v2.4s,v3.4s 1444 eor w11,w11,w15 1445 add v6.4s,v6.4s,v7.4s 1446 eor w12,w12,w16 1447 add v10.4s,v10.4s,v11.4s 1448 ror w9,w9,#20 1449 add v14.4s,v14.4s,v15.4s 1450 ror w10,w10,#20 1451 add v18.4s,v18.4s,v19.4s 1452 ror w11,w11,#20 1453 add v22.4s,v22.4s,v23.4s 1454 ror w12,w12,#20 1455 eor v24.16b,v1.16b,v2.16b 1456 add w5,w5,w9 1457 eor v25.16b,v5.16b,v6.16b 1458 add w6,w6,w10 1459 eor v26.16b,v9.16b,v10.16b 1460 add w7,w7,w11 1461 eor v27.16b,v13.16b,v14.16b 1462 add w8,w8,w12 1463 eor v28.16b,v17.16b,v18.16b 1464 eor w17,w17,w5 1465 eor v29.16b,v21.16b,v22.16b 1466 eor w19,w19,w6 1467 ushr v1.4s,v24.4s,#20 1468 eor w20,w20,w7 1469 ushr v5.4s,v25.4s,#20 1470 eor w21,w21,w8 1471 ushr v9.4s,v26.4s,#20 1472 ror w17,w17,#24 1473 ushr v13.4s,v27.4s,#20 1474 ror w19,w19,#24 1475 ushr v17.4s,v28.4s,#20 1476 ror w20,w20,#24 1477 ushr v21.4s,v29.4s,#20 1478 ror w21,w21,#24 1479 sli v1.4s,v24.4s,#12 1480 add w13,w13,w17 1481 sli v5.4s,v25.4s,#12 1482 add w14,w14,w19 1483 sli v9.4s,v26.4s,#12 1484 add w15,w15,w20 1485 sli v13.4s,v27.4s,#12 1486 add w16,w16,w21 1487 sli v17.4s,v28.4s,#12 1488 eor w9,w9,w13 1489 sli v21.4s,v29.4s,#12 1490 eor w10,w10,w14 1491 add v0.4s,v0.4s,v1.4s 1492 eor w11,w11,w15 1493 add v4.4s,v4.4s,v5.4s 1494 eor w12,w12,w16 1495 add v8.4s,v8.4s,v9.4s 1496 ror w9,w9,#25 1497 add v12.4s,v12.4s,v13.4s 1498 ror w10,w10,#25 1499 add v16.4s,v16.4s,v17.4s 1500 ror w11,w11,#25 1501 add v20.4s,v20.4s,v21.4s 1502 ror w12,w12,#25 1503 eor v24.16b,v3.16b,v0.16b 1504 add w5,w5,w10 1505 eor v25.16b,v7.16b,v4.16b 1506 add w6,w6,w11 1507 eor v26.16b,v11.16b,v8.16b 1508 add w7,w7,w12 1509 eor v27.16b,v15.16b,v12.16b 1510 add w8,w8,w9 1511 eor v28.16b,v19.16b,v16.16b 1512 eor w21,w21,w5 1513 eor v29.16b,v23.16b,v20.16b 1514 eor w17,w17,w6 1515 ushr v3.4s,v24.4s,#24 1516 eor w19,w19,w7 1517 ushr v7.4s,v25.4s,#24 1518 eor w20,w20,w8 1519 ushr v11.4s,v26.4s,#24 1520 ror w21,w21,#16 1521 ushr v15.4s,v27.4s,#24 1522 ror w17,w17,#16 1523 ushr v19.4s,v28.4s,#24 1524 ror w19,w19,#16 1525 ushr v23.4s,v29.4s,#24 1526 ror w20,w20,#16 1527 sli v3.4s,v24.4s,#8 1528 add w15,w15,w21 1529 sli v7.4s,v25.4s,#8 1530 add w16,w16,w17 1531 sli v11.4s,v26.4s,#8 1532 add w13,w13,w19 1533 sli v15.4s,v27.4s,#8 1534 add w14,w14,w20 1535 sli v19.4s,v28.4s,#8 1536 eor w10,w10,w15 1537 sli v23.4s,v29.4s,#8 1538 eor w11,w11,w16 1539 add v2.4s,v2.4s,v3.4s 1540 eor w12,w12,w13 1541 add v6.4s,v6.4s,v7.4s 1542 eor w9,w9,w14 1543 add v10.4s,v10.4s,v11.4s 1544 ror w10,w10,#20 1545 add v14.4s,v14.4s,v15.4s 1546 ror w11,w11,#20 1547 add v18.4s,v18.4s,v19.4s 1548 ror w12,w12,#20 1549 add v22.4s,v22.4s,v23.4s 1550 ror w9,w9,#20 1551 eor v24.16b,v1.16b,v2.16b 1552 add w5,w5,w10 1553 eor v25.16b,v5.16b,v6.16b 1554 add w6,w6,w11 1555 eor v26.16b,v9.16b,v10.16b 1556 add w7,w7,w12 1557 eor v27.16b,v13.16b,v14.16b 1558 add w8,w8,w9 1559 eor v28.16b,v17.16b,v18.16b 1560 eor w21,w21,w5 1561 eor v29.16b,v21.16b,v22.16b 1562 eor w17,w17,w6 1563 ushr v1.4s,v24.4s,#25 1564 eor w19,w19,w7 1565 ushr v5.4s,v25.4s,#25 1566 eor w20,w20,w8 1567 ushr v9.4s,v26.4s,#25 1568 ror w21,w21,#24 1569 ushr v13.4s,v27.4s,#25 1570 ror w17,w17,#24 1571 ushr v17.4s,v28.4s,#25 1572 ror w19,w19,#24 1573 ushr v21.4s,v29.4s,#25 1574 ror w20,w20,#24 1575 sli v1.4s,v24.4s,#7 1576 add w15,w15,w21 1577 sli v5.4s,v25.4s,#7 1578 add w16,w16,w17 1579 sli v9.4s,v26.4s,#7 1580 add w13,w13,w19 1581 sli v13.4s,v27.4s,#7 1582 add w14,w14,w20 1583 sli v17.4s,v28.4s,#7 1584 eor w10,w10,w15 1585 sli v21.4s,v29.4s,#7 1586 eor w11,w11,w16 1587 ext v2.16b,v2.16b,v2.16b,#8 1588 eor w12,w12,w13 1589 ext v6.16b,v6.16b,v6.16b,#8 1590 eor w9,w9,w14 1591 ext v10.16b,v10.16b,v10.16b,#8 1592 ror w10,w10,#25 1593 ext v14.16b,v14.16b,v14.16b,#8 1594 ror w11,w11,#25 1595 ext v18.16b,v18.16b,v18.16b,#8 1596 ror w12,w12,#25 1597 ext v22.16b,v22.16b,v22.16b,#8 1598 ror w9,w9,#25 1599 ext v3.16b,v3.16b,v3.16b,#12 1600 ext v7.16b,v7.16b,v7.16b,#12 1601 ext v11.16b,v11.16b,v11.16b,#12 1602 ext v15.16b,v15.16b,v15.16b,#12 1603 ext v19.16b,v19.16b,v19.16b,#12 1604 ext v23.16b,v23.16b,v23.16b,#12 1605 ext v1.16b,v1.16b,v1.16b,#4 1606 ext v5.16b,v5.16b,v5.16b,#4 1607 ext v9.16b,v9.16b,v9.16b,#4 1608 ext v13.16b,v13.16b,v13.16b,#4 1609 ext v17.16b,v17.16b,v17.16b,#4 1610 ext v21.16b,v21.16b,v21.16b,#4 1611 add v0.4s,v0.4s,v1.4s 1612 add w5,w5,w9 1613 add v4.4s,v4.4s,v5.4s 1614 add w6,w6,w10 1615 add v8.4s,v8.4s,v9.4s 1616 add w7,w7,w11 1617 add v12.4s,v12.4s,v13.4s 1618 add w8,w8,w12 1619 add v16.4s,v16.4s,v17.4s 1620 eor w17,w17,w5 1621 add v20.4s,v20.4s,v21.4s 1622 eor w19,w19,w6 1623 eor v3.16b,v3.16b,v0.16b 1624 eor w20,w20,w7 1625 eor v7.16b,v7.16b,v4.16b 1626 eor w21,w21,w8 1627 eor v11.16b,v11.16b,v8.16b 1628 ror w17,w17,#16 1629 eor v15.16b,v15.16b,v12.16b 1630 ror w19,w19,#16 1631 eor v19.16b,v19.16b,v16.16b 1632 ror w20,w20,#16 1633 eor v23.16b,v23.16b,v20.16b 1634 ror w21,w21,#16 1635 rev32 v3.8h,v3.8h 1636 add w13,w13,w17 1637 rev32 v7.8h,v7.8h 1638 add w14,w14,w19 1639 rev32 v11.8h,v11.8h 1640 add w15,w15,w20 1641 rev32 v15.8h,v15.8h 1642 add w16,w16,w21 1643 rev32 v19.8h,v19.8h 1644 eor w9,w9,w13 1645 rev32 v23.8h,v23.8h 1646 eor w10,w10,w14 1647 add v2.4s,v2.4s,v3.4s 1648 eor w11,w11,w15 1649 add v6.4s,v6.4s,v7.4s 1650 eor w12,w12,w16 1651 add v10.4s,v10.4s,v11.4s 1652 ror w9,w9,#20 1653 add v14.4s,v14.4s,v15.4s 1654 ror w10,w10,#20 1655 add v18.4s,v18.4s,v19.4s 1656 ror w11,w11,#20 1657 add v22.4s,v22.4s,v23.4s 1658 ror w12,w12,#20 1659 eor v24.16b,v1.16b,v2.16b 1660 add w5,w5,w9 1661 eor v25.16b,v5.16b,v6.16b 1662 add w6,w6,w10 1663 eor v26.16b,v9.16b,v10.16b 1664 add w7,w7,w11 1665 eor v27.16b,v13.16b,v14.16b 1666 add w8,w8,w12 1667 eor v28.16b,v17.16b,v18.16b 1668 eor w17,w17,w5 1669 eor v29.16b,v21.16b,v22.16b 1670 eor w19,w19,w6 1671 ushr v1.4s,v24.4s,#20 1672 eor w20,w20,w7 1673 ushr v5.4s,v25.4s,#20 1674 eor w21,w21,w8 1675 ushr v9.4s,v26.4s,#20 1676 ror w17,w17,#24 1677 ushr v13.4s,v27.4s,#20 1678 ror w19,w19,#24 1679 ushr v17.4s,v28.4s,#20 1680 ror w20,w20,#24 1681 ushr v21.4s,v29.4s,#20 1682 ror w21,w21,#24 1683 sli v1.4s,v24.4s,#12 1684 add w13,w13,w17 1685 sli v5.4s,v25.4s,#12 1686 add w14,w14,w19 1687 sli v9.4s,v26.4s,#12 1688 add w15,w15,w20 1689 sli v13.4s,v27.4s,#12 1690 add w16,w16,w21 1691 sli v17.4s,v28.4s,#12 1692 eor w9,w9,w13 1693 sli v21.4s,v29.4s,#12 1694 eor w10,w10,w14 1695 add v0.4s,v0.4s,v1.4s 1696 eor w11,w11,w15 1697 add v4.4s,v4.4s,v5.4s 1698 eor w12,w12,w16 1699 add v8.4s,v8.4s,v9.4s 1700 ror w9,w9,#25 1701 add v12.4s,v12.4s,v13.4s 1702 ror w10,w10,#25 1703 add v16.4s,v16.4s,v17.4s 1704 ror w11,w11,#25 1705 add v20.4s,v20.4s,v21.4s 1706 ror w12,w12,#25 1707 eor v24.16b,v3.16b,v0.16b 1708 add w5,w5,w10 1709 eor v25.16b,v7.16b,v4.16b 1710 add w6,w6,w11 1711 eor v26.16b,v11.16b,v8.16b 1712 add w7,w7,w12 1713 eor v27.16b,v15.16b,v12.16b 1714 add w8,w8,w9 1715 eor v28.16b,v19.16b,v16.16b 1716 eor w21,w21,w5 1717 eor v29.16b,v23.16b,v20.16b 1718 eor w17,w17,w6 1719 ushr v3.4s,v24.4s,#24 1720 eor w19,w19,w7 1721 ushr v7.4s,v25.4s,#24 1722 eor w20,w20,w8 1723 ushr v11.4s,v26.4s,#24 1724 ror w21,w21,#16 1725 ushr v15.4s,v27.4s,#24 1726 ror w17,w17,#16 1727 ushr v19.4s,v28.4s,#24 1728 ror w19,w19,#16 1729 ushr v23.4s,v29.4s,#24 1730 ror w20,w20,#16 1731 sli v3.4s,v24.4s,#8 1732 add w15,w15,w21 1733 sli v7.4s,v25.4s,#8 1734 add w16,w16,w17 1735 sli v11.4s,v26.4s,#8 1736 add w13,w13,w19 1737 sli v15.4s,v27.4s,#8 1738 add w14,w14,w20 1739 sli v19.4s,v28.4s,#8 1740 eor w10,w10,w15 1741 sli v23.4s,v29.4s,#8 1742 eor w11,w11,w16 1743 add v2.4s,v2.4s,v3.4s 1744 eor w12,w12,w13 1745 add v6.4s,v6.4s,v7.4s 1746 eor w9,w9,w14 1747 add v10.4s,v10.4s,v11.4s 1748 ror w10,w10,#20 1749 add v14.4s,v14.4s,v15.4s 1750 ror w11,w11,#20 1751 add v18.4s,v18.4s,v19.4s 1752 ror w12,w12,#20 1753 add v22.4s,v22.4s,v23.4s 1754 ror w9,w9,#20 1755 eor v24.16b,v1.16b,v2.16b 1756 add w5,w5,w10 1757 eor v25.16b,v5.16b,v6.16b 1758 add w6,w6,w11 1759 eor v26.16b,v9.16b,v10.16b 1760 add w7,w7,w12 1761 eor v27.16b,v13.16b,v14.16b 1762 add w8,w8,w9 1763 eor v28.16b,v17.16b,v18.16b 1764 eor w21,w21,w5 1765 eor v29.16b,v21.16b,v22.16b 1766 eor w17,w17,w6 1767 ushr v1.4s,v24.4s,#25 1768 eor w19,w19,w7 1769 ushr v5.4s,v25.4s,#25 1770 eor w20,w20,w8 1771 ushr v9.4s,v26.4s,#25 1772 ror w21,w21,#24 1773 ushr v13.4s,v27.4s,#25 1774 ror w17,w17,#24 1775 ushr v17.4s,v28.4s,#25 1776 ror w19,w19,#24 1777 ushr v21.4s,v29.4s,#25 1778 ror w20,w20,#24 1779 sli v1.4s,v24.4s,#7 1780 add w15,w15,w21 1781 sli v5.4s,v25.4s,#7 1782 add w16,w16,w17 1783 sli v9.4s,v26.4s,#7 1784 add w13,w13,w19 1785 sli v13.4s,v27.4s,#7 1786 add w14,w14,w20 1787 sli v17.4s,v28.4s,#7 1788 eor w10,w10,w15 1789 sli v21.4s,v29.4s,#7 1790 eor w11,w11,w16 1791 ext v2.16b,v2.16b,v2.16b,#8 1792 eor w12,w12,w13 1793 ext v6.16b,v6.16b,v6.16b,#8 1794 eor w9,w9,w14 1795 ext v10.16b,v10.16b,v10.16b,#8 1796 ror w10,w10,#25 1797 ext v14.16b,v14.16b,v14.16b,#8 1798 ror w11,w11,#25 1799 ext v18.16b,v18.16b,v18.16b,#8 1800 ror w12,w12,#25 1801 ext v22.16b,v22.16b,v22.16b,#8 1802 ror w9,w9,#25 1803 ext v3.16b,v3.16b,v3.16b,#4 1804 ext v7.16b,v7.16b,v7.16b,#4 1805 ext v11.16b,v11.16b,v11.16b,#4 1806 ext v15.16b,v15.16b,v15.16b,#4 1807 ext v19.16b,v19.16b,v19.16b,#4 1808 ext v23.16b,v23.16b,v23.16b,#4 1809 ext v1.16b,v1.16b,v1.16b,#12 1810 ext v5.16b,v5.16b,v5.16b,#12 1811 ext v9.16b,v9.16b,v9.16b,#12 1812 ext v13.16b,v13.16b,v13.16b,#12 1813 ext v17.16b,v17.16b,v17.16b,#12 1814 ext v21.16b,v21.16b,v21.16b,#12 1815 cbnz x4,.Loop_lower_neon 1816 1817 add w5,w5,w22 // accumulate key block 1818 ldp q24,q25,[sp,#0] 1819 add x6,x6,x22,lsr#32 1820 ldp q26,q27,[sp,#32] 1821 add w7,w7,w23 1822 ldp q28,q29,[sp,#64] 1823 add x8,x8,x23,lsr#32 1824 add v0.4s,v0.4s,v24.4s 1825 add w9,w9,w24 1826 add v4.4s,v4.4s,v24.4s 1827 add x10,x10,x24,lsr#32 1828 add v8.4s,v8.4s,v24.4s 1829 add w11,w11,w25 1830 add v12.4s,v12.4s,v24.4s 1831 add x12,x12,x25,lsr#32 1832 add v16.4s,v16.4s,v24.4s 1833 add w13,w13,w26 1834 add v20.4s,v20.4s,v24.4s 1835 add x14,x14,x26,lsr#32 1836 add v2.4s,v2.4s,v26.4s 1837 add w15,w15,w27 1838 add v6.4s,v6.4s,v26.4s 1839 add x16,x16,x27,lsr#32 1840 add v10.4s,v10.4s,v26.4s 1841 add w17,w17,w28 1842 add v14.4s,v14.4s,v26.4s 1843 add x19,x19,x28,lsr#32 1844 add v18.4s,v18.4s,v26.4s 1845 add w20,w20,w30 1846 add v22.4s,v22.4s,v26.4s 1847 add x21,x21,x30,lsr#32 1848 add v19.4s,v19.4s,v31.4s // +4 1849 add x5,x5,x6,lsl#32 // pack 1850 add v23.4s,v23.4s,v31.4s // +4 1851 add x7,x7,x8,lsl#32 1852 add v3.4s,v3.4s,v27.4s 1853 ldp x6,x8,[x1,#0] // load input 1854 add v7.4s,v7.4s,v28.4s 1855 add x9,x9,x10,lsl#32 1856 add v11.4s,v11.4s,v29.4s 1857 add x11,x11,x12,lsl#32 1858 add v15.4s,v15.4s,v30.4s 1859 ldp x10,x12,[x1,#16] 1860 add v19.4s,v19.4s,v27.4s 1861 add x13,x13,x14,lsl#32 1862 add v23.4s,v23.4s,v28.4s 1863 add x15,x15,x16,lsl#32 1864 add v1.4s,v1.4s,v25.4s 1865 ldp x14,x16,[x1,#32] 1866 add v5.4s,v5.4s,v25.4s 1867 add x17,x17,x19,lsl#32 1868 add v9.4s,v9.4s,v25.4s 1869 add x20,x20,x21,lsl#32 1870 add v13.4s,v13.4s,v25.4s 1871 ldp x19,x21,[x1,#48] 1872 add v17.4s,v17.4s,v25.4s 1873 add x1,x1,#64 1874 add v21.4s,v21.4s,v25.4s 1875 1876#ifdef __ARMEB__ 1877 rev x5,x5 1878 rev x7,x7 1879 rev x9,x9 1880 rev x11,x11 1881 rev x13,x13 1882 rev x15,x15 1883 rev x17,x17 1884 rev x20,x20 1885#endif 1886 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1887 eor x5,x5,x6 1888 eor x7,x7,x8 1889 eor x9,x9,x10 1890 eor x11,x11,x12 1891 eor x13,x13,x14 1892 eor v0.16b,v0.16b,v24.16b 1893 eor x15,x15,x16 1894 eor v1.16b,v1.16b,v25.16b 1895 eor x17,x17,x19 1896 eor v2.16b,v2.16b,v26.16b 1897 eor x20,x20,x21 1898 eor v3.16b,v3.16b,v27.16b 1899 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 1900 1901 stp x5,x7,[x0,#0] // store output 1902 add x28,x28,#7 // increment counter 1903 stp x9,x11,[x0,#16] 1904 stp x13,x15,[x0,#32] 1905 stp x17,x20,[x0,#48] 1906 add x0,x0,#64 1907 st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 1908 1909 ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 1910 eor v4.16b,v4.16b,v24.16b 1911 eor v5.16b,v5.16b,v25.16b 1912 eor v6.16b,v6.16b,v26.16b 1913 eor v7.16b,v7.16b,v27.16b 1914 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1915 1916 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1917 eor v8.16b,v8.16b,v0.16b 1918 ldp q24,q25,[sp,#0] 1919 eor v9.16b,v9.16b,v1.16b 1920 ldp q26,q27,[sp,#32] 1921 eor v10.16b,v10.16b,v2.16b 1922 eor v11.16b,v11.16b,v3.16b 1923 st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 1924 1925 ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 1926 eor v12.16b,v12.16b,v4.16b 1927 eor v13.16b,v13.16b,v5.16b 1928 eor v14.16b,v14.16b,v6.16b 1929 eor v15.16b,v15.16b,v7.16b 1930 st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 1931 1932 ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 1933 eor v16.16b,v16.16b,v8.16b 1934 eor v17.16b,v17.16b,v9.16b 1935 eor v18.16b,v18.16b,v10.16b 1936 eor v19.16b,v19.16b,v11.16b 1937 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1938 1939 shl v0.4s,v31.4s,#1 // 4 -> 8 1940 eor v20.16b,v20.16b,v12.16b 1941 eor v21.16b,v21.16b,v13.16b 1942 eor v22.16b,v22.16b,v14.16b 1943 eor v23.16b,v23.16b,v15.16b 1944 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1945 1946 add v27.4s,v27.4s,v0.4s // += 8 1947 add v28.4s,v28.4s,v0.4s 1948 add v29.4s,v29.4s,v0.4s 1949 add v30.4s,v30.4s,v0.4s 1950 1951 b.hs .Loop_outer_512_neon 1952 1953 adds x2,x2,#512 1954 ushr v0.4s,v31.4s,#2 // 4 -> 1 1955 1956 ldp d8,d9,[sp,#128+0] // meet ABI requirements 1957 ldp d10,d11,[sp,#128+16] 1958 ldp d12,d13,[sp,#128+32] 1959 ldp d14,d15,[sp,#128+48] 1960 1961 stp q24,q31,[sp,#0] // wipe off-load area 1962 stp q24,q31,[sp,#32] 1963 stp q24,q31,[sp,#64] 1964 1965 b.eq .Ldone_512_neon 1966 1967 cmp x2,#192 1968 sub v27.4s,v27.4s,v0.4s // -= 1 1969 sub v28.4s,v28.4s,v0.4s 1970 sub v29.4s,v29.4s,v0.4s 1971 add sp,sp,#128 1972 b.hs .Loop_outer_neon 1973 1974 eor v25.16b,v25.16b,v25.16b 1975 eor v26.16b,v26.16b,v26.16b 1976 eor v27.16b,v27.16b,v27.16b 1977 eor v28.16b,v28.16b,v28.16b 1978 eor v29.16b,v29.16b,v29.16b 1979 eor v30.16b,v30.16b,v30.16b 1980 b .Loop_outer 1981 1982.Ldone_512_neon: 1983 ldp x19,x20,[x29,#16] 1984 add sp,sp,#128+64 1985 ldp x21,x22,[x29,#32] 1986 ldp x23,x24,[x29,#48] 1987 ldp x25,x26,[x29,#64] 1988 ldp x27,x28,[x29,#80] 1989 ldp x29,x30,[sp],#96 1990 AARCH64_VALIDATE_LINK_REGISTER 1991 ret 1992.size ChaCha20_512_neon,.-ChaCha20_512_neon 1993#endif 1994#endif // !OPENSSL_NO_ASM 1995.section .note.GNU-stack,"",%progbits 1996