1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <openssl/arm_arch.h> 8 9.text 10 11.globl bn_mul_mont 12 13.def bn_mul_mont 14 .type 32 15.endef 16.align 5 17bn_mul_mont: 18 AARCH64_SIGN_LINK_REGISTER 19 tst x5,#7 20 b.eq __bn_sqr8x_mont 21 tst x5,#3 22 b.eq __bn_mul4x_mont 23Lmul_mont: 24 stp x29,x30,[sp,#-64]! 25 add x29,sp,#0 26 stp x19,x20,[sp,#16] 27 stp x21,x22,[sp,#32] 28 stp x23,x24,[sp,#48] 29 30 ldr x9,[x2],#8 // bp[0] 31 sub x22,sp,x5,lsl#3 32 ldp x7,x8,[x1],#16 // ap[0..1] 33 lsl x5,x5,#3 34 ldr x4,[x4] // *n0 35 and x22,x22,#-16 // ABI says so 36 ldp x13,x14,[x3],#16 // np[0..1] 37 38 mul x6,x7,x9 // ap[0]*bp[0] 39 sub x21,x5,#16 // j=num-2 40 umulh x7,x7,x9 41 mul x10,x8,x9 // ap[1]*bp[0] 42 umulh x11,x8,x9 43 44 mul x15,x6,x4 // "tp[0]"*n0 45 mov sp,x22 // alloca 46 47 // (*) mul x12,x13,x15 // np[0]*m1 48 umulh x13,x13,x15 49 mul x16,x14,x15 // np[1]*m1 50 // (*) adds x12,x12,x6 // discarded 51 // (*) As for removal of first multiplication and addition 52 // instructions. The outcome of first addition is 53 // guaranteed to be zero, which leaves two computationally 54 // significant outcomes: it either carries or not. Then 55 // question is when does it carry? Is there alternative 56 // way to deduce it? If you follow operations, you can 57 // observe that condition for carry is quite simple: 58 // x6 being non-zero. So that carry can be calculated 59 // by adding -1 to x6. That's what next instruction does. 60 subs xzr,x6,#1 // (*) 61 umulh x17,x14,x15 62 adc x13,x13,xzr 63 cbz x21,L1st_skip 64 65L1st: 66 ldr x8,[x1],#8 67 adds x6,x10,x7 68 sub x21,x21,#8 // j-- 69 adc x7,x11,xzr 70 71 ldr x14,[x3],#8 72 adds x12,x16,x13 73 mul x10,x8,x9 // ap[j]*bp[0] 74 adc x13,x17,xzr 75 umulh x11,x8,x9 76 77 adds x12,x12,x6 78 mul x16,x14,x15 // np[j]*m1 79 adc x13,x13,xzr 80 umulh x17,x14,x15 81 str x12,[x22],#8 // tp[j-1] 82 cbnz x21,L1st 83 84L1st_skip: 85 adds x6,x10,x7 86 sub x1,x1,x5 // rewind x1 87 adc x7,x11,xzr 88 89 adds x12,x16,x13 90 sub x3,x3,x5 // rewind x3 91 adc x13,x17,xzr 92 93 adds x12,x12,x6 94 sub x20,x5,#8 // i=num-1 95 adcs x13,x13,x7 96 97 adc x19,xzr,xzr // upmost overflow bit 98 stp x12,x13,[x22] 99 100Louter: 101 ldr x9,[x2],#8 // bp[i] 102 ldp x7,x8,[x1],#16 103 ldr x23,[sp] // tp[0] 104 add x22,sp,#8 105 106 mul x6,x7,x9 // ap[0]*bp[i] 107 sub x21,x5,#16 // j=num-2 108 umulh x7,x7,x9 109 ldp x13,x14,[x3],#16 110 mul x10,x8,x9 // ap[1]*bp[i] 111 adds x6,x6,x23 112 umulh x11,x8,x9 113 adc x7,x7,xzr 114 115 mul x15,x6,x4 116 sub x20,x20,#8 // i-- 117 118 // (*) mul x12,x13,x15 // np[0]*m1 119 umulh x13,x13,x15 120 mul x16,x14,x15 // np[1]*m1 121 // (*) adds x12,x12,x6 122 subs xzr,x6,#1 // (*) 123 umulh x17,x14,x15 124 cbz x21,Linner_skip 125 126Linner: 127 ldr x8,[x1],#8 128 adc x13,x13,xzr 129 ldr x23,[x22],#8 // tp[j] 130 adds x6,x10,x7 131 sub x21,x21,#8 // j-- 132 adc x7,x11,xzr 133 134 adds x12,x16,x13 135 ldr x14,[x3],#8 136 adc x13,x17,xzr 137 138 mul x10,x8,x9 // ap[j]*bp[i] 139 adds x6,x6,x23 140 umulh x11,x8,x9 141 adc x7,x7,xzr 142 143 mul x16,x14,x15 // np[j]*m1 144 adds x12,x12,x6 145 umulh x17,x14,x15 146 str x12,[x22,#-16] // tp[j-1] 147 cbnz x21,Linner 148 149Linner_skip: 150 ldr x23,[x22],#8 // tp[j] 151 adc x13,x13,xzr 152 adds x6,x10,x7 153 sub x1,x1,x5 // rewind x1 154 adc x7,x11,xzr 155 156 adds x12,x16,x13 157 sub x3,x3,x5 // rewind x3 158 adcs x13,x17,x19 159 adc x19,xzr,xzr 160 161 adds x6,x6,x23 162 adc x7,x7,xzr 163 164 adds x12,x12,x6 165 adcs x13,x13,x7 166 adc x19,x19,xzr // upmost overflow bit 167 stp x12,x13,[x22,#-16] 168 169 cbnz x20,Louter 170 171 // Final step. We see if result is larger than modulus, and 172 // if it is, subtract the modulus. But comparison implies 173 // subtraction. So we subtract modulus, see if it borrowed, 174 // and conditionally copy original value. 175 ldr x23,[sp] // tp[0] 176 add x22,sp,#8 177 ldr x14,[x3],#8 // np[0] 178 subs x21,x5,#8 // j=num-1 and clear borrow 179 mov x1,x0 180Lsub: 181 sbcs x8,x23,x14 // tp[j]-np[j] 182 ldr x23,[x22],#8 183 sub x21,x21,#8 // j-- 184 ldr x14,[x3],#8 185 str x8,[x1],#8 // rp[j]=tp[j]-np[j] 186 cbnz x21,Lsub 187 188 sbcs x8,x23,x14 189 sbcs x19,x19,xzr // did it borrow? 190 str x8,[x1],#8 // rp[num-1] 191 192 ldr x23,[sp] // tp[0] 193 add x22,sp,#8 194 ldr x8,[x0],#8 // rp[0] 195 sub x5,x5,#8 // num-- 196 nop 197Lcond_copy: 198 sub x5,x5,#8 // num-- 199 csel x14,x23,x8,lo // did it borrow? 200 ldr x23,[x22],#8 201 ldr x8,[x0],#8 202 str xzr,[x22,#-16] // wipe tp 203 str x14,[x0,#-16] 204 cbnz x5,Lcond_copy 205 206 csel x14,x23,x8,lo 207 str xzr,[x22,#-8] // wipe tp 208 str x14,[x0,#-8] 209 210 ldp x19,x20,[x29,#16] 211 mov sp,x29 212 ldp x21,x22,[x29,#32] 213 mov x0,#1 214 ldp x23,x24,[x29,#48] 215 ldr x29,[sp],#64 216 AARCH64_VALIDATE_LINK_REGISTER 217 ret 218 219.def __bn_sqr8x_mont 220 .type 32 221.endef 222.align 5 223__bn_sqr8x_mont: 224 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to 225 // only from bn_mul_mont which has already signed the return address. 226 cmp x1,x2 227 b.ne __bn_mul4x_mont 228Lsqr8x_mont: 229 stp x29,x30,[sp,#-128]! 230 add x29,sp,#0 231 stp x19,x20,[sp,#16] 232 stp x21,x22,[sp,#32] 233 stp x23,x24,[sp,#48] 234 stp x25,x26,[sp,#64] 235 stp x27,x28,[sp,#80] 236 stp x0,x3,[sp,#96] // offload rp and np 237 238 ldp x6,x7,[x1,#8*0] 239 ldp x8,x9,[x1,#8*2] 240 ldp x10,x11,[x1,#8*4] 241 ldp x12,x13,[x1,#8*6] 242 243 sub x2,sp,x5,lsl#4 244 lsl x5,x5,#3 245 ldr x4,[x4] // *n0 246 mov sp,x2 // alloca 247 sub x27,x5,#8*8 248 b Lsqr8x_zero_start 249 250Lsqr8x_zero: 251 sub x27,x27,#8*8 252 stp xzr,xzr,[x2,#8*0] 253 stp xzr,xzr,[x2,#8*2] 254 stp xzr,xzr,[x2,#8*4] 255 stp xzr,xzr,[x2,#8*6] 256Lsqr8x_zero_start: 257 stp xzr,xzr,[x2,#8*8] 258 stp xzr,xzr,[x2,#8*10] 259 stp xzr,xzr,[x2,#8*12] 260 stp xzr,xzr,[x2,#8*14] 261 add x2,x2,#8*16 262 cbnz x27,Lsqr8x_zero 263 264 add x3,x1,x5 265 add x1,x1,#8*8 266 mov x19,xzr 267 mov x20,xzr 268 mov x21,xzr 269 mov x22,xzr 270 mov x23,xzr 271 mov x24,xzr 272 mov x25,xzr 273 mov x26,xzr 274 mov x2,sp 275 str x4,[x29,#112] // offload n0 276 277 // Multiply everything but a[i]*a[i] 278.align 4 279Lsqr8x_outer_loop: 280 // a[1]a[0] (i) 281 // a[2]a[0] 282 // a[3]a[0] 283 // a[4]a[0] 284 // a[5]a[0] 285 // a[6]a[0] 286 // a[7]a[0] 287 // a[2]a[1] (ii) 288 // a[3]a[1] 289 // a[4]a[1] 290 // a[5]a[1] 291 // a[6]a[1] 292 // a[7]a[1] 293 // a[3]a[2] (iii) 294 // a[4]a[2] 295 // a[5]a[2] 296 // a[6]a[2] 297 // a[7]a[2] 298 // a[4]a[3] (iv) 299 // a[5]a[3] 300 // a[6]a[3] 301 // a[7]a[3] 302 // a[5]a[4] (v) 303 // a[6]a[4] 304 // a[7]a[4] 305 // a[6]a[5] (vi) 306 // a[7]a[5] 307 // a[7]a[6] (vii) 308 309 mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) 310 mul x15,x8,x6 311 mul x16,x9,x6 312 mul x17,x10,x6 313 adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) 314 mul x14,x11,x6 315 adcs x21,x21,x15 316 mul x15,x12,x6 317 adcs x22,x22,x16 318 mul x16,x13,x6 319 adcs x23,x23,x17 320 umulh x17,x7,x6 // hi(a[1..7]*a[0]) 321 adcs x24,x24,x14 322 umulh x14,x8,x6 323 adcs x25,x25,x15 324 umulh x15,x9,x6 325 adcs x26,x26,x16 326 umulh x16,x10,x6 327 stp x19,x20,[x2],#8*2 // t[0..1] 328 adc x19,xzr,xzr // t[8] 329 adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) 330 umulh x17,x11,x6 331 adcs x22,x22,x14 332 umulh x14,x12,x6 333 adcs x23,x23,x15 334 umulh x15,x13,x6 335 adcs x24,x24,x16 336 mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) 337 adcs x25,x25,x17 338 mul x17,x9,x7 339 adcs x26,x26,x14 340 mul x14,x10,x7 341 adc x19,x19,x15 342 343 mul x15,x11,x7 344 adds x22,x22,x16 345 mul x16,x12,x7 346 adcs x23,x23,x17 347 mul x17,x13,x7 348 adcs x24,x24,x14 349 umulh x14,x8,x7 // hi(a[2..7]*a[1]) 350 adcs x25,x25,x15 351 umulh x15,x9,x7 352 adcs x26,x26,x16 353 umulh x16,x10,x7 354 adcs x19,x19,x17 355 umulh x17,x11,x7 356 stp x21,x22,[x2],#8*2 // t[2..3] 357 adc x20,xzr,xzr // t[9] 358 adds x23,x23,x14 359 umulh x14,x12,x7 360 adcs x24,x24,x15 361 umulh x15,x13,x7 362 adcs x25,x25,x16 363 mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) 364 adcs x26,x26,x17 365 mul x17,x10,x8 366 adcs x19,x19,x14 367 mul x14,x11,x8 368 adc x20,x20,x15 369 370 mul x15,x12,x8 371 adds x24,x24,x16 372 mul x16,x13,x8 373 adcs x25,x25,x17 374 umulh x17,x9,x8 // hi(a[3..7]*a[2]) 375 adcs x26,x26,x14 376 umulh x14,x10,x8 377 adcs x19,x19,x15 378 umulh x15,x11,x8 379 adcs x20,x20,x16 380 umulh x16,x12,x8 381 stp x23,x24,[x2],#8*2 // t[4..5] 382 adc x21,xzr,xzr // t[10] 383 adds x25,x25,x17 384 umulh x17,x13,x8 385 adcs x26,x26,x14 386 mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) 387 adcs x19,x19,x15 388 mul x15,x11,x9 389 adcs x20,x20,x16 390 mul x16,x12,x9 391 adc x21,x21,x17 392 393 mul x17,x13,x9 394 adds x26,x26,x14 395 umulh x14,x10,x9 // hi(a[4..7]*a[3]) 396 adcs x19,x19,x15 397 umulh x15,x11,x9 398 adcs x20,x20,x16 399 umulh x16,x12,x9 400 adcs x21,x21,x17 401 umulh x17,x13,x9 402 stp x25,x26,[x2],#8*2 // t[6..7] 403 adc x22,xzr,xzr // t[11] 404 adds x19,x19,x14 405 mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) 406 adcs x20,x20,x15 407 mul x15,x12,x10 408 adcs x21,x21,x16 409 mul x16,x13,x10 410 adc x22,x22,x17 411 412 umulh x17,x11,x10 // hi(a[5..7]*a[4]) 413 adds x20,x20,x14 414 umulh x14,x12,x10 415 adcs x21,x21,x15 416 umulh x15,x13,x10 417 adcs x22,x22,x16 418 mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) 419 adc x23,xzr,xzr // t[12] 420 adds x21,x21,x17 421 mul x17,x13,x11 422 adcs x22,x22,x14 423 umulh x14,x12,x11 // hi(a[6..7]*a[5]) 424 adc x23,x23,x15 425 426 umulh x15,x13,x11 427 adds x22,x22,x16 428 mul x16,x13,x12 // lo(a[7]*a[6]) (vii) 429 adcs x23,x23,x17 430 umulh x17,x13,x12 // hi(a[7]*a[6]) 431 adc x24,xzr,xzr // t[13] 432 adds x23,x23,x14 433 sub x27,x3,x1 // done yet? 434 adc x24,x24,x15 435 436 adds x24,x24,x16 437 sub x14,x3,x5 // rewinded ap 438 adc x25,xzr,xzr // t[14] 439 add x25,x25,x17 440 441 cbz x27,Lsqr8x_outer_break 442 443 mov x4,x6 444 ldp x6,x7,[x2,#8*0] 445 ldp x8,x9,[x2,#8*2] 446 ldp x10,x11,[x2,#8*4] 447 ldp x12,x13,[x2,#8*6] 448 adds x19,x19,x6 449 adcs x20,x20,x7 450 ldp x6,x7,[x1,#8*0] 451 adcs x21,x21,x8 452 adcs x22,x22,x9 453 ldp x8,x9,[x1,#8*2] 454 adcs x23,x23,x10 455 adcs x24,x24,x11 456 ldp x10,x11,[x1,#8*4] 457 adcs x25,x25,x12 458 mov x0,x1 459 adcs x26,xzr,x13 460 ldp x12,x13,[x1,#8*6] 461 add x1,x1,#8*8 462 //adc x28,xzr,xzr // moved below 463 mov x27,#-8*8 464 465 // a[8]a[0] 466 // a[9]a[0] 467 // a[a]a[0] 468 // a[b]a[0] 469 // a[c]a[0] 470 // a[d]a[0] 471 // a[e]a[0] 472 // a[f]a[0] 473 // a[8]a[1] 474 // a[f]a[1]........................ 475 // a[8]a[2] 476 // a[f]a[2]........................ 477 // a[8]a[3] 478 // a[f]a[3]........................ 479 // a[8]a[4] 480 // a[f]a[4]........................ 481 // a[8]a[5] 482 // a[f]a[5]........................ 483 // a[8]a[6] 484 // a[f]a[6]........................ 485 // a[8]a[7] 486 // a[f]a[7]........................ 487Lsqr8x_mul: 488 mul x14,x6,x4 489 adc x28,xzr,xzr // carry bit, modulo-scheduled 490 mul x15,x7,x4 491 add x27,x27,#8 492 mul x16,x8,x4 493 mul x17,x9,x4 494 adds x19,x19,x14 495 mul x14,x10,x4 496 adcs x20,x20,x15 497 mul x15,x11,x4 498 adcs x21,x21,x16 499 mul x16,x12,x4 500 adcs x22,x22,x17 501 mul x17,x13,x4 502 adcs x23,x23,x14 503 umulh x14,x6,x4 504 adcs x24,x24,x15 505 umulh x15,x7,x4 506 adcs x25,x25,x16 507 umulh x16,x8,x4 508 adcs x26,x26,x17 509 umulh x17,x9,x4 510 adc x28,x28,xzr 511 str x19,[x2],#8 512 adds x19,x20,x14 513 umulh x14,x10,x4 514 adcs x20,x21,x15 515 umulh x15,x11,x4 516 adcs x21,x22,x16 517 umulh x16,x12,x4 518 adcs x22,x23,x17 519 umulh x17,x13,x4 520 ldr x4,[x0,x27] 521 adcs x23,x24,x14 522 adcs x24,x25,x15 523 adcs x25,x26,x16 524 adcs x26,x28,x17 525 //adc x28,xzr,xzr // moved above 526 cbnz x27,Lsqr8x_mul 527 // note that carry flag is guaranteed 528 // to be zero at this point 529 cmp x1,x3 // done yet? 530 b.eq Lsqr8x_break 531 532 ldp x6,x7,[x2,#8*0] 533 ldp x8,x9,[x2,#8*2] 534 ldp x10,x11,[x2,#8*4] 535 ldp x12,x13,[x2,#8*6] 536 adds x19,x19,x6 537 ldr x4,[x0,#-8*8] 538 adcs x20,x20,x7 539 ldp x6,x7,[x1,#8*0] 540 adcs x21,x21,x8 541 adcs x22,x22,x9 542 ldp x8,x9,[x1,#8*2] 543 adcs x23,x23,x10 544 adcs x24,x24,x11 545 ldp x10,x11,[x1,#8*4] 546 adcs x25,x25,x12 547 mov x27,#-8*8 548 adcs x26,x26,x13 549 ldp x12,x13,[x1,#8*6] 550 add x1,x1,#8*8 551 //adc x28,xzr,xzr // moved above 552 b Lsqr8x_mul 553 554.align 4 555Lsqr8x_break: 556 ldp x6,x7,[x0,#8*0] 557 add x1,x0,#8*8 558 ldp x8,x9,[x0,#8*2] 559 sub x14,x3,x1 // is it last iteration? 560 ldp x10,x11,[x0,#8*4] 561 sub x15,x2,x14 562 ldp x12,x13,[x0,#8*6] 563 cbz x14,Lsqr8x_outer_loop 564 565 stp x19,x20,[x2,#8*0] 566 ldp x19,x20,[x15,#8*0] 567 stp x21,x22,[x2,#8*2] 568 ldp x21,x22,[x15,#8*2] 569 stp x23,x24,[x2,#8*4] 570 ldp x23,x24,[x15,#8*4] 571 stp x25,x26,[x2,#8*6] 572 mov x2,x15 573 ldp x25,x26,[x15,#8*6] 574 b Lsqr8x_outer_loop 575 576.align 4 577Lsqr8x_outer_break: 578 // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] 579 ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] 580 ldp x15,x16,[sp,#8*1] 581 ldp x11,x13,[x14,#8*2] 582 add x1,x14,#8*4 583 ldp x17,x14,[sp,#8*3] 584 585 stp x19,x20,[x2,#8*0] 586 mul x19,x7,x7 587 stp x21,x22,[x2,#8*2] 588 umulh x7,x7,x7 589 stp x23,x24,[x2,#8*4] 590 mul x8,x9,x9 591 stp x25,x26,[x2,#8*6] 592 mov x2,sp 593 umulh x9,x9,x9 594 adds x20,x7,x15,lsl#1 595 extr x15,x16,x15,#63 596 sub x27,x5,#8*4 597 598Lsqr4x_shift_n_add: 599 adcs x21,x8,x15 600 extr x16,x17,x16,#63 601 sub x27,x27,#8*4 602 adcs x22,x9,x16 603 ldp x15,x16,[x2,#8*5] 604 mul x10,x11,x11 605 ldp x7,x9,[x1],#8*2 606 umulh x11,x11,x11 607 mul x12,x13,x13 608 umulh x13,x13,x13 609 extr x17,x14,x17,#63 610 stp x19,x20,[x2,#8*0] 611 adcs x23,x10,x17 612 extr x14,x15,x14,#63 613 stp x21,x22,[x2,#8*2] 614 adcs x24,x11,x14 615 ldp x17,x14,[x2,#8*7] 616 extr x15,x16,x15,#63 617 adcs x25,x12,x15 618 extr x16,x17,x16,#63 619 adcs x26,x13,x16 620 ldp x15,x16,[x2,#8*9] 621 mul x6,x7,x7 622 ldp x11,x13,[x1],#8*2 623 umulh x7,x7,x7 624 mul x8,x9,x9 625 umulh x9,x9,x9 626 stp x23,x24,[x2,#8*4] 627 extr x17,x14,x17,#63 628 stp x25,x26,[x2,#8*6] 629 add x2,x2,#8*8 630 adcs x19,x6,x17 631 extr x14,x15,x14,#63 632 adcs x20,x7,x14 633 ldp x17,x14,[x2,#8*3] 634 extr x15,x16,x15,#63 635 cbnz x27,Lsqr4x_shift_n_add 636 ldp x1,x4,[x29,#104] // pull np and n0 637 638 adcs x21,x8,x15 639 extr x16,x17,x16,#63 640 adcs x22,x9,x16 641 ldp x15,x16,[x2,#8*5] 642 mul x10,x11,x11 643 umulh x11,x11,x11 644 stp x19,x20,[x2,#8*0] 645 mul x12,x13,x13 646 umulh x13,x13,x13 647 stp x21,x22,[x2,#8*2] 648 extr x17,x14,x17,#63 649 adcs x23,x10,x17 650 extr x14,x15,x14,#63 651 ldp x19,x20,[sp,#8*0] 652 adcs x24,x11,x14 653 extr x15,x16,x15,#63 654 ldp x6,x7,[x1,#8*0] 655 adcs x25,x12,x15 656 extr x16,xzr,x16,#63 657 ldp x8,x9,[x1,#8*2] 658 adc x26,x13,x16 659 ldp x10,x11,[x1,#8*4] 660 661 // Reduce by 512 bits per iteration 662 mul x28,x4,x19 // t[0]*n0 663 ldp x12,x13,[x1,#8*6] 664 add x3,x1,x5 665 ldp x21,x22,[sp,#8*2] 666 stp x23,x24,[x2,#8*4] 667 ldp x23,x24,[sp,#8*4] 668 stp x25,x26,[x2,#8*6] 669 ldp x25,x26,[sp,#8*6] 670 add x1,x1,#8*8 671 mov x30,xzr // initial top-most carry 672 mov x2,sp 673 mov x27,#8 674 675Lsqr8x_reduction: 676 // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) 677 mul x15,x7,x28 678 sub x27,x27,#1 679 mul x16,x8,x28 680 str x28,[x2],#8 // put aside t[0]*n0 for tail processing 681 mul x17,x9,x28 682 // (*) adds xzr,x19,x14 683 subs xzr,x19,#1 // (*) 684 mul x14,x10,x28 685 adcs x19,x20,x15 686 mul x15,x11,x28 687 adcs x20,x21,x16 688 mul x16,x12,x28 689 adcs x21,x22,x17 690 mul x17,x13,x28 691 adcs x22,x23,x14 692 umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) 693 adcs x23,x24,x15 694 umulh x15,x7,x28 695 adcs x24,x25,x16 696 umulh x16,x8,x28 697 adcs x25,x26,x17 698 umulh x17,x9,x28 699 adc x26,xzr,xzr 700 adds x19,x19,x14 701 umulh x14,x10,x28 702 adcs x20,x20,x15 703 umulh x15,x11,x28 704 adcs x21,x21,x16 705 umulh x16,x12,x28 706 adcs x22,x22,x17 707 umulh x17,x13,x28 708 mul x28,x4,x19 // next t[0]*n0 709 adcs x23,x23,x14 710 adcs x24,x24,x15 711 adcs x25,x25,x16 712 adc x26,x26,x17 713 cbnz x27,Lsqr8x_reduction 714 715 ldp x14,x15,[x2,#8*0] 716 ldp x16,x17,[x2,#8*2] 717 mov x0,x2 718 sub x27,x3,x1 // done yet? 719 adds x19,x19,x14 720 adcs x20,x20,x15 721 ldp x14,x15,[x2,#8*4] 722 adcs x21,x21,x16 723 adcs x22,x22,x17 724 ldp x16,x17,[x2,#8*6] 725 adcs x23,x23,x14 726 adcs x24,x24,x15 727 adcs x25,x25,x16 728 adcs x26,x26,x17 729 //adc x28,xzr,xzr // moved below 730 cbz x27,Lsqr8x8_post_condition 731 732 ldr x4,[x2,#-8*8] 733 ldp x6,x7,[x1,#8*0] 734 ldp x8,x9,[x1,#8*2] 735 ldp x10,x11,[x1,#8*4] 736 mov x27,#-8*8 737 ldp x12,x13,[x1,#8*6] 738 add x1,x1,#8*8 739 740Lsqr8x_tail: 741 mul x14,x6,x4 742 adc x28,xzr,xzr // carry bit, modulo-scheduled 743 mul x15,x7,x4 744 add x27,x27,#8 745 mul x16,x8,x4 746 mul x17,x9,x4 747 adds x19,x19,x14 748 mul x14,x10,x4 749 adcs x20,x20,x15 750 mul x15,x11,x4 751 adcs x21,x21,x16 752 mul x16,x12,x4 753 adcs x22,x22,x17 754 mul x17,x13,x4 755 adcs x23,x23,x14 756 umulh x14,x6,x4 757 adcs x24,x24,x15 758 umulh x15,x7,x4 759 adcs x25,x25,x16 760 umulh x16,x8,x4 761 adcs x26,x26,x17 762 umulh x17,x9,x4 763 adc x28,x28,xzr 764 str x19,[x2],#8 765 adds x19,x20,x14 766 umulh x14,x10,x4 767 adcs x20,x21,x15 768 umulh x15,x11,x4 769 adcs x21,x22,x16 770 umulh x16,x12,x4 771 adcs x22,x23,x17 772 umulh x17,x13,x4 773 ldr x4,[x0,x27] 774 adcs x23,x24,x14 775 adcs x24,x25,x15 776 adcs x25,x26,x16 777 adcs x26,x28,x17 778 //adc x28,xzr,xzr // moved above 779 cbnz x27,Lsqr8x_tail 780 // note that carry flag is guaranteed 781 // to be zero at this point 782 ldp x6,x7,[x2,#8*0] 783 sub x27,x3,x1 // done yet? 784 sub x16,x3,x5 // rewinded np 785 ldp x8,x9,[x2,#8*2] 786 ldp x10,x11,[x2,#8*4] 787 ldp x12,x13,[x2,#8*6] 788 cbz x27,Lsqr8x_tail_break 789 790 ldr x4,[x0,#-8*8] 791 adds x19,x19,x6 792 adcs x20,x20,x7 793 ldp x6,x7,[x1,#8*0] 794 adcs x21,x21,x8 795 adcs x22,x22,x9 796 ldp x8,x9,[x1,#8*2] 797 adcs x23,x23,x10 798 adcs x24,x24,x11 799 ldp x10,x11,[x1,#8*4] 800 adcs x25,x25,x12 801 mov x27,#-8*8 802 adcs x26,x26,x13 803 ldp x12,x13,[x1,#8*6] 804 add x1,x1,#8*8 805 //adc x28,xzr,xzr // moved above 806 b Lsqr8x_tail 807 808.align 4 809Lsqr8x_tail_break: 810 ldr x4,[x29,#112] // pull n0 811 add x27,x2,#8*8 // end of current t[num] window 812 813 subs xzr,x30,#1 // "move" top-most carry to carry bit 814 adcs x14,x19,x6 815 adcs x15,x20,x7 816 ldp x19,x20,[x0,#8*0] 817 adcs x21,x21,x8 818 ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] 819 adcs x22,x22,x9 820 ldp x8,x9,[x16,#8*2] 821 adcs x23,x23,x10 822 adcs x24,x24,x11 823 ldp x10,x11,[x16,#8*4] 824 adcs x25,x25,x12 825 adcs x26,x26,x13 826 ldp x12,x13,[x16,#8*6] 827 add x1,x16,#8*8 828 adc x30,xzr,xzr // top-most carry 829 mul x28,x4,x19 830 stp x14,x15,[x2,#8*0] 831 stp x21,x22,[x2,#8*2] 832 ldp x21,x22,[x0,#8*2] 833 stp x23,x24,[x2,#8*4] 834 ldp x23,x24,[x0,#8*4] 835 cmp x27,x29 // did we hit the bottom? 836 stp x25,x26,[x2,#8*6] 837 mov x2,x0 // slide the window 838 ldp x25,x26,[x0,#8*6] 839 mov x27,#8 840 b.ne Lsqr8x_reduction 841 842 // Final step. We see if result is larger than modulus, and 843 // if it is, subtract the modulus. But comparison implies 844 // subtraction. So we subtract modulus, see if it borrowed, 845 // and conditionally copy original value. 846 ldr x0,[x29,#96] // pull rp 847 add x2,x2,#8*8 848 subs x14,x19,x6 849 sbcs x15,x20,x7 850 sub x27,x5,#8*8 851 mov x3,x0 // x0 copy 852 853Lsqr8x_sub: 854 sbcs x16,x21,x8 855 ldp x6,x7,[x1,#8*0] 856 sbcs x17,x22,x9 857 stp x14,x15,[x0,#8*0] 858 sbcs x14,x23,x10 859 ldp x8,x9,[x1,#8*2] 860 sbcs x15,x24,x11 861 stp x16,x17,[x0,#8*2] 862 sbcs x16,x25,x12 863 ldp x10,x11,[x1,#8*4] 864 sbcs x17,x26,x13 865 ldp x12,x13,[x1,#8*6] 866 add x1,x1,#8*8 867 ldp x19,x20,[x2,#8*0] 868 sub x27,x27,#8*8 869 ldp x21,x22,[x2,#8*2] 870 ldp x23,x24,[x2,#8*4] 871 ldp x25,x26,[x2,#8*6] 872 add x2,x2,#8*8 873 stp x14,x15,[x0,#8*4] 874 sbcs x14,x19,x6 875 stp x16,x17,[x0,#8*6] 876 add x0,x0,#8*8 877 sbcs x15,x20,x7 878 cbnz x27,Lsqr8x_sub 879 880 sbcs x16,x21,x8 881 mov x2,sp 882 add x1,sp,x5 883 ldp x6,x7,[x3,#8*0] 884 sbcs x17,x22,x9 885 stp x14,x15,[x0,#8*0] 886 sbcs x14,x23,x10 887 ldp x8,x9,[x3,#8*2] 888 sbcs x15,x24,x11 889 stp x16,x17,[x0,#8*2] 890 sbcs x16,x25,x12 891 ldp x19,x20,[x1,#8*0] 892 sbcs x17,x26,x13 893 ldp x21,x22,[x1,#8*2] 894 sbcs xzr,x30,xzr // did it borrow? 895 ldr x30,[x29,#8] // pull return address 896 stp x14,x15,[x0,#8*4] 897 stp x16,x17,[x0,#8*6] 898 899 sub x27,x5,#8*4 900Lsqr4x_cond_copy: 901 sub x27,x27,#8*4 902 csel x14,x19,x6,lo 903 stp xzr,xzr,[x2,#8*0] 904 csel x15,x20,x7,lo 905 ldp x6,x7,[x3,#8*4] 906 ldp x19,x20,[x1,#8*4] 907 csel x16,x21,x8,lo 908 stp xzr,xzr,[x2,#8*2] 909 add x2,x2,#8*4 910 csel x17,x22,x9,lo 911 ldp x8,x9,[x3,#8*6] 912 ldp x21,x22,[x1,#8*6] 913 add x1,x1,#8*4 914 stp x14,x15,[x3,#8*0] 915 stp x16,x17,[x3,#8*2] 916 add x3,x3,#8*4 917 stp xzr,xzr,[x1,#8*0] 918 stp xzr,xzr,[x1,#8*2] 919 cbnz x27,Lsqr4x_cond_copy 920 921 csel x14,x19,x6,lo 922 stp xzr,xzr,[x2,#8*0] 923 csel x15,x20,x7,lo 924 stp xzr,xzr,[x2,#8*2] 925 csel x16,x21,x8,lo 926 csel x17,x22,x9,lo 927 stp x14,x15,[x3,#8*0] 928 stp x16,x17,[x3,#8*2] 929 930 b Lsqr8x_done 931 932.align 4 933Lsqr8x8_post_condition: 934 adc x28,xzr,xzr 935 ldr x30,[x29,#8] // pull return address 936 // x19-7,x28 hold result, x6-7 hold modulus 937 subs x6,x19,x6 938 ldr x1,[x29,#96] // pull rp 939 sbcs x7,x20,x7 940 stp xzr,xzr,[sp,#8*0] 941 sbcs x8,x21,x8 942 stp xzr,xzr,[sp,#8*2] 943 sbcs x9,x22,x9 944 stp xzr,xzr,[sp,#8*4] 945 sbcs x10,x23,x10 946 stp xzr,xzr,[sp,#8*6] 947 sbcs x11,x24,x11 948 stp xzr,xzr,[sp,#8*8] 949 sbcs x12,x25,x12 950 stp xzr,xzr,[sp,#8*10] 951 sbcs x13,x26,x13 952 stp xzr,xzr,[sp,#8*12] 953 sbcs x28,x28,xzr // did it borrow? 954 stp xzr,xzr,[sp,#8*14] 955 956 // x6-7 hold result-modulus 957 csel x6,x19,x6,lo 958 csel x7,x20,x7,lo 959 csel x8,x21,x8,lo 960 csel x9,x22,x9,lo 961 stp x6,x7,[x1,#8*0] 962 csel x10,x23,x10,lo 963 csel x11,x24,x11,lo 964 stp x8,x9,[x1,#8*2] 965 csel x12,x25,x12,lo 966 csel x13,x26,x13,lo 967 stp x10,x11,[x1,#8*4] 968 stp x12,x13,[x1,#8*6] 969 970Lsqr8x_done: 971 ldp x19,x20,[x29,#16] 972 mov sp,x29 973 ldp x21,x22,[x29,#32] 974 mov x0,#1 975 ldp x23,x24,[x29,#48] 976 ldp x25,x26,[x29,#64] 977 ldp x27,x28,[x29,#80] 978 ldr x29,[sp],#128 979 // x30 is popped earlier 980 AARCH64_VALIDATE_LINK_REGISTER 981 ret 982 983.def __bn_mul4x_mont 984 .type 32 985.endef 986.align 5 987__bn_mul4x_mont: 988 // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to 989 // only from bn_mul_mont or __bn_mul8x_mont which have already signed the 990 // return address. 991 stp x29,x30,[sp,#-128]! 992 add x29,sp,#0 993 stp x19,x20,[sp,#16] 994 stp x21,x22,[sp,#32] 995 stp x23,x24,[sp,#48] 996 stp x25,x26,[sp,#64] 997 stp x27,x28,[sp,#80] 998 999 sub x26,sp,x5,lsl#3 1000 lsl x5,x5,#3 1001 ldr x4,[x4] // *n0 1002 sub sp,x26,#8*4 // alloca 1003 1004 add x10,x2,x5 1005 add x27,x1,x5 1006 stp x0,x10,[x29,#96] // offload rp and &b[num] 1007 1008 ldr x24,[x2,#8*0] // b[0] 1009 ldp x6,x7,[x1,#8*0] // a[0..3] 1010 ldp x8,x9,[x1,#8*2] 1011 add x1,x1,#8*4 1012 mov x19,xzr 1013 mov x20,xzr 1014 mov x21,xzr 1015 mov x22,xzr 1016 ldp x14,x15,[x3,#8*0] // n[0..3] 1017 ldp x16,x17,[x3,#8*2] 1018 adds x3,x3,#8*4 // clear carry bit 1019 mov x0,xzr 1020 mov x28,#0 1021 mov x26,sp 1022 1023Loop_mul4x_1st_reduction: 1024 mul x10,x6,x24 // lo(a[0..3]*b[0]) 1025 adc x0,x0,xzr // modulo-scheduled 1026 mul x11,x7,x24 1027 add x28,x28,#8 1028 mul x12,x8,x24 1029 and x28,x28,#31 1030 mul x13,x9,x24 1031 adds x19,x19,x10 1032 umulh x10,x6,x24 // hi(a[0..3]*b[0]) 1033 adcs x20,x20,x11 1034 mul x25,x19,x4 // t[0]*n0 1035 adcs x21,x21,x12 1036 umulh x11,x7,x24 1037 adcs x22,x22,x13 1038 umulh x12,x8,x24 1039 adc x23,xzr,xzr 1040 umulh x13,x9,x24 1041 ldr x24,[x2,x28] // next b[i] (or b[0]) 1042 adds x20,x20,x10 1043 // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) 1044 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1045 adcs x21,x21,x11 1046 mul x11,x15,x25 1047 adcs x22,x22,x12 1048 mul x12,x16,x25 1049 adc x23,x23,x13 // can't overflow 1050 mul x13,x17,x25 1051 // (*) adds xzr,x19,x10 1052 subs xzr,x19,#1 // (*) 1053 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) 1054 adcs x19,x20,x11 1055 umulh x11,x15,x25 1056 adcs x20,x21,x12 1057 umulh x12,x16,x25 1058 adcs x21,x22,x13 1059 umulh x13,x17,x25 1060 adcs x22,x23,x0 1061 adc x0,xzr,xzr 1062 adds x19,x19,x10 1063 sub x10,x27,x1 1064 adcs x20,x20,x11 1065 adcs x21,x21,x12 1066 adcs x22,x22,x13 1067 //adc x0,x0,xzr 1068 cbnz x28,Loop_mul4x_1st_reduction 1069 1070 cbz x10,Lmul4x4_post_condition 1071 1072 ldp x6,x7,[x1,#8*0] // a[4..7] 1073 ldp x8,x9,[x1,#8*2] 1074 add x1,x1,#8*4 1075 ldr x25,[sp] // a[0]*n0 1076 ldp x14,x15,[x3,#8*0] // n[4..7] 1077 ldp x16,x17,[x3,#8*2] 1078 add x3,x3,#8*4 1079 1080Loop_mul4x_1st_tail: 1081 mul x10,x6,x24 // lo(a[4..7]*b[i]) 1082 adc x0,x0,xzr // modulo-scheduled 1083 mul x11,x7,x24 1084 add x28,x28,#8 1085 mul x12,x8,x24 1086 and x28,x28,#31 1087 mul x13,x9,x24 1088 adds x19,x19,x10 1089 umulh x10,x6,x24 // hi(a[4..7]*b[i]) 1090 adcs x20,x20,x11 1091 umulh x11,x7,x24 1092 adcs x21,x21,x12 1093 umulh x12,x8,x24 1094 adcs x22,x22,x13 1095 umulh x13,x9,x24 1096 adc x23,xzr,xzr 1097 ldr x24,[x2,x28] // next b[i] (or b[0]) 1098 adds x20,x20,x10 1099 mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) 1100 adcs x21,x21,x11 1101 mul x11,x15,x25 1102 adcs x22,x22,x12 1103 mul x12,x16,x25 1104 adc x23,x23,x13 // can't overflow 1105 mul x13,x17,x25 1106 adds x19,x19,x10 1107 umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) 1108 adcs x20,x20,x11 1109 umulh x11,x15,x25 1110 adcs x21,x21,x12 1111 umulh x12,x16,x25 1112 adcs x22,x22,x13 1113 adcs x23,x23,x0 1114 umulh x13,x17,x25 1115 adc x0,xzr,xzr 1116 ldr x25,[sp,x28] // next t[0]*n0 1117 str x19,[x26],#8 // result!!! 1118 adds x19,x20,x10 1119 sub x10,x27,x1 // done yet? 1120 adcs x20,x21,x11 1121 adcs x21,x22,x12 1122 adcs x22,x23,x13 1123 //adc x0,x0,xzr 1124 cbnz x28,Loop_mul4x_1st_tail 1125 1126 sub x11,x27,x5 // rewinded x1 1127 cbz x10,Lmul4x_proceed 1128 1129 ldp x6,x7,[x1,#8*0] 1130 ldp x8,x9,[x1,#8*2] 1131 add x1,x1,#8*4 1132 ldp x14,x15,[x3,#8*0] 1133 ldp x16,x17,[x3,#8*2] 1134 add x3,x3,#8*4 1135 b Loop_mul4x_1st_tail 1136 1137.align 5 1138Lmul4x_proceed: 1139 ldr x24,[x2,#8*4]! // *++b 1140 adc x30,x0,xzr 1141 ldp x6,x7,[x11,#8*0] // a[0..3] 1142 sub x3,x3,x5 // rewind np 1143 ldp x8,x9,[x11,#8*2] 1144 add x1,x11,#8*4 1145 1146 stp x19,x20,[x26,#8*0] // result!!! 1147 ldp x19,x20,[sp,#8*4] // t[0..3] 1148 stp x21,x22,[x26,#8*2] // result!!! 1149 ldp x21,x22,[sp,#8*6] 1150 1151 ldp x14,x15,[x3,#8*0] // n[0..3] 1152 mov x26,sp 1153 ldp x16,x17,[x3,#8*2] 1154 adds x3,x3,#8*4 // clear carry bit 1155 mov x0,xzr 1156 1157.align 4 1158Loop_mul4x_reduction: 1159 mul x10,x6,x24 // lo(a[0..3]*b[4]) 1160 adc x0,x0,xzr // modulo-scheduled 1161 mul x11,x7,x24 1162 add x28,x28,#8 1163 mul x12,x8,x24 1164 and x28,x28,#31 1165 mul x13,x9,x24 1166 adds x19,x19,x10 1167 umulh x10,x6,x24 // hi(a[0..3]*b[4]) 1168 adcs x20,x20,x11 1169 mul x25,x19,x4 // t[0]*n0 1170 adcs x21,x21,x12 1171 umulh x11,x7,x24 1172 adcs x22,x22,x13 1173 umulh x12,x8,x24 1174 adc x23,xzr,xzr 1175 umulh x13,x9,x24 1176 ldr x24,[x2,x28] // next b[i] 1177 adds x20,x20,x10 1178 // (*) mul x10,x14,x25 1179 str x25,[x26],#8 // put aside t[0]*n0 for tail processing 1180 adcs x21,x21,x11 1181 mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 1182 adcs x22,x22,x12 1183 mul x12,x16,x25 1184 adc x23,x23,x13 // can't overflow 1185 mul x13,x17,x25 1186 // (*) adds xzr,x19,x10 1187 subs xzr,x19,#1 // (*) 1188 umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 1189 adcs x19,x20,x11 1190 umulh x11,x15,x25 1191 adcs x20,x21,x12 1192 umulh x12,x16,x25 1193 adcs x21,x22,x13 1194 umulh x13,x17,x25 1195 adcs x22,x23,x0 1196 adc x0,xzr,xzr 1197 adds x19,x19,x10 1198 adcs x20,x20,x11 1199 adcs x21,x21,x12 1200 adcs x22,x22,x13 1201 //adc x0,x0,xzr 1202 cbnz x28,Loop_mul4x_reduction 1203 1204 adc x0,x0,xzr 1205 ldp x10,x11,[x26,#8*4] // t[4..7] 1206 ldp x12,x13,[x26,#8*6] 1207 ldp x6,x7,[x1,#8*0] // a[4..7] 1208 ldp x8,x9,[x1,#8*2] 1209 add x1,x1,#8*4 1210 adds x19,x19,x10 1211 adcs x20,x20,x11 1212 adcs x21,x21,x12 1213 adcs x22,x22,x13 1214 //adc x0,x0,xzr 1215 1216 ldr x25,[sp] // t[0]*n0 1217 ldp x14,x15,[x3,#8*0] // n[4..7] 1218 ldp x16,x17,[x3,#8*2] 1219 add x3,x3,#8*4 1220 1221.align 4 1222Loop_mul4x_tail: 1223 mul x10,x6,x24 // lo(a[4..7]*b[4]) 1224 adc x0,x0,xzr // modulo-scheduled 1225 mul x11,x7,x24 1226 add x28,x28,#8 1227 mul x12,x8,x24 1228 and x28,x28,#31 1229 mul x13,x9,x24 1230 adds x19,x19,x10 1231 umulh x10,x6,x24 // hi(a[4..7]*b[4]) 1232 adcs x20,x20,x11 1233 umulh x11,x7,x24 1234 adcs x21,x21,x12 1235 umulh x12,x8,x24 1236 adcs x22,x22,x13 1237 umulh x13,x9,x24 1238 adc x23,xzr,xzr 1239 ldr x24,[x2,x28] // next b[i] 1240 adds x20,x20,x10 1241 mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) 1242 adcs x21,x21,x11 1243 mul x11,x15,x25 1244 adcs x22,x22,x12 1245 mul x12,x16,x25 1246 adc x23,x23,x13 // can't overflow 1247 mul x13,x17,x25 1248 adds x19,x19,x10 1249 umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) 1250 adcs x20,x20,x11 1251 umulh x11,x15,x25 1252 adcs x21,x21,x12 1253 umulh x12,x16,x25 1254 adcs x22,x22,x13 1255 umulh x13,x17,x25 1256 adcs x23,x23,x0 1257 ldr x25,[sp,x28] // next a[0]*n0 1258 adc x0,xzr,xzr 1259 str x19,[x26],#8 // result!!! 1260 adds x19,x20,x10 1261 sub x10,x27,x1 // done yet? 1262 adcs x20,x21,x11 1263 adcs x21,x22,x12 1264 adcs x22,x23,x13 1265 //adc x0,x0,xzr 1266 cbnz x28,Loop_mul4x_tail 1267 1268 sub x11,x3,x5 // rewinded np? 1269 adc x0,x0,xzr 1270 cbz x10,Loop_mul4x_break 1271 1272 ldp x10,x11,[x26,#8*4] 1273 ldp x12,x13,[x26,#8*6] 1274 ldp x6,x7,[x1,#8*0] 1275 ldp x8,x9,[x1,#8*2] 1276 add x1,x1,#8*4 1277 adds x19,x19,x10 1278 adcs x20,x20,x11 1279 adcs x21,x21,x12 1280 adcs x22,x22,x13 1281 //adc x0,x0,xzr 1282 ldp x14,x15,[x3,#8*0] 1283 ldp x16,x17,[x3,#8*2] 1284 add x3,x3,#8*4 1285 b Loop_mul4x_tail 1286 1287.align 4 1288Loop_mul4x_break: 1289 ldp x12,x13,[x29,#96] // pull rp and &b[num] 1290 adds x19,x19,x30 1291 add x2,x2,#8*4 // bp++ 1292 adcs x20,x20,xzr 1293 sub x1,x1,x5 // rewind ap 1294 adcs x21,x21,xzr 1295 stp x19,x20,[x26,#8*0] // result!!! 1296 adcs x22,x22,xzr 1297 ldp x19,x20,[sp,#8*4] // t[0..3] 1298 adc x30,x0,xzr 1299 stp x21,x22,[x26,#8*2] // result!!! 1300 cmp x2,x13 // done yet? 1301 ldp x21,x22,[sp,#8*6] 1302 ldp x14,x15,[x11,#8*0] // n[0..3] 1303 ldp x16,x17,[x11,#8*2] 1304 add x3,x11,#8*4 1305 b.eq Lmul4x_post 1306 1307 ldr x24,[x2] 1308 ldp x6,x7,[x1,#8*0] // a[0..3] 1309 ldp x8,x9,[x1,#8*2] 1310 adds x1,x1,#8*4 // clear carry bit 1311 mov x0,xzr 1312 mov x26,sp 1313 b Loop_mul4x_reduction 1314 1315.align 4 1316Lmul4x_post: 1317 // Final step. We see if result is larger than modulus, and 1318 // if it is, subtract the modulus. But comparison implies 1319 // subtraction. So we subtract modulus, see if it borrowed, 1320 // and conditionally copy original value. 1321 mov x0,x12 1322 mov x27,x12 // x0 copy 1323 subs x10,x19,x14 1324 add x26,sp,#8*8 1325 sbcs x11,x20,x15 1326 sub x28,x5,#8*4 1327 1328Lmul4x_sub: 1329 sbcs x12,x21,x16 1330 ldp x14,x15,[x3,#8*0] 1331 sub x28,x28,#8*4 1332 ldp x19,x20,[x26,#8*0] 1333 sbcs x13,x22,x17 1334 ldp x16,x17,[x3,#8*2] 1335 add x3,x3,#8*4 1336 ldp x21,x22,[x26,#8*2] 1337 add x26,x26,#8*4 1338 stp x10,x11,[x0,#8*0] 1339 sbcs x10,x19,x14 1340 stp x12,x13,[x0,#8*2] 1341 add x0,x0,#8*4 1342 sbcs x11,x20,x15 1343 cbnz x28,Lmul4x_sub 1344 1345 sbcs x12,x21,x16 1346 mov x26,sp 1347 add x1,sp,#8*4 1348 ldp x6,x7,[x27,#8*0] 1349 sbcs x13,x22,x17 1350 stp x10,x11,[x0,#8*0] 1351 ldp x8,x9,[x27,#8*2] 1352 stp x12,x13,[x0,#8*2] 1353 ldp x19,x20,[x1,#8*0] 1354 ldp x21,x22,[x1,#8*2] 1355 sbcs xzr,x30,xzr // did it borrow? 1356 ldr x30,[x29,#8] // pull return address 1357 1358 sub x28,x5,#8*4 1359Lmul4x_cond_copy: 1360 sub x28,x28,#8*4 1361 csel x10,x19,x6,lo 1362 stp xzr,xzr,[x26,#8*0] 1363 csel x11,x20,x7,lo 1364 ldp x6,x7,[x27,#8*4] 1365 ldp x19,x20,[x1,#8*4] 1366 csel x12,x21,x8,lo 1367 stp xzr,xzr,[x26,#8*2] 1368 add x26,x26,#8*4 1369 csel x13,x22,x9,lo 1370 ldp x8,x9,[x27,#8*6] 1371 ldp x21,x22,[x1,#8*6] 1372 add x1,x1,#8*4 1373 stp x10,x11,[x27,#8*0] 1374 stp x12,x13,[x27,#8*2] 1375 add x27,x27,#8*4 1376 cbnz x28,Lmul4x_cond_copy 1377 1378 csel x10,x19,x6,lo 1379 stp xzr,xzr,[x26,#8*0] 1380 csel x11,x20,x7,lo 1381 stp xzr,xzr,[x26,#8*2] 1382 csel x12,x21,x8,lo 1383 stp xzr,xzr,[x26,#8*3] 1384 csel x13,x22,x9,lo 1385 stp xzr,xzr,[x26,#8*4] 1386 stp x10,x11,[x27,#8*0] 1387 stp x12,x13,[x27,#8*2] 1388 1389 b Lmul4x_done 1390 1391.align 4 1392Lmul4x4_post_condition: 1393 adc x0,x0,xzr 1394 ldr x1,[x29,#96] // pull rp 1395 // x19-3,x0 hold result, x14-7 hold modulus 1396 subs x6,x19,x14 1397 ldr x30,[x29,#8] // pull return address 1398 sbcs x7,x20,x15 1399 stp xzr,xzr,[sp,#8*0] 1400 sbcs x8,x21,x16 1401 stp xzr,xzr,[sp,#8*2] 1402 sbcs x9,x22,x17 1403 stp xzr,xzr,[sp,#8*4] 1404 sbcs xzr,x0,xzr // did it borrow? 1405 stp xzr,xzr,[sp,#8*6] 1406 1407 // x6-3 hold result-modulus 1408 csel x6,x19,x6,lo 1409 csel x7,x20,x7,lo 1410 csel x8,x21,x8,lo 1411 csel x9,x22,x9,lo 1412 stp x6,x7,[x1,#8*0] 1413 stp x8,x9,[x1,#8*2] 1414 1415Lmul4x_done: 1416 ldp x19,x20,[x29,#16] 1417 mov sp,x29 1418 ldp x21,x22,[x29,#32] 1419 mov x0,#1 1420 ldp x23,x24,[x29,#48] 1421 ldp x25,x26,[x29,#64] 1422 ldp x27,x28,[x29,#80] 1423 ldr x29,[sp],#128 1424 // x30 is popped earlier 1425 AARCH64_VALIDATE_LINK_REGISTER 1426 ret 1427 1428.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 1429.align 2 1430.align 4 1431#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 1432