1// Copyright 2018 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build !purego 6 7// This file contains constant-time, 64-bit assembly implementation of 8// P256. The optimizations performed here are described in detail in: 9// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with 10// 256-bit primes" 11// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x 12// https://eprint.iacr.org/2013/816.pdf 13 14#include "textflag.h" 15 16#define res_ptr R0 17#define a_ptr R1 18#define b_ptr R2 19 20#define acc0 R3 21#define acc1 R4 22#define acc2 R5 23#define acc3 R6 24 25#define acc4 R7 26#define acc5 R8 27#define acc6 R9 28#define acc7 R10 29#define t0 R11 30#define t1 R12 31#define t2 R13 32#define t3 R14 33#define const0 R15 34#define const1 R16 35 36#define hlp0 R17 37#define hlp1 res_ptr 38 39#define x0 R19 40#define x1 R20 41#define x2 R21 42#define x3 R22 43#define y0 R23 44#define y1 R24 45#define y2 R25 46#define y3 R26 47 48#define const2 t2 49#define const3 t3 50 51DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff 52DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 53DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f 54DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 55DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 56DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff 57DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 58DATA p256one<>+0x00(SB)/8, $0x0000000000000001 59DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 60DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff 61DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe 62GLOBL p256const0<>(SB), 8, $8 63GLOBL p256const1<>(SB), 8, $8 64GLOBL p256ordK0<>(SB), 8, $8 65GLOBL p256ord<>(SB), 8, $32 66GLOBL p256one<>(SB), 8, $32 67 68/* ---------------------------------------*/ 69// func p256OrdLittleToBig(res *[32]byte, in *p256OrdElement) 70TEXT ·p256OrdLittleToBig(SB),NOSPLIT,$0 71 JMP ·p256BigToLittle(SB) 72/* ---------------------------------------*/ 73// func p256OrdBigToLittle(res *p256OrdElement, in *[32]byte) 74TEXT ·p256OrdBigToLittle(SB),NOSPLIT,$0 75 JMP ·p256BigToLittle(SB) 76/* ---------------------------------------*/ 77// func p256LittleToBig(res *[32]byte, in *p256Element) 78TEXT ·p256LittleToBig(SB),NOSPLIT,$0 79 JMP ·p256BigToLittle(SB) 80/* ---------------------------------------*/ 81// func p256BigToLittle(res *p256Element, in *[32]byte) 82TEXT ·p256BigToLittle(SB),NOSPLIT,$0 83 MOVD res+0(FP), res_ptr 84 MOVD in+8(FP), a_ptr 85 86 LDP 0*16(a_ptr), (acc0, acc1) 87 LDP 1*16(a_ptr), (acc2, acc3) 88 89 REV acc0, acc0 90 REV acc1, acc1 91 REV acc2, acc2 92 REV acc3, acc3 93 94 STP (acc3, acc2), 0*16(res_ptr) 95 STP (acc1, acc0), 1*16(res_ptr) 96 RET 97/* ---------------------------------------*/ 98// func p256MovCond(res, a, b *P256Point, cond int) 99// If cond == 0 res=b, else res=a 100TEXT ·p256MovCond(SB),NOSPLIT,$0 101 MOVD res+0(FP), res_ptr 102 MOVD a+8(FP), a_ptr 103 MOVD b+16(FP), b_ptr 104 MOVD cond+24(FP), R3 105 106 CMP $0, R3 107 // Two remarks: 108 // 1) Will want to revisit NEON, when support is better 109 // 2) CSEL might not be constant time on all ARM processors 110 LDP 0*16(a_ptr), (R4, R5) 111 LDP 1*16(a_ptr), (R6, R7) 112 LDP 2*16(a_ptr), (R8, R9) 113 LDP 0*16(b_ptr), (R16, R17) 114 LDP 1*16(b_ptr), (R19, R20) 115 LDP 2*16(b_ptr), (R21, R22) 116 CSEL EQ, R16, R4, R4 117 CSEL EQ, R17, R5, R5 118 CSEL EQ, R19, R6, R6 119 CSEL EQ, R20, R7, R7 120 CSEL EQ, R21, R8, R8 121 CSEL EQ, R22, R9, R9 122 STP (R4, R5), 0*16(res_ptr) 123 STP (R6, R7), 1*16(res_ptr) 124 STP (R8, R9), 2*16(res_ptr) 125 126 LDP 3*16(a_ptr), (R4, R5) 127 LDP 4*16(a_ptr), (R6, R7) 128 LDP 5*16(a_ptr), (R8, R9) 129 LDP 3*16(b_ptr), (R16, R17) 130 LDP 4*16(b_ptr), (R19, R20) 131 LDP 5*16(b_ptr), (R21, R22) 132 CSEL EQ, R16, R4, R4 133 CSEL EQ, R17, R5, R5 134 CSEL EQ, R19, R6, R6 135 CSEL EQ, R20, R7, R7 136 CSEL EQ, R21, R8, R8 137 CSEL EQ, R22, R9, R9 138 STP (R4, R5), 3*16(res_ptr) 139 STP (R6, R7), 4*16(res_ptr) 140 STP (R8, R9), 5*16(res_ptr) 141 142 RET 143/* ---------------------------------------*/ 144// func p256NegCond(val *p256Element, cond int) 145TEXT ·p256NegCond(SB),NOSPLIT,$0 146 MOVD val+0(FP), a_ptr 147 MOVD cond+8(FP), hlp0 148 MOVD a_ptr, res_ptr 149 // acc = poly 150 MOVD $-1, acc0 151 MOVD p256const0<>(SB), acc1 152 MOVD $0, acc2 153 MOVD p256const1<>(SB), acc3 154 // Load the original value 155 LDP 0*16(a_ptr), (t0, t1) 156 LDP 1*16(a_ptr), (t2, t3) 157 // Speculatively subtract 158 SUBS t0, acc0 159 SBCS t1, acc1 160 SBCS t2, acc2 161 SBC t3, acc3 162 // If condition is 0, keep original value 163 CMP $0, hlp0 164 CSEL EQ, t0, acc0, acc0 165 CSEL EQ, t1, acc1, acc1 166 CSEL EQ, t2, acc2, acc2 167 CSEL EQ, t3, acc3, acc3 168 // Store result 169 STP (acc0, acc1), 0*16(res_ptr) 170 STP (acc2, acc3), 1*16(res_ptr) 171 172 RET 173/* ---------------------------------------*/ 174// func p256Sqr(res, in *p256Element, n int) 175TEXT ·p256Sqr(SB),NOSPLIT,$0 176 MOVD res+0(FP), res_ptr 177 MOVD in+8(FP), a_ptr 178 MOVD n+16(FP), b_ptr 179 180 MOVD p256const0<>(SB), const0 181 MOVD p256const1<>(SB), const1 182 183 LDP 0*16(a_ptr), (x0, x1) 184 LDP 1*16(a_ptr), (x2, x3) 185 186sqrLoop: 187 SUB $1, b_ptr 188 CALL p256SqrInternal<>(SB) 189 MOVD y0, x0 190 MOVD y1, x1 191 MOVD y2, x2 192 MOVD y3, x3 193 CBNZ b_ptr, sqrLoop 194 195 STP (y0, y1), 0*16(res_ptr) 196 STP (y2, y3), 1*16(res_ptr) 197 RET 198/* ---------------------------------------*/ 199// func p256Mul(res, in1, in2 *p256Element) 200TEXT ·p256Mul(SB),NOSPLIT,$0 201 MOVD res+0(FP), res_ptr 202 MOVD in1+8(FP), a_ptr 203 MOVD in2+16(FP), b_ptr 204 205 MOVD p256const0<>(SB), const0 206 MOVD p256const1<>(SB), const1 207 208 LDP 0*16(a_ptr), (x0, x1) 209 LDP 1*16(a_ptr), (x2, x3) 210 211 LDP 0*16(b_ptr), (y0, y1) 212 LDP 1*16(b_ptr), (y2, y3) 213 214 CALL p256MulInternal<>(SB) 215 216 STP (y0, y1), 0*16(res_ptr) 217 STP (y2, y3), 1*16(res_ptr) 218 RET 219/* ---------------------------------------*/ 220// func p256FromMont(res, in *p256Element) 221TEXT ·p256FromMont(SB),NOSPLIT,$0 222 MOVD res+0(FP), res_ptr 223 MOVD in+8(FP), a_ptr 224 225 MOVD p256const0<>(SB), const0 226 MOVD p256const1<>(SB), const1 227 228 LDP 0*16(a_ptr), (acc0, acc1) 229 LDP 1*16(a_ptr), (acc2, acc3) 230 // Only reduce, no multiplications are needed 231 // First reduction step 232 ADDS acc0<<32, acc1, acc1 233 LSR $32, acc0, t0 234 MUL acc0, const1, t1 235 UMULH acc0, const1, acc0 236 ADCS t0, acc2 237 ADCS t1, acc3 238 ADC $0, acc0 239 // Second reduction step 240 ADDS acc1<<32, acc2, acc2 241 LSR $32, acc1, t0 242 MUL acc1, const1, t1 243 UMULH acc1, const1, acc1 244 ADCS t0, acc3 245 ADCS t1, acc0 246 ADC $0, acc1 247 // Third reduction step 248 ADDS acc2<<32, acc3, acc3 249 LSR $32, acc2, t0 250 MUL acc2, const1, t1 251 UMULH acc2, const1, acc2 252 ADCS t0, acc0 253 ADCS t1, acc1 254 ADC $0, acc2 255 // Last reduction step 256 ADDS acc3<<32, acc0, acc0 257 LSR $32, acc3, t0 258 MUL acc3, const1, t1 259 UMULH acc3, const1, acc3 260 ADCS t0, acc1 261 ADCS t1, acc2 262 ADC $0, acc3 263 264 SUBS $-1, acc0, t0 265 SBCS const0, acc1, t1 266 SBCS $0, acc2, t2 267 SBCS const1, acc3, t3 268 269 CSEL CS, t0, acc0, acc0 270 CSEL CS, t1, acc1, acc1 271 CSEL CS, t2, acc2, acc2 272 CSEL CS, t3, acc3, acc3 273 274 STP (acc0, acc1), 0*16(res_ptr) 275 STP (acc2, acc3), 1*16(res_ptr) 276 277 RET 278/* ---------------------------------------*/ 279// func p256Select(res *P256Point, table *p256Table, idx int) 280TEXT ·p256Select(SB),NOSPLIT,$0 281 MOVD idx+16(FP), const0 282 MOVD table+8(FP), b_ptr 283 MOVD res+0(FP), res_ptr 284 285 EOR x0, x0, x0 286 EOR x1, x1, x1 287 EOR x2, x2, x2 288 EOR x3, x3, x3 289 EOR y0, y0, y0 290 EOR y1, y1, y1 291 EOR y2, y2, y2 292 EOR y3, y3, y3 293 EOR t0, t0, t0 294 EOR t1, t1, t1 295 EOR t2, t2, t2 296 EOR t3, t3, t3 297 298 MOVD $0, const1 299 300loop_select: 301 ADD $1, const1 302 CMP const0, const1 303 LDP.P 16(b_ptr), (acc0, acc1) 304 CSEL EQ, acc0, x0, x0 305 CSEL EQ, acc1, x1, x1 306 LDP.P 16(b_ptr), (acc2, acc3) 307 CSEL EQ, acc2, x2, x2 308 CSEL EQ, acc3, x3, x3 309 LDP.P 16(b_ptr), (acc4, acc5) 310 CSEL EQ, acc4, y0, y0 311 CSEL EQ, acc5, y1, y1 312 LDP.P 16(b_ptr), (acc6, acc7) 313 CSEL EQ, acc6, y2, y2 314 CSEL EQ, acc7, y3, y3 315 LDP.P 16(b_ptr), (acc0, acc1) 316 CSEL EQ, acc0, t0, t0 317 CSEL EQ, acc1, t1, t1 318 LDP.P 16(b_ptr), (acc2, acc3) 319 CSEL EQ, acc2, t2, t2 320 CSEL EQ, acc3, t3, t3 321 322 CMP $16, const1 323 BNE loop_select 324 325 STP (x0, x1), 0*16(res_ptr) 326 STP (x2, x3), 1*16(res_ptr) 327 STP (y0, y1), 2*16(res_ptr) 328 STP (y2, y3), 3*16(res_ptr) 329 STP (t0, t1), 4*16(res_ptr) 330 STP (t2, t3), 5*16(res_ptr) 331 RET 332/* ---------------------------------------*/ 333// func p256SelectAffine(res *p256AffinePoint, table *p256AffineTable, idx int) 334TEXT ·p256SelectAffine(SB),NOSPLIT,$0 335 MOVD idx+16(FP), t0 336 MOVD table+8(FP), t1 337 MOVD res+0(FP), res_ptr 338 339 EOR x0, x0, x0 340 EOR x1, x1, x1 341 EOR x2, x2, x2 342 EOR x3, x3, x3 343 EOR y0, y0, y0 344 EOR y1, y1, y1 345 EOR y2, y2, y2 346 EOR y3, y3, y3 347 348 MOVD $0, t2 349 350loop_select: 351 ADD $1, t2 352 CMP t0, t2 353 LDP.P 16(t1), (acc0, acc1) 354 CSEL EQ, acc0, x0, x0 355 CSEL EQ, acc1, x1, x1 356 LDP.P 16(t1), (acc2, acc3) 357 CSEL EQ, acc2, x2, x2 358 CSEL EQ, acc3, x3, x3 359 LDP.P 16(t1), (acc4, acc5) 360 CSEL EQ, acc4, y0, y0 361 CSEL EQ, acc5, y1, y1 362 LDP.P 16(t1), (acc6, acc7) 363 CSEL EQ, acc6, y2, y2 364 CSEL EQ, acc7, y3, y3 365 366 CMP $32, t2 367 BNE loop_select 368 369 STP (x0, x1), 0*16(res_ptr) 370 STP (x2, x3), 1*16(res_ptr) 371 STP (y0, y1), 2*16(res_ptr) 372 STP (y2, y3), 3*16(res_ptr) 373 RET 374/* ---------------------------------------*/ 375// func p256OrdSqr(res, in *p256OrdElement, n int) 376TEXT ·p256OrdSqr(SB),NOSPLIT,$0 377 MOVD in+8(FP), a_ptr 378 MOVD n+16(FP), b_ptr 379 380 MOVD p256ordK0<>(SB), hlp1 381 LDP p256ord<>+0x00(SB), (const0, const1) 382 LDP p256ord<>+0x10(SB), (const2, const3) 383 384 LDP 0*16(a_ptr), (x0, x1) 385 LDP 1*16(a_ptr), (x2, x3) 386 387ordSqrLoop: 388 SUB $1, b_ptr 389 390 // x[1:] * x[0] 391 MUL x0, x1, acc1 392 UMULH x0, x1, acc2 393 394 MUL x0, x2, t0 395 ADDS t0, acc2, acc2 396 UMULH x0, x2, acc3 397 398 MUL x0, x3, t0 399 ADCS t0, acc3, acc3 400 UMULH x0, x3, acc4 401 ADC $0, acc4, acc4 402 // x[2:] * x[1] 403 MUL x1, x2, t0 404 ADDS t0, acc3 405 UMULH x1, x2, t1 406 ADCS t1, acc4 407 ADC $0, ZR, acc5 408 409 MUL x1, x3, t0 410 ADDS t0, acc4 411 UMULH x1, x3, t1 412 ADC t1, acc5 413 // x[3] * x[2] 414 MUL x2, x3, t0 415 ADDS t0, acc5 416 UMULH x2, x3, acc6 417 ADC $0, acc6 418 419 MOVD $0, acc7 420 // *2 421 ADDS acc1, acc1 422 ADCS acc2, acc2 423 ADCS acc3, acc3 424 ADCS acc4, acc4 425 ADCS acc5, acc5 426 ADCS acc6, acc6 427 ADC $0, acc7 428 // Missing products 429 MUL x0, x0, acc0 430 UMULH x0, x0, t0 431 ADDS t0, acc1, acc1 432 433 MUL x1, x1, t0 434 ADCS t0, acc2, acc2 435 UMULH x1, x1, t1 436 ADCS t1, acc3, acc3 437 438 MUL x2, x2, t0 439 ADCS t0, acc4, acc4 440 UMULH x2, x2, t1 441 ADCS t1, acc5, acc5 442 443 MUL x3, x3, t0 444 ADCS t0, acc6, acc6 445 UMULH x3, x3, t1 446 ADC t1, acc7, acc7 447 // First reduction step 448 MUL acc0, hlp1, hlp0 449 450 MUL const0, hlp1, t0 451 ADDS t0, acc0, acc0 452 UMULH const0, hlp0, t1 453 454 MUL const1, hlp0, t0 455 ADCS t0, acc1, acc1 456 UMULH const1, hlp0, y0 457 458 MUL const2, hlp0, t0 459 ADCS t0, acc2, acc2 460 UMULH const2, hlp0, acc0 461 462 MUL const3, hlp0, t0 463 ADCS t0, acc3, acc3 464 465 UMULH const3, hlp0, hlp0 466 ADC $0, hlp0 467 468 ADDS t1, acc1, acc1 469 ADCS y0, acc2, acc2 470 ADCS acc0, acc3, acc3 471 ADC $0, hlp0, acc0 472 // Second reduction step 473 MUL acc1, hlp1, hlp0 474 475 MUL const0, hlp1, t0 476 ADDS t0, acc1, acc1 477 UMULH const0, hlp0, t1 478 479 MUL const1, hlp0, t0 480 ADCS t0, acc2, acc2 481 UMULH const1, hlp0, y0 482 483 MUL const2, hlp0, t0 484 ADCS t0, acc3, acc3 485 UMULH const2, hlp0, acc1 486 487 MUL const3, hlp0, t0 488 ADCS t0, acc0, acc0 489 490 UMULH const3, hlp0, hlp0 491 ADC $0, hlp0 492 493 ADDS t1, acc2, acc2 494 ADCS y0, acc3, acc3 495 ADCS acc1, acc0, acc0 496 ADC $0, hlp0, acc1 497 // Third reduction step 498 MUL acc2, hlp1, hlp0 499 500 MUL const0, hlp1, t0 501 ADDS t0, acc2, acc2 502 UMULH const0, hlp0, t1 503 504 MUL const1, hlp0, t0 505 ADCS t0, acc3, acc3 506 UMULH const1, hlp0, y0 507 508 MUL const2, hlp0, t0 509 ADCS t0, acc0, acc0 510 UMULH const2, hlp0, acc2 511 512 MUL const3, hlp0, t0 513 ADCS t0, acc1, acc1 514 515 UMULH const3, hlp0, hlp0 516 ADC $0, hlp0 517 518 ADDS t1, acc3, acc3 519 ADCS y0, acc0, acc0 520 ADCS acc2, acc1, acc1 521 ADC $0, hlp0, acc2 522 523 // Last reduction step 524 MUL acc3, hlp1, hlp0 525 526 MUL const0, hlp1, t0 527 ADDS t0, acc3, acc3 528 UMULH const0, hlp0, t1 529 530 MUL const1, hlp0, t0 531 ADCS t0, acc0, acc0 532 UMULH const1, hlp0, y0 533 534 MUL const2, hlp0, t0 535 ADCS t0, acc1, acc1 536 UMULH const2, hlp0, acc3 537 538 MUL const3, hlp0, t0 539 ADCS t0, acc2, acc2 540 541 UMULH const3, hlp0, hlp0 542 ADC $0, acc7 543 544 ADDS t1, acc0, acc0 545 ADCS y0, acc1, acc1 546 ADCS acc3, acc2, acc2 547 ADC $0, hlp0, acc3 548 549 ADDS acc4, acc0, acc0 550 ADCS acc5, acc1, acc1 551 ADCS acc6, acc2, acc2 552 ADCS acc7, acc3, acc3 553 ADC $0, ZR, acc4 554 555 SUBS const0, acc0, y0 556 SBCS const1, acc1, y1 557 SBCS const2, acc2, y2 558 SBCS const3, acc3, y3 559 SBCS $0, acc4, acc4 560 561 CSEL CS, y0, acc0, x0 562 CSEL CS, y1, acc1, x1 563 CSEL CS, y2, acc2, x2 564 CSEL CS, y3, acc3, x3 565 566 CBNZ b_ptr, ordSqrLoop 567 568 MOVD res+0(FP), res_ptr 569 STP (x0, x1), 0*16(res_ptr) 570 STP (x2, x3), 1*16(res_ptr) 571 572 RET 573/* ---------------------------------------*/ 574// func p256OrdMul(res, in1, in2 *p256OrdElement) 575TEXT ·p256OrdMul(SB),NOSPLIT,$0 576 MOVD in1+8(FP), a_ptr 577 MOVD in2+16(FP), b_ptr 578 579 MOVD p256ordK0<>(SB), hlp1 580 LDP p256ord<>+0x00(SB), (const0, const1) 581 LDP p256ord<>+0x10(SB), (const2, const3) 582 583 LDP 0*16(a_ptr), (x0, x1) 584 LDP 1*16(a_ptr), (x2, x3) 585 LDP 0*16(b_ptr), (y0, y1) 586 LDP 1*16(b_ptr), (y2, y3) 587 588 // y[0] * x 589 MUL y0, x0, acc0 590 UMULH y0, x0, acc1 591 592 MUL y0, x1, t0 593 ADDS t0, acc1 594 UMULH y0, x1, acc2 595 596 MUL y0, x2, t0 597 ADCS t0, acc2 598 UMULH y0, x2, acc3 599 600 MUL y0, x3, t0 601 ADCS t0, acc3 602 UMULH y0, x3, acc4 603 ADC $0, acc4 604 // First reduction step 605 MUL acc0, hlp1, hlp0 606 607 MUL const0, hlp1, t0 608 ADDS t0, acc0, acc0 609 UMULH const0, hlp0, t1 610 611 MUL const1, hlp0, t0 612 ADCS t0, acc1, acc1 613 UMULH const1, hlp0, y0 614 615 MUL const2, hlp0, t0 616 ADCS t0, acc2, acc2 617 UMULH const2, hlp0, acc0 618 619 MUL const3, hlp0, t0 620 ADCS t0, acc3, acc3 621 622 UMULH const3, hlp0, hlp0 623 ADC $0, acc4 624 625 ADDS t1, acc1, acc1 626 ADCS y0, acc2, acc2 627 ADCS acc0, acc3, acc3 628 ADC $0, hlp0, acc0 629 // y[1] * x 630 MUL y1, x0, t0 631 ADDS t0, acc1 632 UMULH y1, x0, t1 633 634 MUL y1, x1, t0 635 ADCS t0, acc2 636 UMULH y1, x1, hlp0 637 638 MUL y1, x2, t0 639 ADCS t0, acc3 640 UMULH y1, x2, y0 641 642 MUL y1, x3, t0 643 ADCS t0, acc4 644 UMULH y1, x3, y1 645 ADC $0, ZR, acc5 646 647 ADDS t1, acc2 648 ADCS hlp0, acc3 649 ADCS y0, acc4 650 ADC y1, acc5 651 // Second reduction step 652 MUL acc1, hlp1, hlp0 653 654 MUL const0, hlp1, t0 655 ADDS t0, acc1, acc1 656 UMULH const0, hlp0, t1 657 658 MUL const1, hlp0, t0 659 ADCS t0, acc2, acc2 660 UMULH const1, hlp0, y0 661 662 MUL const2, hlp0, t0 663 ADCS t0, acc3, acc3 664 UMULH const2, hlp0, acc1 665 666 MUL const3, hlp0, t0 667 ADCS t0, acc0, acc0 668 669 UMULH const3, hlp0, hlp0 670 ADC $0, acc5 671 672 ADDS t1, acc2, acc2 673 ADCS y0, acc3, acc3 674 ADCS acc1, acc0, acc0 675 ADC $0, hlp0, acc1 676 // y[2] * x 677 MUL y2, x0, t0 678 ADDS t0, acc2 679 UMULH y2, x0, t1 680 681 MUL y2, x1, t0 682 ADCS t0, acc3 683 UMULH y2, x1, hlp0 684 685 MUL y2, x2, t0 686 ADCS t0, acc4 687 UMULH y2, x2, y0 688 689 MUL y2, x3, t0 690 ADCS t0, acc5 691 UMULH y2, x3, y1 692 ADC $0, ZR, acc6 693 694 ADDS t1, acc3 695 ADCS hlp0, acc4 696 ADCS y0, acc5 697 ADC y1, acc6 698 // Third reduction step 699 MUL acc2, hlp1, hlp0 700 701 MUL const0, hlp1, t0 702 ADDS t0, acc2, acc2 703 UMULH const0, hlp0, t1 704 705 MUL const1, hlp0, t0 706 ADCS t0, acc3, acc3 707 UMULH const1, hlp0, y0 708 709 MUL const2, hlp0, t0 710 ADCS t0, acc0, acc0 711 UMULH const2, hlp0, acc2 712 713 MUL const3, hlp0, t0 714 ADCS t0, acc1, acc1 715 716 UMULH const3, hlp0, hlp0 717 ADC $0, acc6 718 719 ADDS t1, acc3, acc3 720 ADCS y0, acc0, acc0 721 ADCS acc2, acc1, acc1 722 ADC $0, hlp0, acc2 723 // y[3] * x 724 MUL y3, x0, t0 725 ADDS t0, acc3 726 UMULH y3, x0, t1 727 728 MUL y3, x1, t0 729 ADCS t0, acc4 730 UMULH y3, x1, hlp0 731 732 MUL y3, x2, t0 733 ADCS t0, acc5 734 UMULH y3, x2, y0 735 736 MUL y3, x3, t0 737 ADCS t0, acc6 738 UMULH y3, x3, y1 739 ADC $0, ZR, acc7 740 741 ADDS t1, acc4 742 ADCS hlp0, acc5 743 ADCS y0, acc6 744 ADC y1, acc7 745 // Last reduction step 746 MUL acc3, hlp1, hlp0 747 748 MUL const0, hlp1, t0 749 ADDS t0, acc3, acc3 750 UMULH const0, hlp0, t1 751 752 MUL const1, hlp0, t0 753 ADCS t0, acc0, acc0 754 UMULH const1, hlp0, y0 755 756 MUL const2, hlp0, t0 757 ADCS t0, acc1, acc1 758 UMULH const2, hlp0, acc3 759 760 MUL const3, hlp0, t0 761 ADCS t0, acc2, acc2 762 763 UMULH const3, hlp0, hlp0 764 ADC $0, acc7 765 766 ADDS t1, acc0, acc0 767 ADCS y0, acc1, acc1 768 ADCS acc3, acc2, acc2 769 ADC $0, hlp0, acc3 770 771 ADDS acc4, acc0, acc0 772 ADCS acc5, acc1, acc1 773 ADCS acc6, acc2, acc2 774 ADCS acc7, acc3, acc3 775 ADC $0, ZR, acc4 776 777 SUBS const0, acc0, t0 778 SBCS const1, acc1, t1 779 SBCS const2, acc2, t2 780 SBCS const3, acc3, t3 781 SBCS $0, acc4, acc4 782 783 CSEL CS, t0, acc0, acc0 784 CSEL CS, t1, acc1, acc1 785 CSEL CS, t2, acc2, acc2 786 CSEL CS, t3, acc3, acc3 787 788 MOVD res+0(FP), res_ptr 789 STP (acc0, acc1), 0*16(res_ptr) 790 STP (acc2, acc3), 1*16(res_ptr) 791 792 RET 793/* ---------------------------------------*/ 794TEXT p256SubInternal<>(SB),NOSPLIT,$0 795 SUBS x0, y0, acc0 796 SBCS x1, y1, acc1 797 SBCS x2, y2, acc2 798 SBCS x3, y3, acc3 799 SBC $0, ZR, t0 800 801 ADDS $-1, acc0, acc4 802 ADCS const0, acc1, acc5 803 ADCS $0, acc2, acc6 804 ADC const1, acc3, acc7 805 806 ANDS $1, t0 807 CSEL EQ, acc0, acc4, x0 808 CSEL EQ, acc1, acc5, x1 809 CSEL EQ, acc2, acc6, x2 810 CSEL EQ, acc3, acc7, x3 811 812 RET 813/* ---------------------------------------*/ 814TEXT p256SqrInternal<>(SB),NOSPLIT,$0 815 // x[1:] * x[0] 816 MUL x0, x1, acc1 817 UMULH x0, x1, acc2 818 819 MUL x0, x2, t0 820 ADDS t0, acc2, acc2 821 UMULH x0, x2, acc3 822 823 MUL x0, x3, t0 824 ADCS t0, acc3, acc3 825 UMULH x0, x3, acc4 826 ADC $0, acc4, acc4 827 // x[2:] * x[1] 828 MUL x1, x2, t0 829 ADDS t0, acc3 830 UMULH x1, x2, t1 831 ADCS t1, acc4 832 ADC $0, ZR, acc5 833 834 MUL x1, x3, t0 835 ADDS t0, acc4 836 UMULH x1, x3, t1 837 ADC t1, acc5 838 // x[3] * x[2] 839 MUL x2, x3, t0 840 ADDS t0, acc5 841 UMULH x2, x3, acc6 842 ADC $0, acc6 843 844 MOVD $0, acc7 845 // *2 846 ADDS acc1, acc1 847 ADCS acc2, acc2 848 ADCS acc3, acc3 849 ADCS acc4, acc4 850 ADCS acc5, acc5 851 ADCS acc6, acc6 852 ADC $0, acc7 853 // Missing products 854 MUL x0, x0, acc0 855 UMULH x0, x0, t0 856 ADDS t0, acc1, acc1 857 858 MUL x1, x1, t0 859 ADCS t0, acc2, acc2 860 UMULH x1, x1, t1 861 ADCS t1, acc3, acc3 862 863 MUL x2, x2, t0 864 ADCS t0, acc4, acc4 865 UMULH x2, x2, t1 866 ADCS t1, acc5, acc5 867 868 MUL x3, x3, t0 869 ADCS t0, acc6, acc6 870 UMULH x3, x3, t1 871 ADCS t1, acc7, acc7 872 // First reduction step 873 ADDS acc0<<32, acc1, acc1 874 LSR $32, acc0, t0 875 MUL acc0, const1, t1 876 UMULH acc0, const1, acc0 877 ADCS t0, acc2, acc2 878 ADCS t1, acc3, acc3 879 ADC $0, acc0, acc0 880 // Second reduction step 881 ADDS acc1<<32, acc2, acc2 882 LSR $32, acc1, t0 883 MUL acc1, const1, t1 884 UMULH acc1, const1, acc1 885 ADCS t0, acc3, acc3 886 ADCS t1, acc0, acc0 887 ADC $0, acc1, acc1 888 // Third reduction step 889 ADDS acc2<<32, acc3, acc3 890 LSR $32, acc2, t0 891 MUL acc2, const1, t1 892 UMULH acc2, const1, acc2 893 ADCS t0, acc0, acc0 894 ADCS t1, acc1, acc1 895 ADC $0, acc2, acc2 896 // Last reduction step 897 ADDS acc3<<32, acc0, acc0 898 LSR $32, acc3, t0 899 MUL acc3, const1, t1 900 UMULH acc3, const1, acc3 901 ADCS t0, acc1, acc1 902 ADCS t1, acc2, acc2 903 ADC $0, acc3, acc3 904 // Add bits [511:256] of the sqr result 905 ADDS acc4, acc0, acc0 906 ADCS acc5, acc1, acc1 907 ADCS acc6, acc2, acc2 908 ADCS acc7, acc3, acc3 909 ADC $0, ZR, acc4 910 911 SUBS $-1, acc0, t0 912 SBCS const0, acc1, t1 913 SBCS $0, acc2, t2 914 SBCS const1, acc3, t3 915 SBCS $0, acc4, acc4 916 917 CSEL CS, t0, acc0, y0 918 CSEL CS, t1, acc1, y1 919 CSEL CS, t2, acc2, y2 920 CSEL CS, t3, acc3, y3 921 RET 922/* ---------------------------------------*/ 923TEXT p256MulInternal<>(SB),NOSPLIT,$0 924 // y[0] * x 925 MUL y0, x0, acc0 926 UMULH y0, x0, acc1 927 928 MUL y0, x1, t0 929 ADDS t0, acc1 930 UMULH y0, x1, acc2 931 932 MUL y0, x2, t0 933 ADCS t0, acc2 934 UMULH y0, x2, acc3 935 936 MUL y0, x3, t0 937 ADCS t0, acc3 938 UMULH y0, x3, acc4 939 ADC $0, acc4 940 // First reduction step 941 ADDS acc0<<32, acc1, acc1 942 LSR $32, acc0, t0 943 MUL acc0, const1, t1 944 UMULH acc0, const1, acc0 945 ADCS t0, acc2 946 ADCS t1, acc3 947 ADC $0, acc0 948 // y[1] * x 949 MUL y1, x0, t0 950 ADDS t0, acc1 951 UMULH y1, x0, t1 952 953 MUL y1, x1, t0 954 ADCS t0, acc2 955 UMULH y1, x1, t2 956 957 MUL y1, x2, t0 958 ADCS t0, acc3 959 UMULH y1, x2, t3 960 961 MUL y1, x3, t0 962 ADCS t0, acc4 963 UMULH y1, x3, hlp0 964 ADC $0, ZR, acc5 965 966 ADDS t1, acc2 967 ADCS t2, acc3 968 ADCS t3, acc4 969 ADC hlp0, acc5 970 // Second reduction step 971 ADDS acc1<<32, acc2, acc2 972 LSR $32, acc1, t0 973 MUL acc1, const1, t1 974 UMULH acc1, const1, acc1 975 ADCS t0, acc3 976 ADCS t1, acc0 977 ADC $0, acc1 978 // y[2] * x 979 MUL y2, x0, t0 980 ADDS t0, acc2 981 UMULH y2, x0, t1 982 983 MUL y2, x1, t0 984 ADCS t0, acc3 985 UMULH y2, x1, t2 986 987 MUL y2, x2, t0 988 ADCS t0, acc4 989 UMULH y2, x2, t3 990 991 MUL y2, x3, t0 992 ADCS t0, acc5 993 UMULH y2, x3, hlp0 994 ADC $0, ZR, acc6 995 996 ADDS t1, acc3 997 ADCS t2, acc4 998 ADCS t3, acc5 999 ADC hlp0, acc6 1000 // Third reduction step 1001 ADDS acc2<<32, acc3, acc3 1002 LSR $32, acc2, t0 1003 MUL acc2, const1, t1 1004 UMULH acc2, const1, acc2 1005 ADCS t0, acc0 1006 ADCS t1, acc1 1007 ADC $0, acc2 1008 // y[3] * x 1009 MUL y3, x0, t0 1010 ADDS t0, acc3 1011 UMULH y3, x0, t1 1012 1013 MUL y3, x1, t0 1014 ADCS t0, acc4 1015 UMULH y3, x1, t2 1016 1017 MUL y3, x2, t0 1018 ADCS t0, acc5 1019 UMULH y3, x2, t3 1020 1021 MUL y3, x3, t0 1022 ADCS t0, acc6 1023 UMULH y3, x3, hlp0 1024 ADC $0, ZR, acc7 1025 1026 ADDS t1, acc4 1027 ADCS t2, acc5 1028 ADCS t3, acc6 1029 ADC hlp0, acc7 1030 // Last reduction step 1031 ADDS acc3<<32, acc0, acc0 1032 LSR $32, acc3, t0 1033 MUL acc3, const1, t1 1034 UMULH acc3, const1, acc3 1035 ADCS t0, acc1 1036 ADCS t1, acc2 1037 ADC $0, acc3 1038 // Add bits [511:256] of the mul result 1039 ADDS acc4, acc0, acc0 1040 ADCS acc5, acc1, acc1 1041 ADCS acc6, acc2, acc2 1042 ADCS acc7, acc3, acc3 1043 ADC $0, ZR, acc4 1044 1045 SUBS $-1, acc0, t0 1046 SBCS const0, acc1, t1 1047 SBCS $0, acc2, t2 1048 SBCS const1, acc3, t3 1049 SBCS $0, acc4, acc4 1050 1051 CSEL CS, t0, acc0, y0 1052 CSEL CS, t1, acc1, y1 1053 CSEL CS, t2, acc2, y2 1054 CSEL CS, t3, acc3, y3 1055 RET 1056/* ---------------------------------------*/ 1057#define p256MulBy2Inline \ 1058 ADDS y0, y0, x0; \ 1059 ADCS y1, y1, x1; \ 1060 ADCS y2, y2, x2; \ 1061 ADCS y3, y3, x3; \ 1062 ADC $0, ZR, hlp0; \ 1063 SUBS $-1, x0, t0; \ 1064 SBCS const0, x1, t1;\ 1065 SBCS $0, x2, t2; \ 1066 SBCS const1, x3, t3;\ 1067 SBCS $0, hlp0, hlp0;\ 1068 CSEL CC, x0, t0, x0;\ 1069 CSEL CC, x1, t1, x1;\ 1070 CSEL CC, x2, t2, x2;\ 1071 CSEL CC, x3, t3, x3; 1072/* ---------------------------------------*/ 1073#define x1in(off) (off)(a_ptr) 1074#define y1in(off) (off + 32)(a_ptr) 1075#define z1in(off) (off + 64)(a_ptr) 1076#define x2in(off) (off)(b_ptr) 1077#define z2in(off) (off + 64)(b_ptr) 1078#define x3out(off) (off)(res_ptr) 1079#define y3out(off) (off + 32)(res_ptr) 1080#define z3out(off) (off + 64)(res_ptr) 1081#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) 1082#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) 1083#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) 1084#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) 1085/* ---------------------------------------*/ 1086#define y2in(off) (32*0 + 8 + off)(RSP) 1087#define s2(off) (32*1 + 8 + off)(RSP) 1088#define z1sqr(off) (32*2 + 8 + off)(RSP) 1089#define h(off) (32*3 + 8 + off)(RSP) 1090#define r(off) (32*4 + 8 + off)(RSP) 1091#define hsqr(off) (32*5 + 8 + off)(RSP) 1092#define rsqr(off) (32*6 + 8 + off)(RSP) 1093#define hcub(off) (32*7 + 8 + off)(RSP) 1094 1095#define z2sqr(off) (32*8 + 8 + off)(RSP) 1096#define s1(off) (32*9 + 8 + off)(RSP) 1097#define u1(off) (32*10 + 8 + off)(RSP) 1098#define u2(off) (32*11 + 8 + off)(RSP) 1099 1100// func p256PointAddAffineAsm(res, in1 *P256Point, in2 *p256AffinePoint, sign, sel, zero int) 1101TEXT ·p256PointAddAffineAsm(SB),0,$264-48 1102 MOVD in1+8(FP), a_ptr 1103 MOVD in2+16(FP), b_ptr 1104 MOVD sign+24(FP), hlp0 1105 MOVD sel+32(FP), hlp1 1106 MOVD zero+40(FP), t2 1107 1108 MOVD $1, t0 1109 CMP $0, t2 1110 CSEL EQ, ZR, t0, t2 1111 CMP $0, hlp1 1112 CSEL EQ, ZR, t0, hlp1 1113 1114 MOVD p256const0<>(SB), const0 1115 MOVD p256const1<>(SB), const1 1116 EOR t2<<1, hlp1 1117 1118 // Negate y2in based on sign 1119 LDP 2*16(b_ptr), (y0, y1) 1120 LDP 3*16(b_ptr), (y2, y3) 1121 MOVD $-1, acc0 1122 1123 SUBS y0, acc0, acc0 1124 SBCS y1, const0, acc1 1125 SBCS y2, ZR, acc2 1126 SBCS y3, const1, acc3 1127 SBC $0, ZR, t0 1128 1129 ADDS $-1, acc0, acc4 1130 ADCS const0, acc1, acc5 1131 ADCS $0, acc2, acc6 1132 ADCS const1, acc3, acc7 1133 ADC $0, t0, t0 1134 1135 CMP $0, t0 1136 CSEL EQ, acc4, acc0, acc0 1137 CSEL EQ, acc5, acc1, acc1 1138 CSEL EQ, acc6, acc2, acc2 1139 CSEL EQ, acc7, acc3, acc3 1140 // If condition is 0, keep original value 1141 CMP $0, hlp0 1142 CSEL EQ, y0, acc0, y0 1143 CSEL EQ, y1, acc1, y1 1144 CSEL EQ, y2, acc2, y2 1145 CSEL EQ, y3, acc3, y3 1146 // Store result 1147 STy(y2in) 1148 // Begin point add 1149 LDx(z1in) 1150 CALL p256SqrInternal<>(SB) // z1ˆ2 1151 STy(z1sqr) 1152 1153 LDx(x2in) 1154 CALL p256MulInternal<>(SB) // x2 * z1ˆ2 1155 1156 LDx(x1in) 1157 CALL p256SubInternal<>(SB) // h = u2 - u1 1158 STx(h) 1159 1160 LDy(z1in) 1161 CALL p256MulInternal<>(SB) // z3 = h * z1 1162 1163 LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1 1164 LDP 5*16(a_ptr), (acc2, acc3) 1165 ANDS $1, hlp1, ZR 1166 CSEL EQ, acc0, y0, y0 1167 CSEL EQ, acc1, y1, y1 1168 CSEL EQ, acc2, y2, y2 1169 CSEL EQ, acc3, y3, y3 1170 LDP p256one<>+0x00(SB), (acc0, acc1) 1171 LDP p256one<>+0x10(SB), (acc2, acc3) 1172 ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1 1173 CSEL EQ, acc0, y0, y0 1174 CSEL EQ, acc1, y1, y1 1175 CSEL EQ, acc2, y2, y2 1176 CSEL EQ, acc3, y3, y3 1177 LDx(z1in) 1178 MOVD res+0(FP), t0 1179 STP (y0, y1), 4*16(t0) 1180 STP (y2, y3), 5*16(t0) 1181 1182 LDy(z1sqr) 1183 CALL p256MulInternal<>(SB) // z1 ^ 3 1184 1185 LDx(y2in) 1186 CALL p256MulInternal<>(SB) // s2 = y2 * z1ˆ3 1187 STy(s2) 1188 1189 LDx(y1in) 1190 CALL p256SubInternal<>(SB) // r = s2 - s1 1191 STx(r) 1192 1193 CALL p256SqrInternal<>(SB) // rsqr = rˆ2 1194 STy (rsqr) 1195 1196 LDx(h) 1197 CALL p256SqrInternal<>(SB) // hsqr = hˆ2 1198 STy(hsqr) 1199 1200 CALL p256MulInternal<>(SB) // hcub = hˆ3 1201 STy(hcub) 1202 1203 LDx(y1in) 1204 CALL p256MulInternal<>(SB) // y1 * hˆ3 1205 STy(s2) 1206 1207 LDP hsqr(0*8), (x0, x1) 1208 LDP hsqr(2*8), (x2, x3) 1209 LDP 0*16(a_ptr), (y0, y1) 1210 LDP 1*16(a_ptr), (y2, y3) 1211 CALL p256MulInternal<>(SB) // u1 * hˆ2 1212 STP (y0, y1), h(0*8) 1213 STP (y2, y3), h(2*8) 1214 1215 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1216 1217 LDy(rsqr) 1218 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1219 1220 MOVD x0, y0 1221 MOVD x1, y1 1222 MOVD x2, y2 1223 MOVD x3, y3 1224 LDx(hcub) 1225 CALL p256SubInternal<>(SB) 1226 1227 LDP 0*16(a_ptr), (acc0, acc1) 1228 LDP 1*16(a_ptr), (acc2, acc3) 1229 ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1 1230 CSEL EQ, acc0, x0, x0 1231 CSEL EQ, acc1, x1, x1 1232 CSEL EQ, acc2, x2, x2 1233 CSEL EQ, acc3, x3, x3 1234 LDP 0*16(b_ptr), (acc0, acc1) 1235 LDP 1*16(b_ptr), (acc2, acc3) 1236 ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2 1237 CSEL EQ, acc0, x0, x0 1238 CSEL EQ, acc1, x1, x1 1239 CSEL EQ, acc2, x2, x2 1240 CSEL EQ, acc3, x3, x3 1241 MOVD res+0(FP), t0 1242 STP (x0, x1), 0*16(t0) 1243 STP (x2, x3), 1*16(t0) 1244 1245 LDP h(0*8), (y0, y1) 1246 LDP h(2*8), (y2, y3) 1247 CALL p256SubInternal<>(SB) 1248 1249 LDP r(0*8), (y0, y1) 1250 LDP r(2*8), (y2, y3) 1251 CALL p256MulInternal<>(SB) 1252 1253 LDP s2(0*8), (x0, x1) 1254 LDP s2(2*8), (x2, x3) 1255 CALL p256SubInternal<>(SB) 1256 LDP 2*16(a_ptr), (acc0, acc1) 1257 LDP 3*16(a_ptr), (acc2, acc3) 1258 ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1 1259 CSEL EQ, acc0, x0, x0 1260 CSEL EQ, acc1, x1, x1 1261 CSEL EQ, acc2, x2, x2 1262 CSEL EQ, acc3, x3, x3 1263 LDP y2in(0*8), (acc0, acc1) 1264 LDP y2in(2*8), (acc2, acc3) 1265 ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2 1266 CSEL EQ, acc0, x0, x0 1267 CSEL EQ, acc1, x1, x1 1268 CSEL EQ, acc2, x2, x2 1269 CSEL EQ, acc3, x3, x3 1270 MOVD res+0(FP), t0 1271 STP (x0, x1), 2*16(t0) 1272 STP (x2, x3), 3*16(t0) 1273 1274 RET 1275 1276#define p256AddInline \ 1277 ADDS y0, x0, x0; \ 1278 ADCS y1, x1, x1; \ 1279 ADCS y2, x2, x2; \ 1280 ADCS y3, x3, x3; \ 1281 ADC $0, ZR, hlp0; \ 1282 SUBS $-1, x0, t0; \ 1283 SBCS const0, x1, t1;\ 1284 SBCS $0, x2, t2; \ 1285 SBCS const1, x3, t3;\ 1286 SBCS $0, hlp0, hlp0;\ 1287 CSEL CC, x0, t0, x0;\ 1288 CSEL CC, x1, t1, x1;\ 1289 CSEL CC, x2, t2, x2;\ 1290 CSEL CC, x3, t3, x3; 1291 1292#define s(off) (32*0 + 8 + off)(RSP) 1293#define m(off) (32*1 + 8 + off)(RSP) 1294#define zsqr(off) (32*2 + 8 + off)(RSP) 1295#define tmp(off) (32*3 + 8 + off)(RSP) 1296 1297//func p256PointDoubleAsm(res, in *P256Point) 1298TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-16 1299 MOVD res+0(FP), res_ptr 1300 MOVD in+8(FP), a_ptr 1301 1302 MOVD p256const0<>(SB), const0 1303 MOVD p256const1<>(SB), const1 1304 1305 // Begin point double 1306 LDP 4*16(a_ptr), (x0, x1) 1307 LDP 5*16(a_ptr), (x2, x3) 1308 CALL p256SqrInternal<>(SB) 1309 STP (y0, y1), zsqr(0*8) 1310 STP (y2, y3), zsqr(2*8) 1311 1312 LDP 0*16(a_ptr), (x0, x1) 1313 LDP 1*16(a_ptr), (x2, x3) 1314 p256AddInline 1315 STx(m) 1316 1317 LDx(z1in) 1318 LDy(y1in) 1319 CALL p256MulInternal<>(SB) 1320 p256MulBy2Inline 1321 STx(z3out) 1322 1323 LDy(x1in) 1324 LDx(zsqr) 1325 CALL p256SubInternal<>(SB) 1326 LDy(m) 1327 CALL p256MulInternal<>(SB) 1328 1329 // Multiply by 3 1330 p256MulBy2Inline 1331 p256AddInline 1332 STx(m) 1333 1334 LDy(y1in) 1335 p256MulBy2Inline 1336 CALL p256SqrInternal<>(SB) 1337 STy(s) 1338 MOVD y0, x0 1339 MOVD y1, x1 1340 MOVD y2, x2 1341 MOVD y3, x3 1342 CALL p256SqrInternal<>(SB) 1343 1344 // Divide by 2 1345 ADDS $-1, y0, t0 1346 ADCS const0, y1, t1 1347 ADCS $0, y2, t2 1348 ADCS const1, y3, t3 1349 ADC $0, ZR, hlp0 1350 1351 ANDS $1, y0, ZR 1352 CSEL EQ, y0, t0, t0 1353 CSEL EQ, y1, t1, t1 1354 CSEL EQ, y2, t2, t2 1355 CSEL EQ, y3, t3, t3 1356 AND y0, hlp0, hlp0 1357 1358 EXTR $1, t0, t1, y0 1359 EXTR $1, t1, t2, y1 1360 EXTR $1, t2, t3, y2 1361 EXTR $1, t3, hlp0, y3 1362 STy(y3out) 1363 1364 LDx(x1in) 1365 LDy(s) 1366 CALL p256MulInternal<>(SB) 1367 STy(s) 1368 p256MulBy2Inline 1369 STx(tmp) 1370 1371 LDx(m) 1372 CALL p256SqrInternal<>(SB) 1373 LDx(tmp) 1374 CALL p256SubInternal<>(SB) 1375 1376 STx(x3out) 1377 1378 LDy(s) 1379 CALL p256SubInternal<>(SB) 1380 1381 LDy(m) 1382 CALL p256MulInternal<>(SB) 1383 1384 LDx(y3out) 1385 CALL p256SubInternal<>(SB) 1386 STx(y3out) 1387 RET 1388/* ---------------------------------------*/ 1389#undef y2in 1390#undef x3out 1391#undef y3out 1392#undef z3out 1393#define y2in(off) (off + 32)(b_ptr) 1394#define x3out(off) (off)(b_ptr) 1395#define y3out(off) (off + 32)(b_ptr) 1396#define z3out(off) (off + 64)(b_ptr) 1397// func p256PointAddAsm(res, in1, in2 *P256Point) int 1398TEXT ·p256PointAddAsm(SB),0,$392-32 1399 // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl 1400 // Move input to stack in order to free registers 1401 MOVD in1+8(FP), a_ptr 1402 MOVD in2+16(FP), b_ptr 1403 1404 MOVD p256const0<>(SB), const0 1405 MOVD p256const1<>(SB), const1 1406 1407 // Begin point add 1408 LDx(z2in) 1409 CALL p256SqrInternal<>(SB) // z2^2 1410 STy(z2sqr) 1411 1412 CALL p256MulInternal<>(SB) // z2^3 1413 1414 LDx(y1in) 1415 CALL p256MulInternal<>(SB) // s1 = z2ˆ3*y1 1416 STy(s1) 1417 1418 LDx(z1in) 1419 CALL p256SqrInternal<>(SB) // z1^2 1420 STy(z1sqr) 1421 1422 CALL p256MulInternal<>(SB) // z1^3 1423 1424 LDx(y2in) 1425 CALL p256MulInternal<>(SB) // s2 = z1ˆ3*y2 1426 1427 LDx(s1) 1428 CALL p256SubInternal<>(SB) // r = s2 - s1 1429 STx(r) 1430 1431 MOVD $1, t2 1432 ORR x0, x1, t0 // Check if zero mod p256 1433 ORR x2, x3, t1 1434 ORR t1, t0, t0 1435 CMP $0, t0 1436 CSEL EQ, t2, ZR, hlp1 1437 1438 EOR $-1, x0, t0 1439 EOR const0, x1, t1 1440 EOR const1, x3, t3 1441 1442 ORR t0, t1, t0 1443 ORR x2, t3, t1 1444 ORR t1, t0, t0 1445 CMP $0, t0 1446 CSEL EQ, t2, hlp1, hlp1 1447 1448 LDx(z2sqr) 1449 LDy(x1in) 1450 CALL p256MulInternal<>(SB) // u1 = x1 * z2ˆ2 1451 STy(u1) 1452 1453 LDx(z1sqr) 1454 LDy(x2in) 1455 CALL p256MulInternal<>(SB) // u2 = x2 * z1ˆ2 1456 STy(u2) 1457 1458 LDx(u1) 1459 CALL p256SubInternal<>(SB) // h = u2 - u1 1460 STx(h) 1461 1462 MOVD $1, t2 1463 ORR x0, x1, t0 // Check if zero mod p256 1464 ORR x2, x3, t1 1465 ORR t1, t0, t0 1466 CMP $0, t0 1467 CSEL EQ, t2, ZR, hlp0 1468 1469 EOR $-1, x0, t0 1470 EOR const0, x1, t1 1471 EOR const1, x3, t3 1472 1473 ORR t0, t1, t0 1474 ORR x2, t3, t1 1475 ORR t1, t0, t0 1476 CMP $0, t0 1477 CSEL EQ, t2, hlp0, hlp0 1478 1479 AND hlp0, hlp1, hlp1 1480 1481 LDx(r) 1482 CALL p256SqrInternal<>(SB) // rsqr = rˆ2 1483 STy(rsqr) 1484 1485 LDx(h) 1486 CALL p256SqrInternal<>(SB) // hsqr = hˆ2 1487 STy(hsqr) 1488 1489 LDx(h) 1490 CALL p256MulInternal<>(SB) // hcub = hˆ3 1491 STy(hcub) 1492 1493 LDx(s1) 1494 CALL p256MulInternal<>(SB) 1495 STy(s2) 1496 1497 LDx(z1in) 1498 LDy(z2in) 1499 CALL p256MulInternal<>(SB) // z1 * z2 1500 LDx(h) 1501 CALL p256MulInternal<>(SB) // z1 * z2 * h 1502 MOVD res+0(FP), b_ptr 1503 STy(z3out) 1504 1505 LDx(hsqr) 1506 LDy(u1) 1507 CALL p256MulInternal<>(SB) // hˆ2 * u1 1508 STy(u2) 1509 1510 p256MulBy2Inline // u1 * hˆ2 * 2, inline 1511 LDy(rsqr) 1512 CALL p256SubInternal<>(SB) // rˆ2 - u1 * hˆ2 * 2 1513 1514 MOVD x0, y0 1515 MOVD x1, y1 1516 MOVD x2, y2 1517 MOVD x3, y3 1518 LDx(hcub) 1519 CALL p256SubInternal<>(SB) 1520 STx(x3out) 1521 1522 LDy(u2) 1523 CALL p256SubInternal<>(SB) 1524 1525 LDy(r) 1526 CALL p256MulInternal<>(SB) 1527 1528 LDx(s2) 1529 CALL p256SubInternal<>(SB) 1530 STx(y3out) 1531 1532 MOVD hlp1, R0 1533 MOVD R0, ret+24(FP) 1534 1535 RET 1536