1// Copyright 2019 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build (ppc64 || ppc64le) && !purego 6 7// Portions based on CRYPTOGAMS code with the following comment: 8// # ==================================================================== 9// # Written by Andy Polyakov <[email protected]> for the OpenSSL 10// # project. The module is, however, dual licensed under OpenSSL and 11// # CRYPTOGAMS licenses depending on where you obtain it. For further 12// # details see http://www.openssl.org/~appro/cryptogams/. 13// # ==================================================================== 14 15// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm 16// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl 17// from commit d47afb3c. 18 19// Changes were made due to differences in the ABI and some register usage. 20// Some arguments were changed due to the way the Go code passes them. 21 22// Portions that use the stitched AES-GCM approach in counterCryptASM 23// are based on code found in 24// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s 25 26#include "textflag.h" 27 28#define XIP R3 29#define HTBL R4 30#define INP R5 31#define LEN R6 32 33#define XL V0 34#define XM V1 35#define XH V2 36#define IN V3 37#define ZERO V4 38#define T0 V5 39#define T1 V6 40#define T2 V7 41#define XC2 V8 42#define H V9 43#define HH V10 44#define HL V11 45#define LEMASK V12 46#define XL1 V13 47#define XM1 V14 48#define XH1 V15 49#define IN1 V16 50#define H2 V17 51#define H2H V18 52#define H2L V19 53#define XL3 V20 54#define XM2 V21 55#define IN2 V22 56#define H3L V23 57#define H3 V24 58#define H3H V25 59#define XH3 V26 60#define XM3 V27 61#define IN3 V28 62#define H4L V29 63#define H4 V30 64#define H4H V31 65 66#define IN0 IN 67#define H21L HL 68#define H21H HH 69#define LOPERM H2L 70#define HIPERM H2H 71 72#define VXL VS32 73#define VIN VS35 74#define VXC2 VS40 75#define VH VS41 76#define VHH VS42 77#define VHL VS43 78#define VIN1 VS48 79#define VH2 VS49 80#define VH2H VS50 81#define VH2L VS51 82 83#define VIN2 VS54 84#define VH3L VS55 85#define VH3 VS56 86#define VH3H VS57 87#define VIN3 VS60 88#define VH4L VS61 89#define VH4 VS62 90#define VH4H VS63 91 92#define VIN0 VIN 93 94#define ESPERM V10 95#define TMP2 V11 96 97// The following macros provide appropriate 98// implementations for endianness as well as 99// ISA specific for power8 and power9. 100#ifdef GOARCH_ppc64le 101# ifdef GOPPC64_power9 102#define P8_LXVB16X(RA,RB,VT) LXVB16X (RA)(RB), VT 103#define P8_STXVB16X(VS,RA,RB) STXVB16X VS, (RA)(RB) 104# else 105#define NEEDS_ESPERM 106#define P8_LXVB16X(RA,RB,VT) \ 107 LXVD2X (RA+RB), VT \ 108 VPERM VT, VT, ESPERM, VT 109 110#define P8_STXVB16X(VS,RA,RB) \ 111 VPERM VS, VS, ESPERM, TMP2; \ 112 STXVD2X TMP2, (RA+RB) 113 114# endif 115#else 116#define P8_LXVB16X(RA,RB,VT) \ 117 LXVD2X (RA+RB), VT 118 119#define P8_STXVB16X(VS,RA,RB) \ 120 STXVD2X VS, (RA+RB) 121 122#endif 123 124#define MASK_PTR R8 125 126#define MASKV V0 127#define INV V1 128 129// The following macros are used for 130// the stitched implementation within 131// counterCryptASM. 132 133// Load the initial GCM counter value 134// in V30 and set up the counter increment 135// in V31 136#define SETUP_COUNTER \ 137 P8_LXVB16X(COUNTER, R0, V30); \ 138 VSPLTISB $1, V28; \ 139 VXOR V31, V31, V31; \ 140 VSLDOI $1, V31, V28, V31 141 142// These macros set up the initial value 143// for a single encryption, or 4 or 8 144// stitched encryptions implemented 145// with interleaving vciphers. 146// 147// The input value for each encryption 148// is generated by XORing the counter 149// from V30 with the first key in VS0 150// and incrementing the counter. 151// 152// Single encryption in V15 153#define GEN_VCIPHER_INPUT \ 154 XXLOR VS0, VS0, V29 \ 155 VXOR V30, V29, V15; \ 156 VADDUWM V30, V31, V30 157 158// 4 encryptions in V15 - V18 159#define GEN_VCIPHER_4_INPUTS \ 160 XXLOR VS0, VS0, V29; \ 161 VXOR V30, V29, V15; \ 162 VADDUWM V30, V31, V30; \ 163 VXOR V30, V29, V16; \ 164 VADDUWM V30, V31, V30; \ 165 VXOR V30, V29, V17; \ 166 VADDUWM V30, V31, V30; \ 167 VXOR V30, V29, V18; \ 168 VADDUWM V30, V31, V30 169 170// 8 encryptions in V15 - V22 171#define GEN_VCIPHER_8_INPUTS \ 172 XXLOR VS0, VS0, V29; \ 173 VXOR V30, V29, V15; \ 174 VADDUWM V30, V31, V30; \ 175 VXOR V30, V29, V16; \ 176 VADDUWM V30, V31, V30; \ 177 VXOR V30, V29, V17; \ 178 VADDUWM V30, V31, V30; \ 179 VXOR V30, V29, V18; \ 180 VADDUWM V30, V31, V30; \ 181 VXOR V30, V29, V19; \ 182 VADDUWM V30, V31, V30; \ 183 VXOR V30, V29, V20; \ 184 VADDUWM V30, V31, V30; \ 185 VXOR V30, V29, V21; \ 186 VADDUWM V30, V31, V30; \ 187 VXOR V30, V29, V22; \ 188 VADDUWM V30, V31, V30 189 190// Load the keys to be used for 191// encryption based on key_len. 192// Keys are in VS0 - VS14 193// depending on key_len. 194// Valid keys sizes are verified 195// here. CR2 is set and used 196// throughout to check key_len. 197#define LOAD_KEYS(blk_key, key_len) \ 198 MOVD $16, R16; \ 199 MOVD $32, R17; \ 200 MOVD $48, R18; \ 201 MOVD $64, R19; \ 202 LXVD2X (blk_key)(R0), VS0; \ 203 LXVD2X (blk_key)(R16), VS1; \ 204 LXVD2X (blk_key)(R17), VS2; \ 205 LXVD2X (blk_key)(R18), VS3; \ 206 LXVD2X (blk_key)(R19), VS4; \ 207 ADD $64, R16; \ 208 ADD $64, R17; \ 209 ADD $64, R18; \ 210 ADD $64, R19; \ 211 LXVD2X (blk_key)(R16), VS5; \ 212 LXVD2X (blk_key)(R17), VS6; \ 213 LXVD2X (blk_key)(R18), VS7; \ 214 LXVD2X (blk_key)(R19), VS8; \ 215 ADD $64, R16; \ 216 ADD $64, R17; \ 217 ADD $64, R18; \ 218 ADD $64, R19; \ 219 LXVD2X (blk_key)(R16), VS9; \ 220 LXVD2X (blk_key)(R17), VS10; \ 221 CMP key_len, $12, CR2; \ 222 CMP key_len, $10; \ 223 BEQ keysLoaded; \ 224 LXVD2X (blk_key)(R18), VS11; \ 225 LXVD2X (blk_key)(R19), VS12; \ 226 BEQ CR2, keysLoaded; \ 227 ADD $64, R16; \ 228 ADD $64, R17; \ 229 LXVD2X (blk_key)(R16), VS13; \ 230 LXVD2X (blk_key)(R17), VS14; \ 231 CMP key_len, $14; \ 232 BEQ keysLoaded; \ 233 MOVD R0,0(R0); \ 234keysLoaded: 235 236// Encrypt 1 (vin) with first 9 237// keys from VS1 - VS9. 238#define VCIPHER_1X9_KEYS(vin) \ 239 XXLOR VS1, VS1, V23; \ 240 XXLOR VS2, VS2, V24; \ 241 XXLOR VS3, VS3, V25; \ 242 XXLOR VS4, VS4, V26; \ 243 XXLOR VS5, VS5, V27; \ 244 VCIPHER vin, V23, vin; \ 245 VCIPHER vin, V24, vin; \ 246 VCIPHER vin, V25, vin; \ 247 VCIPHER vin, V26, vin; \ 248 VCIPHER vin, V27, vin; \ 249 XXLOR VS6, VS6, V23; \ 250 XXLOR VS7, VS7, V24; \ 251 XXLOR VS8, VS8, V25; \ 252 XXLOR VS9, VS9, V26; \ 253 VCIPHER vin, V23, vin; \ 254 VCIPHER vin, V24, vin; \ 255 VCIPHER vin, V25, vin; \ 256 VCIPHER vin, V26, vin 257 258// Encrypt 1 value (vin) with 259// 2 specified keys 260#define VCIPHER_1X2_KEYS(vin, key1, key2) \ 261 XXLOR key1, key1, V25; \ 262 XXLOR key2, key2, V26; \ 263 VCIPHER vin, V25, vin; \ 264 VCIPHER vin, V26, vin 265 266// Encrypt 4 values in V15 - V18 267// with the specified key from 268// VS1 - VS9. 269#define VCIPHER_4X1_KEY(key) \ 270 XXLOR key, key, V23; \ 271 VCIPHER V15, V23, V15; \ 272 VCIPHER V16, V23, V16; \ 273 VCIPHER V17, V23, V17; \ 274 VCIPHER V18, V23, V18 275 276// Encrypt 8 values in V15 - V22 277// with the specified key, 278// assuming it is a VSreg 279#define VCIPHER_8X1_KEY(key) \ 280 XXLOR key, key, V23; \ 281 VCIPHER V15, V23, V15; \ 282 VCIPHER V16, V23, V16; \ 283 VCIPHER V17, V23, V17; \ 284 VCIPHER V18, V23, V18; \ 285 VCIPHER V19, V23, V19; \ 286 VCIPHER V20, V23, V20; \ 287 VCIPHER V21, V23, V21; \ 288 VCIPHER V22, V23, V22 289 290// Load input block into V1-V4 291// in big endian order and 292// update blk_inp by 64. 293#define LOAD_INPUT_BLOCK64(blk_inp) \ 294 MOVD $16, R16; \ 295 MOVD $32, R17; \ 296 MOVD $48, R18; \ 297 P8_LXVB16X(blk_inp,R0,V1); \ 298 P8_LXVB16X(blk_inp,R16,V2); \ 299 P8_LXVB16X(blk_inp,R17,V3); \ 300 P8_LXVB16X(blk_inp,R18,V4); \ 301 ADD $64, blk_inp 302 303// Load input block into V1-V8 304// in big endian order and 305// Update blk_inp by 128 306#define LOAD_INPUT_BLOCK128(blk_inp) \ 307 MOVD $16, R16; \ 308 MOVD $32, R17; \ 309 MOVD $48, R18; \ 310 MOVD $64, R19; \ 311 MOVD $80, R20; \ 312 MOVD $96, R21; \ 313 MOVD $112, R22; \ 314 P8_LXVB16X(blk_inp,R0,V1); \ 315 P8_LXVB16X(blk_inp,R16,V2); \ 316 P8_LXVB16X(blk_inp,R17,V3); \ 317 P8_LXVB16X(blk_inp,R18,V4); \ 318 P8_LXVB16X(blk_inp,R19,V5); \ 319 P8_LXVB16X(blk_inp,R20,V6); \ 320 P8_LXVB16X(blk_inp,R21,V7); \ 321 P8_LXVB16X(blk_inp,R22,V8); \ 322 ADD $128, blk_inp 323 324// Finish encryption on 8 streams and 325// XOR with input block 326#define VCIPHERLAST8_XOR_INPUT \ 327 VCIPHERLAST V15, V23, V15; \ 328 VCIPHERLAST V16, V23, V16; \ 329 VCIPHERLAST V17, V23, V17; \ 330 VCIPHERLAST V18, V23, V18; \ 331 VCIPHERLAST V19, V23, V19; \ 332 VCIPHERLAST V20, V23, V20; \ 333 VCIPHERLAST V21, V23, V21; \ 334 VCIPHERLAST V22, V23, V22; \ 335 XXLXOR V1, V15, V1; \ 336 XXLXOR V2, V16, V2; \ 337 XXLXOR V3, V17, V3; \ 338 XXLXOR V4, V18, V4; \ 339 XXLXOR V5, V19, V5; \ 340 XXLXOR V6, V20, V6; \ 341 XXLXOR V7, V21, V7; \ 342 XXLXOR V8, V22, V8 343 344// Finish encryption on 4 streams and 345// XOR with input block 346#define VCIPHERLAST4_XOR_INPUT \ 347 VCIPHERLAST V15, V23, V15; \ 348 VCIPHERLAST V16, V23, V16; \ 349 VCIPHERLAST V17, V23, V17; \ 350 VCIPHERLAST V18, V23, V18; \ 351 XXLXOR V1, V15, V1; \ 352 XXLXOR V2, V16, V2; \ 353 XXLXOR V3, V17, V3; \ 354 XXLXOR V4, V18, V4 355 356// Store output block from V1-V8 357// in big endian order and 358// Update blk_out by 128 359#define STORE_OUTPUT_BLOCK128(blk_out) \ 360 P8_STXVB16X(V1,blk_out,R0); \ 361 P8_STXVB16X(V2,blk_out,R16); \ 362 P8_STXVB16X(V3,blk_out,R17); \ 363 P8_STXVB16X(V4,blk_out,R18); \ 364 P8_STXVB16X(V5,blk_out,R19); \ 365 P8_STXVB16X(V6,blk_out,R20); \ 366 P8_STXVB16X(V7,blk_out,R21); \ 367 P8_STXVB16X(V8,blk_out,R22); \ 368 ADD $128, blk_out 369 370// Store output block from V1-V4 371// in big endian order and 372// Update blk_out by 64 373#define STORE_OUTPUT_BLOCK64(blk_out) \ 374 P8_STXVB16X(V1,blk_out,R0); \ 375 P8_STXVB16X(V2,blk_out,R16); \ 376 P8_STXVB16X(V3,blk_out,R17); \ 377 P8_STXVB16X(V4,blk_out,R18); \ 378 ADD $64, blk_out 379 380// func gcmInit(productTable *[256]byte, h []byte) 381TEXT ·gcmInit(SB), NOSPLIT, $0-32 382 MOVD productTable+0(FP), XIP 383 MOVD h+8(FP), HTBL 384 385 MOVD $0x10, R8 386 MOVD $0x20, R9 387 MOVD $0x30, R10 388 LXVD2X (HTBL)(R0), VH // Load H 389 390 VSPLTISB $-16, XC2 // 0xf0 391 VSPLTISB $1, T0 // one 392 VADDUBM XC2, XC2, XC2 // 0xe0 393 VXOR ZERO, ZERO, ZERO 394 VOR XC2, T0, XC2 // 0xe1 395 VSLDOI $15, XC2, ZERO, XC2 // 0xe1... 396 VSLDOI $1, ZERO, T0, T1 // ...1 397 VADDUBM XC2, XC2, XC2 // 0xc2... 398 VSPLTISB $7, T2 399 VOR XC2, T1, XC2 // 0xc2....01 400 VSPLTB $0, H, T1 // most significant byte 401 VSL H, T0, H // H<<=1 402 VSRAB T1, T2, T1 // broadcast carry bit 403 VAND T1, XC2, T1 404 VXOR H, T1, IN // twisted H 405 406 VSLDOI $8, IN, IN, H // twist even more ... 407 VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0 408 VSLDOI $8, ZERO, H, HL // ... and split 409 VSLDOI $8, H, ZERO, HH 410 411 STXVD2X VXC2, (XIP+R0) // save pre-computed table 412 STXVD2X VHL, (XIP+R8) 413 MOVD $0x40, R8 414 STXVD2X VH, (XIP+R9) 415 MOVD $0x50, R9 416 STXVD2X VHH, (XIP+R10) 417 MOVD $0x60, R10 418 419 VPMSUMD IN, HL, XL // H.lo·H.lo 420 VPMSUMD IN, H, XM // H.hi·H.lo+H.lo·H.hi 421 VPMSUMD IN, HH, XH // H.hi·H.hi 422 423 VPMSUMD XL, XC2, T2 // 1st reduction phase 424 425 VSLDOI $8, XM, ZERO, T0 426 VSLDOI $8, ZERO, XM, T1 427 VXOR XL, T0, XL 428 VXOR XH, T1, XH 429 430 VSLDOI $8, XL, XL, XL 431 VXOR XL, T2, XL 432 433 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 434 VPMSUMD XL, XC2, XL 435 VXOR T1, XH, T1 436 VXOR XL, T1, IN1 437 438 VSLDOI $8, IN1, IN1, H2 439 VSLDOI $8, ZERO, H2, H2L 440 VSLDOI $8, H2, ZERO, H2H 441 442 STXVD2X VH2L, (XIP+R8) // save H^2 443 MOVD $0x70, R8 444 STXVD2X VH2, (XIP+R9) 445 MOVD $0x80, R9 446 STXVD2X VH2H, (XIP+R10) 447 MOVD $0x90, R10 448 449 VPMSUMD IN, H2L, XL // H.lo·H^2.lo 450 VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo 451 VPMSUMD IN, H2, XM // H.hi·H^2.lo+H.lo·H^2.hi 452 VPMSUMD IN1, H2, XM1 // H^2.hi·H^2.lo+H^2.lo·H^2.hi 453 VPMSUMD IN, H2H, XH // H.hi·H^2.hi 454 VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi 455 456 VPMSUMD XL, XC2, T2 // 1st reduction phase 457 VPMSUMD XL1, XC2, HH // 1st reduction phase 458 459 VSLDOI $8, XM, ZERO, T0 460 VSLDOI $8, ZERO, XM, T1 461 VSLDOI $8, XM1, ZERO, HL 462 VSLDOI $8, ZERO, XM1, H 463 VXOR XL, T0, XL 464 VXOR XH, T1, XH 465 VXOR XL1, HL, XL1 466 VXOR XH1, H, XH1 467 468 VSLDOI $8, XL, XL, XL 469 VSLDOI $8, XL1, XL1, XL1 470 VXOR XL, T2, XL 471 VXOR XL1, HH, XL1 472 473 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 474 VSLDOI $8, XL1, XL1, H // 2nd reduction phase 475 VPMSUMD XL, XC2, XL 476 VPMSUMD XL1, XC2, XL1 477 VXOR T1, XH, T1 478 VXOR H, XH1, H 479 VXOR XL, T1, XL 480 VXOR XL1, H, XL1 481 482 VSLDOI $8, XL, XL, H 483 VSLDOI $8, XL1, XL1, H2 484 VSLDOI $8, ZERO, H, HL 485 VSLDOI $8, H, ZERO, HH 486 VSLDOI $8, ZERO, H2, H2L 487 VSLDOI $8, H2, ZERO, H2H 488 489 STXVD2X VHL, (XIP+R8) // save H^3 490 MOVD $0xa0, R8 491 STXVD2X VH, (XIP+R9) 492 MOVD $0xb0, R9 493 STXVD2X VHH, (XIP+R10) 494 MOVD $0xc0, R10 495 STXVD2X VH2L, (XIP+R8) // save H^4 496 STXVD2X VH2, (XIP+R9) 497 STXVD2X VH2H, (XIP+R10) 498 499 RET 500 501// func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int) 502TEXT ·gcmHash(SB), NOSPLIT, $0-64 503 MOVD output+0(FP), XIP 504 MOVD productTable+24(FP), HTBL 505 MOVD inp+32(FP), INP 506 MOVD len+56(FP), LEN 507 508 MOVD $0x10, R8 509 MOVD $0x20, R9 510 MOVD $0x30, R10 511 LXVD2X (XIP)(R0), VXL // load Xi 512 513 LXVD2X (HTBL)(R8), VHL // load pre-computed table 514 MOVD $0x40, R8 515 LXVD2X (HTBL)(R9), VH 516 MOVD $0x50, R9 517 LXVD2X (HTBL)(R10), VHH 518 MOVD $0x60, R10 519 LXVD2X (HTBL)(R0), VXC2 520#ifdef GOARCH_ppc64le 521 LVSL (R0)(R0), LEMASK 522 VSPLTISB $0x07, T0 523 VXOR LEMASK, T0, LEMASK 524 VPERM XL, XL, LEMASK, XL 525#endif 526 VXOR ZERO, ZERO, ZERO 527 528 CMPU LEN, $64 529 BGE gcm_ghash_p8_4x 530 531 LXVD2X (INP)(R0), VIN 532 ADD $16, INP, INP 533 SUBCCC $16, LEN, LEN 534#ifdef GOARCH_ppc64le 535 VPERM IN, IN, LEMASK, IN 536#endif 537 VXOR IN, XL, IN 538 BEQ short 539 540 LXVD2X (HTBL)(R8), VH2L // load H^2 541 MOVD $16, R8 542 LXVD2X (HTBL)(R9), VH2 543 ADD LEN, INP, R9 // end of input 544 LXVD2X (HTBL)(R10), VH2H 545 546loop_2x: 547 LXVD2X (INP)(R0), VIN1 548#ifdef GOARCH_ppc64le 549 VPERM IN1, IN1, LEMASK, IN1 550#endif 551 552 SUBC $32, LEN, LEN 553 VPMSUMD IN, H2L, XL // H^2.lo·Xi.lo 554 VPMSUMD IN1, HL, XL1 // H.lo·Xi+1.lo 555 SUBE R11, R11, R11 // borrow?-1:0 556 VPMSUMD IN, H2, XM // H^2.hi·Xi.lo+H^2.lo·Xi.hi 557 VPMSUMD IN1, H, XM1 // H.hi·Xi+1.lo+H.lo·Xi+1.hi 558 AND LEN, R11, R11 559 VPMSUMD IN, H2H, XH // H^2.hi·Xi.hi 560 VPMSUMD IN1, HH, XH1 // H.hi·Xi+1.hi 561 ADD R11, INP, INP 562 563 VXOR XL, XL1, XL 564 VXOR XM, XM1, XM 565 566 VPMSUMD XL, XC2, T2 // 1st reduction phase 567 568 VSLDOI $8, XM, ZERO, T0 569 VSLDOI $8, ZERO, XM, T1 570 VXOR XH, XH1, XH 571 VXOR XL, T0, XL 572 VXOR XH, T1, XH 573 574 VSLDOI $8, XL, XL, XL 575 VXOR XL, T2, XL 576 LXVD2X (INP)(R8), VIN 577 ADD $32, INP, INP 578 579 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 580 VPMSUMD XL, XC2, XL 581#ifdef GOARCH_ppc64le 582 VPERM IN, IN, LEMASK, IN 583#endif 584 VXOR T1, XH, T1 585 VXOR IN, T1, IN 586 VXOR IN, XL, IN 587 CMP R9, INP 588 BGT loop_2x // done yet? 589 590 CMPWU LEN, $0 591 BNE even 592 593short: 594 VPMSUMD IN, HL, XL // H.lo·Xi.lo 595 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi 596 VPMSUMD IN, HH, XH // H.hi·Xi.hi 597 598 VPMSUMD XL, XC2, T2 // 1st reduction phase 599 600 VSLDOI $8, XM, ZERO, T0 601 VSLDOI $8, ZERO, XM, T1 602 VXOR XL, T0, XL 603 VXOR XH, T1, XH 604 605 VSLDOI $8, XL, XL, XL 606 VXOR XL, T2, XL 607 608 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 609 VPMSUMD XL, XC2, XL 610 VXOR T1, XH, T1 611 612even: 613 VXOR XL, T1, XL 614#ifdef GOARCH_ppc64le 615 VPERM XL, XL, LEMASK, XL 616#endif 617 STXVD2X VXL, (XIP+R0) 618 619 OR R12, R12, R12 // write out Xi 620 RET 621 622gcm_ghash_p8_4x: 623 LVSL (R8)(R0), T0 // 0x0001..0e0f 624 MOVD $0x70, R8 625 LXVD2X (HTBL)(R9), VH2 626 MOVD $0x80, R9 627 VSPLTISB $8, T1 // 0x0808..0808 628 MOVD $0x90, R10 629 LXVD2X (HTBL)(R8), VH3L // load H^3 630 MOVD $0xa0, R8 631 LXVD2X (HTBL)(R9), VH3 632 MOVD $0xb0, R9 633 LXVD2X (HTBL)(R10), VH3H 634 MOVD $0xc0, R10 635 LXVD2X (HTBL)(R8), VH4L // load H^4 636 MOVD $0x10, R8 637 LXVD2X (HTBL)(R9), VH4 638 MOVD $0x20, R9 639 LXVD2X (HTBL)(R10), VH4H 640 MOVD $0x30, R10 641 642 VSLDOI $8, ZERO, T1, T2 // 0x0000..0808 643 VADDUBM T0, T2, HIPERM // 0x0001..1617 644 VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f 645 646 SRD $4, LEN, LEN // this allows to use sign bit as carry 647 648 LXVD2X (INP)(R0), VIN0 // load input 649 LXVD2X (INP)(R8), VIN1 650 SUBCCC $8, LEN, LEN 651 LXVD2X (INP)(R9), VIN2 652 LXVD2X (INP)(R10), VIN3 653 ADD $0x40, INP, INP 654#ifdef GOARCH_ppc64le 655 VPERM IN0, IN0, LEMASK, IN0 656 VPERM IN1, IN1, LEMASK, IN1 657 VPERM IN2, IN2, LEMASK, IN2 658 VPERM IN3, IN3, LEMASK, IN3 659#endif 660 661 VXOR IN0, XL, XH 662 663 VPMSUMD IN1, H3L, XL1 664 VPMSUMD IN1, H3, XM1 665 VPMSUMD IN1, H3H, XH1 666 667 VPERM H2, H, HIPERM, H21L 668 VPERM IN2, IN3, LOPERM, T0 669 VPERM H2, H, LOPERM, H21H 670 VPERM IN2, IN3, HIPERM, T1 671 VPMSUMD IN2, H2, XM2 // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo 672 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo 673 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi 674 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi 675 676 VXOR XM2, XM1, XM2 677 VXOR XL3, XL1, XL3 678 VXOR XM3, XM2, XM3 679 VXOR XH3, XH1, XH3 680 681 BLT tail_4x 682 683loop_4x: 684 LXVD2X (INP)(R0), VIN0 685 LXVD2X (INP)(R8), VIN1 686 SUBCCC $4, LEN, LEN 687 LXVD2X (INP)(R9), VIN2 688 LXVD2X (INP)(R10), VIN3 689 ADD $0x40, INP, INP 690#ifdef GOARCH_ppc64le 691 VPERM IN1, IN1, LEMASK, IN1 692 VPERM IN2, IN2, LEMASK, IN2 693 VPERM IN3, IN3, LEMASK, IN3 694 VPERM IN0, IN0, LEMASK, IN0 695#endif 696 697 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo 698 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi 699 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi 700 VPMSUMD IN1, H3L, XL1 701 VPMSUMD IN1, H3, XM1 702 VPMSUMD IN1, H3H, XH1 703 704 VXOR XL, XL3, XL 705 VXOR XM, XM3, XM 706 VXOR XH, XH3, XH 707 VPERM IN2, IN3, LOPERM, T0 708 VPERM IN2, IN3, HIPERM, T1 709 710 VPMSUMD XL, XC2, T2 // 1st reduction phase 711 VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo +H^2.lo·Xi+2.lo 712 VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi +H^2.hi·Xi+2.hi 713 714 VSLDOI $8, XM, ZERO, T0 715 VSLDOI $8, ZERO, XM, T1 716 VXOR XL, T0, XL 717 VXOR XH, T1, XH 718 719 VSLDOI $8, XL, XL, XL 720 VXOR XL, T2, XL 721 722 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 723 VPMSUMD IN2, H2, XM2 // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi 724 VPMSUMD IN3, H, XM3 // H.hi·Xi+3.lo +H.lo·Xi+3.hi 725 VPMSUMD XL, XC2, XL 726 727 VXOR XL3, XL1, XL3 728 VXOR XH3, XH1, XH3 729 VXOR XH, IN0, XH 730 VXOR XM2, XM1, XM2 731 VXOR XH, T1, XH 732 VXOR XM3, XM2, XM3 733 VXOR XH, XL, XH 734 BGE loop_4x 735 736tail_4x: 737 VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo 738 VPMSUMD XH, H4, XM // H^4.hi·Xi.lo+H^4.lo·Xi.hi 739 VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi 740 741 VXOR XL, XL3, XL 742 VXOR XM, XM3, XM 743 744 VPMSUMD XL, XC2, T2 // 1st reduction phase 745 746 VSLDOI $8, XM, ZERO, T0 747 VSLDOI $8, ZERO, XM, T1 748 VXOR XH, XH3, XH 749 VXOR XL, T0, XL 750 VXOR XH, T1, XH 751 752 VSLDOI $8, XL, XL, XL 753 VXOR XL, T2, XL 754 755 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 756 VPMSUMD XL, XC2, XL 757 VXOR T1, XH, T1 758 VXOR XL, T1, XL 759 760 ADDCCC $4, LEN, LEN 761 BEQ done_4x 762 763 LXVD2X (INP)(R0), VIN0 764 CMPU LEN, $2 765 MOVD $-4, LEN 766 BLT one 767 LXVD2X (INP)(R8), VIN1 768 BEQ two 769 770three: 771 LXVD2X (INP)(R9), VIN2 772#ifdef GOARCH_ppc64le 773 VPERM IN0, IN0, LEMASK, IN0 774 VPERM IN1, IN1, LEMASK, IN1 775 VPERM IN2, IN2, LEMASK, IN2 776#endif 777 778 VXOR IN0, XL, XH 779 VOR H3L, H3L, H4L 780 VOR H3, H3, H4 781 VOR H3H, H3H, H4H 782 783 VPERM IN1, IN2, LOPERM, T0 784 VPERM IN1, IN2, HIPERM, T1 785 VPMSUMD IN1, H2, XM2 // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo 786 VPMSUMD IN2, H, XM3 // H.hi·Xi+2.lo +H.lo·Xi+2.hi 787 VPMSUMD T0, H21L, XL3 // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo 788 VPMSUMD T1, H21H, XH3 // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi 789 790 VXOR XM3, XM2, XM3 791 JMP tail_4x 792 793two: 794#ifdef GOARCH_ppc64le 795 VPERM IN0, IN0, LEMASK, IN0 796 VPERM IN1, IN1, LEMASK, IN1 797#endif 798 799 VXOR IN, XL, XH 800 VPERM ZERO, IN1, LOPERM, T0 801 VPERM ZERO, IN1, HIPERM, T1 802 803 VSLDOI $8, ZERO, H2, H4L 804 VOR H2, H2, H4 805 VSLDOI $8, H2, ZERO, H4H 806 807 VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo 808 VPMSUMD IN1, H, XM3 // H.hi·Xi+1.lo+H.lo·Xi+2.hi 809 VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi 810 811 JMP tail_4x 812 813one: 814#ifdef GOARCH_ppc64le 815 VPERM IN0, IN0, LEMASK, IN0 816#endif 817 818 VSLDOI $8, ZERO, H, H4L 819 VOR H, H, H4 820 VSLDOI $8, H, ZERO, H4H 821 822 VXOR IN0, XL, XH 823 VXOR XL3, XL3, XL3 824 VXOR XM3, XM3, XM3 825 VXOR XH3, XH3, XH3 826 827 JMP tail_4x 828 829done_4x: 830#ifdef GOARCH_ppc64le 831 VPERM XL, XL, LEMASK, XL 832#endif 833 STXVD2X VXL, (XIP+R0) // write out Xi 834 RET 835 836// func gcmMul(output []byte, productTable *[256]byte) 837TEXT ·gcmMul(SB), NOSPLIT, $0-32 838 MOVD output+0(FP), XIP 839 MOVD productTable+24(FP), HTBL 840 841 MOVD $0x10, R8 842 MOVD $0x20, R9 843 MOVD $0x30, R10 844 LXVD2X (XIP)(R0), VIN // load Xi 845 846 LXVD2X (HTBL)(R8), VHL // Load pre-computed table 847 LXVD2X (HTBL)(R9), VH 848 LXVD2X (HTBL)(R10), VHH 849 LXVD2X (HTBL)(R0), VXC2 850#ifdef GOARCH_ppc64le 851 VSPLTISB $0x07, T0 852 VXOR LEMASK, T0, LEMASK 853 VPERM IN, IN, LEMASK, IN 854#endif 855 VXOR ZERO, ZERO, ZERO 856 857 VPMSUMD IN, HL, XL // H.lo·Xi.lo 858 VPMSUMD IN, H, XM // H.hi·Xi.lo+H.lo·Xi.hi 859 VPMSUMD IN, HH, XH // H.hi·Xi.hi 860 861 VPMSUMD XL, XC2, T2 // 1st reduction phase 862 863 VSLDOI $8, XM, ZERO, T0 864 VSLDOI $8, ZERO, XM, T1 865 VXOR XL, T0, XL 866 VXOR XH, T1, XH 867 868 VSLDOI $8, XL, XL, XL 869 VXOR XL, T2, XL 870 871 VSLDOI $8, XL, XL, T1 // 2nd reduction phase 872 VPMSUMD XL, XC2, XL 873 VXOR T1, XH, T1 874 VXOR XL, T1, XL 875 876#ifdef GOARCH_ppc64le 877 VPERM XL, XL, LEMASK, XL 878#endif 879 STXVD2X VXL, (XIP+R0) // write out Xi 880 RET 881 882#define BLK_INP R3 883#define BLK_OUT R4 884#define BLK_KEY R5 885#define KEY_LEN R6 886#define BLK_IDX R7 887#define IDX R8 888#define IN_LEN R9 889#define COUNTER R10 890#define CONPTR R14 891#define MASK V5 892 893// Implementation of the counterCrypt function in assembler. 894// Original loop is unrolled to allow for multiple encryption 895// streams to be done in parallel, which is achieved by interleaving 896// vcipher instructions from each stream. This is also referred to as 897// stitching, and provides significant performance improvements. 898// Some macros are defined which enable execution for big or little 899// endian as well as different ISA targets. 900//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32) 901//func counterCryptASM(xr, out, in, counter, key) 902TEXT ·counterCryptASM(SB), NOSPLIT, $16-72 903 MOVD xr(FP), KEY_LEN 904 MOVD out+8(FP), BLK_OUT 905 MOVD out_len+16(FP), R8 906 MOVD in+32(FP), BLK_INP 907 MOVD in_len+40(FP), IN_LEN 908 MOVD counter+56(FP), COUNTER 909 MOVD key+64(FP), BLK_KEY 910 911// Set up permute string when needed. 912#ifdef NEEDS_ESPERM 913 MOVD $·rcon(SB), R14 914 LVX (R14), ESPERM // Permute value for P8_ macros. 915#endif 916 SETUP_COUNTER // V30 Counter V31 BE {0, 0, 0, 1} 917 LOAD_KEYS(BLK_KEY, KEY_LEN) // VS1 - VS10/12/14 based on keysize 918 CMP IN_LEN, $128 919 BLT block64 920block128_loop: 921 // Do 8 encryptions in parallel by setting 922 // input values in V15-V22 and executing 923 // vcipher on the updated value and the keys. 924 GEN_VCIPHER_8_INPUTS 925 VCIPHER_8X1_KEY(VS1) 926 VCIPHER_8X1_KEY(VS2) 927 VCIPHER_8X1_KEY(VS3) 928 VCIPHER_8X1_KEY(VS4) 929 VCIPHER_8X1_KEY(VS5) 930 VCIPHER_8X1_KEY(VS6) 931 VCIPHER_8X1_KEY(VS7) 932 VCIPHER_8X1_KEY(VS8) 933 VCIPHER_8X1_KEY(VS9) 934 // Additional encryptions are done based on 935 // the key length, with the last key moved 936 // to V23 for use with VCIPHERLAST. 937 // CR2 = CMP key_len, $12 938 XXLOR VS10, VS10, V23 939 BLT CR2, block128_last // key_len = 10 940 VCIPHER_8X1_KEY(VS10) 941 VCIPHER_8X1_KEY(VS11) 942 XXLOR VS12,VS12,V23 943 BEQ CR2, block128_last // ken_len = 12 944 VCIPHER_8X1_KEY(VS12) 945 VCIPHER_8X1_KEY(VS13) 946 XXLOR VS14,VS14,V23 // key_len = 14 947block128_last: 948 // vcipher encryptions are in V15-V22 at this 949 // point with vcipherlast remaining to be done. 950 // Load input block into V1-V8, setting index offsets 951 // in R16-R22 to use with the STORE. 952 LOAD_INPUT_BLOCK128(BLK_INP) 953 // Do VCIPHERLAST on the last key for each encryption 954 // stream and XOR the result with the corresponding 955 // value from the input block. 956 VCIPHERLAST8_XOR_INPUT 957 // Store the results (8*16) and update BLK_OUT by 128. 958 STORE_OUTPUT_BLOCK128(BLK_OUT) 959 ADD $-128, IN_LEN // input size 960 CMP IN_LEN, $128 // check if >= blocksize 961 BGE block128_loop // next input block 962 CMP IN_LEN, $0 963 BEQ done 964block64: 965 CMP IN_LEN, $64 // Check if >= 64 966 BLT block16_loop 967 // Do 4 encryptions in parallel by setting 968 // input values in V15-V18 and executing 969 // vcipher on the updated value and the keys. 970 GEN_VCIPHER_4_INPUTS 971 VCIPHER_4X1_KEY(VS1) 972 VCIPHER_4X1_KEY(VS2) 973 VCIPHER_4X1_KEY(VS3) 974 VCIPHER_4X1_KEY(VS4) 975 VCIPHER_4X1_KEY(VS5) 976 VCIPHER_4X1_KEY(VS6) 977 VCIPHER_4X1_KEY(VS7) 978 VCIPHER_4X1_KEY(VS8) 979 VCIPHER_4X1_KEY(VS9) 980 // Check key length based on CR2 981 // Move last key to V23 for use with later vcipherlast 982 XXLOR VS10, VS10, V23 983 BLT CR2, block64_last // size = 10 984 VCIPHER_4X1_KEY(VS10) // Encrypt next 2 keys 985 VCIPHER_4X1_KEY(VS11) 986 XXLOR VS12, VS12, V23 987 BEQ CR2, block64_last // size = 12 988 VCIPHER_4X1_KEY(VS12) // Encrypt last 2 keys 989 VCIPHER_4X1_KEY(VS13) 990 XXLOR VS14, VS14, V23 // size = 14 991block64_last: 992 LOAD_INPUT_BLOCK64(BLK_INP) // Load 64 bytes of input 993 // Do VCIPHERLAST on the last for each encryption 994 // stream and XOR the result with the corresponding 995 // value from the input block. 996 VCIPHERLAST4_XOR_INPUT 997 // Store the results (4*16) and update BLK_OUT by 64. 998 STORE_OUTPUT_BLOCK64(BLK_OUT) 999 ADD $-64, IN_LEN // decrement input block length 1000 CMP IN_LEN, $0 // check for remaining length 1001 BEQ done 1002block16_loop: 1003 CMP IN_LEN, $16 // More input 1004 BLT final_block // If not, then handle partial block 1005 // Single encryption, no stitching 1006 GEN_VCIPHER_INPUT // Generate input value for single encryption 1007 VCIPHER_1X9_KEYS(V15) // Encrypt V15 value with 9 keys 1008 XXLOR VS10, VS10, V23 // Last key -> V23 for later vcipiherlast 1009 // Key length based on CR2. (LT=10, EQ=12, GT=14) 1010 BLT CR2, block16_last // Finish for key size 10 1011 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys 1012 XXLOR VS12, VS12, V23 // Last key -> V23 for later vcipherlast 1013 BEQ CR2, block16_last // Finish for key size 12 1014 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys 1015 XXLOR VS14, VS14, V23 // Last key -> V23 for vcipherlast with key size 14 1016block16_last: 1017 P8_LXVB16X(BLK_INP, R0, V1) // Load input 1018 VCIPHERLAST V15, V23, V15 // Encrypt last value in V23 1019 XXLXOR V15, V1, V1 // XOR with input 1020 P8_STXVB16X(V1,R0,BLK_OUT) // Store final encryption value to output 1021 ADD $16, BLK_INP // Increment input pointer 1022 ADD $16, BLK_OUT // Increment output pointer 1023 ADD $-16, IN_LEN // Decrement input length 1024 BR block16_loop // Check for next 1025final_block: 1026 CMP IN_LEN, $0 1027 BEQ done 1028 GEN_VCIPHER_INPUT // Generate input value for partial encryption 1029 VCIPHER_1X9_KEYS(V15) // Encrypt V15 with 9 keys 1030 XXLOR VS10, VS10, V23 // Save possible last key 1031 BLT CR2, final_block_last 1032 VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with next 2 keys 1033 XXLOR VS12, VS12, V23 // Save possible last key 1034 BEQ CR2, final_block_last 1035 VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys 1036 XXLOR VS14, VS14, V23 // Save last key 1037final_block_last: 1038 VCIPHERLAST V15, V23, V15 // Finish encryption 1039#ifdef GOPPC64_power10 1040 // set up length 1041 SLD $56, IN_LEN, R17 1042 LXVLL BLK_INP, R17, V25 1043 VXOR V25, V15, V25 1044 STXVLL V25, BLK_OUT, R17 1045#else 1046 ADD $32, R1, MASK_PTR 1047 MOVD $0, R16 1048 P8_STXVB16X(V15, MASK_PTR, R0) 1049 CMP IN_LEN, $8 1050 BLT next4 1051 MOVD 0(MASK_PTR), R14 1052 MOVD 0(BLK_INP), R15 1053 XOR R14, R15, R14 1054 MOVD R14, 0(BLK_OUT) 1055 ADD $8, R16 1056 ADD $-8, IN_LEN 1057next4: 1058 CMP IN_LEN, $4 1059 BLT next2 1060 MOVWZ (BLK_INP)(R16), R15 1061 MOVWZ (MASK_PTR)(R16), R14 1062 XOR R14, R15, R14 1063 MOVW R14, (R16)(BLK_OUT) 1064 ADD $4, R16 1065 ADD $-4, IN_LEN 1066next2: 1067 CMP IN_LEN, $2 1068 BLT next1 1069 MOVHZ (BLK_INP)(R16), R15 1070 MOVHZ (MASK_PTR)(R16), R14 1071 XOR R14, R15, R14 1072 MOVH R14, (R16)(BLK_OUT) 1073 ADD $2, R16 1074 ADD $-2, IN_LEN 1075next1: 1076 CMP IN_LEN, $1 1077 BLT done 1078 MOVBZ (MASK_PTR)(R16), R14 1079 MOVBZ (BLK_INP)(R16), R15 1080 XOR R14, R15, R14 1081 MOVB R14, (R16)(BLK_OUT) 1082#endif 1083done: 1084 // Save the updated counter value 1085 P8_STXVB16X(V30, COUNTER, R0) 1086 // Clear the keys 1087 XXLXOR VS0, VS0, VS0 1088 XXLXOR VS1, VS1, VS1 1089 XXLXOR VS2, VS2, VS2 1090 XXLXOR VS3, VS3, VS3 1091 XXLXOR VS4, VS4, VS4 1092 XXLXOR VS5, VS5, VS5 1093 XXLXOR VS6, VS6, VS6 1094 XXLXOR VS7, VS7, VS7 1095 XXLXOR VS8, VS8, VS8 1096 XXLXOR VS9, VS9, VS9 1097 XXLXOR VS10, VS10, VS10 1098 XXLXOR VS11, VS11, VS11 1099 XXLXOR VS12, VS12, VS12 1100 XXLXOR VS13, VS13, VS13 1101 XXLXOR VS14, VS14, VS14 1102 RET 1103 1104