1// Copyright 2019 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build (ppc64 || ppc64le) && !purego
6
7// Portions based on CRYPTOGAMS code with the following comment:
8// # ====================================================================
9// # Written by Andy Polyakov <[email protected]> for the OpenSSL
10// # project. The module is, however, dual licensed under OpenSSL and
11// # CRYPTOGAMS licenses depending on where you obtain it. For further
12// # details see http://www.openssl.org/~appro/cryptogams/.
13// # ====================================================================
14
15// The implementations for gcmHash, gcmInit and gcmMul are based on the generated asm
16// from the script https://github.com/dot-asm/cryptogams/blob/master/ppc/ghashp8-ppc.pl
17// from commit d47afb3c.
18
19// Changes were made due to differences in the ABI and some register usage.
20// Some arguments were changed due to the way the Go code passes them.
21
22// Portions that use the stitched AES-GCM approach in counterCryptASM
23// are based on code found in
24// https://github.com/IBM/ipcri/blob/main/aes/p10_aes_gcm.s
25
26#include "textflag.h"
27
28#define XIP    R3
29#define HTBL   R4
30#define INP    R5
31#define LEN    R6
32
33#define XL     V0
34#define XM     V1
35#define XH     V2
36#define IN     V3
37#define ZERO   V4
38#define T0     V5
39#define T1     V6
40#define T2     V7
41#define XC2    V8
42#define H      V9
43#define HH     V10
44#define HL     V11
45#define LEMASK V12
46#define XL1    V13
47#define XM1    V14
48#define XH1    V15
49#define IN1    V16
50#define H2     V17
51#define H2H    V18
52#define H2L    V19
53#define XL3    V20
54#define XM2    V21
55#define IN2    V22
56#define H3L    V23
57#define H3     V24
58#define H3H    V25
59#define XH3    V26
60#define XM3    V27
61#define IN3    V28
62#define H4L    V29
63#define H4     V30
64#define H4H    V31
65
66#define IN0    IN
67#define H21L   HL
68#define H21H   HH
69#define LOPERM H2L
70#define HIPERM H2H
71
72#define VXL    VS32
73#define VIN    VS35
74#define VXC2   VS40
75#define VH     VS41
76#define VHH    VS42
77#define VHL    VS43
78#define VIN1   VS48
79#define VH2    VS49
80#define VH2H   VS50
81#define VH2L   VS51
82
83#define VIN2   VS54
84#define VH3L   VS55
85#define VH3    VS56
86#define VH3H   VS57
87#define VIN3   VS60
88#define VH4L   VS61
89#define VH4    VS62
90#define VH4H   VS63
91
92#define VIN0   VIN
93
94#define ESPERM V10
95#define TMP2 V11
96
97// The following macros provide appropriate
98// implementations for endianness as well as
99// ISA specific for power8 and power9.
100#ifdef GOARCH_ppc64le
101#  ifdef GOPPC64_power9
102#define P8_LXVB16X(RA,RB,VT)   LXVB16X (RA)(RB), VT
103#define P8_STXVB16X(VS,RA,RB)  STXVB16X VS, (RA)(RB)
104#  else
105#define NEEDS_ESPERM
106#define P8_LXVB16X(RA,RB,VT) \
107	LXVD2X  (RA+RB), VT \
108	VPERM	VT, VT, ESPERM, VT
109
110#define P8_STXVB16X(VS,RA,RB) \
111	VPERM	VS, VS, ESPERM, TMP2; \
112	STXVD2X TMP2, (RA+RB)
113
114#  endif
115#else
116#define P8_LXVB16X(RA,RB,VT) \
117	LXVD2X  (RA+RB), VT
118
119#define P8_STXVB16X(VS,RA,RB) \
120	STXVD2X VS, (RA+RB)
121
122#endif
123
124#define MASK_PTR   R8
125
126#define MASKV   V0
127#define INV     V1
128
129// The following macros are used for
130// the stitched implementation within
131// counterCryptASM.
132
133// Load the initial GCM counter value
134// in V30 and set up the counter increment
135// in V31
136#define SETUP_COUNTER \
137	P8_LXVB16X(COUNTER, R0, V30); \
138	VSPLTISB $1, V28; \
139	VXOR V31, V31, V31; \
140	VSLDOI $1, V31, V28, V31
141
142// These macros set up the initial value
143// for a single encryption, or 4 or 8
144// stitched encryptions implemented
145// with interleaving vciphers.
146//
147// The input value for each encryption
148// is generated by XORing the counter
149// from V30 with the first key in VS0
150// and incrementing the counter.
151//
152// Single encryption in V15
153#define GEN_VCIPHER_INPUT \
154	XXLOR VS0, VS0, V29 \
155	VXOR V30, V29, V15; \
156	VADDUWM V30, V31, V30
157
158// 4 encryptions in V15 - V18
159#define GEN_VCIPHER_4_INPUTS \
160	XXLOR VS0, VS0, V29; \
161	VXOR V30, V29, V15; \
162	VADDUWM V30, V31, V30; \
163	VXOR V30, V29, V16; \
164	VADDUWM V30, V31, V30; \
165	VXOR V30, V29, V17; \
166	VADDUWM V30, V31, V30; \
167	VXOR V30, V29, V18; \
168	VADDUWM V30, V31, V30
169
170// 8 encryptions in V15 - V22
171#define GEN_VCIPHER_8_INPUTS \
172	XXLOR VS0, VS0, V29; \
173	VXOR V30, V29, V15; \
174	VADDUWM V30, V31, V30; \
175	VXOR V30, V29, V16; \
176	VADDUWM V30, V31, V30; \
177	VXOR V30, V29, V17; \
178	VADDUWM V30, V31, V30; \
179	VXOR V30, V29, V18; \
180	VADDUWM V30, V31, V30; \
181	VXOR V30, V29, V19; \
182	VADDUWM V30, V31, V30; \
183	VXOR V30, V29, V20; \
184	VADDUWM V30, V31, V30; \
185	VXOR V30, V29, V21; \
186	VADDUWM V30, V31, V30; \
187	VXOR V30, V29, V22; \
188	VADDUWM V30, V31, V30
189
190// Load the keys to be used for
191// encryption based on key_len.
192// Keys are in VS0 - VS14
193// depending on key_len.
194// Valid keys sizes are verified
195// here. CR2 is set and used
196// throughout to check key_len.
197#define LOAD_KEYS(blk_key, key_len) \
198	MOVD	$16, R16; \
199	MOVD	$32, R17; \
200	MOVD	$48, R18; \
201	MOVD	$64, R19; \
202	LXVD2X (blk_key)(R0), VS0; \
203	LXVD2X (blk_key)(R16), VS1; \
204	LXVD2X (blk_key)(R17), VS2; \
205	LXVD2X (blk_key)(R18), VS3; \
206	LXVD2X (blk_key)(R19), VS4; \
207	ADD $64, R16; \
208	ADD $64, R17; \
209	ADD $64, R18; \
210	ADD $64, R19; \
211	LXVD2X (blk_key)(R16), VS5; \
212	LXVD2X (blk_key)(R17), VS6; \
213	LXVD2X (blk_key)(R18), VS7; \
214	LXVD2X (blk_key)(R19), VS8; \
215	ADD $64, R16; \
216	ADD $64, R17; \
217	ADD $64, R18; \
218	ADD $64, R19; \
219	LXVD2X (blk_key)(R16), VS9; \
220	LXVD2X (blk_key)(R17), VS10; \
221	CMP key_len, $12, CR2; \
222	CMP key_len, $10; \
223	BEQ keysLoaded; \
224	LXVD2X (blk_key)(R18), VS11; \
225	LXVD2X (blk_key)(R19), VS12; \
226	BEQ CR2, keysLoaded; \
227	ADD $64, R16; \
228	ADD $64, R17; \
229	LXVD2X (blk_key)(R16), VS13; \
230	LXVD2X (blk_key)(R17), VS14; \
231	CMP key_len, $14; \
232	BEQ keysLoaded; \
233	MOVD R0,0(R0); \
234keysLoaded:
235
236// Encrypt 1 (vin) with first 9
237// keys from VS1 - VS9.
238#define VCIPHER_1X9_KEYS(vin) \
239	XXLOR VS1, VS1, V23; \
240	XXLOR VS2, VS2, V24; \
241	XXLOR VS3, VS3, V25; \
242	XXLOR VS4, VS4, V26; \
243	XXLOR VS5, VS5, V27; \
244	VCIPHER vin, V23, vin; \
245	VCIPHER vin, V24, vin; \
246	VCIPHER vin, V25, vin; \
247	VCIPHER vin, V26, vin; \
248	VCIPHER vin, V27, vin; \
249	XXLOR VS6, VS6, V23; \
250	XXLOR VS7, VS7, V24; \
251	XXLOR VS8, VS8, V25; \
252	XXLOR VS9, VS9, V26; \
253	VCIPHER vin, V23, vin; \
254	VCIPHER vin, V24, vin; \
255	VCIPHER vin, V25, vin; \
256	VCIPHER	vin, V26, vin
257
258// Encrypt 1 value (vin) with
259// 2 specified keys
260#define VCIPHER_1X2_KEYS(vin, key1, key2) \
261	XXLOR key1, key1, V25; \
262	XXLOR key2, key2, V26; \
263	VCIPHER vin, V25, vin; \
264	VCIPHER vin, V26, vin
265
266// Encrypt 4 values in V15 - V18
267// with the specified key from
268// VS1 - VS9.
269#define VCIPHER_4X1_KEY(key) \
270	XXLOR key, key, V23; \
271	VCIPHER V15, V23, V15; \
272	VCIPHER V16, V23, V16; \
273	VCIPHER V17, V23, V17; \
274	VCIPHER V18, V23, V18
275
276// Encrypt 8 values in V15 - V22
277// with the specified key,
278// assuming it is a VSreg
279#define VCIPHER_8X1_KEY(key) \
280	XXLOR key, key, V23; \
281	VCIPHER V15, V23, V15; \
282	VCIPHER V16, V23, V16; \
283	VCIPHER V17, V23, V17; \
284	VCIPHER V18, V23, V18; \
285	VCIPHER V19, V23, V19; \
286	VCIPHER V20, V23, V20; \
287	VCIPHER V21, V23, V21; \
288	VCIPHER V22, V23, V22
289
290// Load input block into V1-V4
291// in big endian order and
292// update blk_inp by 64.
293#define LOAD_INPUT_BLOCK64(blk_inp) \
294	MOVD $16, R16; \
295	MOVD $32, R17; \
296	MOVD $48, R18; \
297	P8_LXVB16X(blk_inp,R0,V1); \
298	P8_LXVB16X(blk_inp,R16,V2); \
299	P8_LXVB16X(blk_inp,R17,V3); \
300	P8_LXVB16X(blk_inp,R18,V4); \
301	ADD $64, blk_inp
302
303// Load input block into V1-V8
304// in big endian order and
305// Update blk_inp by 128
306#define LOAD_INPUT_BLOCK128(blk_inp) \
307	MOVD $16, R16; \
308	MOVD $32, R17; \
309	MOVD $48, R18; \
310	MOVD $64, R19; \
311	MOVD $80, R20; \
312	MOVD $96, R21; \
313	MOVD $112, R22; \
314	P8_LXVB16X(blk_inp,R0,V1); \
315	P8_LXVB16X(blk_inp,R16,V2); \
316	P8_LXVB16X(blk_inp,R17,V3); \
317	P8_LXVB16X(blk_inp,R18,V4); \
318	P8_LXVB16X(blk_inp,R19,V5); \
319	P8_LXVB16X(blk_inp,R20,V6); \
320	P8_LXVB16X(blk_inp,R21,V7); \
321	P8_LXVB16X(blk_inp,R22,V8); \
322	ADD $128, blk_inp
323
324// Finish encryption on 8 streams and
325// XOR with input block
326#define VCIPHERLAST8_XOR_INPUT \
327	VCIPHERLAST     V15, V23, V15; \
328	VCIPHERLAST     V16, V23, V16; \
329	VCIPHERLAST     V17, V23, V17; \
330	VCIPHERLAST     V18, V23, V18; \
331	VCIPHERLAST     V19, V23, V19; \
332	VCIPHERLAST     V20, V23, V20; \
333	VCIPHERLAST     V21, V23, V21; \
334	VCIPHERLAST     V22, V23, V22; \
335	XXLXOR          V1, V15, V1; \
336	XXLXOR          V2, V16, V2; \
337	XXLXOR          V3, V17, V3; \
338	XXLXOR          V4, V18, V4; \
339	XXLXOR          V5, V19, V5; \
340	XXLXOR          V6, V20, V6; \
341	XXLXOR          V7, V21, V7; \
342	XXLXOR          V8, V22, V8
343
344// Finish encryption on 4 streams and
345// XOR with input block
346#define VCIPHERLAST4_XOR_INPUT \
347	VCIPHERLAST     V15, V23, V15; \
348	VCIPHERLAST     V16, V23, V16; \
349	VCIPHERLAST     V17, V23, V17; \
350	VCIPHERLAST     V18, V23, V18; \
351	XXLXOR          V1, V15, V1; \
352	XXLXOR          V2, V16, V2; \
353	XXLXOR          V3, V17, V3; \
354	XXLXOR          V4, V18, V4
355
356// Store output block from V1-V8
357// in big endian order and
358// Update blk_out by 128
359#define STORE_OUTPUT_BLOCK128(blk_out) \
360	P8_STXVB16X(V1,blk_out,R0); \
361	P8_STXVB16X(V2,blk_out,R16); \
362	P8_STXVB16X(V3,blk_out,R17); \
363	P8_STXVB16X(V4,blk_out,R18); \
364	P8_STXVB16X(V5,blk_out,R19); \
365	P8_STXVB16X(V6,blk_out,R20); \
366	P8_STXVB16X(V7,blk_out,R21); \
367	P8_STXVB16X(V8,blk_out,R22); \
368	ADD $128, blk_out
369
370// Store output block from V1-V4
371// in big endian order and
372// Update blk_out by 64
373#define STORE_OUTPUT_BLOCK64(blk_out) \
374	P8_STXVB16X(V1,blk_out,R0); \
375	P8_STXVB16X(V2,blk_out,R16); \
376	P8_STXVB16X(V3,blk_out,R17); \
377	P8_STXVB16X(V4,blk_out,R18); \
378	ADD $64, blk_out
379
380// func gcmInit(productTable *[256]byte, h []byte)
381TEXT ·gcmInit(SB), NOSPLIT, $0-32
382	MOVD productTable+0(FP), XIP
383	MOVD h+8(FP), HTBL
384
385	MOVD   $0x10, R8
386	MOVD   $0x20, R9
387	MOVD   $0x30, R10
388	LXVD2X (HTBL)(R0), VH // Load H
389
390	VSPLTISB $-16, XC2           // 0xf0
391	VSPLTISB $1, T0              // one
392	VADDUBM  XC2, XC2, XC2       // 0xe0
393	VXOR     ZERO, ZERO, ZERO
394	VOR      XC2, T0, XC2        // 0xe1
395	VSLDOI   $15, XC2, ZERO, XC2 // 0xe1...
396	VSLDOI   $1, ZERO, T0, T1    // ...1
397	VADDUBM  XC2, XC2, XC2       // 0xc2...
398	VSPLTISB $7, T2
399	VOR      XC2, T1, XC2        // 0xc2....01
400	VSPLTB   $0, H, T1           // most significant byte
401	VSL      H, T0, H            // H<<=1
402	VSRAB    T1, T2, T1          // broadcast carry bit
403	VAND     T1, XC2, T1
404	VXOR     H, T1, IN           // twisted H
405
406	VSLDOI $8, IN, IN, H      // twist even more ...
407	VSLDOI $8, ZERO, XC2, XC2 // 0xc2.0
408	VSLDOI $8, ZERO, H, HL    // ... and split
409	VSLDOI $8, H, ZERO, HH
410
411	STXVD2X VXC2, (XIP+R0) // save pre-computed table
412	STXVD2X VHL, (XIP+R8)
413	MOVD    $0x40, R8
414	STXVD2X VH, (XIP+R9)
415	MOVD    $0x50, R9
416	STXVD2X VHH, (XIP+R10)
417	MOVD    $0x60, R10
418
419	VPMSUMD IN, HL, XL // H.lo·H.lo
420	VPMSUMD IN, H, XM  // H.hi·H.lo+H.lo·H.hi
421	VPMSUMD IN, HH, XH // H.hH.hi
422
423	VPMSUMD XL, XC2, T2 // 1st reduction phase
424
425	VSLDOI $8, XM, ZERO, T0
426	VSLDOI $8, ZERO, XM, T1
427	VXOR   XL, T0, XL
428	VXOR   XH, T1, XH
429
430	VSLDOI $8, XL, XL, XL
431	VXOR   XL, T2, XL
432
433	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
434	VPMSUMD XL, XC2, XL
435	VXOR    T1, XH, T1
436	VXOR    XL, T1, IN1
437
438	VSLDOI $8, IN1, IN1, H2
439	VSLDOI $8, ZERO, H2, H2L
440	VSLDOI $8, H2, ZERO, H2H
441
442	STXVD2X VH2L, (XIP+R8)  // save H^2
443	MOVD    $0x70, R8
444	STXVD2X VH2, (XIP+R9)
445	MOVD    $0x80, R9
446	STXVD2X VH2H, (XIP+R10)
447	MOVD    $0x90, R10
448
449	VPMSUMD IN, H2L, XL   // H.lo·H^2.lo
450	VPMSUMD IN1, H2L, XL1 // H^2.lo·H^2.lo
451	VPMSUMD IN, H2, XM    // H.hi·H^2.lo+H.lo·H^2.hi
452	VPMSUMD IN1, H2, XM1  // H^2.hi·H^2.lo+H^2.lo·H^2.hi
453	VPMSUMD IN, H2H, XH   // H.hi·H^2.hi
454	VPMSUMD IN1, H2H, XH1 // H^2.hi·H^2.hi
455
456	VPMSUMD XL, XC2, T2  // 1st reduction phase
457	VPMSUMD XL1, XC2, HH // 1st reduction phase
458
459	VSLDOI $8, XM, ZERO, T0
460	VSLDOI $8, ZERO, XM, T1
461	VSLDOI $8, XM1, ZERO, HL
462	VSLDOI $8, ZERO, XM1, H
463	VXOR   XL, T0, XL
464	VXOR   XH, T1, XH
465	VXOR   XL1, HL, XL1
466	VXOR   XH1, H, XH1
467
468	VSLDOI $8, XL, XL, XL
469	VSLDOI $8, XL1, XL1, XL1
470	VXOR   XL, T2, XL
471	VXOR   XL1, HH, XL1
472
473	VSLDOI  $8, XL, XL, T1  // 2nd reduction phase
474	VSLDOI  $8, XL1, XL1, H // 2nd reduction phase
475	VPMSUMD XL, XC2, XL
476	VPMSUMD XL1, XC2, XL1
477	VXOR    T1, XH, T1
478	VXOR    H, XH1, H
479	VXOR    XL, T1, XL
480	VXOR    XL1, H, XL1
481
482	VSLDOI $8, XL, XL, H
483	VSLDOI $8, XL1, XL1, H2
484	VSLDOI $8, ZERO, H, HL
485	VSLDOI $8, H, ZERO, HH
486	VSLDOI $8, ZERO, H2, H2L
487	VSLDOI $8, H2, ZERO, H2H
488
489	STXVD2X VHL, (XIP+R8)   // save H^3
490	MOVD    $0xa0, R8
491	STXVD2X VH, (XIP+R9)
492	MOVD    $0xb0, R9
493	STXVD2X VHH, (XIP+R10)
494	MOVD    $0xc0, R10
495	STXVD2X VH2L, (XIP+R8)  // save H^4
496	STXVD2X VH2, (XIP+R9)
497	STXVD2X VH2H, (XIP+R10)
498
499	RET
500
501// func gcmHash(output []byte, productTable *[256]byte, inp []byte, len int)
502TEXT ·gcmHash(SB), NOSPLIT, $0-64
503	MOVD output+0(FP), XIP
504	MOVD productTable+24(FP), HTBL
505	MOVD inp+32(FP), INP
506	MOVD len+56(FP), LEN
507
508	MOVD   $0x10, R8
509	MOVD   $0x20, R9
510	MOVD   $0x30, R10
511	LXVD2X (XIP)(R0), VXL // load Xi
512
513	LXVD2X   (HTBL)(R8), VHL    // load pre-computed table
514	MOVD     $0x40, R8
515	LXVD2X   (HTBL)(R9), VH
516	MOVD     $0x50, R9
517	LXVD2X   (HTBL)(R10), VHH
518	MOVD     $0x60, R10
519	LXVD2X   (HTBL)(R0), VXC2
520#ifdef GOARCH_ppc64le
521	LVSL     (R0)(R0), LEMASK
522	VSPLTISB $0x07, T0
523	VXOR     LEMASK, T0, LEMASK
524	VPERM    XL, XL, LEMASK, XL
525#endif
526	VXOR     ZERO, ZERO, ZERO
527
528	CMPU LEN, $64
529	BGE  gcm_ghash_p8_4x
530
531	LXVD2X (INP)(R0), VIN
532	ADD    $16, INP, INP
533	SUBCCC $16, LEN, LEN
534#ifdef GOARCH_ppc64le
535	VPERM  IN, IN, LEMASK, IN
536#endif
537	VXOR   IN, XL, IN
538	BEQ    short
539
540	LXVD2X (HTBL)(R8), VH2L  // load H^2
541	MOVD   $16, R8
542	LXVD2X (HTBL)(R9), VH2
543	ADD    LEN, INP, R9      // end of input
544	LXVD2X (HTBL)(R10), VH2H
545
546loop_2x:
547	LXVD2X (INP)(R0), VIN1
548#ifdef GOARCH_ppc64le
549	VPERM  IN1, IN1, LEMASK, IN1
550#endif
551
552	SUBC    $32, LEN, LEN
553	VPMSUMD IN, H2L, XL   // H^2.lo·Xi.lo
554	VPMSUMD IN1, HL, XL1  // H.lo·Xi+1.lo
555	SUBE    R11, R11, R11 // borrow?-1:0
556	VPMSUMD IN, H2, XM    // H^2.hi·Xi.lo+H^2.lo·Xi.hi
557	VPMSUMD IN1, H, XM1   // H.hi·Xi+1.lo+H.lo·Xi+1.hi
558	AND     LEN, R11, R11
559	VPMSUMD IN, H2H, XH   // H^2.hi·Xi.hi
560	VPMSUMD IN1, HH, XH1  // H.hi·Xi+1.hi
561	ADD     R11, INP, INP
562
563	VXOR XL, XL1, XL
564	VXOR XM, XM1, XM
565
566	VPMSUMD XL, XC2, T2 // 1st reduction phase
567
568	VSLDOI $8, XM, ZERO, T0
569	VSLDOI $8, ZERO, XM, T1
570	VXOR   XH, XH1, XH
571	VXOR   XL, T0, XL
572	VXOR   XH, T1, XH
573
574	VSLDOI $8, XL, XL, XL
575	VXOR   XL, T2, XL
576	LXVD2X (INP)(R8), VIN
577	ADD    $32, INP, INP
578
579	VSLDOI  $8, XL, XL, T1     // 2nd reduction phase
580	VPMSUMD XL, XC2, XL
581#ifdef GOARCH_ppc64le
582	VPERM   IN, IN, LEMASK, IN
583#endif
584	VXOR    T1, XH, T1
585	VXOR    IN, T1, IN
586	VXOR    IN, XL, IN
587	CMP     R9, INP
588	BGT     loop_2x            // done yet?
589
590	CMPWU LEN, $0
591	BNE   even
592
593short:
594	VPMSUMD IN, HL, XL // H.lo·Xi.lo
595	VPMSUMD IN, H, XM  // H.hi·Xi.lo+H.lo·Xi.hi
596	VPMSUMD IN, HH, XH // H.hXi.hi
597
598	VPMSUMD XL, XC2, T2 // 1st reduction phase
599
600	VSLDOI $8, XM, ZERO, T0
601	VSLDOI $8, ZERO, XM, T1
602	VXOR   XL, T0, XL
603	VXOR   XH, T1, XH
604
605	VSLDOI $8, XL, XL, XL
606	VXOR   XL, T2, XL
607
608	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
609	VPMSUMD XL, XC2, XL
610	VXOR    T1, XH, T1
611
612even:
613	VXOR    XL, T1, XL
614#ifdef GOARCH_ppc64le
615	VPERM   XL, XL, LEMASK, XL
616#endif
617	STXVD2X VXL, (XIP+R0)
618
619	OR R12, R12, R12 // write out Xi
620	RET
621
622gcm_ghash_p8_4x:
623	LVSL     (R8)(R0), T0      // 0x0001..0e0f
624	MOVD     $0x70, R8
625	LXVD2X   (HTBL)(R9), VH2
626	MOVD     $0x80, R9
627	VSPLTISB $8, T1            // 0x0808..0808
628	MOVD     $0x90, R10
629	LXVD2X   (HTBL)(R8), VH3L  // load H^3
630	MOVD     $0xa0, R8
631	LXVD2X   (HTBL)(R9), VH3
632	MOVD     $0xb0, R9
633	LXVD2X   (HTBL)(R10), VH3H
634	MOVD     $0xc0, R10
635	LXVD2X   (HTBL)(R8), VH4L  // load H^4
636	MOVD     $0x10, R8
637	LXVD2X   (HTBL)(R9), VH4
638	MOVD     $0x20, R9
639	LXVD2X   (HTBL)(R10), VH4H
640	MOVD     $0x30, R10
641
642	VSLDOI  $8, ZERO, T1, T2   // 0x0000..0808
643	VADDUBM T0, T2, HIPERM     // 0x0001..1617
644	VADDUBM T1, HIPERM, LOPERM // 0x0809..1e1f
645
646	SRD $4, LEN, LEN // this allows to use sign bit as carry
647
648	LXVD2X (INP)(R0), VIN0       // load input
649	LXVD2X (INP)(R8), VIN1
650	SUBCCC $8, LEN, LEN
651	LXVD2X (INP)(R9), VIN2
652	LXVD2X (INP)(R10), VIN3
653	ADD    $0x40, INP, INP
654#ifdef GOARCH_ppc64le
655	VPERM  IN0, IN0, LEMASK, IN0
656	VPERM  IN1, IN1, LEMASK, IN1
657	VPERM  IN2, IN2, LEMASK, IN2
658	VPERM  IN3, IN3, LEMASK, IN3
659#endif
660
661	VXOR IN0, XL, XH
662
663	VPMSUMD IN1, H3L, XL1
664	VPMSUMD IN1, H3, XM1
665	VPMSUMD IN1, H3H, XH1
666
667	VPERM   H2, H, HIPERM, H21L
668	VPERM   IN2, IN3, LOPERM, T0
669	VPERM   H2, H, LOPERM, H21H
670	VPERM   IN2, IN3, HIPERM, T1
671	VPMSUMD IN2, H2, XM2         // H^2.lo·Xi+2.hi+H^2.hi·Xi+2.lo
672	VPMSUMD T0, H21L, XL3        // H^2.lo·Xi+2.lo+H.lo·Xi+3.lo
673	VPMSUMD IN3, H, XM3          // H.hi·Xi+3.lo  +H.lo·Xi+3.hi
674	VPMSUMD T1, H21H, XH3        // H^2.hi·Xi+2.hi+H.hi·Xi+3.hi
675
676	VXOR XM2, XM1, XM2
677	VXOR XL3, XL1, XL3
678	VXOR XM3, XM2, XM3
679	VXOR XH3, XH1, XH3
680
681	BLT tail_4x
682
683loop_4x:
684	LXVD2X (INP)(R0), VIN0
685	LXVD2X (INP)(R8), VIN1
686	SUBCCC $4, LEN, LEN
687	LXVD2X (INP)(R9), VIN2
688	LXVD2X (INP)(R10), VIN3
689	ADD    $0x40, INP, INP
690#ifdef GOARCH_ppc64le
691	VPERM  IN1, IN1, LEMASK, IN1
692	VPERM  IN2, IN2, LEMASK, IN2
693	VPERM  IN3, IN3, LEMASK, IN3
694	VPERM  IN0, IN0, LEMASK, IN0
695#endif
696
697	VPMSUMD XH, H4L, XL   // H^4.lo·Xi.lo
698	VPMSUMD XH, H4, XM    // H^4.hi·Xi.lo+H^4.lo·Xi.hi
699	VPMSUMD XH, H4H, XH   // H^4.hi·Xi.hi
700	VPMSUMD IN1, H3L, XL1
701	VPMSUMD IN1, H3, XM1
702	VPMSUMD IN1, H3H, XH1
703
704	VXOR  XL, XL3, XL
705	VXOR  XM, XM3, XM
706	VXOR  XH, XH3, XH
707	VPERM IN2, IN3, LOPERM, T0
708	VPERM IN2, IN3, HIPERM, T1
709
710	VPMSUMD XL, XC2, T2   // 1st reduction phase
711	VPMSUMD T0, H21L, XL3 // H.lo·Xi+3.lo  +H^2.lo·Xi+2.lo
712	VPMSUMD T1, H21H, XH3 // H.hi·Xi+3.hi  +H^2.hi·Xi+2.hi
713
714	VSLDOI $8, XM, ZERO, T0
715	VSLDOI $8, ZERO, XM, T1
716	VXOR   XL, T0, XL
717	VXOR   XH, T1, XH
718
719	VSLDOI $8, XL, XL, XL
720	VXOR   XL, T2, XL
721
722	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
723	VPMSUMD IN2, H2, XM2   // H^2.hi·Xi+2.lo+H^2.lo·Xi+2.hi
724	VPMSUMD IN3, H, XM3    // H.hi·Xi+3.lo  +H.lo·Xi+3.hi
725	VPMSUMD XL, XC2, XL
726
727	VXOR XL3, XL1, XL3
728	VXOR XH3, XH1, XH3
729	VXOR XH, IN0, XH
730	VXOR XM2, XM1, XM2
731	VXOR XH, T1, XH
732	VXOR XM3, XM2, XM3
733	VXOR XH, XL, XH
734	BGE  loop_4x
735
736tail_4x:
737	VPMSUMD XH, H4L, XL // H^4.lo·Xi.lo
738	VPMSUMD XH, H4, XM  // H^4.hi·Xi.lo+H^4.lo·Xi.hi
739	VPMSUMD XH, H4H, XH // H^4.hi·Xi.hi
740
741	VXOR XL, XL3, XL
742	VXOR XM, XM3, XM
743
744	VPMSUMD XL, XC2, T2 // 1st reduction phase
745
746	VSLDOI $8, XM, ZERO, T0
747	VSLDOI $8, ZERO, XM, T1
748	VXOR   XH, XH3, XH
749	VXOR   XL, T0, XL
750	VXOR   XH, T1, XH
751
752	VSLDOI $8, XL, XL, XL
753	VXOR   XL, T2, XL
754
755	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
756	VPMSUMD XL, XC2, XL
757	VXOR    T1, XH, T1
758	VXOR    XL, T1, XL
759
760	ADDCCC $4, LEN, LEN
761	BEQ    done_4x
762
763	LXVD2X (INP)(R0), VIN0
764	CMPU   LEN, $2
765	MOVD   $-4, LEN
766	BLT    one
767	LXVD2X (INP)(R8), VIN1
768	BEQ    two
769
770three:
771	LXVD2X (INP)(R9), VIN2
772#ifdef GOARCH_ppc64le
773	VPERM  IN0, IN0, LEMASK, IN0
774	VPERM  IN1, IN1, LEMASK, IN1
775	VPERM  IN2, IN2, LEMASK, IN2
776#endif
777
778	VXOR IN0, XL, XH
779	VOR  H3L, H3L, H4L
780	VOR  H3, H3, H4
781	VOR  H3H, H3H, H4H
782
783	VPERM   IN1, IN2, LOPERM, T0
784	VPERM   IN1, IN2, HIPERM, T1
785	VPMSUMD IN1, H2, XM2         // H^2.lo·Xi+1.hi+H^2.hi·Xi+1.lo
786	VPMSUMD IN2, H, XM3          // H.hi·Xi+2.lo  +H.lo·Xi+2.hi
787	VPMSUMD T0, H21L, XL3        // H^2.lo·Xi+1.lo+H.lo·Xi+2.lo
788	VPMSUMD T1, H21H, XH3        // H^2.hi·Xi+1.hi+H.hi·Xi+2.hi
789
790	VXOR XM3, XM2, XM3
791	JMP  tail_4x
792
793two:
794#ifdef GOARCH_ppc64le
795	VPERM IN0, IN0, LEMASK, IN0
796	VPERM IN1, IN1, LEMASK, IN1
797#endif
798
799	VXOR  IN, XL, XH
800	VPERM ZERO, IN1, LOPERM, T0
801	VPERM ZERO, IN1, HIPERM, T1
802
803	VSLDOI $8, ZERO, H2, H4L
804	VOR    H2, H2, H4
805	VSLDOI $8, H2, ZERO, H4H
806
807	VPMSUMD T0, H21L, XL3 // H.lo·Xi+1.lo
808	VPMSUMD IN1, H, XM3   // H.hi·Xi+1.lo+H.lo·Xi+2.hi
809	VPMSUMD T1, H21H, XH3 // H.hi·Xi+1.hi
810
811	JMP tail_4x
812
813one:
814#ifdef GOARCH_ppc64le
815	VPERM IN0, IN0, LEMASK, IN0
816#endif
817
818	VSLDOI $8, ZERO, H, H4L
819	VOR    H, H, H4
820	VSLDOI $8, H, ZERO, H4H
821
822	VXOR IN0, XL, XH
823	VXOR XL3, XL3, XL3
824	VXOR XM3, XM3, XM3
825	VXOR XH3, XH3, XH3
826
827	JMP tail_4x
828
829done_4x:
830#ifdef GOARCH_ppc64le
831	VPERM   XL, XL, LEMASK, XL
832#endif
833	STXVD2X VXL, (XIP+R0)      // write out Xi
834	RET
835
836// func gcmMul(output []byte, productTable *[256]byte)
837TEXT ·gcmMul(SB), NOSPLIT, $0-32
838	MOVD output+0(FP), XIP
839	MOVD productTable+24(FP), HTBL
840
841	MOVD   $0x10, R8
842	MOVD   $0x20, R9
843	MOVD   $0x30, R10
844	LXVD2X (XIP)(R0), VIN // load Xi
845
846	LXVD2X   (HTBL)(R8), VHL    // Load pre-computed table
847	LXVD2X   (HTBL)(R9), VH
848	LXVD2X   (HTBL)(R10), VHH
849	LXVD2X   (HTBL)(R0), VXC2
850#ifdef GOARCH_ppc64le
851	VSPLTISB $0x07, T0
852	VXOR     LEMASK, T0, LEMASK
853	VPERM    IN, IN, LEMASK, IN
854#endif
855	VXOR     ZERO, ZERO, ZERO
856
857	VPMSUMD IN, HL, XL // H.lo·Xi.lo
858	VPMSUMD IN, H, XM  // H.hi·Xi.lo+H.lo·Xi.hi
859	VPMSUMD IN, HH, XH // H.hXi.hi
860
861	VPMSUMD XL, XC2, T2 // 1st reduction phase
862
863	VSLDOI $8, XM, ZERO, T0
864	VSLDOI $8, ZERO, XM, T1
865	VXOR   XL, T0, XL
866	VXOR   XH, T1, XH
867
868	VSLDOI $8, XL, XL, XL
869	VXOR   XL, T2, XL
870
871	VSLDOI  $8, XL, XL, T1 // 2nd reduction phase
872	VPMSUMD XL, XC2, XL
873	VXOR    T1, XH, T1
874	VXOR    XL, T1, XL
875
876#ifdef GOARCH_ppc64le
877	VPERM   XL, XL, LEMASK, XL
878#endif
879	STXVD2X VXL, (XIP+R0)      // write out Xi
880	RET
881
882#define BLK_INP    R3
883#define BLK_OUT    R4
884#define BLK_KEY    R5
885#define KEY_LEN    R6
886#define BLK_IDX    R7
887#define IDX        R8
888#define IN_LEN     R9
889#define COUNTER    R10
890#define CONPTR     R14
891#define MASK       V5
892
893// Implementation of the counterCrypt function in assembler.
894// Original loop is unrolled to allow for multiple encryption
895// streams to be done in parallel, which is achieved by interleaving
896// vcipher instructions from each stream. This is also referred to as
897// stitching, and provides significant performance improvements.
898// Some macros are defined which enable execution for big or little
899// endian as well as different ISA targets.
900//func (g *gcmAsm) counterCrypt(out, in []byte, counter *[gcmBlockSize]byte, key[gcmBlockSize]uint32)
901//func counterCryptASM(xr, out, in, counter, key)
902TEXT ·counterCryptASM(SB), NOSPLIT, $16-72
903	MOVD	xr(FP), KEY_LEN
904	MOVD    out+8(FP), BLK_OUT
905	MOVD    out_len+16(FP), R8
906	MOVD    in+32(FP), BLK_INP
907	MOVD    in_len+40(FP), IN_LEN
908	MOVD    counter+56(FP), COUNTER
909	MOVD    key+64(FP), BLK_KEY
910
911// Set up permute string when needed.
912#ifdef NEEDS_ESPERM
913	MOVDrcon(SB), R14
914	LVX     (R14), ESPERM   // Permute value for P8_ macros.
915#endif
916	SETUP_COUNTER		// V30 Counter V31 BE {0, 0, 0, 1}
917	LOAD_KEYS(BLK_KEY, KEY_LEN)	// VS1 - VS10/12/14 based on keysize
918	CMP     IN_LEN, $128
919	BLT	block64
920block128_loop:
921	// Do 8 encryptions in parallel by setting
922	// input values in V15-V22 and executing
923	// vcipher on the updated value and the keys.
924	GEN_VCIPHER_8_INPUTS
925	VCIPHER_8X1_KEY(VS1)
926	VCIPHER_8X1_KEY(VS2)
927	VCIPHER_8X1_KEY(VS3)
928	VCIPHER_8X1_KEY(VS4)
929	VCIPHER_8X1_KEY(VS5)
930	VCIPHER_8X1_KEY(VS6)
931	VCIPHER_8X1_KEY(VS7)
932	VCIPHER_8X1_KEY(VS8)
933	VCIPHER_8X1_KEY(VS9)
934	// Additional encryptions are done based on
935	// the key length, with the last key moved
936	// to V23 for use with VCIPHERLAST.
937	// CR2 = CMP key_len, $12
938	XXLOR VS10, VS10, V23
939	BLT	CR2, block128_last // key_len = 10
940	VCIPHER_8X1_KEY(VS10)
941	VCIPHER_8X1_KEY(VS11)
942	XXLOR VS12,VS12,V23
943	BEQ	CR2, block128_last // ken_len = 12
944	VCIPHER_8X1_KEY(VS12)
945	VCIPHER_8X1_KEY(VS13)
946	XXLOR VS14,VS14,V23	// key_len = 14
947block128_last:
948	// vcipher encryptions are in V15-V22 at this
949	// point with vcipherlast remaining to be done.
950	// Load input block into V1-V8, setting index offsets
951	// in R16-R22 to use with the STORE.
952	LOAD_INPUT_BLOCK128(BLK_INP)
953	// Do VCIPHERLAST on the last key for each encryption
954	// stream and XOR the result with the corresponding
955	// value from the input block.
956	VCIPHERLAST8_XOR_INPUT
957	// Store the results (8*16) and update BLK_OUT by 128.
958	STORE_OUTPUT_BLOCK128(BLK_OUT)
959	ADD	$-128, IN_LEN	// input size
960	CMP     IN_LEN, $128	// check if >= blocksize
961	BGE	block128_loop	// next input block
962	CMP	IN_LEN, $0
963	BEQ	done
964block64:
965	CMP	IN_LEN, $64	// Check if >= 64
966	BLT	block16_loop
967	// Do 4 encryptions in parallel by setting
968	// input values in V15-V18 and executing
969	// vcipher on the updated value and the keys.
970	GEN_VCIPHER_4_INPUTS
971	VCIPHER_4X1_KEY(VS1)
972	VCIPHER_4X1_KEY(VS2)
973	VCIPHER_4X1_KEY(VS3)
974	VCIPHER_4X1_KEY(VS4)
975	VCIPHER_4X1_KEY(VS5)
976	VCIPHER_4X1_KEY(VS6)
977	VCIPHER_4X1_KEY(VS7)
978	VCIPHER_4X1_KEY(VS8)
979	VCIPHER_4X1_KEY(VS9)
980	// Check key length based on CR2
981	// Move last key to V23 for use with later vcipherlast
982	XXLOR	VS10, VS10, V23
983	BLT	CR2, block64_last	// size = 10
984	VCIPHER_4X1_KEY(VS10)		// Encrypt next 2 keys
985	VCIPHER_4X1_KEY(VS11)
986	XXLOR	VS12, VS12, V23
987	BEQ	CR2, block64_last	// size = 12
988	VCIPHER_4X1_KEY(VS12)		// Encrypt last 2 keys
989	VCIPHER_4X1_KEY(VS13)
990	XXLOR	VS14, VS14, V23		// size = 14
991block64_last:
992	LOAD_INPUT_BLOCK64(BLK_INP)	// Load 64 bytes of input
993	// Do VCIPHERLAST on the last for each encryption
994	// stream and XOR the result with the corresponding
995	// value from the input block.
996	VCIPHERLAST4_XOR_INPUT
997	// Store the results (4*16) and update BLK_OUT by 64.
998	STORE_OUTPUT_BLOCK64(BLK_OUT)
999	ADD	$-64, IN_LEN		// decrement input block length
1000	CMP	IN_LEN, $0		// check for remaining length
1001	BEQ	done
1002block16_loop:
1003	CMP	IN_LEN, $16		// More input
1004	BLT	final_block		// If not, then handle partial block
1005	// Single encryption, no stitching
1006	GEN_VCIPHER_INPUT		// Generate input value for single encryption
1007	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 value with 9 keys
1008	XXLOR	VS10, VS10, V23		// Last key -> V23 for later vcipiherlast
1009	// Key length based on CR2. (LT=10, EQ=12, GT=14)
1010	BLT	CR2, block16_last	// Finish for key size 10
1011	VCIPHER_1X2_KEYS(V15, VS10, VS11) // Encrypt V15 with 2 more keys
1012	XXLOR	VS12, VS12, V23		// Last key -> V23 for later vcipherlast
1013	BEQ	CR2, block16_last	// Finish for key size 12
1014	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
1015	XXLOR	VS14, VS14, V23		// Last key -> V23 for vcipherlast with key size 14
1016block16_last:
1017	P8_LXVB16X(BLK_INP, R0, V1)	// Load input
1018	VCIPHERLAST V15, V23, V15	// Encrypt last value in V23
1019	XXLXOR	V15, V1, V1		// XOR with input
1020	P8_STXVB16X(V1,R0,BLK_OUT)	// Store final encryption value to output
1021	ADD	$16, BLK_INP		// Increment input pointer
1022	ADD	$16, BLK_OUT		// Increment output pointer
1023	ADD	$-16, IN_LEN		// Decrement input length
1024	BR	block16_loop		// Check for next
1025final_block:
1026	CMP	IN_LEN, $0
1027	BEQ	done
1028	GEN_VCIPHER_INPUT		// Generate input value for partial encryption
1029	VCIPHER_1X9_KEYS(V15)		// Encrypt V15 with 9 keys
1030	XXLOR	VS10, VS10, V23		// Save possible last key
1031	BLT	CR2, final_block_last
1032	VCIPHER_1X2_KEYS(V15, VS10, VS11)	// Encrypt V15 with next 2 keys
1033	XXLOR	VS12, VS12, V23		// Save possible last key
1034	BEQ	CR2, final_block_last
1035	VCIPHER_1X2_KEYS(V15, VS12, VS13) // Encrypt V15 with last 2 keys
1036	XXLOR	VS14, VS14, V23		// Save last key
1037final_block_last:
1038	VCIPHERLAST V15, V23, V15	// Finish encryption
1039#ifdef GOPPC64_power10
1040	// set up length
1041	SLD	$56, IN_LEN, R17
1042	LXVLL	BLK_INP, R17, V25
1043	VXOR	V25, V15, V25
1044	STXVLL	V25, BLK_OUT, R17
1045#else
1046	ADD	$32, R1, MASK_PTR
1047	MOVD	$0, R16
1048	P8_STXVB16X(V15, MASK_PTR, R0)
1049	CMP	IN_LEN, $8
1050	BLT	next4
1051	MOVD	0(MASK_PTR), R14
1052	MOVD	0(BLK_INP), R15
1053	XOR	R14, R15, R14
1054	MOVD	R14, 0(BLK_OUT)
1055	ADD	$8, R16
1056	ADD	$-8, IN_LEN
1057next4:
1058	CMP	IN_LEN, $4
1059	BLT	next2
1060	MOVWZ	(BLK_INP)(R16), R15
1061	MOVWZ	(MASK_PTR)(R16), R14
1062	XOR	R14, R15, R14
1063	MOVW	R14, (R16)(BLK_OUT)
1064	ADD	$4, R16
1065	ADD	$-4, IN_LEN
1066next2:
1067	CMP	IN_LEN, $2
1068	BLT	next1
1069	MOVHZ	(BLK_INP)(R16), R15
1070	MOVHZ	(MASK_PTR)(R16), R14
1071	XOR	R14, R15, R14
1072	MOVH	R14, (R16)(BLK_OUT)
1073	ADD	$2, R16
1074	ADD	$-2, IN_LEN
1075next1:
1076	CMP	IN_LEN, $1
1077	BLT	done
1078	MOVBZ	(MASK_PTR)(R16), R14
1079	MOVBZ	(BLK_INP)(R16), R15
1080	XOR	R14, R15, R14
1081	MOVB	R14, (R16)(BLK_OUT)
1082#endif
1083done:
1084	// Save the updated counter value
1085	P8_STXVB16X(V30, COUNTER, R0)
1086	// Clear the keys
1087	XXLXOR	VS0, VS0, VS0
1088	XXLXOR	VS1, VS1, VS1
1089	XXLXOR	VS2, VS2, VS2
1090	XXLXOR	VS3, VS3, VS3
1091	XXLXOR	VS4, VS4, VS4
1092	XXLXOR	VS5, VS5, VS5
1093	XXLXOR	VS6, VS6, VS6
1094	XXLXOR	VS7, VS7, VS7
1095	XXLXOR	VS8, VS8, VS8
1096	XXLXOR	VS9, VS9, VS9
1097	XXLXOR	VS10, VS10, VS10
1098	XXLXOR	VS11, VS11, VS11
1099	XXLXOR	VS12, VS12, VS12
1100	XXLXOR	VS13, VS13, VS13
1101	XXLXOR	VS14, VS14, VS14
1102	RET
1103
1104