1// Copyright 2023 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// ChaCha8 is ChaCha with 8 rounds.
6// See https://cr.yp.to/chacha/chacha-20080128.pdf.
7//
8// ChaCha8 operates on a 4x4 matrix of uint32 values, initially set to:
9//
10//	const1 const2 const3 const4
11//	seed   seed   seed   seed
12//	seed   seed   seed   seed
13//	counter64     0      0
14//
15// We use the same constants as ChaCha20 does, a random seed,
16// and a counter. Running ChaCha8 on this input produces
17// a 4x4 matrix of pseudo-random values with as much entropy
18// as the seed.
19//
20// Given SIMD registers that can hold N uint32s, it is possible
21// to run N ChaCha8 block transformations in parallel by filling
22// the first register with the N copies of const1, the second
23// with N copies of const2, and so on, and then running the operations.
24//
25// Each iteration of ChaCha8Rand operates over 32 bytes of input and
26// produces 992 bytes of RNG output, plus 32 bytes of input for the next
27// iteration.
28//
29// The 32 bytes of input are used as a ChaCha8 key, with a zero nonce, to
30// produce 1024 bytes of output (16 blocks, with counters 0 to 15).
31// First, for each block, the values 0x61707865, 0x3320646e, 0x79622d32,
32// 0x6b206574 are subtracted from the 32-bit little-endian words at
33// position 0, 1, 2, and 3 respectively, and an increasing counter
34// starting at zero is subtracted from each word at position 12. Then,
35// this stream is permuted such that for each sequence of four blocks,
36// first we output the first four bytes of each block, then the next four
37// bytes of each block, and so on. Finally, the last 32 bytes of output
38// are used as the input of the next iteration, and the remaining 992
39// bytes are the RNG output.
40//
41// See https://c2sp.org/chacha8rand for additional details.
42//
43// Normal ChaCha20 implementations for encryption use this same
44// parallelism but then have to deinterlace the results so that
45// it appears the blocks were generated separately. For the purposes
46// of generating random numbers, the interlacing is fine.
47// We are simply locked in to preserving the 4-way interlacing
48// in any future optimizations.
49package chacha8rand
50
51import (
52	"internal/goarch"
53	"unsafe"
54)
55
56// setup sets up 4 ChaCha8 blocks in b32 with the counter and seed.
57// Note that b32 is [16][4]uint32 not [4][16]uint32: the blocks are interlaced
58// the same way they would be in a 4-way SIMD implementations.
59func setup(seed *[4]uint64, b32 *[16][4]uint32, counter uint32) {
60	// Convert to uint64 to do half as many stores to memory.
61	b := (*[16][2]uint64)(unsafe.Pointer(b32))
62
63	// Constants; same as in ChaCha20: "expand 32-byte k"
64	b[0][0] = 0x61707865_61707865
65	b[0][1] = 0x61707865_61707865
66
67	b[1][0] = 0x3320646e_3320646e
68	b[1][1] = 0x3320646e_3320646e
69
70	b[2][0] = 0x79622d32_79622d32
71	b[2][1] = 0x79622d32_79622d32
72
73	b[3][0] = 0x6b206574_6b206574
74	b[3][1] = 0x6b206574_6b206574
75
76	// Seed values.
77	var x64 uint64
78	var x uint32
79
80	x = uint32(seed[0])
81	x64 = uint64(x)<<32 | uint64(x)
82	b[4][0] = x64
83	b[4][1] = x64
84
85	x = uint32(seed[0] >> 32)
86	x64 = uint64(x)<<32 | uint64(x)
87	b[5][0] = x64
88	b[5][1] = x64
89
90	x = uint32(seed[1])
91	x64 = uint64(x)<<32 | uint64(x)
92	b[6][0] = x64
93	b[6][1] = x64
94
95	x = uint32(seed[1] >> 32)
96	x64 = uint64(x)<<32 | uint64(x)
97	b[7][0] = x64
98	b[7][1] = x64
99
100	x = uint32(seed[2])
101	x64 = uint64(x)<<32 | uint64(x)
102	b[8][0] = x64
103	b[8][1] = x64
104
105	x = uint32(seed[2] >> 32)
106	x64 = uint64(x)<<32 | uint64(x)
107	b[9][0] = x64
108	b[9][1] = x64
109
110	x = uint32(seed[3])
111	x64 = uint64(x)<<32 | uint64(x)
112	b[10][0] = x64
113	b[10][1] = x64
114
115	x = uint32(seed[3] >> 32)
116	x64 = uint64(x)<<32 | uint64(x)
117	b[11][0] = x64
118	b[11][1] = x64
119
120	// Counters.
121	if goarch.BigEndian {
122		b[12][0] = uint64(counter+0)<<32 | uint64(counter+1)
123		b[12][1] = uint64(counter+2)<<32 | uint64(counter+3)
124	} else {
125		b[12][0] = uint64(counter+0) | uint64(counter+1)<<32
126		b[12][1] = uint64(counter+2) | uint64(counter+3)<<32
127	}
128
129	// Zeros.
130	b[13][0] = 0
131	b[13][1] = 0
132	b[14][0] = 0
133	b[14][1] = 0
134
135	b[15][0] = 0
136	b[15][1] = 0
137}
138
139func _() {
140	// block and block_generic must have same type
141	x := block
142	x = block_generic
143	_ = x
144}
145
146// block_generic is the non-assembly block implementation,
147// for use on systems without special assembly.
148// Even on such systems, it is quite fast: on GOOS=386,
149// ChaCha8 using this code generates random values faster than PCG-DXSM.
150func block_generic(seed *[4]uint64, buf *[32]uint64, counter uint32) {
151	b := (*[16][4]uint32)(unsafe.Pointer(buf))
152
153	setup(seed, b, counter)
154
155	for i := range b[0] {
156		// Load block i from b[*][i] into local variables.
157		b0 := b[0][i]
158		b1 := b[1][i]
159		b2 := b[2][i]
160		b3 := b[3][i]
161		b4 := b[4][i]
162		b5 := b[5][i]
163		b6 := b[6][i]
164		b7 := b[7][i]
165		b8 := b[8][i]
166		b9 := b[9][i]
167		b10 := b[10][i]
168		b11 := b[11][i]
169		b12 := b[12][i]
170		b13 := b[13][i]
171		b14 := b[14][i]
172		b15 := b[15][i]
173
174		// 4 iterations of eight quarter-rounds each is 8 rounds
175		for round := 0; round < 4; round++ {
176			b0, b4, b8, b12 = qr(b0, b4, b8, b12)
177			b1, b5, b9, b13 = qr(b1, b5, b9, b13)
178			b2, b6, b10, b14 = qr(b2, b6, b10, b14)
179			b3, b7, b11, b15 = qr(b3, b7, b11, b15)
180
181			b0, b5, b10, b15 = qr(b0, b5, b10, b15)
182			b1, b6, b11, b12 = qr(b1, b6, b11, b12)
183			b2, b7, b8, b13 = qr(b2, b7, b8, b13)
184			b3, b4, b9, b14 = qr(b3, b4, b9, b14)
185		}
186
187		// Store block i back into b[*][i].
188		// Add b4..b11 back to the original key material,
189		// like in ChaCha20, to avoid trivial invertibility.
190		// There is no entropy in b0..b3 and b12..b15
191		// so we can skip the additions and save some time.
192		b[0][i] = b0
193		b[1][i] = b1
194		b[2][i] = b2
195		b[3][i] = b3
196		b[4][i] += b4
197		b[5][i] += b5
198		b[6][i] += b6
199		b[7][i] += b7
200		b[8][i] += b8
201		b[9][i] += b9
202		b[10][i] += b10
203		b[11][i] += b11
204		b[12][i] = b12
205		b[13][i] = b13
206		b[14][i] = b14
207		b[15][i] = b15
208	}
209
210	if goarch.BigEndian {
211		// On a big-endian system, reading the uint32 pairs as uint64s
212		// will word-swap them compared to little-endian, so we word-swap
213		// them here first to make the next swap get the right answer.
214		for i, x := range buf {
215			buf[i] = x>>32 | x<<32
216		}
217	}
218}
219
220// qr is the (inlinable) ChaCha8 quarter round.
221func qr(a, b, c, d uint32) (_a, _b, _c, _d uint32) {
222	a += b
223	d ^= a
224	d = d<<16 | d>>16
225	c += d
226	b ^= c
227	b = b<<12 | b>>20
228	a += b
229	d ^= a
230	d = d<<8 | d>>24
231	c += d
232	b ^= c
233	b = b<<7 | b>>25
234	return a, b, c, d
235}
236