1*cd192fa9SAndroid Build Coastguard Worker /*
2*cd192fa9SAndroid Build Coastguard Worker * version 20110505
3*cd192fa9SAndroid Build Coastguard Worker * D. J. Bernstein
4*cd192fa9SAndroid Build Coastguard Worker * Public domain.
5*cd192fa9SAndroid Build Coastguard Worker *
6*cd192fa9SAndroid Build Coastguard Worker * Based on crypto_core/salsa208/armneon/core.c from SUPERCOP 20130419
7*cd192fa9SAndroid Build Coastguard Worker */
8*cd192fa9SAndroid Build Coastguard Worker
9*cd192fa9SAndroid Build Coastguard Worker #define ROUNDS 8
10*cd192fa9SAndroid Build Coastguard Worker static void
salsa20_8_intrinsic(void * input)11*cd192fa9SAndroid Build Coastguard Worker salsa20_8_intrinsic(void * input)
12*cd192fa9SAndroid Build Coastguard Worker {
13*cd192fa9SAndroid Build Coastguard Worker int i;
14*cd192fa9SAndroid Build Coastguard Worker
15*cd192fa9SAndroid Build Coastguard Worker const uint32x4_t abab = {-1,0,-1,0};
16*cd192fa9SAndroid Build Coastguard Worker
17*cd192fa9SAndroid Build Coastguard Worker /*
18*cd192fa9SAndroid Build Coastguard Worker * This is modified since we only have one argument. Usually you'd rearrange
19*cd192fa9SAndroid Build Coastguard Worker * the constant, key, and input bytes, but we just have one linear array to
20*cd192fa9SAndroid Build Coastguard Worker * rearrange which is a bit easier.
21*cd192fa9SAndroid Build Coastguard Worker */
22*cd192fa9SAndroid Build Coastguard Worker
23*cd192fa9SAndroid Build Coastguard Worker /*
24*cd192fa9SAndroid Build Coastguard Worker * Change the input to be diagonals as if it's a 4x4 matrix of 32-bit values.
25*cd192fa9SAndroid Build Coastguard Worker */
26*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x0x5x10x15;
27*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x12x1x6x11;
28*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x8x13x2x7;
29*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x4x9x14x3;
30*cd192fa9SAndroid Build Coastguard Worker
31*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x0x1x10x11;
32*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x12x13x6x7;
33*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x8x9x2x3;
34*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x4x5x14x15;
35*cd192fa9SAndroid Build Coastguard Worker
36*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x0x1x2x3;
37*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x4x5x6x7;
38*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x8x9x10x11;
39*cd192fa9SAndroid Build Coastguard Worker uint32x4_t x12x13x14x15;
40*cd192fa9SAndroid Build Coastguard Worker
41*cd192fa9SAndroid Build Coastguard Worker x0x1x2x3 = vld1q_u8((uint8_t *) input);
42*cd192fa9SAndroid Build Coastguard Worker x4x5x6x7 = vld1q_u8(16 + (uint8_t *) input);
43*cd192fa9SAndroid Build Coastguard Worker x8x9x10x11 = vld1q_u8(32 + (uint8_t *) input);
44*cd192fa9SAndroid Build Coastguard Worker x12x13x14x15 = vld1q_u8(48 + (uint8_t *) input);
45*cd192fa9SAndroid Build Coastguard Worker
46*cd192fa9SAndroid Build Coastguard Worker x0x1x10x11 = vcombine_u32(vget_low_u32(x0x1x2x3), vget_high_u32(x8x9x10x11));
47*cd192fa9SAndroid Build Coastguard Worker x4x5x14x15 = vcombine_u32(vget_low_u32(x4x5x6x7), vget_high_u32(x12x13x14x15));
48*cd192fa9SAndroid Build Coastguard Worker x8x9x2x3 = vcombine_u32(vget_low_u32(x8x9x10x11), vget_high_u32(x0x1x2x3));
49*cd192fa9SAndroid Build Coastguard Worker x12x13x6x7 = vcombine_u32(vget_low_u32(x12x13x14x15), vget_high_u32(x4x5x6x7));
50*cd192fa9SAndroid Build Coastguard Worker
51*cd192fa9SAndroid Build Coastguard Worker x0x5x10x15 = vbslq_u32(abab,x0x1x10x11,x4x5x14x15);
52*cd192fa9SAndroid Build Coastguard Worker x8x13x2x7 = vbslq_u32(abab,x8x9x2x3,x12x13x6x7);
53*cd192fa9SAndroid Build Coastguard Worker x4x9x14x3 = vbslq_u32(abab,x4x5x14x15,x8x9x2x3);
54*cd192fa9SAndroid Build Coastguard Worker x12x1x6x11 = vbslq_u32(abab,x12x13x6x7,x0x1x10x11);
55*cd192fa9SAndroid Build Coastguard Worker
56*cd192fa9SAndroid Build Coastguard Worker uint32x4_t start0 = x0x5x10x15;
57*cd192fa9SAndroid Build Coastguard Worker uint32x4_t start1 = x12x1x6x11;
58*cd192fa9SAndroid Build Coastguard Worker uint32x4_t start3 = x4x9x14x3;
59*cd192fa9SAndroid Build Coastguard Worker uint32x4_t start2 = x8x13x2x7;
60*cd192fa9SAndroid Build Coastguard Worker
61*cd192fa9SAndroid Build Coastguard Worker /* From here on this should be the same as the SUPERCOP version. */
62*cd192fa9SAndroid Build Coastguard Worker
63*cd192fa9SAndroid Build Coastguard Worker uint32x4_t diag0 = start0;
64*cd192fa9SAndroid Build Coastguard Worker uint32x4_t diag1 = start1;
65*cd192fa9SAndroid Build Coastguard Worker uint32x4_t diag2 = start2;
66*cd192fa9SAndroid Build Coastguard Worker uint32x4_t diag3 = start3;
67*cd192fa9SAndroid Build Coastguard Worker
68*cd192fa9SAndroid Build Coastguard Worker uint32x4_t a0;
69*cd192fa9SAndroid Build Coastguard Worker uint32x4_t a1;
70*cd192fa9SAndroid Build Coastguard Worker uint32x4_t a2;
71*cd192fa9SAndroid Build Coastguard Worker uint32x4_t a3;
72*cd192fa9SAndroid Build Coastguard Worker
73*cd192fa9SAndroid Build Coastguard Worker for (i = ROUNDS;i > 0;i -= 2) {
74*cd192fa9SAndroid Build Coastguard Worker a0 = diag1 + diag0;
75*cd192fa9SAndroid Build Coastguard Worker diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
76*cd192fa9SAndroid Build Coastguard Worker a1 = diag0 + diag3;
77*cd192fa9SAndroid Build Coastguard Worker diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
78*cd192fa9SAndroid Build Coastguard Worker a2 = diag3 + diag2;
79*cd192fa9SAndroid Build Coastguard Worker diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
80*cd192fa9SAndroid Build Coastguard Worker a3 = diag2 + diag1;
81*cd192fa9SAndroid Build Coastguard Worker diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);
82*cd192fa9SAndroid Build Coastguard Worker
83*cd192fa9SAndroid Build Coastguard Worker diag3 = vextq_u32(diag3,diag3,3);
84*cd192fa9SAndroid Build Coastguard Worker diag2 = vextq_u32(diag2,diag2,2);
85*cd192fa9SAndroid Build Coastguard Worker diag1 = vextq_u32(diag1,diag1,1);
86*cd192fa9SAndroid Build Coastguard Worker
87*cd192fa9SAndroid Build Coastguard Worker a0 = diag3 + diag0;
88*cd192fa9SAndroid Build Coastguard Worker diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25);
89*cd192fa9SAndroid Build Coastguard Worker a1 = diag0 + diag1;
90*cd192fa9SAndroid Build Coastguard Worker diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23);
91*cd192fa9SAndroid Build Coastguard Worker a2 = diag1 + diag2;
92*cd192fa9SAndroid Build Coastguard Worker diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19);
93*cd192fa9SAndroid Build Coastguard Worker a3 = diag2 + diag3;
94*cd192fa9SAndroid Build Coastguard Worker diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14);
95*cd192fa9SAndroid Build Coastguard Worker
96*cd192fa9SAndroid Build Coastguard Worker diag1 = vextq_u32(diag1,diag1,3);
97*cd192fa9SAndroid Build Coastguard Worker diag2 = vextq_u32(diag2,diag2,2);
98*cd192fa9SAndroid Build Coastguard Worker diag3 = vextq_u32(diag3,diag3,1);
99*cd192fa9SAndroid Build Coastguard Worker }
100*cd192fa9SAndroid Build Coastguard Worker
101*cd192fa9SAndroid Build Coastguard Worker x0x5x10x15 = diag0 + start0;
102*cd192fa9SAndroid Build Coastguard Worker x12x1x6x11 = diag1 + start1;
103*cd192fa9SAndroid Build Coastguard Worker x8x13x2x7 = diag2 + start2;
104*cd192fa9SAndroid Build Coastguard Worker x4x9x14x3 = diag3 + start3;
105*cd192fa9SAndroid Build Coastguard Worker
106*cd192fa9SAndroid Build Coastguard Worker x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11);
107*cd192fa9SAndroid Build Coastguard Worker x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7);
108*cd192fa9SAndroid Build Coastguard Worker x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3);
109*cd192fa9SAndroid Build Coastguard Worker x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15);
110*cd192fa9SAndroid Build Coastguard Worker
111*cd192fa9SAndroid Build Coastguard Worker x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3));
112*cd192fa9SAndroid Build Coastguard Worker x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7));
113*cd192fa9SAndroid Build Coastguard Worker x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11));
114*cd192fa9SAndroid Build Coastguard Worker x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15));
115*cd192fa9SAndroid Build Coastguard Worker
116*cd192fa9SAndroid Build Coastguard Worker vst1q_u8((uint8_t *) input,(uint8x16_t) x0x1x2x3);
117*cd192fa9SAndroid Build Coastguard Worker vst1q_u8(16 + (uint8_t *) input,(uint8x16_t) x4x5x6x7);
118*cd192fa9SAndroid Build Coastguard Worker vst1q_u8(32 + (uint8_t *) input,(uint8x16_t) x8x9x10x11);
119*cd192fa9SAndroid Build Coastguard Worker vst1q_u8(48 + (uint8_t *) input,(uint8x16_t) x12x13x14x15);
120*cd192fa9SAndroid Build Coastguard Worker }
121