1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <ring-core/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <ring-core/arm_arch.h> 8 9.section .rodata 10 11 12.align 7 // totally strategic alignment 13_vpaes_consts: 14Lk_mc_forward: // mc_forward 15.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 16.quad 0x080B0A0904070605, 0x000302010C0F0E0D 17.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 18.quad 0x000302010C0F0E0D, 0x080B0A0904070605 19Lk_mc_backward: // mc_backward 20.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 21.quad 0x020100030E0D0C0F, 0x0A09080B06050407 22.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 23.quad 0x0A09080B06050407, 0x020100030E0D0C0F 24Lk_sr: // sr 25.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 26.quad 0x030E09040F0A0500, 0x0B06010C07020D08 27.quad 0x0F060D040B020900, 0x070E050C030A0108 28.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 29 30// 31// "Hot" constants 32// 33Lk_inv: // inv, inva 34.quad 0x0E05060F0D080180, 0x040703090A0B0C02 35.quad 0x01040A060F0B0780, 0x030D0E0C02050809 36Lk_ipt: // input transform (lo, hi) 37.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 38.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 39Lk_sbo: // sbou, sbot 40.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 41.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 42Lk_sb1: // sb1u, sb1t 43.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 44.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 45Lk_sb2: // sb2u, sb2t 46.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 47.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 48 49// 50// Key schedule constants 51// 52Lk_dksd: // decryption key schedule: invskew x*D 53.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 54.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 55Lk_dksb: // decryption key schedule: invskew x*B 56.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 57.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 58Lk_dkse: // decryption key schedule: invskew x*E + 0x63 59.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 60.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 61Lk_dks9: // decryption key schedule: invskew x*9 62.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 63.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 64 65Lk_rcon: // rcon 66.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 67 68Lk_opt: // output transform 69.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 70.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 71Lk_deskew: // deskew tables: inverts the sbox's "skew" 72.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 73.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 74 75.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 76.align 2 77 78.align 6 79 80.text 81## 82## _aes_preheat 83## 84## Fills register %r10 -> .aes_consts (so you can -fPIC) 85## and %xmm9-%xmm15 as specified below. 86## 87.def _vpaes_encrypt_preheat 88 .type 32 89.endef 90.align 4 91_vpaes_encrypt_preheat: 92 adrp x10, Lk_inv 93 add x10, x10, :lo12:Lk_inv 94 movi v17.16b, #0x0f 95 ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv 96 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo 97 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 98 ret 99 100 101## 102## _aes_encrypt_core 103## 104## AES-encrypt %xmm0. 105## 106## Inputs: 107## %xmm0 = input 108## %xmm9-%xmm15 as in _vpaes_preheat 109## (%rdx) = scheduled keys 110## 111## Output in %xmm0 112## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 113## Preserves %xmm6 - %xmm8 so you get some local vectors 114## 115## 116.def _vpaes_encrypt_core 117 .type 32 118.endef 119.align 4 120_vpaes_encrypt_core: 121 mov x9, x2 122 ldr w8, [x2,#240] // pull rounds 123 adrp x11, Lk_mc_forward+16 124 add x11, x11, :lo12:Lk_mc_forward+16 125 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 126 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 127 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 128 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 129 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 130 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 131 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 132 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 133 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 134 b Lenc_entry 135 136.align 4 137Lenc_loop: 138 // middle of middle round 139 add x10, x11, #0x40 140 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 141 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] 142 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 143 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 144 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 145 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 146 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 147 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] 148 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 149 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 150 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 151 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 152 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 153 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 154 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 155 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 156 sub w8, w8, #1 // nr-- 157 158Lenc_entry: 159 // top of round 160 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 161 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 162 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 163 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 164 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 165 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 166 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 167 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 168 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 169 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 170 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 171 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 172 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 173 cbnz w8, Lenc_loop 174 175 // middle of last round 176 add x10, x11, #0x80 177 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 178 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 179 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 180 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] 181 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 182 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 183 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 184 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 185 ret 186 187 188.globl vpaes_encrypt 189 190.def vpaes_encrypt 191 .type 32 192.endef 193.align 4 194vpaes_encrypt: 195 AARCH64_SIGN_LINK_REGISTER 196 stp x29,x30,[sp,#-16]! 197 add x29,sp,#0 198 199 ld1 {v7.16b}, [x0] 200 bl _vpaes_encrypt_preheat 201 bl _vpaes_encrypt_core 202 st1 {v0.16b}, [x1] 203 204 ldp x29,x30,[sp],#16 205 AARCH64_VALIDATE_LINK_REGISTER 206 ret 207 208 209.def _vpaes_encrypt_2x 210 .type 32 211.endef 212.align 4 213_vpaes_encrypt_2x: 214 mov x9, x2 215 ldr w8, [x2,#240] // pull rounds 216 adrp x11, Lk_mc_forward+16 217 add x11, x11, :lo12:Lk_mc_forward+16 218 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 219 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 220 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 221 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 222 and v9.16b, v15.16b, v17.16b 223 ushr v8.16b, v15.16b, #4 224 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 225 tbl v9.16b, {v20.16b}, v9.16b 226 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 227 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 228 tbl v10.16b, {v21.16b}, v8.16b 229 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 230 eor v8.16b, v9.16b, v16.16b 231 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 232 eor v8.16b, v8.16b, v10.16b 233 b Lenc_2x_entry 234 235.align 4 236Lenc_2x_loop: 237 // middle of middle round 238 add x10, x11, #0x40 239 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 240 tbl v12.16b, {v25.16b}, v10.16b 241 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] 242 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 243 tbl v8.16b, {v24.16b}, v11.16b 244 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 245 eor v12.16b, v12.16b, v16.16b 246 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 247 tbl v13.16b, {v27.16b}, v10.16b 248 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 249 eor v8.16b, v8.16b, v12.16b 250 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 251 tbl v10.16b, {v26.16b}, v11.16b 252 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] 253 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 254 tbl v11.16b, {v8.16b}, v1.16b 255 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 256 eor v10.16b, v10.16b, v13.16b 257 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 258 tbl v8.16b, {v8.16b}, v4.16b 259 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 260 eor v11.16b, v11.16b, v10.16b 261 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 262 tbl v12.16b, {v11.16b},v1.16b 263 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 264 eor v8.16b, v8.16b, v11.16b 265 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 266 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 267 eor v8.16b, v8.16b, v12.16b 268 sub w8, w8, #1 // nr-- 269 270Lenc_2x_entry: 271 // top of round 272 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 273 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 274 and v9.16b, v8.16b, v17.16b 275 ushr v8.16b, v8.16b, #4 276 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 277 tbl v13.16b, {v19.16b},v9.16b 278 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 279 eor v9.16b, v9.16b, v8.16b 280 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 281 tbl v11.16b, {v18.16b},v8.16b 282 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 283 tbl v12.16b, {v18.16b},v9.16b 284 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 285 eor v11.16b, v11.16b, v13.16b 286 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 287 eor v12.16b, v12.16b, v13.16b 288 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 289 tbl v10.16b, {v18.16b},v11.16b 290 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 291 tbl v11.16b, {v18.16b},v12.16b 292 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 293 eor v10.16b, v10.16b, v9.16b 294 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 295 eor v11.16b, v11.16b, v8.16b 296 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 297 cbnz w8, Lenc_2x_loop 298 299 // middle of last round 300 add x10, x11, #0x80 301 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 302 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 303 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 304 tbl v12.16b, {v22.16b}, v10.16b 305 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] 306 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 307 tbl v8.16b, {v23.16b}, v11.16b 308 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 309 eor v12.16b, v12.16b, v16.16b 310 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 311 eor v8.16b, v8.16b, v12.16b 312 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 313 tbl v1.16b, {v8.16b},v1.16b 314 ret 315 316######################################################## 317## ## 318## AES key schedule ## 319## ## 320######################################################## 321.def _vpaes_key_preheat 322 .type 32 323.endef 324.align 4 325_vpaes_key_preheat: 326 adrp x10, Lk_inv 327 add x10, x10, :lo12:Lk_inv 328 movi v16.16b, #0x5b // Lk_s63 329 adrp x11, Lk_sb1 330 add x11, x11, :lo12:Lk_sb1 331 movi v17.16b, #0x0f // Lk_s0F 332 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt 333 adrp x10, Lk_dksd 334 add x10, x10, :lo12:Lk_dksd 335 ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 336 adrp x11, Lk_mc_forward 337 add x11, x11, :lo12:Lk_mc_forward 338 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb 339 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 340 ld1 {v8.2d}, [x10] // Lk_rcon 341 ld1 {v9.2d}, [x11] // Lk_mc_forward[0] 342 ret 343 344 345.def _vpaes_schedule_core 346 .type 32 347.endef 348.align 4 349_vpaes_schedule_core: 350 AARCH64_SIGN_LINK_REGISTER 351 stp x29, x30, [sp,#-16]! 352 add x29,sp,#0 353 354 bl _vpaes_key_preheat // load the tables 355 356 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 357 358 // input transform 359 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 360 bl _vpaes_schedule_transform 361 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 362 363 adrp x10, Lk_sr // lea Lk_sr(%rip),%r10 364 add x10, x10, :lo12:Lk_sr 365 366 add x8, x8, x10 367 368 // encrypting, output zeroth round key after transform 369 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 370 371 cmp w1, #192 // cmp $192, %esi 372 b.hi Lschedule_256 373 b.eq Lschedule_192 374 // 128: fall though 375 376## 377## .schedule_128 378## 379## 128-bit specific part of key schedule. 380## 381## This schedule is really simple, because all its parts 382## are accomplished by the subroutines. 383## 384Lschedule_128: 385 mov x0, #10 // mov $10, %esi 386 387Loop_schedule_128: 388 sub x0, x0, #1 // dec %esi 389 bl _vpaes_schedule_round 390 cbz x0, Lschedule_mangle_last 391 bl _vpaes_schedule_mangle // write output 392 b Loop_schedule_128 393 394## 395## .aes_schedule_192 396## 397## 192-bit specific part of key schedule. 398## 399## The main body of this schedule is the same as the 128-bit 400## schedule, but with more smearing. The long, high side is 401## stored in %xmm7 as before, and the short, low side is in 402## the high bits of %xmm6. 403## 404## This schedule is somewhat nastier, however, because each 405## round produces 192 bits of key material, or 1.5 round keys. 406## Therefore, on each cycle we do 2 rounds and produce 3 round 407## keys. 408## 409.align 4 410Lschedule_192: 411 sub x0, x0, #8 412 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 413 bl _vpaes_schedule_transform // input transform 414 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 415 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 416 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 417 mov x0, #4 // mov $4, %esi 418 419Loop_schedule_192: 420 sub x0, x0, #1 // dec %esi 421 bl _vpaes_schedule_round 422 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 423 bl _vpaes_schedule_mangle // save key n 424 bl _vpaes_schedule_192_smear 425 bl _vpaes_schedule_mangle // save key n+1 426 bl _vpaes_schedule_round 427 cbz x0, Lschedule_mangle_last 428 bl _vpaes_schedule_mangle // save key n+2 429 bl _vpaes_schedule_192_smear 430 b Loop_schedule_192 431 432## 433## .aes_schedule_256 434## 435## 256-bit specific part of key schedule. 436## 437## The structure here is very similar to the 128-bit 438## schedule, but with an additional "low side" in 439## %xmm6. The low side's rounds are the same as the 440## high side's, except no rcon and no rotation. 441## 442.align 4 443Lschedule_256: 444 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 445 bl _vpaes_schedule_transform // input transform 446 mov x0, #7 // mov $7, %esi 447 448Loop_schedule_256: 449 sub x0, x0, #1 // dec %esi 450 bl _vpaes_schedule_mangle // output low result 451 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 452 453 // high round 454 bl _vpaes_schedule_round 455 cbz x0, Lschedule_mangle_last 456 bl _vpaes_schedule_mangle 457 458 // low round. swap xmm7 and xmm6 459 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 460 movi v4.16b, #0 461 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 462 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 463 bl _vpaes_schedule_low_round 464 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 465 466 b Loop_schedule_256 467 468## 469## .aes_schedule_mangle_last 470## 471## Mangler for last round of key schedule 472## Mangles %xmm0 473## when encrypting, outputs out(%xmm0) ^ 63 474## when decrypting, outputs unskew(%xmm0) 475## 476## Always called right before return... jumps to cleanup and exits 477## 478.align 4 479Lschedule_mangle_last: 480 // schedule last round key from xmm0 481 adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew 482 add x11, x11, :lo12:Lk_deskew 483 484 cbnz w3, Lschedule_mangle_last_dec 485 486 // encrypting 487 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 488 adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform 489 add x11, x11, :lo12:Lk_opt 490 add x2, x2, #32 // add $32, %rdx 491 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 492 493Lschedule_mangle_last_dec: 494 ld1 {v20.2d,v21.2d}, [x11] // reload constants 495 sub x2, x2, #16 // add $-16, %rdx 496 eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 497 bl _vpaes_schedule_transform // output transform 498 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 499 500 // cleanup 501 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 502 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 503 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 504 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 505 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 506 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 507 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 508 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 509 ldp x29, x30, [sp],#16 510 AARCH64_VALIDATE_LINK_REGISTER 511 ret 512 513 514## 515## .aes_schedule_192_smear 516## 517## Smear the short, low side in the 192-bit key schedule. 518## 519## Inputs: 520## %xmm7: high side, b a x y 521## %xmm6: low side, d c 0 0 522## %xmm13: 0 523## 524## Outputs: 525## %xmm6: b+c+d b+c 0 0 526## %xmm0: b+c+d b+c b a 527## 528.def _vpaes_schedule_192_smear 529 .type 32 530.endef 531.align 4 532_vpaes_schedule_192_smear: 533 movi v1.16b, #0 534 dup v0.4s, v7.s[3] 535 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 536 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 537 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 538 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 539 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 540 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 541 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 542 ret 543 544 545## 546## .aes_schedule_round 547## 548## Runs one main round of the key schedule on %xmm0, %xmm7 549## 550## Specifically, runs subbytes on the high dword of %xmm0 551## then rotates it by one byte and xors into the low dword of 552## %xmm7. 553## 554## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 555## next rcon. 556## 557## Smears the dwords of %xmm7 by xoring the low into the 558## second low, result into third, result into highest. 559## 560## Returns results in %xmm7 = %xmm0. 561## Clobbers %xmm1-%xmm4, %r11. 562## 563.def _vpaes_schedule_round 564 .type 32 565.endef 566.align 4 567_vpaes_schedule_round: 568 // extract rcon from xmm8 569 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 570 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 571 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 572 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 573 574 // rotate 575 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 576 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 577 578 // fall through... 579 580 // low round: same as high round, but no rotation and no rcon. 581_vpaes_schedule_low_round: 582 // smear xmm7 583 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 584 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 585 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 586 587 // subbytes 588 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 589 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 590 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 591 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 592 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 593 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 594 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 595 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 596 eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 597 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 598 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 599 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 600 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 601 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 602 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 603 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 604 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 605 606 // add in smeared stuff 607 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 608 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 609 ret 610 611 612## 613## .aes_schedule_transform 614## 615## Linear-transform %xmm0 according to tables at (%r11) 616## 617## Requires that %xmm9 = 0x0F0F... as in preheat 618## Output in %xmm0 619## Clobbers %xmm1, %xmm2 620## 621.def _vpaes_schedule_transform 622 .type 32 623.endef 624.align 4 625_vpaes_schedule_transform: 626 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 627 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 628 // vmovdqa (%r11), %xmm2 # lo 629 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 630 // vmovdqa 16(%r11), %xmm1 # hi 631 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 632 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 633 ret 634 635 636## 637## .aes_schedule_mangle 638## 639## Mangle xmm0 from (basis-transformed) standard version 640## to our version. 641## 642## On encrypt, 643## xor with 0x63 644## multiply by circulant 0,1,1,1 645## apply shiftrows transform 646## 647## On decrypt, 648## xor with 0x63 649## multiply by "inverse mixcolumns" circulant E,B,D,9 650## deskew 651## apply shiftrows transform 652## 653## 654## Writes out to (%rdx), and increments or decrements it 655## Keeps track of round number mod 4 in %r8 656## Preserves xmm0 657## Clobbers xmm1-xmm5 658## 659.def _vpaes_schedule_mangle 660 .type 32 661.endef 662.align 4 663_vpaes_schedule_mangle: 664 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 665 // vmovdqa .Lk_mc_forward(%rip),%xmm5 666 667 // encrypting 668 eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 669 add x2, x2, #16 // add $16, %rdx 670 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 671 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 672 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 673 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 674 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 675 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 676 677Lschedule_mangle_both: 678 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 679 add x8, x8, #48 // add $-16, %r8 680 and x8, x8, #~(1<<6) // and $0x30, %r8 681 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 682 ret 683 684 685.globl vpaes_set_encrypt_key 686 687.def vpaes_set_encrypt_key 688 .type 32 689.endef 690.align 4 691vpaes_set_encrypt_key: 692 AARCH64_SIGN_LINK_REGISTER 693 stp x29,x30,[sp,#-16]! 694 add x29,sp,#0 695 stp d8,d9,[sp,#-16]! // ABI spec says so 696 697 lsr w9, w1, #5 // shr $5,%eax 698 add w9, w9, #5 // $5,%eax 699 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 700 701 mov w3, #0 // mov $0,%ecx 702 mov x8, #0x30 // mov $0x30,%r8d 703 bl _vpaes_schedule_core 704 eor x0, x0, x0 705 706 ldp d8,d9,[sp],#16 707 ldp x29,x30,[sp],#16 708 AARCH64_VALIDATE_LINK_REGISTER 709 ret 710 711.globl vpaes_ctr32_encrypt_blocks 712 713.def vpaes_ctr32_encrypt_blocks 714 .type 32 715.endef 716.align 4 717vpaes_ctr32_encrypt_blocks: 718 AARCH64_SIGN_LINK_REGISTER 719 stp x29,x30,[sp,#-16]! 720 add x29,sp,#0 721 stp d8,d9,[sp,#-16]! // ABI spec says so 722 stp d10,d11,[sp,#-16]! 723 stp d12,d13,[sp,#-16]! 724 stp d14,d15,[sp,#-16]! 725 726 cbz x2, Lctr32_done 727 728 // Note, unlike the other functions, x2 here is measured in blocks, 729 // not bytes. 730 mov x17, x2 731 mov x2, x3 732 733 // Load the IV and counter portion. 734 ldr w6, [x4, #12] 735 ld1 {v7.16b}, [x4] 736 737 bl _vpaes_encrypt_preheat 738 tst x17, #1 739 rev w6, w6 // The counter is big-endian. 740 b.eq Lctr32_prep_loop 741 742 // Handle one block so the remaining block count is even for 743 // _vpaes_encrypt_2x. 744 ld1 {v6.16b}, [x0], #16 // Load input ahead of time 745 bl _vpaes_encrypt_core 746 eor v0.16b, v0.16b, v6.16b // XOR input and result 747 st1 {v0.16b}, [x1], #16 748 subs x17, x17, #1 749 // Update the counter. 750 add w6, w6, #1 751 rev w7, w6 752 mov v7.s[3], w7 753 b.ls Lctr32_done 754 755Lctr32_prep_loop: 756 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 757 // uses v14 and v15. 758 mov v15.16b, v7.16b 759 mov v14.16b, v7.16b 760 add w6, w6, #1 761 rev w7, w6 762 mov v15.s[3], w7 763 764Lctr32_loop: 765 ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time 766 bl _vpaes_encrypt_2x 767 eor v0.16b, v0.16b, v6.16b // XOR input and result 768 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 769 st1 {v0.16b,v1.16b}, [x1], #32 770 subs x17, x17, #2 771 // Update the counter. 772 add w7, w6, #1 773 add w6, w6, #2 774 rev w7, w7 775 mov v14.s[3], w7 776 rev w7, w6 777 mov v15.s[3], w7 778 b.hi Lctr32_loop 779 780Lctr32_done: 781 ldp d14,d15,[sp],#16 782 ldp d12,d13,[sp],#16 783 ldp d10,d11,[sp],#16 784 ldp d8,d9,[sp],#16 785 ldp x29,x30,[sp],#16 786 AARCH64_VALIDATE_LINK_REGISTER 787 ret 788 789#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 790