1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) 7#include <openssl/arm_arch.h> 8 9.section .rodata 10 11.type _vpaes_consts,%object 12.align 7 // totally strategic alignment 13_vpaes_consts: 14.Lk_mc_forward: // mc_forward 15.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 16.quad 0x080B0A0904070605, 0x000302010C0F0E0D 17.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 18.quad 0x000302010C0F0E0D, 0x080B0A0904070605 19.Lk_mc_backward: // mc_backward 20.quad 0x0605040702010003, 0x0E0D0C0F0A09080B 21.quad 0x020100030E0D0C0F, 0x0A09080B06050407 22.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 23.quad 0x0A09080B06050407, 0x020100030E0D0C0F 24.Lk_sr: // sr 25.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 26.quad 0x030E09040F0A0500, 0x0B06010C07020D08 27.quad 0x0F060D040B020900, 0x070E050C030A0108 28.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 29 30// 31// "Hot" constants 32// 33.Lk_inv: // inv, inva 34.quad 0x0E05060F0D080180, 0x040703090A0B0C02 35.quad 0x01040A060F0B0780, 0x030D0E0C02050809 36.Lk_ipt: // input transform (lo, hi) 37.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 38.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 39.Lk_sbo: // sbou, sbot 40.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 41.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA 42.Lk_sb1: // sb1u, sb1t 43.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF 44.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 45.Lk_sb2: // sb2u, sb2t 46.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A 47.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD 48 49// 50// Decryption stuff 51// 52.Lk_dipt: // decryption input transform 53.quad 0x0F505B040B545F00, 0x154A411E114E451A 54.quad 0x86E383E660056500, 0x12771772F491F194 55.Lk_dsbo: // decryption sbox final output 56.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D 57.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C 58.Lk_dsb9: // decryption sbox output *9*u, *9*t 59.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 60.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 61.Lk_dsbd: // decryption sbox output *D*u, *D*t 62.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 63.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 64.Lk_dsbb: // decryption sbox output *B*u, *B*t 65.quad 0xD022649296B44200, 0x602646F6B0F2D404 66.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B 67.Lk_dsbe: // decryption sbox output *E*u, *E*t 68.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 69.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 70 71// 72// Key schedule constants 73// 74.Lk_dksd: // decryption key schedule: invskew x*D 75.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 76.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E 77.Lk_dksb: // decryption key schedule: invskew x*B 78.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 79.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 80.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 81.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 82.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 83.Lk_dks9: // decryption key schedule: invskew x*9 84.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC 85.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE 86 87.Lk_rcon: // rcon 88.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 89 90.Lk_opt: // output transform 91.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 92.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 93.Lk_deskew: // deskew tables: inverts the sbox's "skew" 94.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A 95.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 96 97.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 98.align 2 99.size _vpaes_consts,.-_vpaes_consts 100.align 6 101 102.text 103## 104## _aes_preheat 105## 106## Fills register %r10 -> .aes_consts (so you can -fPIC) 107## and %xmm9-%xmm15 as specified below. 108## 109.type _vpaes_encrypt_preheat,%function 110.align 4 111_vpaes_encrypt_preheat: 112 adrp x10, .Lk_inv 113 add x10, x10, :lo12:.Lk_inv 114 movi v17.16b, #0x0f 115 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 116 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo 117 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 118 ret 119.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat 120 121## 122## _aes_encrypt_core 123## 124## AES-encrypt %xmm0. 125## 126## Inputs: 127## %xmm0 = input 128## %xmm9-%xmm15 as in _vpaes_preheat 129## (%rdx) = scheduled keys 130## 131## Output in %xmm0 132## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax 133## Preserves %xmm6 - %xmm8 so you get some local vectors 134## 135## 136.type _vpaes_encrypt_core,%function 137.align 4 138_vpaes_encrypt_core: 139 mov x9, x2 140 ldr w8, [x2,#240] // pull rounds 141 adrp x11, .Lk_mc_forward+16 142 add x11, x11, :lo12:.Lk_mc_forward+16 143 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 144 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 145 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 146 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 147 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 148 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 149 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 150 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 151 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 152 b .Lenc_entry 153 154.align 4 155.Lenc_loop: 156 // middle of middle round 157 add x10, x11, #0x40 158 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 159 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 160 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 161 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 162 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 163 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 164 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 165 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 166 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 167 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 168 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 169 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 170 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 171 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 172 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 173 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 174 sub w8, w8, #1 // nr-- 175 176.Lenc_entry: 177 // top of round 178 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 179 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 180 tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 181 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 182 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 183 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 184 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 185 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 186 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 187 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 188 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 189 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 190 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 191 cbnz w8, .Lenc_loop 192 193 // middle of last round 194 add x10, x11, #0x80 195 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 196 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 197 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 198 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 199 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 200 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 201 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 202 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 203 ret 204.size _vpaes_encrypt_core,.-_vpaes_encrypt_core 205 206.globl vpaes_encrypt 207.hidden vpaes_encrypt 208.type vpaes_encrypt,%function 209.align 4 210vpaes_encrypt: 211 AARCH64_SIGN_LINK_REGISTER 212 stp x29,x30,[sp,#-16]! 213 add x29,sp,#0 214 215 ld1 {v7.16b}, [x0] 216 bl _vpaes_encrypt_preheat 217 bl _vpaes_encrypt_core 218 st1 {v0.16b}, [x1] 219 220 ldp x29,x30,[sp],#16 221 AARCH64_VALIDATE_LINK_REGISTER 222 ret 223.size vpaes_encrypt,.-vpaes_encrypt 224 225.type _vpaes_encrypt_2x,%function 226.align 4 227_vpaes_encrypt_2x: 228 mov x9, x2 229 ldr w8, [x2,#240] // pull rounds 230 adrp x11, .Lk_mc_forward+16 231 add x11, x11, :lo12:.Lk_mc_forward+16 232 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo 233 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key 234 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 235 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 236 and v9.16b, v15.16b, v17.16b 237 ushr v8.16b, v15.16b, #4 238 tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 239 tbl v9.16b, {v20.16b}, v9.16b 240 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi 241 tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 242 tbl v10.16b, {v21.16b}, v8.16b 243 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 244 eor v8.16b, v9.16b, v16.16b 245 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 246 eor v8.16b, v8.16b, v10.16b 247 b .Lenc_2x_entry 248 249.align 4 250.Lenc_2x_loop: 251 // middle of middle round 252 add x10, x11, #0x40 253 tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u 254 tbl v12.16b, {v25.16b}, v10.16b 255 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] 256 tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t 257 tbl v8.16b, {v24.16b}, v11.16b 258 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 259 eor v12.16b, v12.16b, v16.16b 260 tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u 261 tbl v13.16b, {v27.16b}, v10.16b 262 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 263 eor v8.16b, v8.16b, v12.16b 264 tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t 265 tbl v10.16b, {v26.16b}, v11.16b 266 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] 267 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B 268 tbl v11.16b, {v8.16b}, v1.16b 269 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A 270 eor v10.16b, v10.16b, v13.16b 271 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D 272 tbl v8.16b, {v8.16b}, v4.16b 273 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B 274 eor v11.16b, v11.16b, v10.16b 275 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C 276 tbl v12.16b, {v11.16b},v1.16b 277 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D 278 eor v8.16b, v8.16b, v11.16b 279 and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 280 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D 281 eor v8.16b, v8.16b, v12.16b 282 sub w8, w8, #1 // nr-- 283 284.Lenc_2x_entry: 285 // top of round 286 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k 287 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 288 and v9.16b, v8.16b, v17.16b 289 ushr v8.16b, v8.16b, #4 290 tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k 291 tbl v13.16b, {v19.16b},v9.16b 292 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 293 eor v9.16b, v9.16b, v8.16b 294 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 295 tbl v11.16b, {v18.16b},v8.16b 296 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 297 tbl v12.16b, {v18.16b},v9.16b 298 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 299 eor v11.16b, v11.16b, v13.16b 300 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 301 eor v12.16b, v12.16b, v13.16b 302 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 303 tbl v10.16b, {v18.16b},v11.16b 304 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 305 tbl v11.16b, {v18.16b},v12.16b 306 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 307 eor v10.16b, v10.16b, v9.16b 308 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 309 eor v11.16b, v11.16b, v8.16b 310 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 311 cbnz w8, .Lenc_2x_loop 312 313 // middle of last round 314 add x10, x11, #0x80 315 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo 316 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 317 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 318 tbl v12.16b, {v22.16b}, v10.16b 319 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] 320 tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t 321 tbl v8.16b, {v23.16b}, v11.16b 322 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k 323 eor v12.16b, v12.16b, v16.16b 324 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A 325 eor v8.16b, v8.16b, v12.16b 326 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 327 tbl v1.16b, {v8.16b},v1.16b 328 ret 329.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x 330 331.type _vpaes_decrypt_preheat,%function 332.align 4 333_vpaes_decrypt_preheat: 334 adrp x10, .Lk_inv 335 add x10, x10, :lo12:.Lk_inv 336 movi v17.16b, #0x0f 337 adrp x11, .Lk_dipt 338 add x11, x11, :lo12:.Lk_dipt 339 ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv 340 ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo 341 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd 342 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe 343 ret 344.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat 345 346## 347## Decryption core 348## 349## Same API as encryption core. 350## 351.type _vpaes_decrypt_core,%function 352.align 4 353_vpaes_decrypt_core: 354 mov x9, x2 355 ldr w8, [x2,#240] // pull rounds 356 357 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 358 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 359 eor x11, x11, #0x30 // xor $0x30, %r11 360 adrp x10, .Lk_sr 361 add x10, x10, :lo12:.Lk_sr 362 and x11, x11, #0x30 // and $0x30, %r11 363 add x11, x11, x10 364 adrp x10, .Lk_mc_forward+48 365 add x10, x10, :lo12:.Lk_mc_forward+48 366 367 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 368 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 369 ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 370 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 371 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 372 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 373 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 374 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 375 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 376 b .Ldec_entry 377 378.align 4 379.Ldec_loop: 380// 381// Inverse mix columns 382// 383 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 384 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 385 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 386 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 387 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 388 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 389 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 390 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 391 392 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 393 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 394 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 395 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 396 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 397 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 398 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 399 400 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 401 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 402 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 403 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 404 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 405 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 406 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 407 408 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 409 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 410 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 411 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 412 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 413 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 414 sub w8, w8, #1 // sub $1,%rax # nr-- 415 416.Ldec_entry: 417 // top of round 418 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 419 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 420 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 421 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 422 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 423 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 424 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 425 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 426 tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 427 tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 428 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 429 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 430 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 431 cbnz w8, .Ldec_loop 432 433 // middle of last round 434 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 435 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 436 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 437 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 438 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 439 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 440 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 441 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 442 ret 443.size _vpaes_decrypt_core,.-_vpaes_decrypt_core 444 445.globl vpaes_decrypt 446.hidden vpaes_decrypt 447.type vpaes_decrypt,%function 448.align 4 449vpaes_decrypt: 450 AARCH64_SIGN_LINK_REGISTER 451 stp x29,x30,[sp,#-16]! 452 add x29,sp,#0 453 454 ld1 {v7.16b}, [x0] 455 bl _vpaes_decrypt_preheat 456 bl _vpaes_decrypt_core 457 st1 {v0.16b}, [x1] 458 459 ldp x29,x30,[sp],#16 460 AARCH64_VALIDATE_LINK_REGISTER 461 ret 462.size vpaes_decrypt,.-vpaes_decrypt 463 464// v14-v15 input, v0-v1 output 465.type _vpaes_decrypt_2x,%function 466.align 4 467_vpaes_decrypt_2x: 468 mov x9, x2 469 ldr w8, [x2,#240] // pull rounds 470 471 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo 472 lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 473 eor x11, x11, #0x30 // xor $0x30, %r11 474 adrp x10, .Lk_sr 475 add x10, x10, :lo12:.Lk_sr 476 and x11, x11, #0x30 // and $0x30, %r11 477 add x11, x11, x10 478 adrp x10, .Lk_mc_forward+48 479 add x10, x10, :lo12:.Lk_mc_forward+48 480 481 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key 482 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 483 ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 484 and v9.16b, v15.16b, v17.16b 485 ushr v8.16b, v15.16b, #4 486 tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 487 tbl v10.16b, {v20.16b},v9.16b 488 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 489 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi 490 tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 491 tbl v8.16b, {v21.16b},v8.16b 492 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 493 eor v10.16b, v10.16b, v16.16b 494 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 495 eor v8.16b, v8.16b, v10.16b 496 b .Ldec_2x_entry 497 498.align 4 499.Ldec_2x_loop: 500// 501// Inverse mix columns 502// 503 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u 504 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t 505 tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u 506 tbl v12.16b, {v24.16b}, v10.16b 507 tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t 508 tbl v9.16b, {v25.16b}, v11.16b 509 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 510 eor v8.16b, v12.16b, v16.16b 511 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu 512 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 513 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 514 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt 515 516 tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu 517 tbl v12.16b, {v26.16b}, v10.16b 518 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 519 tbl v8.16b, {v8.16b},v5.16b 520 tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt 521 tbl v9.16b, {v27.16b}, v11.16b 522 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 523 eor v8.16b, v8.16b, v12.16b 524 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu 525 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 526 eor v8.16b, v8.16b, v9.16b 527 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt 528 529 tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu 530 tbl v12.16b, {v28.16b}, v10.16b 531 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 532 tbl v8.16b, {v8.16b},v5.16b 533 tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt 534 tbl v9.16b, {v29.16b}, v11.16b 535 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 536 eor v8.16b, v8.16b, v12.16b 537 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu 538 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 539 eor v8.16b, v8.16b, v9.16b 540 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet 541 542 tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu 543 tbl v12.16b, {v30.16b}, v10.16b 544 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch 545 tbl v8.16b, {v8.16b},v5.16b 546 tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet 547 tbl v9.16b, {v31.16b}, v11.16b 548 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch 549 eor v8.16b, v8.16b, v12.16b 550 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 551 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch 552 eor v8.16b, v8.16b, v9.16b 553 sub w8, w8, #1 // sub $1,%rax # nr-- 554 555.Ldec_2x_entry: 556 // top of round 557 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 558 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 559 and v9.16b, v8.16b, v17.16b 560 ushr v8.16b, v8.16b, #4 561 tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 562 tbl v10.16b, {v19.16b},v9.16b 563 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 564 eor v9.16b, v9.16b, v8.16b 565 tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 566 tbl v11.16b, {v18.16b},v8.16b 567 tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 568 tbl v12.16b, {v18.16b},v9.16b 569 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 570 eor v11.16b, v11.16b, v10.16b 571 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 572 eor v12.16b, v12.16b, v10.16b 573 tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak 574 tbl v10.16b, {v18.16b},v11.16b 575 tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak 576 tbl v11.16b, {v18.16b},v12.16b 577 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io 578 eor v10.16b, v10.16b, v9.16b 579 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo 580 eor v11.16b, v11.16b, v8.16b 581 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 582 cbnz w8, .Ldec_2x_loop 583 584 // middle of last round 585 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou 586 tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou 587 tbl v12.16b, {v22.16b}, v10.16b 588 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot 589 tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t 590 tbl v9.16b, {v23.16b}, v11.16b 591 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 592 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k 593 eor v12.16b, v12.16b, v16.16b 594 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A 595 eor v8.16b, v9.16b, v12.16b 596 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 597 tbl v1.16b, {v8.16b},v2.16b 598 ret 599.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x 600######################################################## 601## ## 602## AES key schedule ## 603## ## 604######################################################## 605.type _vpaes_key_preheat,%function 606.align 4 607_vpaes_key_preheat: 608 adrp x10, .Lk_inv 609 add x10, x10, :lo12:.Lk_inv 610 movi v16.16b, #0x5b // .Lk_s63 611 adrp x11, .Lk_sb1 612 add x11, x11, :lo12:.Lk_sb1 613 movi v17.16b, #0x0f // .Lk_s0F 614 ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt 615 adrp x10, .Lk_dksd 616 add x10, x10, :lo12:.Lk_dksd 617 ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 618 adrp x11, .Lk_mc_forward 619 add x11, x11, :lo12:.Lk_mc_forward 620 ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb 621 ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 622 ld1 {v8.2d}, [x10] // .Lk_rcon 623 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] 624 ret 625.size _vpaes_key_preheat,.-_vpaes_key_preheat 626 627.type _vpaes_schedule_core,%function 628.align 4 629_vpaes_schedule_core: 630 AARCH64_SIGN_LINK_REGISTER 631 stp x29, x30, [sp,#-16]! 632 add x29,sp,#0 633 634 bl _vpaes_key_preheat // load the tables 635 636 ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) 637 638 // input transform 639 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 640 bl _vpaes_schedule_transform 641 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 642 643 adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 644 add x10, x10, :lo12:.Lk_sr 645 646 add x8, x8, x10 647 cbnz w3, .Lschedule_am_decrypting 648 649 // encrypting, output zeroth round key after transform 650 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) 651 b .Lschedule_go 652 653.Lschedule_am_decrypting: 654 // decrypting, output zeroth round key after shiftrows 655 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 656 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 657 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 658 eor x8, x8, #0x30 // xor $0x30, %r8 659 660.Lschedule_go: 661 cmp w1, #192 // cmp $192, %esi 662 b.hi .Lschedule_256 663 b.eq .Lschedule_192 664 // 128: fall though 665 666## 667## .schedule_128 668## 669## 128-bit specific part of key schedule. 670## 671## This schedule is really simple, because all its parts 672## are accomplished by the subroutines. 673## 674.Lschedule_128: 675 mov x0, #10 // mov $10, %esi 676 677.Loop_schedule_128: 678 sub x0, x0, #1 // dec %esi 679 bl _vpaes_schedule_round 680 cbz x0, .Lschedule_mangle_last 681 bl _vpaes_schedule_mangle // write output 682 b .Loop_schedule_128 683 684## 685## .aes_schedule_192 686## 687## 192-bit specific part of key schedule. 688## 689## The main body of this schedule is the same as the 128-bit 690## schedule, but with more smearing. The long, high side is 691## stored in %xmm7 as before, and the short, low side is in 692## the high bits of %xmm6. 693## 694## This schedule is somewhat nastier, however, because each 695## round produces 192 bits of key material, or 1.5 round keys. 696## Therefore, on each cycle we do 2 rounds and produce 3 round 697## keys. 698## 699.align 4 700.Lschedule_192: 701 sub x0, x0, #8 702 ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) 703 bl _vpaes_schedule_transform // input transform 704 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part 705 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 706 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros 707 mov x0, #4 // mov $4, %esi 708 709.Loop_schedule_192: 710 sub x0, x0, #1 // dec %esi 711 bl _vpaes_schedule_round 712 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 713 bl _vpaes_schedule_mangle // save key n 714 bl _vpaes_schedule_192_smear 715 bl _vpaes_schedule_mangle // save key n+1 716 bl _vpaes_schedule_round 717 cbz x0, .Lschedule_mangle_last 718 bl _vpaes_schedule_mangle // save key n+2 719 bl _vpaes_schedule_192_smear 720 b .Loop_schedule_192 721 722## 723## .aes_schedule_256 724## 725## 256-bit specific part of key schedule. 726## 727## The structure here is very similar to the 128-bit 728## schedule, but with an additional "low side" in 729## %xmm6. The low side's rounds are the same as the 730## high side's, except no rcon and no rotation. 731## 732.align 4 733.Lschedule_256: 734 ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) 735 bl _vpaes_schedule_transform // input transform 736 mov x0, #7 // mov $7, %esi 737 738.Loop_schedule_256: 739 sub x0, x0, #1 // dec %esi 740 bl _vpaes_schedule_mangle // output low result 741 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 742 743 // high round 744 bl _vpaes_schedule_round 745 cbz x0, .Lschedule_mangle_last 746 bl _vpaes_schedule_mangle 747 748 // low round. swap xmm7 and xmm6 749 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 750 movi v4.16b, #0 751 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 752 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 753 bl _vpaes_schedule_low_round 754 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 755 756 b .Loop_schedule_256 757 758## 759## .aes_schedule_mangle_last 760## 761## Mangler for last round of key schedule 762## Mangles %xmm0 763## when encrypting, outputs out(%xmm0) ^ 63 764## when decrypting, outputs unskew(%xmm0) 765## 766## Always called right before return... jumps to cleanup and exits 767## 768.align 4 769.Lschedule_mangle_last: 770 // schedule last round key from xmm0 771 adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew 772 add x11, x11, :lo12:.Lk_deskew 773 774 cbnz w3, .Lschedule_mangle_last_dec 775 776 // encrypting 777 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 778 adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform 779 add x11, x11, :lo12:.Lk_opt 780 add x2, x2, #32 // add $32, %rdx 781 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute 782 783.Lschedule_mangle_last_dec: 784 ld1 {v20.2d,v21.2d}, [x11] // reload constants 785 sub x2, x2, #16 // add $-16, %rdx 786 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 787 bl _vpaes_schedule_transform // output transform 788 st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key 789 790 // cleanup 791 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 792 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 793 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 794 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 795 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 796 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 797 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 798 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 799 ldp x29, x30, [sp],#16 800 AARCH64_VALIDATE_LINK_REGISTER 801 ret 802.size _vpaes_schedule_core,.-_vpaes_schedule_core 803 804## 805## .aes_schedule_192_smear 806## 807## Smear the short, low side in the 192-bit key schedule. 808## 809## Inputs: 810## %xmm7: high side, b a x y 811## %xmm6: low side, d c 0 0 812## %xmm13: 0 813## 814## Outputs: 815## %xmm6: b+c+d b+c 0 0 816## %xmm0: b+c+d b+c b a 817## 818.type _vpaes_schedule_192_smear,%function 819.align 4 820_vpaes_schedule_192_smear: 821 movi v1.16b, #0 822 dup v0.4s, v7.s[3] 823 ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 824 ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a 825 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 826 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 827 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a 828 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 829 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros 830 ret 831.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear 832 833## 834## .aes_schedule_round 835## 836## Runs one main round of the key schedule on %xmm0, %xmm7 837## 838## Specifically, runs subbytes on the high dword of %xmm0 839## then rotates it by one byte and xors into the low dword of 840## %xmm7. 841## 842## Adds rcon from low byte of %xmm8, then rotates %xmm8 for 843## next rcon. 844## 845## Smears the dwords of %xmm7 by xoring the low into the 846## second low, result into third, result into highest. 847## 848## Returns results in %xmm7 = %xmm0. 849## Clobbers %xmm1-%xmm4, %r11. 850## 851.type _vpaes_schedule_round,%function 852.align 4 853_vpaes_schedule_round: 854 // extract rcon from xmm8 855 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 856 ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 857 ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 858 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 859 860 // rotate 861 dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 862 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 863 864 // fall through... 865 866 // low round: same as high round, but no rotation and no rcon. 867_vpaes_schedule_low_round: 868 // smear xmm7 869 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 870 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 871 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 872 873 // subbytes 874 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k 875 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i 876 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 877 tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k 878 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j 879 tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i 880 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k 881 tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j 882 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 883 tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak 884 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k 885 tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak 886 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io 887 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo 888 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou 889 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t 890 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output 891 892 // add in smeared stuff 893 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 894 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 895 ret 896.size _vpaes_schedule_round,.-_vpaes_schedule_round 897 898## 899## .aes_schedule_transform 900## 901## Linear-transform %xmm0 according to tables at (%r11) 902## 903## Requires that %xmm9 = 0x0F0F... as in preheat 904## Output in %xmm0 905## Clobbers %xmm1, %xmm2 906## 907.type _vpaes_schedule_transform,%function 908.align 4 909_vpaes_schedule_transform: 910 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 911 ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 912 // vmovdqa (%r11), %xmm2 # lo 913 tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 914 // vmovdqa 16(%r11), %xmm1 # hi 915 tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 916 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 917 ret 918.size _vpaes_schedule_transform,.-_vpaes_schedule_transform 919 920## 921## .aes_schedule_mangle 922## 923## Mangle xmm0 from (basis-transformed) standard version 924## to our version. 925## 926## On encrypt, 927## xor with 0x63 928## multiply by circulant 0,1,1,1 929## apply shiftrows transform 930## 931## On decrypt, 932## xor with 0x63 933## multiply by "inverse mixcolumns" circulant E,B,D,9 934## deskew 935## apply shiftrows transform 936## 937## 938## Writes out to (%rdx), and increments or decrements it 939## Keeps track of round number mod 4 in %r8 940## Preserves xmm0 941## Clobbers xmm1-xmm5 942## 943.type _vpaes_schedule_mangle,%function 944.align 4 945_vpaes_schedule_mangle: 946 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later 947 // vmovdqa .Lk_mc_forward(%rip),%xmm5 948 cbnz w3, .Lschedule_mangle_dec 949 950 // encrypting 951 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 952 add x2, x2, #16 // add $16, %rdx 953 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 954 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 955 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 956 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 957 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 958 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 959 960 b .Lschedule_mangle_both 961.align 4 962.Lschedule_mangle_dec: 963 // inverse mix columns 964 // lea .Lk_dksd(%rip),%r11 965 ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi 966 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo 967 968 // vmovdqa 0x00(%r11), %xmm2 969 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 970 // vmovdqa 0x10(%r11), %xmm3 971 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 972 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 973 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 974 975 // vmovdqa 0x20(%r11), %xmm2 976 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 977 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 978 // vmovdqa 0x30(%r11), %xmm3 979 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 980 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 981 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 982 983 // vmovdqa 0x40(%r11), %xmm2 984 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 985 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 986 // vmovdqa 0x50(%r11), %xmm3 987 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 988 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 989 990 // vmovdqa 0x60(%r11), %xmm2 991 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 992 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 993 // vmovdqa 0x70(%r11), %xmm4 994 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 995 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 996 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 997 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 998 999 sub x2, x2, #16 // add $-16, %rdx 1000 1001.Lschedule_mangle_both: 1002 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 1003 add x8, x8, #48 // add $-16, %r8 1004 and x8, x8, #~(1<<6) // and $0x30, %r8 1005 st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) 1006 ret 1007.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle 1008 1009.globl vpaes_set_encrypt_key 1010.hidden vpaes_set_encrypt_key 1011.type vpaes_set_encrypt_key,%function 1012.align 4 1013vpaes_set_encrypt_key: 1014 AARCH64_SIGN_LINK_REGISTER 1015 stp x29,x30,[sp,#-16]! 1016 add x29,sp,#0 1017 stp d8,d9,[sp,#-16]! // ABI spec says so 1018 1019 lsr w9, w1, #5 // shr $5,%eax 1020 add w9, w9, #5 // $5,%eax 1021 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1022 1023 mov w3, #0 // mov $0,%ecx 1024 mov x8, #0x30 // mov $0x30,%r8d 1025 bl _vpaes_schedule_core 1026 eor x0, x0, x0 1027 1028 ldp d8,d9,[sp],#16 1029 ldp x29,x30,[sp],#16 1030 AARCH64_VALIDATE_LINK_REGISTER 1031 ret 1032.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key 1033 1034.globl vpaes_set_decrypt_key 1035.hidden vpaes_set_decrypt_key 1036.type vpaes_set_decrypt_key,%function 1037.align 4 1038vpaes_set_decrypt_key: 1039 AARCH64_SIGN_LINK_REGISTER 1040 stp x29,x30,[sp,#-16]! 1041 add x29,sp,#0 1042 stp d8,d9,[sp,#-16]! // ABI spec says so 1043 1044 lsr w9, w1, #5 // shr $5,%eax 1045 add w9, w9, #5 // $5,%eax 1046 str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; 1047 lsl w9, w9, #4 // shl $4,%eax 1048 add x2, x2, #16 // lea 16(%rdx,%rax),%rdx 1049 add x2, x2, x9 1050 1051 mov w3, #1 // mov $1,%ecx 1052 lsr w8, w1, #1 // shr $1,%r8d 1053 and x8, x8, #32 // and $32,%r8d 1054 eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 1055 bl _vpaes_schedule_core 1056 1057 ldp d8,d9,[sp],#16 1058 ldp x29,x30,[sp],#16 1059 AARCH64_VALIDATE_LINK_REGISTER 1060 ret 1061.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key 1062.globl vpaes_cbc_encrypt 1063.hidden vpaes_cbc_encrypt 1064.type vpaes_cbc_encrypt,%function 1065.align 4 1066vpaes_cbc_encrypt: 1067 AARCH64_SIGN_LINK_REGISTER 1068 cbz x2, .Lcbc_abort 1069 cmp w5, #0 // check direction 1070 b.eq vpaes_cbc_decrypt 1071 1072 stp x29,x30,[sp,#-16]! 1073 add x29,sp,#0 1074 1075 mov x17, x2 // reassign 1076 mov x2, x3 // reassign 1077 1078 ld1 {v0.16b}, [x4] // load ivec 1079 bl _vpaes_encrypt_preheat 1080 b .Lcbc_enc_loop 1081 1082.align 4 1083.Lcbc_enc_loop: 1084 ld1 {v7.16b}, [x0],#16 // load input 1085 eor v7.16b, v7.16b, v0.16b // xor with ivec 1086 bl _vpaes_encrypt_core 1087 st1 {v0.16b}, [x1],#16 // save output 1088 subs x17, x17, #16 1089 b.hi .Lcbc_enc_loop 1090 1091 st1 {v0.16b}, [x4] // write ivec 1092 1093 ldp x29,x30,[sp],#16 1094.Lcbc_abort: 1095 AARCH64_VALIDATE_LINK_REGISTER 1096 ret 1097.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt 1098 1099.type vpaes_cbc_decrypt,%function 1100.align 4 1101vpaes_cbc_decrypt: 1102 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to 1103 // only from vpaes_cbc_encrypt which has already signed the return address. 1104 stp x29,x30,[sp,#-16]! 1105 add x29,sp,#0 1106 stp d8,d9,[sp,#-16]! // ABI spec says so 1107 stp d10,d11,[sp,#-16]! 1108 stp d12,d13,[sp,#-16]! 1109 stp d14,d15,[sp,#-16]! 1110 1111 mov x17, x2 // reassign 1112 mov x2, x3 // reassign 1113 ld1 {v6.16b}, [x4] // load ivec 1114 bl _vpaes_decrypt_preheat 1115 tst x17, #16 1116 b.eq .Lcbc_dec_loop2x 1117 1118 ld1 {v7.16b}, [x0], #16 // load input 1119 bl _vpaes_decrypt_core 1120 eor v0.16b, v0.16b, v6.16b // xor with ivec 1121 orr v6.16b, v7.16b, v7.16b // next ivec value 1122 st1 {v0.16b}, [x1], #16 1123 subs x17, x17, #16 1124 b.ls .Lcbc_dec_done 1125 1126.align 4 1127.Lcbc_dec_loop2x: 1128 ld1 {v14.16b,v15.16b}, [x0], #32 1129 bl _vpaes_decrypt_2x 1130 eor v0.16b, v0.16b, v6.16b // xor with ivec 1131 eor v1.16b, v1.16b, v14.16b 1132 orr v6.16b, v15.16b, v15.16b 1133 st1 {v0.16b,v1.16b}, [x1], #32 1134 subs x17, x17, #32 1135 b.hi .Lcbc_dec_loop2x 1136 1137.Lcbc_dec_done: 1138 st1 {v6.16b}, [x4] 1139 1140 ldp d14,d15,[sp],#16 1141 ldp d12,d13,[sp],#16 1142 ldp d10,d11,[sp],#16 1143 ldp d8,d9,[sp],#16 1144 ldp x29,x30,[sp],#16 1145 AARCH64_VALIDATE_LINK_REGISTER 1146 ret 1147.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt 1148.globl vpaes_ctr32_encrypt_blocks 1149.hidden vpaes_ctr32_encrypt_blocks 1150.type vpaes_ctr32_encrypt_blocks,%function 1151.align 4 1152vpaes_ctr32_encrypt_blocks: 1153 AARCH64_SIGN_LINK_REGISTER 1154 stp x29,x30,[sp,#-16]! 1155 add x29,sp,#0 1156 stp d8,d9,[sp,#-16]! // ABI spec says so 1157 stp d10,d11,[sp,#-16]! 1158 stp d12,d13,[sp,#-16]! 1159 stp d14,d15,[sp,#-16]! 1160 1161 cbz x2, .Lctr32_done 1162 1163 // Note, unlike the other functions, x2 here is measured in blocks, 1164 // not bytes. 1165 mov x17, x2 1166 mov x2, x3 1167 1168 // Load the IV and counter portion. 1169 ldr w6, [x4, #12] 1170 ld1 {v7.16b}, [x4] 1171 1172 bl _vpaes_encrypt_preheat 1173 tst x17, #1 1174 rev w6, w6 // The counter is big-endian. 1175 b.eq .Lctr32_prep_loop 1176 1177 // Handle one block so the remaining block count is even for 1178 // _vpaes_encrypt_2x. 1179 ld1 {v6.16b}, [x0], #16 // .Load input ahead of time 1180 bl _vpaes_encrypt_core 1181 eor v0.16b, v0.16b, v6.16b // XOR input and result 1182 st1 {v0.16b}, [x1], #16 1183 subs x17, x17, #1 1184 // Update the counter. 1185 add w6, w6, #1 1186 rev w7, w6 1187 mov v7.s[3], w7 1188 b.ls .Lctr32_done 1189 1190.Lctr32_prep_loop: 1191 // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x 1192 // uses v14 and v15. 1193 mov v15.16b, v7.16b 1194 mov v14.16b, v7.16b 1195 add w6, w6, #1 1196 rev w7, w6 1197 mov v15.s[3], w7 1198 1199.Lctr32_loop: 1200 ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time 1201 bl _vpaes_encrypt_2x 1202 eor v0.16b, v0.16b, v6.16b // XOR input and result 1203 eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) 1204 st1 {v0.16b,v1.16b}, [x1], #32 1205 subs x17, x17, #2 1206 // Update the counter. 1207 add w7, w6, #1 1208 add w6, w6, #2 1209 rev w7, w7 1210 mov v14.s[3], w7 1211 rev w7, w6 1212 mov v15.s[3], w7 1213 b.hi .Lctr32_loop 1214 1215.Lctr32_done: 1216 ldp d14,d15,[sp],#16 1217 ldp d12,d13,[sp],#16 1218 ldp d10,d11,[sp],#16 1219 ldp d8,d9,[sp],#16 1220 ldp x29,x30,[sp],#16 1221 AARCH64_VALIDATE_LINK_REGISTER 1222 ret 1223.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks 1224#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) 1225