1// This file is generated from a similarly-named Perl script in the BoringSSL 2// source tree. Do not edit by hand. 3 4#include <openssl/asm_base.h> 5 6#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) 7#include <openssl/arm_arch.h> 8 9#if __ARM_MAX_ARCH__>=7 10.text 11.arch armv8-a+crypto 12.globl gcm_init_v8 13 14.def gcm_init_v8 15 .type 32 16.endef 17.align 4 18gcm_init_v8: 19 AARCH64_VALID_CALL_TARGET 20 ld1 {v17.2d},[x1] //load input H 21 movi v19.16b,#0xe1 22 shl v19.2d,v19.2d,#57 //0xc2.0 23 ext v3.16b,v17.16b,v17.16b,#8 24 ushr v18.2d,v19.2d,#63 25 dup v17.4s,v17.s[1] 26 ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 27 ushr v18.2d,v3.2d,#63 28 sshr v17.4s,v17.4s,#31 //broadcast carry bit 29 and v18.16b,v18.16b,v16.16b 30 shl v3.2d,v3.2d,#1 31 ext v18.16b,v18.16b,v18.16b,#8 32 and v16.16b,v16.16b,v17.16b 33 orr v3.16b,v3.16b,v18.16b //H<<<=1 34 eor v20.16b,v3.16b,v16.16b //twisted H 35 st1 {v20.2d},[x0],#16 //store Htable[0] 36 37 //calculate H^2 38 ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing 39 pmull v0.1q,v20.1d,v20.1d 40 eor v16.16b,v16.16b,v20.16b 41 pmull2 v2.1q,v20.2d,v20.2d 42 pmull v1.1q,v16.1d,v16.1d 43 44 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 45 eor v18.16b,v0.16b,v2.16b 46 eor v1.16b,v1.16b,v17.16b 47 eor v1.16b,v1.16b,v18.16b 48 pmull v18.1q,v0.1d,v19.1d //1st phase 49 50 ins v2.d[0],v1.d[1] 51 ins v1.d[1],v0.d[0] 52 eor v0.16b,v1.16b,v18.16b 53 54 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 55 pmull v0.1q,v0.1d,v19.1d 56 eor v18.16b,v18.16b,v2.16b 57 eor v22.16b,v0.16b,v18.16b 58 59 ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing 60 eor v17.16b,v17.16b,v22.16b 61 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 62 st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] 63 //calculate H^3 and H^4 64 pmull v0.1q,v20.1d, v22.1d 65 pmull v5.1q,v22.1d,v22.1d 66 pmull2 v2.1q,v20.2d, v22.2d 67 pmull2 v7.1q,v22.2d,v22.2d 68 pmull v1.1q,v16.1d,v17.1d 69 pmull v6.1q,v17.1d,v17.1d 70 71 ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 72 ext v17.16b,v5.16b,v7.16b,#8 73 eor v18.16b,v0.16b,v2.16b 74 eor v1.16b,v1.16b,v16.16b 75 eor v4.16b,v5.16b,v7.16b 76 eor v6.16b,v6.16b,v17.16b 77 eor v1.16b,v1.16b,v18.16b 78 pmull v18.1q,v0.1d,v19.1d //1st phase 79 eor v6.16b,v6.16b,v4.16b 80 pmull v4.1q,v5.1d,v19.1d 81 82 ins v2.d[0],v1.d[1] 83 ins v7.d[0],v6.d[1] 84 ins v1.d[1],v0.d[0] 85 ins v6.d[1],v5.d[0] 86 eor v0.16b,v1.16b,v18.16b 87 eor v5.16b,v6.16b,v4.16b 88 89 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase 90 ext v4.16b,v5.16b,v5.16b,#8 91 pmull v0.1q,v0.1d,v19.1d 92 pmull v5.1q,v5.1d,v19.1d 93 eor v18.16b,v18.16b,v2.16b 94 eor v4.16b,v4.16b,v7.16b 95 eor v20.16b, v0.16b,v18.16b //H^3 96 eor v22.16b,v5.16b,v4.16b //H^4 97 98 ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing 99 ext v17.16b,v22.16b,v22.16b,#8 100 eor v16.16b,v16.16b,v20.16b 101 eor v17.16b,v17.16b,v22.16b 102 ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed 103 st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] 104 ret 105 106.globl gcm_gmult_v8 107 108.def gcm_gmult_v8 109 .type 32 110.endef 111.align 4 112gcm_gmult_v8: 113 AARCH64_VALID_CALL_TARGET 114 ld1 {v17.2d},[x0] //load Xi 115 movi v19.16b,#0xe1 116 ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... 117 shl v19.2d,v19.2d,#57 118#ifndef __AARCH64EB__ 119 rev64 v17.16b,v17.16b 120#endif 121 ext v3.16b,v17.16b,v17.16b,#8 122 123 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 124 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 125 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 126 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 127 128 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 129 eor v18.16b,v0.16b,v2.16b 130 eor v1.16b,v1.16b,v17.16b 131 eor v1.16b,v1.16b,v18.16b 132 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 133 134 ins v2.d[0],v1.d[1] 135 ins v1.d[1],v0.d[0] 136 eor v0.16b,v1.16b,v18.16b 137 138 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 139 pmull v0.1q,v0.1d,v19.1d 140 eor v18.16b,v18.16b,v2.16b 141 eor v0.16b,v0.16b,v18.16b 142 143#ifndef __AARCH64EB__ 144 rev64 v0.16b,v0.16b 145#endif 146 ext v0.16b,v0.16b,v0.16b,#8 147 st1 {v0.2d},[x0] //write out Xi 148 149 ret 150 151.globl gcm_ghash_v8 152 153.def gcm_ghash_v8 154 .type 32 155.endef 156.align 4 157gcm_ghash_v8: 158 AARCH64_VALID_CALL_TARGET 159 cmp x3,#64 160 b.hs Lgcm_ghash_v8_4x 161 ld1 {v0.2d},[x0] //load [rotated] Xi 162 //"[rotated]" means that 163 //loaded value would have 164 //to be rotated in order to 165 //make it appear as in 166 //algorithm specification 167 subs x3,x3,#32 //see if x3 is 32 or larger 168 mov x12,#16 //x12 is used as post- 169 //increment for input pointer; 170 //as loop is modulo-scheduled 171 //x12 is zeroed just in time 172 //to preclude overstepping 173 //inp[len], which means that 174 //last block[s] are actually 175 //loaded twice, but last 176 //copy is not processed 177 ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 178 movi v19.16b,#0xe1 179 ld1 {v22.2d},[x1] 180 csel x12,xzr,x12,eq //is it time to zero x12? 181 ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi 182 ld1 {v16.2d},[x2],#16 //load [rotated] I[0] 183 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 184#ifndef __AARCH64EB__ 185 rev64 v16.16b,v16.16b 186 rev64 v0.16b,v0.16b 187#endif 188 ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] 189 b.lo Lodd_tail_v8 //x3 was less than 32 190 ld1 {v17.2d},[x2],x12 //load [rotated] I[1] 191#ifndef __AARCH64EB__ 192 rev64 v17.16b,v17.16b 193#endif 194 ext v7.16b,v17.16b,v17.16b,#8 195 eor v3.16b,v3.16b,v0.16b //I[i]^=Xi 196 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 197 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 198 pmull2 v6.1q,v20.2d,v7.2d 199 b Loop_mod2x_v8 200 201.align 4 202Loop_mod2x_v8: 203 ext v18.16b,v3.16b,v3.16b,#8 204 subs x3,x3,#32 //is there more data? 205 pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo 206 csel x12,xzr,x12,lo //is it time to zero x12? 207 208 pmull v5.1q,v21.1d,v17.1d 209 eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing 210 pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi 211 eor v0.16b,v0.16b,v4.16b //accumulate 212 pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) 213 ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] 214 215 eor v2.16b,v2.16b,v6.16b 216 csel x12,xzr,x12,eq //is it time to zero x12? 217 eor v1.16b,v1.16b,v5.16b 218 219 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 220 eor v18.16b,v0.16b,v2.16b 221 eor v1.16b,v1.16b,v17.16b 222 ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] 223#ifndef __AARCH64EB__ 224 rev64 v16.16b,v16.16b 225#endif 226 eor v1.16b,v1.16b,v18.16b 227 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 228 229#ifndef __AARCH64EB__ 230 rev64 v17.16b,v17.16b 231#endif 232 ins v2.d[0],v1.d[1] 233 ins v1.d[1],v0.d[0] 234 ext v7.16b,v17.16b,v17.16b,#8 235 ext v3.16b,v16.16b,v16.16b,#8 236 eor v0.16b,v1.16b,v18.16b 237 pmull v4.1q,v20.1d,v7.1d //H·Ii+1 238 eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early 239 240 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 241 pmull v0.1q,v0.1d,v19.1d 242 eor v3.16b,v3.16b,v18.16b 243 eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing 244 eor v3.16b,v3.16b,v0.16b 245 pmull2 v6.1q,v20.2d,v7.2d 246 b.hs Loop_mod2x_v8 //there was at least 32 more bytes 247 248 eor v2.16b,v2.16b,v18.16b 249 ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b 250 adds x3,x3,#32 //re-construct x3 251 eor v0.16b,v0.16b,v2.16b //re-construct v0.16b 252 b.eq Ldone_v8 //is x3 zero? 253Lodd_tail_v8: 254 ext v18.16b,v0.16b,v0.16b,#8 255 eor v3.16b,v3.16b,v0.16b //inp^=Xi 256 eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi 257 258 pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo 259 eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing 260 pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi 261 pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) 262 263 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 264 eor v18.16b,v0.16b,v2.16b 265 eor v1.16b,v1.16b,v17.16b 266 eor v1.16b,v1.16b,v18.16b 267 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 268 269 ins v2.d[0],v1.d[1] 270 ins v1.d[1],v0.d[0] 271 eor v0.16b,v1.16b,v18.16b 272 273 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 274 pmull v0.1q,v0.1d,v19.1d 275 eor v18.16b,v18.16b,v2.16b 276 eor v0.16b,v0.16b,v18.16b 277 278Ldone_v8: 279#ifndef __AARCH64EB__ 280 rev64 v0.16b,v0.16b 281#endif 282 ext v0.16b,v0.16b,v0.16b,#8 283 st1 {v0.2d},[x0] //write out Xi 284 285 ret 286 287.def gcm_ghash_v8_4x 288 .type 32 289.endef 290.align 4 291gcm_ghash_v8_4x: 292Lgcm_ghash_v8_4x: 293 ld1 {v0.2d},[x0] //load [rotated] Xi 294 ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 295 movi v19.16b,#0xe1 296 ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 297 shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant 298 299 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 300#ifndef __AARCH64EB__ 301 rev64 v0.16b,v0.16b 302 rev64 v5.16b,v5.16b 303 rev64 v6.16b,v6.16b 304 rev64 v7.16b,v7.16b 305 rev64 v4.16b,v4.16b 306#endif 307 ext v25.16b,v7.16b,v7.16b,#8 308 ext v24.16b,v6.16b,v6.16b,#8 309 ext v23.16b,v5.16b,v5.16b,#8 310 311 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 312 eor v7.16b,v7.16b,v25.16b 313 pmull2 v31.1q,v20.2d,v25.2d 314 pmull v30.1q,v21.1d,v7.1d 315 316 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 317 eor v6.16b,v6.16b,v24.16b 318 pmull2 v24.1q,v22.2d,v24.2d 319 pmull2 v6.1q,v21.2d,v6.2d 320 321 eor v29.16b,v29.16b,v16.16b 322 eor v31.16b,v31.16b,v24.16b 323 eor v30.16b,v30.16b,v6.16b 324 325 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 326 eor v5.16b,v5.16b,v23.16b 327 pmull2 v23.1q,v26.2d,v23.2d 328 pmull v5.1q,v27.1d,v5.1d 329 330 eor v29.16b,v29.16b,v7.16b 331 eor v31.16b,v31.16b,v23.16b 332 eor v30.16b,v30.16b,v5.16b 333 334 subs x3,x3,#128 335 b.lo Ltail4x 336 337 b Loop4x 338 339.align 4 340Loop4x: 341 eor v16.16b,v4.16b,v0.16b 342 ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 343 ext v3.16b,v16.16b,v16.16b,#8 344#ifndef __AARCH64EB__ 345 rev64 v5.16b,v5.16b 346 rev64 v6.16b,v6.16b 347 rev64 v7.16b,v7.16b 348 rev64 v4.16b,v4.16b 349#endif 350 351 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 352 eor v16.16b,v16.16b,v3.16b 353 pmull2 v2.1q,v28.2d,v3.2d 354 ext v25.16b,v7.16b,v7.16b,#8 355 pmull2 v1.1q,v27.2d,v16.2d 356 357 eor v0.16b,v0.16b,v29.16b 358 eor v2.16b,v2.16b,v31.16b 359 ext v24.16b,v6.16b,v6.16b,#8 360 eor v1.16b,v1.16b,v30.16b 361 ext v23.16b,v5.16b,v5.16b,#8 362 363 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 364 eor v18.16b,v0.16b,v2.16b 365 pmull v29.1q,v20.1d,v25.1d //H·Ii+3 366 eor v7.16b,v7.16b,v25.16b 367 eor v1.16b,v1.16b,v17.16b 368 pmull2 v31.1q,v20.2d,v25.2d 369 eor v1.16b,v1.16b,v18.16b 370 pmull v30.1q,v21.1d,v7.1d 371 372 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 373 ins v2.d[0],v1.d[1] 374 ins v1.d[1],v0.d[0] 375 pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 376 eor v6.16b,v6.16b,v24.16b 377 pmull2 v24.1q,v22.2d,v24.2d 378 eor v0.16b,v1.16b,v18.16b 379 pmull2 v6.1q,v21.2d,v6.2d 380 381 eor v29.16b,v29.16b,v16.16b 382 eor v31.16b,v31.16b,v24.16b 383 eor v30.16b,v30.16b,v6.16b 384 385 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 386 pmull v0.1q,v0.1d,v19.1d 387 pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 388 eor v5.16b,v5.16b,v23.16b 389 eor v18.16b,v18.16b,v2.16b 390 pmull2 v23.1q,v26.2d,v23.2d 391 pmull v5.1q,v27.1d,v5.1d 392 393 eor v0.16b,v0.16b,v18.16b 394 eor v29.16b,v29.16b,v7.16b 395 eor v31.16b,v31.16b,v23.16b 396 ext v0.16b,v0.16b,v0.16b,#8 397 eor v30.16b,v30.16b,v5.16b 398 399 subs x3,x3,#64 400 b.hs Loop4x 401 402Ltail4x: 403 eor v16.16b,v4.16b,v0.16b 404 ext v3.16b,v16.16b,v16.16b,#8 405 406 pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) 407 eor v16.16b,v16.16b,v3.16b 408 pmull2 v2.1q,v28.2d,v3.2d 409 pmull2 v1.1q,v27.2d,v16.2d 410 411 eor v0.16b,v0.16b,v29.16b 412 eor v2.16b,v2.16b,v31.16b 413 eor v1.16b,v1.16b,v30.16b 414 415 adds x3,x3,#64 416 b.eq Ldone4x 417 418 cmp x3,#32 419 b.lo Lone 420 b.eq Ltwo 421Lthree: 422 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 423 eor v18.16b,v0.16b,v2.16b 424 eor v1.16b,v1.16b,v17.16b 425 ld1 {v4.2d,v5.2d,v6.2d},[x2] 426 eor v1.16b,v1.16b,v18.16b 427#ifndef __AARCH64EB__ 428 rev64 v5.16b,v5.16b 429 rev64 v6.16b,v6.16b 430 rev64 v4.16b,v4.16b 431#endif 432 433 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 434 ins v2.d[0],v1.d[1] 435 ins v1.d[1],v0.d[0] 436 ext v24.16b,v6.16b,v6.16b,#8 437 ext v23.16b,v5.16b,v5.16b,#8 438 eor v0.16b,v1.16b,v18.16b 439 440 pmull v29.1q,v20.1d,v24.1d //H·Ii+2 441 eor v6.16b,v6.16b,v24.16b 442 443 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 444 pmull v0.1q,v0.1d,v19.1d 445 eor v18.16b,v18.16b,v2.16b 446 pmull2 v31.1q,v20.2d,v24.2d 447 pmull v30.1q,v21.1d,v6.1d 448 eor v0.16b,v0.16b,v18.16b 449 pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 450 eor v5.16b,v5.16b,v23.16b 451 ext v0.16b,v0.16b,v0.16b,#8 452 453 pmull2 v23.1q,v22.2d,v23.2d 454 eor v16.16b,v4.16b,v0.16b 455 pmull2 v5.1q,v21.2d,v5.2d 456 ext v3.16b,v16.16b,v16.16b,#8 457 458 eor v29.16b,v29.16b,v7.16b 459 eor v31.16b,v31.16b,v23.16b 460 eor v30.16b,v30.16b,v5.16b 461 462 pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) 463 eor v16.16b,v16.16b,v3.16b 464 pmull2 v2.1q,v26.2d,v3.2d 465 pmull v1.1q,v27.1d,v16.1d 466 467 eor v0.16b,v0.16b,v29.16b 468 eor v2.16b,v2.16b,v31.16b 469 eor v1.16b,v1.16b,v30.16b 470 b Ldone4x 471 472.align 4 473Ltwo: 474 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 475 eor v18.16b,v0.16b,v2.16b 476 eor v1.16b,v1.16b,v17.16b 477 ld1 {v4.2d,v5.2d},[x2] 478 eor v1.16b,v1.16b,v18.16b 479#ifndef __AARCH64EB__ 480 rev64 v5.16b,v5.16b 481 rev64 v4.16b,v4.16b 482#endif 483 484 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 485 ins v2.d[0],v1.d[1] 486 ins v1.d[1],v0.d[0] 487 ext v23.16b,v5.16b,v5.16b,#8 488 eor v0.16b,v1.16b,v18.16b 489 490 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 491 pmull v0.1q,v0.1d,v19.1d 492 eor v18.16b,v18.16b,v2.16b 493 eor v0.16b,v0.16b,v18.16b 494 ext v0.16b,v0.16b,v0.16b,#8 495 496 pmull v29.1q,v20.1d,v23.1d //H·Ii+1 497 eor v5.16b,v5.16b,v23.16b 498 499 eor v16.16b,v4.16b,v0.16b 500 ext v3.16b,v16.16b,v16.16b,#8 501 502 pmull2 v31.1q,v20.2d,v23.2d 503 pmull v30.1q,v21.1d,v5.1d 504 505 pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) 506 eor v16.16b,v16.16b,v3.16b 507 pmull2 v2.1q,v22.2d,v3.2d 508 pmull2 v1.1q,v21.2d,v16.2d 509 510 eor v0.16b,v0.16b,v29.16b 511 eor v2.16b,v2.16b,v31.16b 512 eor v1.16b,v1.16b,v30.16b 513 b Ldone4x 514 515.align 4 516Lone: 517 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 518 eor v18.16b,v0.16b,v2.16b 519 eor v1.16b,v1.16b,v17.16b 520 ld1 {v4.2d},[x2] 521 eor v1.16b,v1.16b,v18.16b 522#ifndef __AARCH64EB__ 523 rev64 v4.16b,v4.16b 524#endif 525 526 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 527 ins v2.d[0],v1.d[1] 528 ins v1.d[1],v0.d[0] 529 eor v0.16b,v1.16b,v18.16b 530 531 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 532 pmull v0.1q,v0.1d,v19.1d 533 eor v18.16b,v18.16b,v2.16b 534 eor v0.16b,v0.16b,v18.16b 535 ext v0.16b,v0.16b,v0.16b,#8 536 537 eor v16.16b,v4.16b,v0.16b 538 ext v3.16b,v16.16b,v16.16b,#8 539 540 pmull v0.1q,v20.1d,v3.1d 541 eor v16.16b,v16.16b,v3.16b 542 pmull2 v2.1q,v20.2d,v3.2d 543 pmull v1.1q,v21.1d,v16.1d 544 545Ldone4x: 546 ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing 547 eor v18.16b,v0.16b,v2.16b 548 eor v1.16b,v1.16b,v17.16b 549 eor v1.16b,v1.16b,v18.16b 550 551 pmull v18.1q,v0.1d,v19.1d //1st phase of reduction 552 ins v2.d[0],v1.d[1] 553 ins v1.d[1],v0.d[0] 554 eor v0.16b,v1.16b,v18.16b 555 556 ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction 557 pmull v0.1q,v0.1d,v19.1d 558 eor v18.16b,v18.16b,v2.16b 559 eor v0.16b,v0.16b,v18.16b 560 ext v0.16b,v0.16b,v0.16b,#8 561 562#ifndef __AARCH64EB__ 563 rev64 v0.16b,v0.16b 564#endif 565 st1 {v0.2d},[x0] //write out Xi 566 567 ret 568 569.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 570.align 2 571.align 2 572#endif 573#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) 574