1 // NOTE: The descriptions for each of the vector methods on the traits below 2 // are pretty inscrutable. For this reason, there are tests for every method 3 // on for every trait impl below. If you're confused about what an op does, 4 // consult its test. (They probably should be doc tests, but I couldn't figure 5 // out how to write them in a non-annoying way.) 6 7 use core::{ 8 fmt::Debug, 9 panic::{RefUnwindSafe, UnwindSafe}, 10 }; 11 12 /// A trait for describing vector operations used by vectorized searchers. 13 /// 14 /// The trait is highly constrained to low level vector operations needed for 15 /// the specific algorithms used in this crate. In general, it was invented 16 /// mostly to be generic over x86's __m128i and __m256i types. At time of 17 /// writing, it also supports wasm and aarch64 128-bit vector types as well. 18 /// 19 /// # Safety 20 /// 21 /// All methods are not safe since they are intended to be implemented using 22 /// vendor intrinsics, which are also not safe. Callers must ensure that 23 /// the appropriate target features are enabled in the calling function, 24 /// and that the current CPU supports them. All implementations should 25 /// avoid marking the routines with `#[target_feature]` and instead mark 26 /// them as `#[inline(always)]` to ensure they get appropriately inlined. 27 /// (`inline(always)` cannot be used with target_feature.) 28 pub(crate) trait Vector: 29 Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe 30 { 31 /// The number of bits in the vector. 32 const BITS: usize; 33 /// The number of bytes in the vector. That is, this is the size of the 34 /// vector in memory. 35 const BYTES: usize; 36 37 /// Create a vector with 8-bit lanes with the given byte repeated into each 38 /// lane. 39 /// 40 /// # Safety 41 /// 42 /// Callers must ensure that this is okay to call in the current target for 43 /// the current CPU. splat(byte: u8) -> Self44 unsafe fn splat(byte: u8) -> Self; 45 46 /// Read a vector-size number of bytes from the given pointer. The pointer 47 /// does not need to be aligned. 48 /// 49 /// # Safety 50 /// 51 /// Callers must ensure that this is okay to call in the current target for 52 /// the current CPU. 53 /// 54 /// Callers must guarantee that at least `BYTES` bytes are readable from 55 /// `data`. load_unaligned(data: *const u8) -> Self56 unsafe fn load_unaligned(data: *const u8) -> Self; 57 58 /// Returns true if and only if this vector has zero in all of its lanes. 59 /// 60 /// # Safety 61 /// 62 /// Callers must ensure that this is okay to call in the current target for 63 /// the current CPU. is_zero(self) -> bool64 unsafe fn is_zero(self) -> bool; 65 66 /// Do an 8-bit pairwise equality check. If lane `i` is equal in this 67 /// vector and the one given, then lane `i` in the resulting vector is set 68 /// to `0xFF`. Otherwise, it is set to `0x00`. 69 /// 70 /// # Safety 71 /// 72 /// Callers must ensure that this is okay to call in the current target for 73 /// the current CPU. cmpeq(self, vector2: Self) -> Self74 unsafe fn cmpeq(self, vector2: Self) -> Self; 75 76 /// Perform a bitwise 'and' of this vector and the one given and return 77 /// the result. 78 /// 79 /// # Safety 80 /// 81 /// Callers must ensure that this is okay to call in the current target for 82 /// the current CPU. and(self, vector2: Self) -> Self83 unsafe fn and(self, vector2: Self) -> Self; 84 85 /// Perform a bitwise 'or' of this vector and the one given and return 86 /// the result. 87 /// 88 /// # Safety 89 /// 90 /// Callers must ensure that this is okay to call in the current target for 91 /// the current CPU. 92 #[allow(dead_code)] // unused, but useful enough to keep around? or(self, vector2: Self) -> Self93 unsafe fn or(self, vector2: Self) -> Self; 94 95 /// Shift each 8-bit lane in this vector to the right by the number of 96 /// bits indictated by the `BITS` type parameter. 97 /// 98 /// # Safety 99 /// 100 /// Callers must ensure that this is okay to call in the current target for 101 /// the current CPU. shift_8bit_lane_right<const BITS: i32>(self) -> Self102 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self; 103 104 /// Shift this vector to the left by one byte and shift the most 105 /// significant byte of `vector2` into the least significant position of 106 /// this vector. 107 /// 108 /// Stated differently, this behaves as if `self` and `vector2` were 109 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted 110 /// right by `Self::BYTES - 1` bytes. 111 /// 112 /// With respect to the Teddy algorithm, `vector2` is usually a previous 113 /// `Self::BYTES` chunk from the haystack and `self` is the chunk 114 /// immediately following it. This permits combining the last two bytes 115 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1` 116 /// bytes from the current chunk. This permits aligning the result of 117 /// various shuffles so that they can be and-ed together and a possible 118 /// candidate discovered. 119 /// 120 /// # Safety 121 /// 122 /// Callers must ensure that this is okay to call in the current target for 123 /// the current CPU. shift_in_one_byte(self, vector2: Self) -> Self124 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self; 125 126 /// Shift this vector to the left by two bytes and shift the two most 127 /// significant bytes of `vector2` into the least significant position of 128 /// this vector. 129 /// 130 /// Stated differently, this behaves as if `self` and `vector2` were 131 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted 132 /// right by `Self::BYTES - 2` bytes. 133 /// 134 /// With respect to the Teddy algorithm, `vector2` is usually a previous 135 /// `Self::BYTES` chunk from the haystack and `self` is the chunk 136 /// immediately following it. This permits combining the last two bytes 137 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2` 138 /// bytes from the current chunk. This permits aligning the result of 139 /// various shuffles so that they can be and-ed together and a possible 140 /// candidate discovered. 141 /// 142 /// # Safety 143 /// 144 /// Callers must ensure that this is okay to call in the current target for 145 /// the current CPU. shift_in_two_bytes(self, vector2: Self) -> Self146 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self; 147 148 /// Shift this vector to the left by three bytes and shift the three most 149 /// significant bytes of `vector2` into the least significant position of 150 /// this vector. 151 /// 152 /// Stated differently, this behaves as if `self` and `vector2` were 153 /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted 154 /// right by `Self::BYTES - 3` bytes. 155 /// 156 /// With respect to the Teddy algorithm, `vector2` is usually a previous 157 /// `Self::BYTES` chunk from the haystack and `self` is the chunk 158 /// immediately following it. This permits combining the last three bytes 159 /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3` 160 /// bytes from the current chunk. This permits aligning the result of 161 /// various shuffles so that they can be and-ed together and a possible 162 /// candidate discovered. 163 /// 164 /// # Safety 165 /// 166 /// Callers must ensure that this is okay to call in the current target for 167 /// the current CPU. shift_in_three_bytes(self, vector2: Self) -> Self168 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self; 169 170 /// Shuffles the bytes in this vector according to the indices in each of 171 /// the corresponding lanes in `indices`. 172 /// 173 /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is 174 /// indices and `C` is the resulting vector, then `C = A[B[i]]`. 175 /// 176 /// # Safety 177 /// 178 /// Callers must ensure that this is okay to call in the current target for 179 /// the current CPU. shuffle_bytes(self, indices: Self) -> Self180 unsafe fn shuffle_bytes(self, indices: Self) -> Self; 181 182 /// Call the provided function for each 64-bit lane in this vector. The 183 /// given function is provided the lane index and lane value as a `u64`. 184 /// 185 /// If `f` returns `Some`, then iteration over the lanes is stopped and the 186 /// value is returned. Otherwise, this returns `None`. 187 /// 188 /// # Notes 189 /// 190 /// Conceptually it would be nice if we could have a 191 /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is 192 /// tricky given Rust's [current support for const generics][support]. 193 /// And even if we could, it would be tricky to write generic code over 194 /// it. (Not impossible. We could introduce another layer that requires 195 /// `AsRef<[u64]>` or something.) 196 /// 197 /// [support]: https://github.com/rust-lang/rust/issues/60551 198 /// 199 /// # Safety 200 /// 201 /// Callers must ensure that this is okay to call in the current target for 202 /// the current CPU. for_each_64bit_lane<T>( self, f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>203 unsafe fn for_each_64bit_lane<T>( 204 self, 205 f: impl FnMut(usize, u64) -> Option<T>, 206 ) -> Option<T>; 207 } 208 209 /// This trait extends the `Vector` trait with additional operations to support 210 /// Fat Teddy. 211 /// 212 /// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as 213 /// the vector size) instead of the full size of a vector per iteration. For 214 /// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr 215 /// but Fat Teddy reads 16 bytes at a time. 216 /// 217 /// Fat Teddy is useful when searching for a large number of literals. 218 /// The extra number of buckets spreads the literals out more and reduces 219 /// verification time. 220 /// 221 /// Currently we only implement this for AVX on x86_64. It would be nice to 222 /// implement this for SSE on x86_64 and NEON on aarch64, with the latter two 223 /// only reading 8 bytes at a time. It's not clear how well it would work, but 224 /// there are some tricky things to figure out in terms of implementation. The 225 /// `half_shift_in_{one,two,three}_bytes` methods in particular are probably 226 /// the trickiest of the bunch. For AVX2, these are implemented by taking 227 /// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit 228 /// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8` 229 /// operates on the full 128-bit vector and not on each 64-bit half.) I didn't 230 /// do a careful survey of NEON to see if it could easily support these 231 /// operations. 232 pub(crate) trait FatVector: Vector { 233 type Half: Vector; 234 235 /// Read a half-vector-size number of bytes from the given pointer, and 236 /// broadcast it across both halfs of a full vector. The pointer does not 237 /// need to be aligned. 238 /// 239 /// # Safety 240 /// 241 /// Callers must ensure that this is okay to call in the current target for 242 /// the current CPU. 243 /// 244 /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are 245 /// readable from `data`. load_half_unaligned(data: *const u8) -> Self246 unsafe fn load_half_unaligned(data: *const u8) -> Self; 247 248 /// Like `Vector::shift_in_one_byte`, except this is done for each half 249 /// of the vector instead. 250 /// 251 /// # Safety 252 /// 253 /// Callers must ensure that this is okay to call in the current target for 254 /// the current CPU. half_shift_in_one_byte(self, vector2: Self) -> Self255 unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self; 256 257 /// Like `Vector::shift_in_two_bytes`, except this is done for each half 258 /// of the vector instead. 259 /// 260 /// # Safety 261 /// 262 /// Callers must ensure that this is okay to call in the current target for 263 /// the current CPU. half_shift_in_two_bytes(self, vector2: Self) -> Self264 unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self; 265 266 /// Like `Vector::shift_in_two_bytes`, except this is done for each half 267 /// of the vector instead. 268 /// 269 /// # Safety 270 /// 271 /// Callers must ensure that this is okay to call in the current target for 272 /// the current CPU. half_shift_in_three_bytes(self, vector2: Self) -> Self273 unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self; 274 275 /// Swap the 128-bit lanes in this vector. 276 /// 277 /// # Safety 278 /// 279 /// Callers must ensure that this is okay to call in the current target for 280 /// the current CPU. swap_halves(self) -> Self281 unsafe fn swap_halves(self) -> Self; 282 283 /// Unpack and interleave the 8-bit lanes from the low 128 bits of each 284 /// vector and return the result. 285 /// 286 /// # Safety 287 /// 288 /// Callers must ensure that this is okay to call in the current target for 289 /// the current CPU. interleave_low_8bit_lanes(self, vector2: Self) -> Self290 unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self; 291 292 /// Unpack and interleave the 8-bit lanes from the high 128 bits of each 293 /// vector and return the result. 294 /// 295 /// # Safety 296 /// 297 /// Callers must ensure that this is okay to call in the current target for 298 /// the current CPU. interleave_high_8bit_lanes(self, vector2: Self) -> Self299 unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self; 300 301 /// Call the provided function for each 64-bit lane in the lower half 302 /// of this vector and then in the other vector. The given function is 303 /// provided the lane index and lane value as a `u64`. (The high 128-bits 304 /// of each vector are ignored.) 305 /// 306 /// If `f` returns `Some`, then iteration over the lanes is stopped and the 307 /// value is returned. Otherwise, this returns `None`. 308 /// 309 /// # Safety 310 /// 311 /// Callers must ensure that this is okay to call in the current target for 312 /// the current CPU. for_each_low_64bit_lane<T>( self, vector2: Self, f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>313 unsafe fn for_each_low_64bit_lane<T>( 314 self, 315 vector2: Self, 316 f: impl FnMut(usize, u64) -> Option<T>, 317 ) -> Option<T>; 318 } 319 320 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] 321 mod x86_64_ssse3 { 322 use core::arch::x86_64::*; 323 324 use crate::util::int::{I32, I8}; 325 326 use super::Vector; 327 328 impl Vector for __m128i { 329 const BITS: usize = 128; 330 const BYTES: usize = 16; 331 332 #[inline(always)] splat(byte: u8) -> __m128i333 unsafe fn splat(byte: u8) -> __m128i { 334 _mm_set1_epi8(i8::from_bits(byte)) 335 } 336 337 #[inline(always)] load_unaligned(data: *const u8) -> __m128i338 unsafe fn load_unaligned(data: *const u8) -> __m128i { 339 _mm_loadu_si128(data.cast::<__m128i>()) 340 } 341 342 #[inline(always)] is_zero(self) -> bool343 unsafe fn is_zero(self) -> bool { 344 let cmp = self.cmpeq(Self::splat(0)); 345 _mm_movemask_epi8(cmp).to_bits() == 0xFFFF 346 } 347 348 #[inline(always)] cmpeq(self, vector2: Self) -> __m128i349 unsafe fn cmpeq(self, vector2: Self) -> __m128i { 350 _mm_cmpeq_epi8(self, vector2) 351 } 352 353 #[inline(always)] and(self, vector2: Self) -> __m128i354 unsafe fn and(self, vector2: Self) -> __m128i { 355 _mm_and_si128(self, vector2) 356 } 357 358 #[inline(always)] or(self, vector2: Self) -> __m128i359 unsafe fn or(self, vector2: Self) -> __m128i { 360 _mm_or_si128(self, vector2) 361 } 362 363 #[inline(always)] shift_8bit_lane_right<const BITS: i32>(self) -> Self364 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self { 365 // Apparently there is no _mm_srli_epi8, so we emulate it by 366 // shifting 16-bit integers and masking out the high nybble of each 367 // 8-bit lane (since that nybble will contain bits from the low 368 // nybble of the previous lane). 369 let lomask = Self::splat(0xF); 370 _mm_srli_epi16(self, BITS).and(lomask) 371 } 372 373 #[inline(always)] shift_in_one_byte(self, vector2: Self) -> Self374 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { 375 _mm_alignr_epi8(self, vector2, 15) 376 } 377 378 #[inline(always)] shift_in_two_bytes(self, vector2: Self) -> Self379 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { 380 _mm_alignr_epi8(self, vector2, 14) 381 } 382 383 #[inline(always)] shift_in_three_bytes(self, vector2: Self) -> Self384 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { 385 _mm_alignr_epi8(self, vector2, 13) 386 } 387 388 #[inline(always)] shuffle_bytes(self, indices: Self) -> Self389 unsafe fn shuffle_bytes(self, indices: Self) -> Self { 390 _mm_shuffle_epi8(self, indices) 391 } 392 393 #[inline(always)] for_each_64bit_lane<T>( self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>394 unsafe fn for_each_64bit_lane<T>( 395 self, 396 mut f: impl FnMut(usize, u64) -> Option<T>, 397 ) -> Option<T> { 398 // We could just use _mm_extract_epi64 here, but that requires 399 // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1, 400 // but everything else works with SSSE3 so we stick to that subset. 401 let lanes: [u64; 2] = core::mem::transmute(self); 402 if let Some(t) = f(0, lanes[0]) { 403 return Some(t); 404 } 405 if let Some(t) = f(1, lanes[1]) { 406 return Some(t); 407 } 408 None 409 } 410 } 411 } 412 413 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] 414 mod x86_64_avx2 { 415 use core::arch::x86_64::*; 416 417 use crate::util::int::{I32, I64, I8}; 418 419 use super::{FatVector, Vector}; 420 421 impl Vector for __m256i { 422 const BITS: usize = 256; 423 const BYTES: usize = 32; 424 425 #[inline(always)] splat(byte: u8) -> __m256i426 unsafe fn splat(byte: u8) -> __m256i { 427 _mm256_set1_epi8(i8::from_bits(byte)) 428 } 429 430 #[inline(always)] load_unaligned(data: *const u8) -> __m256i431 unsafe fn load_unaligned(data: *const u8) -> __m256i { 432 _mm256_loadu_si256(data.cast::<__m256i>()) 433 } 434 435 #[inline(always)] is_zero(self) -> bool436 unsafe fn is_zero(self) -> bool { 437 let cmp = self.cmpeq(Self::splat(0)); 438 _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF 439 } 440 441 #[inline(always)] cmpeq(self, vector2: Self) -> __m256i442 unsafe fn cmpeq(self, vector2: Self) -> __m256i { 443 _mm256_cmpeq_epi8(self, vector2) 444 } 445 446 #[inline(always)] and(self, vector2: Self) -> __m256i447 unsafe fn and(self, vector2: Self) -> __m256i { 448 _mm256_and_si256(self, vector2) 449 } 450 451 #[inline(always)] or(self, vector2: Self) -> __m256i452 unsafe fn or(self, vector2: Self) -> __m256i { 453 _mm256_or_si256(self, vector2) 454 } 455 456 #[inline(always)] shift_8bit_lane_right<const BITS: i32>(self) -> Self457 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self { 458 let lomask = Self::splat(0xF); 459 _mm256_srli_epi16(self, BITS).and(lomask) 460 } 461 462 #[inline(always)] shift_in_one_byte(self, vector2: Self) -> Self463 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { 464 // Credit goes to jneem for figuring this out: 465 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 466 // 467 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit 468 // PALIGNR instructions, which is not what we want, so we need to 469 // do some extra shuffling. 470 let v = _mm256_permute2x128_si256(vector2, self, 0x21); 471 _mm256_alignr_epi8(self, v, 15) 472 } 473 474 #[inline(always)] shift_in_two_bytes(self, vector2: Self) -> Self475 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { 476 // Credit goes to jneem for figuring this out: 477 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 478 // 479 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit 480 // PALIGNR instructions, which is not what we want, so we need to 481 // do some extra shuffling. 482 let v = _mm256_permute2x128_si256(vector2, self, 0x21); 483 _mm256_alignr_epi8(self, v, 14) 484 } 485 486 #[inline(always)] shift_in_three_bytes(self, vector2: Self) -> Self487 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { 488 // Credit goes to jneem for figuring this out: 489 // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184 490 // 491 // TL;DR avx2's PALIGNR instruction is actually just two 128-bit 492 // PALIGNR instructions, which is not what we want, so we need to 493 // do some extra shuffling. 494 let v = _mm256_permute2x128_si256(vector2, self, 0x21); 495 _mm256_alignr_epi8(self, v, 13) 496 } 497 498 #[inline(always)] shuffle_bytes(self, indices: Self) -> Self499 unsafe fn shuffle_bytes(self, indices: Self) -> Self { 500 _mm256_shuffle_epi8(self, indices) 501 } 502 503 #[inline(always)] for_each_64bit_lane<T>( self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>504 unsafe fn for_each_64bit_lane<T>( 505 self, 506 mut f: impl FnMut(usize, u64) -> Option<T>, 507 ) -> Option<T> { 508 // NOTE: At one point in the past, I used transmute to this to 509 // get a [u64; 4], but it turned out to lead to worse codegen IIRC. 510 // I've tried it more recently, and it looks like that's no longer 511 // the case. But since there's no difference, we stick with the 512 // slightly more complicated but transmute-free version. 513 let lane = _mm256_extract_epi64(self, 0).to_bits(); 514 if let Some(t) = f(0, lane) { 515 return Some(t); 516 } 517 let lane = _mm256_extract_epi64(self, 1).to_bits(); 518 if let Some(t) = f(1, lane) { 519 return Some(t); 520 } 521 let lane = _mm256_extract_epi64(self, 2).to_bits(); 522 if let Some(t) = f(2, lane) { 523 return Some(t); 524 } 525 let lane = _mm256_extract_epi64(self, 3).to_bits(); 526 if let Some(t) = f(3, lane) { 527 return Some(t); 528 } 529 None 530 } 531 } 532 533 impl FatVector for __m256i { 534 type Half = __m128i; 535 536 #[inline(always)] load_half_unaligned(data: *const u8) -> Self537 unsafe fn load_half_unaligned(data: *const u8) -> Self { 538 let half = Self::Half::load_unaligned(data); 539 _mm256_broadcastsi128_si256(half) 540 } 541 542 #[inline(always)] half_shift_in_one_byte(self, vector2: Self) -> Self543 unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self { 544 _mm256_alignr_epi8(self, vector2, 15) 545 } 546 547 #[inline(always)] half_shift_in_two_bytes(self, vector2: Self) -> Self548 unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self { 549 _mm256_alignr_epi8(self, vector2, 14) 550 } 551 552 #[inline(always)] half_shift_in_three_bytes(self, vector2: Self) -> Self553 unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self { 554 _mm256_alignr_epi8(self, vector2, 13) 555 } 556 557 #[inline(always)] swap_halves(self) -> Self558 unsafe fn swap_halves(self) -> Self { 559 _mm256_permute4x64_epi64(self, 0x4E) 560 } 561 562 #[inline(always)] interleave_low_8bit_lanes(self, vector2: Self) -> Self563 unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self { 564 _mm256_unpacklo_epi8(self, vector2) 565 } 566 567 #[inline(always)] interleave_high_8bit_lanes(self, vector2: Self) -> Self568 unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self { 569 _mm256_unpackhi_epi8(self, vector2) 570 } 571 572 #[inline(always)] for_each_low_64bit_lane<T>( self, vector2: Self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>573 unsafe fn for_each_low_64bit_lane<T>( 574 self, 575 vector2: Self, 576 mut f: impl FnMut(usize, u64) -> Option<T>, 577 ) -> Option<T> { 578 let lane = _mm256_extract_epi64(self, 0).to_bits(); 579 if let Some(t) = f(0, lane) { 580 return Some(t); 581 } 582 let lane = _mm256_extract_epi64(self, 1).to_bits(); 583 if let Some(t) = f(1, lane) { 584 return Some(t); 585 } 586 let lane = _mm256_extract_epi64(vector2, 0).to_bits(); 587 if let Some(t) = f(2, lane) { 588 return Some(t); 589 } 590 let lane = _mm256_extract_epi64(vector2, 1).to_bits(); 591 if let Some(t) = f(3, lane) { 592 return Some(t); 593 } 594 None 595 } 596 } 597 } 598 599 #[cfg(all( 600 target_arch = "aarch64", 601 target_feature = "neon", 602 target_endian = "little" 603 ))] 604 mod aarch64_neon { 605 use core::arch::aarch64::*; 606 607 use super::Vector; 608 609 impl Vector for uint8x16_t { 610 const BITS: usize = 128; 611 const BYTES: usize = 16; 612 613 #[inline(always)] splat(byte: u8) -> uint8x16_t614 unsafe fn splat(byte: u8) -> uint8x16_t { 615 vdupq_n_u8(byte) 616 } 617 618 #[inline(always)] load_unaligned(data: *const u8) -> uint8x16_t619 unsafe fn load_unaligned(data: *const u8) -> uint8x16_t { 620 vld1q_u8(data) 621 } 622 623 #[inline(always)] is_zero(self) -> bool624 unsafe fn is_zero(self) -> bool { 625 // Could also use vmaxvq_u8. 626 // ... I tried that and couldn't observe any meaningful difference 627 // in benchmarks. 628 let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self)); 629 vgetq_lane_u64(maxes, 0) == 0 630 } 631 632 #[inline(always)] cmpeq(self, vector2: Self) -> uint8x16_t633 unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t { 634 vceqq_u8(self, vector2) 635 } 636 637 #[inline(always)] and(self, vector2: Self) -> uint8x16_t638 unsafe fn and(self, vector2: Self) -> uint8x16_t { 639 vandq_u8(self, vector2) 640 } 641 642 #[inline(always)] or(self, vector2: Self) -> uint8x16_t643 unsafe fn or(self, vector2: Self) -> uint8x16_t { 644 vorrq_u8(self, vector2) 645 } 646 647 #[inline(always)] shift_8bit_lane_right<const BITS: i32>(self) -> Self648 unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self { 649 debug_assert!(BITS <= 7); 650 vshrq_n_u8(self, BITS) 651 } 652 653 #[inline(always)] shift_in_one_byte(self, vector2: Self) -> Self654 unsafe fn shift_in_one_byte(self, vector2: Self) -> Self { 655 vextq_u8(vector2, self, 15) 656 } 657 658 #[inline(always)] shift_in_two_bytes(self, vector2: Self) -> Self659 unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self { 660 vextq_u8(vector2, self, 14) 661 } 662 663 #[inline(always)] shift_in_three_bytes(self, vector2: Self) -> Self664 unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self { 665 vextq_u8(vector2, self, 13) 666 } 667 668 #[inline(always)] shuffle_bytes(self, indices: Self) -> Self669 unsafe fn shuffle_bytes(self, indices: Self) -> Self { 670 vqtbl1q_u8(self, indices) 671 } 672 673 #[inline(always)] for_each_64bit_lane<T>( self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>674 unsafe fn for_each_64bit_lane<T>( 675 self, 676 mut f: impl FnMut(usize, u64) -> Option<T>, 677 ) -> Option<T> { 678 let this = vreinterpretq_u64_u8(self); 679 let lane = vgetq_lane_u64(this, 0); 680 if let Some(t) = f(0, lane) { 681 return Some(t); 682 } 683 let lane = vgetq_lane_u64(this, 1); 684 if let Some(t) = f(1, lane) { 685 return Some(t); 686 } 687 None 688 } 689 } 690 } 691 692 #[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))] 693 mod tests_x86_64_ssse3 { 694 use core::arch::x86_64::*; 695 696 use crate::util::int::{I32, U32}; 697 698 use super::*; 699 is_runnable() -> bool700 fn is_runnable() -> bool { 701 std::is_x86_feature_detected!("ssse3") 702 } 703 704 #[target_feature(enable = "ssse3")] load(lanes: [u8; 16]) -> __m128i705 unsafe fn load(lanes: [u8; 16]) -> __m128i { 706 __m128i::load_unaligned(&lanes as *const u8) 707 } 708 709 #[target_feature(enable = "ssse3")] unload(v: __m128i) -> [u8; 16]710 unsafe fn unload(v: __m128i) -> [u8; 16] { 711 [ 712 _mm_extract_epi8(v, 0).to_bits().low_u8(), 713 _mm_extract_epi8(v, 1).to_bits().low_u8(), 714 _mm_extract_epi8(v, 2).to_bits().low_u8(), 715 _mm_extract_epi8(v, 3).to_bits().low_u8(), 716 _mm_extract_epi8(v, 4).to_bits().low_u8(), 717 _mm_extract_epi8(v, 5).to_bits().low_u8(), 718 _mm_extract_epi8(v, 6).to_bits().low_u8(), 719 _mm_extract_epi8(v, 7).to_bits().low_u8(), 720 _mm_extract_epi8(v, 8).to_bits().low_u8(), 721 _mm_extract_epi8(v, 9).to_bits().low_u8(), 722 _mm_extract_epi8(v, 10).to_bits().low_u8(), 723 _mm_extract_epi8(v, 11).to_bits().low_u8(), 724 _mm_extract_epi8(v, 12).to_bits().low_u8(), 725 _mm_extract_epi8(v, 13).to_bits().low_u8(), 726 _mm_extract_epi8(v, 14).to_bits().low_u8(), 727 _mm_extract_epi8(v, 15).to_bits().low_u8(), 728 ] 729 } 730 731 #[test] vector_splat()732 fn vector_splat() { 733 #[target_feature(enable = "ssse3")] 734 unsafe fn test() { 735 let v = __m128i::splat(0xAF); 736 assert_eq!( 737 unload(v), 738 [ 739 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 740 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF 741 ] 742 ); 743 } 744 if !is_runnable() { 745 return; 746 } 747 unsafe { test() } 748 } 749 750 #[test] vector_is_zero()751 fn vector_is_zero() { 752 #[target_feature(enable = "ssse3")] 753 unsafe fn test() { 754 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 755 assert!(!v.is_zero()); 756 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 757 assert!(v.is_zero()); 758 } 759 if !is_runnable() { 760 return; 761 } 762 unsafe { test() } 763 } 764 765 #[test] vector_cmpeq()766 fn vector_cmpeq() { 767 #[target_feature(enable = "ssse3")] 768 unsafe fn test() { 769 let v1 = 770 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]); 771 let v2 = 772 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); 773 assert_eq!( 774 unload(v1.cmpeq(v2)), 775 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF] 776 ); 777 } 778 if !is_runnable() { 779 return; 780 } 781 unsafe { test() } 782 } 783 784 #[test] vector_and()785 fn vector_and() { 786 #[target_feature(enable = "ssse3")] 787 unsafe fn test() { 788 let v1 = 789 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 790 let v2 = 791 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 792 assert_eq!( 793 unload(v1.and(v2)), 794 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 795 ); 796 } 797 if !is_runnable() { 798 return; 799 } 800 unsafe { test() } 801 } 802 803 #[test] vector_or()804 fn vector_or() { 805 #[target_feature(enable = "ssse3")] 806 unsafe fn test() { 807 let v1 = 808 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 809 let v2 = 810 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 811 assert_eq!( 812 unload(v1.or(v2)), 813 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 814 ); 815 } 816 if !is_runnable() { 817 return; 818 } 819 unsafe { test() } 820 } 821 822 #[test] vector_shift_8bit_lane_right()823 fn vector_shift_8bit_lane_right() { 824 #[target_feature(enable = "ssse3")] 825 unsafe fn test() { 826 let v = load([ 827 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 828 ]); 829 assert_eq!( 830 unload(v.shift_8bit_lane_right::<2>()), 831 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 832 ); 833 } 834 if !is_runnable() { 835 return; 836 } 837 unsafe { test() } 838 } 839 840 #[test] vector_shift_in_one_byte()841 fn vector_shift_in_one_byte() { 842 #[target_feature(enable = "ssse3")] 843 unsafe fn test() { 844 let v1 = 845 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 846 let v2 = load([ 847 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 848 ]); 849 assert_eq!( 850 unload(v1.shift_in_one_byte(v2)), 851 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 852 ); 853 } 854 if !is_runnable() { 855 return; 856 } 857 unsafe { test() } 858 } 859 860 #[test] vector_shift_in_two_bytes()861 fn vector_shift_in_two_bytes() { 862 #[target_feature(enable = "ssse3")] 863 unsafe fn test() { 864 let v1 = 865 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 866 let v2 = load([ 867 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 868 ]); 869 assert_eq!( 870 unload(v1.shift_in_two_bytes(v2)), 871 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 872 ); 873 } 874 if !is_runnable() { 875 return; 876 } 877 unsafe { test() } 878 } 879 880 #[test] vector_shift_in_three_bytes()881 fn vector_shift_in_three_bytes() { 882 #[target_feature(enable = "ssse3")] 883 unsafe fn test() { 884 let v1 = 885 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 886 let v2 = load([ 887 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 888 ]); 889 assert_eq!( 890 unload(v1.shift_in_three_bytes(v2)), 891 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], 892 ); 893 } 894 if !is_runnable() { 895 return; 896 } 897 unsafe { test() } 898 } 899 900 #[test] vector_shuffle_bytes()901 fn vector_shuffle_bytes() { 902 #[target_feature(enable = "ssse3")] 903 unsafe fn test() { 904 let v1 = 905 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 906 let v2 = 907 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]); 908 assert_eq!( 909 unload(v1.shuffle_bytes(v2)), 910 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13], 911 ); 912 } 913 if !is_runnable() { 914 return; 915 } 916 unsafe { test() } 917 } 918 919 #[test] vector_for_each_64bit_lane()920 fn vector_for_each_64bit_lane() { 921 #[target_feature(enable = "ssse3")] 922 unsafe fn test() { 923 let v = load([ 924 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 925 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 926 ]); 927 let mut lanes = [0u64; 2]; 928 v.for_each_64bit_lane(|i, lane| { 929 lanes[i] = lane; 930 None::<()> 931 }); 932 assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],); 933 } 934 if !is_runnable() { 935 return; 936 } 937 unsafe { test() } 938 } 939 } 940 941 #[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))] 942 mod tests_x86_64_avx2 { 943 use core::arch::x86_64::*; 944 945 use crate::util::int::{I32, U32}; 946 947 use super::*; 948 is_runnable() -> bool949 fn is_runnable() -> bool { 950 std::is_x86_feature_detected!("avx2") 951 } 952 953 #[target_feature(enable = "avx2")] load(lanes: [u8; 32]) -> __m256i954 unsafe fn load(lanes: [u8; 32]) -> __m256i { 955 __m256i::load_unaligned(&lanes as *const u8) 956 } 957 958 #[target_feature(enable = "avx2")] load_half(lanes: [u8; 16]) -> __m256i959 unsafe fn load_half(lanes: [u8; 16]) -> __m256i { 960 __m256i::load_half_unaligned(&lanes as *const u8) 961 } 962 963 #[target_feature(enable = "avx2")] unload(v: __m256i) -> [u8; 32]964 unsafe fn unload(v: __m256i) -> [u8; 32] { 965 [ 966 _mm256_extract_epi8(v, 0).to_bits().low_u8(), 967 _mm256_extract_epi8(v, 1).to_bits().low_u8(), 968 _mm256_extract_epi8(v, 2).to_bits().low_u8(), 969 _mm256_extract_epi8(v, 3).to_bits().low_u8(), 970 _mm256_extract_epi8(v, 4).to_bits().low_u8(), 971 _mm256_extract_epi8(v, 5).to_bits().low_u8(), 972 _mm256_extract_epi8(v, 6).to_bits().low_u8(), 973 _mm256_extract_epi8(v, 7).to_bits().low_u8(), 974 _mm256_extract_epi8(v, 8).to_bits().low_u8(), 975 _mm256_extract_epi8(v, 9).to_bits().low_u8(), 976 _mm256_extract_epi8(v, 10).to_bits().low_u8(), 977 _mm256_extract_epi8(v, 11).to_bits().low_u8(), 978 _mm256_extract_epi8(v, 12).to_bits().low_u8(), 979 _mm256_extract_epi8(v, 13).to_bits().low_u8(), 980 _mm256_extract_epi8(v, 14).to_bits().low_u8(), 981 _mm256_extract_epi8(v, 15).to_bits().low_u8(), 982 _mm256_extract_epi8(v, 16).to_bits().low_u8(), 983 _mm256_extract_epi8(v, 17).to_bits().low_u8(), 984 _mm256_extract_epi8(v, 18).to_bits().low_u8(), 985 _mm256_extract_epi8(v, 19).to_bits().low_u8(), 986 _mm256_extract_epi8(v, 20).to_bits().low_u8(), 987 _mm256_extract_epi8(v, 21).to_bits().low_u8(), 988 _mm256_extract_epi8(v, 22).to_bits().low_u8(), 989 _mm256_extract_epi8(v, 23).to_bits().low_u8(), 990 _mm256_extract_epi8(v, 24).to_bits().low_u8(), 991 _mm256_extract_epi8(v, 25).to_bits().low_u8(), 992 _mm256_extract_epi8(v, 26).to_bits().low_u8(), 993 _mm256_extract_epi8(v, 27).to_bits().low_u8(), 994 _mm256_extract_epi8(v, 28).to_bits().low_u8(), 995 _mm256_extract_epi8(v, 29).to_bits().low_u8(), 996 _mm256_extract_epi8(v, 30).to_bits().low_u8(), 997 _mm256_extract_epi8(v, 31).to_bits().low_u8(), 998 ] 999 } 1000 1001 #[test] vector_splat()1002 fn vector_splat() { 1003 #[target_feature(enable = "avx2")] 1004 unsafe fn test() { 1005 let v = __m256i::splat(0xAF); 1006 assert_eq!( 1007 unload(v), 1008 [ 1009 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 1010 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 1011 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 1012 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 1013 ] 1014 ); 1015 } 1016 if !is_runnable() { 1017 return; 1018 } 1019 unsafe { test() } 1020 } 1021 1022 #[test] vector_is_zero()1023 fn vector_is_zero() { 1024 #[target_feature(enable = "avx2")] 1025 unsafe fn test() { 1026 let v = load([ 1027 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1028 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1029 ]); 1030 assert!(!v.is_zero()); 1031 let v = load([ 1032 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1033 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1034 ]); 1035 assert!(v.is_zero()); 1036 } 1037 if !is_runnable() { 1038 return; 1039 } 1040 unsafe { test() } 1041 } 1042 1043 #[test] vector_cmpeq()1044 fn vector_cmpeq() { 1045 #[target_feature(enable = "avx2")] 1046 unsafe fn test() { 1047 let v1 = load([ 1048 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1049 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1, 1050 ]); 1051 let v2 = load([ 1052 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 1053 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 1054 ]); 1055 assert_eq!( 1056 unload(v1.cmpeq(v2)), 1057 [ 1058 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1059 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF 1060 ] 1061 ); 1062 } 1063 if !is_runnable() { 1064 return; 1065 } 1066 unsafe { test() } 1067 } 1068 1069 #[test] vector_and()1070 fn vector_and() { 1071 #[target_feature(enable = "avx2")] 1072 unsafe fn test() { 1073 let v1 = load([ 1074 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1075 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1076 ]); 1077 let v2 = load([ 1078 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1079 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1080 ]); 1081 assert_eq!( 1082 unload(v1.and(v2)), 1083 [ 1084 0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1086 ] 1087 ); 1088 } 1089 if !is_runnable() { 1090 return; 1091 } 1092 unsafe { test() } 1093 } 1094 1095 #[test] vector_or()1096 fn vector_or() { 1097 #[target_feature(enable = "avx2")] 1098 unsafe fn test() { 1099 let v1 = load([ 1100 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1102 ]); 1103 let v2 = load([ 1104 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1106 ]); 1107 assert_eq!( 1108 unload(v1.or(v2)), 1109 [ 1110 0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1111 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1112 ] 1113 ); 1114 } 1115 if !is_runnable() { 1116 return; 1117 } 1118 unsafe { test() } 1119 } 1120 1121 #[test] vector_shift_8bit_lane_right()1122 fn vector_shift_8bit_lane_right() { 1123 #[target_feature(enable = "avx2")] 1124 unsafe fn test() { 1125 let v = load([ 1126 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1128 ]); 1129 assert_eq!( 1130 unload(v.shift_8bit_lane_right::<2>()), 1131 [ 1132 0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1134 ] 1135 ); 1136 } 1137 if !is_runnable() { 1138 return; 1139 } 1140 unsafe { test() } 1141 } 1142 1143 #[test] vector_shift_in_one_byte()1144 fn vector_shift_in_one_byte() { 1145 #[target_feature(enable = "avx2")] 1146 unsafe fn test() { 1147 let v1 = load([ 1148 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1149 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1150 ]); 1151 let v2 = load([ 1152 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 1153 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 1154 63, 64, 1155 ]); 1156 assert_eq!( 1157 unload(v1.shift_in_one_byte(v2)), 1158 [ 1159 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1160 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1161 31, 1162 ], 1163 ); 1164 } 1165 if !is_runnable() { 1166 return; 1167 } 1168 unsafe { test() } 1169 } 1170 1171 #[test] vector_shift_in_two_bytes()1172 fn vector_shift_in_two_bytes() { 1173 #[target_feature(enable = "avx2")] 1174 unsafe fn test() { 1175 let v1 = load([ 1176 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1177 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1178 ]); 1179 let v2 = load([ 1180 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 1181 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 1182 63, 64, 1183 ]); 1184 assert_eq!( 1185 unload(v1.shift_in_two_bytes(v2)), 1186 [ 1187 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1188 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 1189 30, 1190 ], 1191 ); 1192 } 1193 if !is_runnable() { 1194 return; 1195 } 1196 unsafe { test() } 1197 } 1198 1199 #[test] vector_shift_in_three_bytes()1200 fn vector_shift_in_three_bytes() { 1201 #[target_feature(enable = "avx2")] 1202 unsafe fn test() { 1203 let v1 = load([ 1204 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1205 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1206 ]); 1207 let v2 = load([ 1208 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 1209 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 1210 63, 64, 1211 ]); 1212 assert_eq!( 1213 unload(v1.shift_in_three_bytes(v2)), 1214 [ 1215 62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1216 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 1217 29, 1218 ], 1219 ); 1220 } 1221 if !is_runnable() { 1222 return; 1223 } 1224 unsafe { test() } 1225 } 1226 1227 #[test] vector_shuffle_bytes()1228 fn vector_shuffle_bytes() { 1229 #[target_feature(enable = "avx2")] 1230 unsafe fn test() { 1231 let v1 = load([ 1232 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1233 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1234 ]); 1235 let v2 = load([ 1236 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16, 1237 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28, 1238 ]); 1239 assert_eq!( 1240 unload(v1.shuffle_bytes(v2)), 1241 [ 1242 1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17, 1243 17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29, 1244 29 1245 ], 1246 ); 1247 } 1248 if !is_runnable() { 1249 return; 1250 } 1251 unsafe { test() } 1252 } 1253 1254 #[test] vector_for_each_64bit_lane()1255 fn vector_for_each_64bit_lane() { 1256 #[target_feature(enable = "avx2")] 1257 unsafe fn test() { 1258 let v = load([ 1259 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 1260 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 1261 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 1262 0x1F, 0x20, 1263 ]); 1264 let mut lanes = [0u64; 4]; 1265 v.for_each_64bit_lane(|i, lane| { 1266 lanes[i] = lane; 1267 None::<()> 1268 }); 1269 assert_eq!( 1270 lanes, 1271 [ 1272 0x0807060504030201, 1273 0x100F0E0D0C0B0A09, 1274 0x1817161514131211, 1275 0x201F1E1D1C1B1A19 1276 ] 1277 ); 1278 } 1279 if !is_runnable() { 1280 return; 1281 } 1282 unsafe { test() } 1283 } 1284 1285 #[test] fat_vector_half_shift_in_one_byte()1286 fn fat_vector_half_shift_in_one_byte() { 1287 #[target_feature(enable = "avx2")] 1288 unsafe fn test() { 1289 let v1 = load_half([ 1290 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1291 ]); 1292 let v2 = load_half([ 1293 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1294 ]); 1295 assert_eq!( 1296 unload(v1.half_shift_in_one_byte(v2)), 1297 [ 1298 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32, 1299 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 1300 ], 1301 ); 1302 } 1303 if !is_runnable() { 1304 return; 1305 } 1306 unsafe { test() } 1307 } 1308 1309 #[test] fat_vector_half_shift_in_two_bytes()1310 fn fat_vector_half_shift_in_two_bytes() { 1311 #[target_feature(enable = "avx2")] 1312 unsafe fn test() { 1313 let v1 = load_half([ 1314 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1315 ]); 1316 let v2 = load_half([ 1317 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1318 ]); 1319 assert_eq!( 1320 unload(v1.half_shift_in_two_bytes(v2)), 1321 [ 1322 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31, 1323 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1324 ], 1325 ); 1326 } 1327 if !is_runnable() { 1328 return; 1329 } 1330 unsafe { test() } 1331 } 1332 1333 #[test] fat_vector_half_shift_in_three_bytes()1334 fn fat_vector_half_shift_in_three_bytes() { 1335 #[target_feature(enable = "avx2")] 1336 unsafe fn test() { 1337 let v1 = load_half([ 1338 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1339 ]); 1340 let v2 = load_half([ 1341 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1342 ]); 1343 assert_eq!( 1344 unload(v1.half_shift_in_three_bytes(v2)), 1345 [ 1346 30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30, 1347 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1348 ], 1349 ); 1350 } 1351 if !is_runnable() { 1352 return; 1353 } 1354 unsafe { test() } 1355 } 1356 1357 #[test] fat_vector_swap_halves()1358 fn fat_vector_swap_halves() { 1359 #[target_feature(enable = "avx2")] 1360 unsafe fn test() { 1361 let v = load([ 1362 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1363 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1364 ]); 1365 assert_eq!( 1366 unload(v.swap_halves()), 1367 [ 1368 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 1369 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1370 16, 1371 ], 1372 ); 1373 } 1374 if !is_runnable() { 1375 return; 1376 } 1377 unsafe { test() } 1378 } 1379 1380 #[test] fat_vector_interleave_low_8bit_lanes()1381 fn fat_vector_interleave_low_8bit_lanes() { 1382 #[target_feature(enable = "avx2")] 1383 unsafe fn test() { 1384 let v1 = load([ 1385 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1386 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1387 ]); 1388 let v2 = load([ 1389 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 1390 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 1391 63, 64, 1392 ]); 1393 assert_eq!( 1394 unload(v1.interleave_low_8bit_lanes(v2)), 1395 [ 1396 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40, 1397 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, 1398 24, 56, 1399 ], 1400 ); 1401 } 1402 if !is_runnable() { 1403 return; 1404 } 1405 unsafe { test() } 1406 } 1407 1408 #[test] fat_vector_interleave_high_8bit_lanes()1409 fn fat_vector_interleave_high_8bit_lanes() { 1410 #[target_feature(enable = "avx2")] 1411 unsafe fn test() { 1412 let v1 = load([ 1413 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 1414 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1415 ]); 1416 let v2 = load([ 1417 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 1418 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 1419 63, 64, 1420 ]); 1421 assert_eq!( 1422 unload(v1.interleave_high_8bit_lanes(v2)), 1423 [ 1424 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16, 1425 48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 1426 63, 32, 64, 1427 ], 1428 ); 1429 } 1430 if !is_runnable() { 1431 return; 1432 } 1433 unsafe { test() } 1434 } 1435 1436 #[test] fat_vector_for_each_low_64bit_lane()1437 fn fat_vector_for_each_low_64bit_lane() { 1438 #[target_feature(enable = "avx2")] 1439 unsafe fn test() { 1440 let v1 = load([ 1441 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 1442 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 1443 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 1444 0x1F, 0x20, 1445 ]); 1446 let v2 = load([ 1447 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 1448 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 1449 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, 1450 0x3F, 0x40, 1451 ]); 1452 let mut lanes = [0u64; 4]; 1453 v1.for_each_low_64bit_lane(v2, |i, lane| { 1454 lanes[i] = lane; 1455 None::<()> 1456 }); 1457 assert_eq!( 1458 lanes, 1459 [ 1460 0x0807060504030201, 1461 0x100F0E0D0C0B0A09, 1462 0x2827262524232221, 1463 0x302F2E2D2C2B2A29 1464 ] 1465 ); 1466 } 1467 if !is_runnable() { 1468 return; 1469 } 1470 unsafe { test() } 1471 } 1472 } 1473 1474 #[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))] 1475 mod tests_aarch64_neon { 1476 use core::arch::aarch64::*; 1477 1478 use super::*; 1479 1480 #[target_feature(enable = "neon")] load(lanes: [u8; 16]) -> uint8x16_t1481 unsafe fn load(lanes: [u8; 16]) -> uint8x16_t { 1482 uint8x16_t::load_unaligned(&lanes as *const u8) 1483 } 1484 1485 #[target_feature(enable = "neon")] unload(v: uint8x16_t) -> [u8; 16]1486 unsafe fn unload(v: uint8x16_t) -> [u8; 16] { 1487 [ 1488 vgetq_lane_u8(v, 0), 1489 vgetq_lane_u8(v, 1), 1490 vgetq_lane_u8(v, 2), 1491 vgetq_lane_u8(v, 3), 1492 vgetq_lane_u8(v, 4), 1493 vgetq_lane_u8(v, 5), 1494 vgetq_lane_u8(v, 6), 1495 vgetq_lane_u8(v, 7), 1496 vgetq_lane_u8(v, 8), 1497 vgetq_lane_u8(v, 9), 1498 vgetq_lane_u8(v, 10), 1499 vgetq_lane_u8(v, 11), 1500 vgetq_lane_u8(v, 12), 1501 vgetq_lane_u8(v, 13), 1502 vgetq_lane_u8(v, 14), 1503 vgetq_lane_u8(v, 15), 1504 ] 1505 } 1506 1507 // Example functions. These don't test the Vector traits, but rather, 1508 // specific NEON instructions. They are basically little experiments I 1509 // wrote to figure out what an instruction does since their descriptions 1510 // are so dense. I decided to keep the experiments around as example tests 1511 // in case there' useful. 1512 1513 #[test] example_vmaxvq_u8_non_zero()1514 fn example_vmaxvq_u8_non_zero() { 1515 #[target_feature(enable = "neon")] 1516 unsafe fn example() { 1517 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1518 assert_eq!(vmaxvq_u8(v), 1); 1519 } 1520 unsafe { example() } 1521 } 1522 1523 #[test] example_vmaxvq_u8_zero()1524 fn example_vmaxvq_u8_zero() { 1525 #[target_feature(enable = "neon")] 1526 unsafe fn example() { 1527 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1528 assert_eq!(vmaxvq_u8(v), 0); 1529 } 1530 unsafe { example() } 1531 } 1532 1533 #[test] example_vpmaxq_u8_non_zero()1534 fn example_vpmaxq_u8_non_zero() { 1535 #[target_feature(enable = "neon")] 1536 unsafe fn example() { 1537 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1538 let r = vpmaxq_u8(v, v); 1539 assert_eq!( 1540 unload(r), 1541 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0] 1542 ); 1543 } 1544 unsafe { example() } 1545 } 1546 1547 #[test] example_vpmaxq_u8_self()1548 fn example_vpmaxq_u8_self() { 1549 #[target_feature(enable = "neon")] 1550 unsafe fn example() { 1551 let v = 1552 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 1553 let r = vpmaxq_u8(v, v); 1554 assert_eq!( 1555 unload(r), 1556 [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16] 1557 ); 1558 } 1559 unsafe { example() } 1560 } 1561 1562 #[test] example_vpmaxq_u8_other()1563 fn example_vpmaxq_u8_other() { 1564 #[target_feature(enable = "neon")] 1565 unsafe fn example() { 1566 let v1 = 1567 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 1568 let v2 = load([ 1569 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1570 ]); 1571 let r = vpmaxq_u8(v1, v2); 1572 assert_eq!( 1573 unload(r), 1574 [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32] 1575 ); 1576 } 1577 unsafe { example() } 1578 } 1579 1580 // Now we test the actual methods on the Vector trait. 1581 1582 #[test] vector_splat()1583 fn vector_splat() { 1584 #[target_feature(enable = "neon")] 1585 unsafe fn test() { 1586 let v = uint8x16_t::splat(0xAF); 1587 assert_eq!( 1588 unload(v), 1589 [ 1590 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 1591 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF 1592 ] 1593 ); 1594 } 1595 unsafe { test() } 1596 } 1597 1598 #[test] vector_is_zero()1599 fn vector_is_zero() { 1600 #[target_feature(enable = "neon")] 1601 unsafe fn test() { 1602 let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1603 assert!(!v.is_zero()); 1604 let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1605 assert!(v.is_zero()); 1606 } 1607 unsafe { test() } 1608 } 1609 1610 #[test] vector_cmpeq()1611 fn vector_cmpeq() { 1612 #[target_feature(enable = "neon")] 1613 unsafe fn test() { 1614 let v1 = 1615 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]); 1616 let v2 = 1617 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]); 1618 assert_eq!( 1619 unload(v1.cmpeq(v2)), 1620 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF] 1621 ); 1622 } 1623 unsafe { test() } 1624 } 1625 1626 #[test] vector_and()1627 fn vector_and() { 1628 #[target_feature(enable = "neon")] 1629 unsafe fn test() { 1630 let v1 = 1631 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1632 let v2 = 1633 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1634 assert_eq!( 1635 unload(v1.and(v2)), 1636 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1637 ); 1638 } 1639 unsafe { test() } 1640 } 1641 1642 #[test] vector_or()1643 fn vector_or() { 1644 #[target_feature(enable = "neon")] 1645 unsafe fn test() { 1646 let v1 = 1647 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1648 let v2 = 1649 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]); 1650 assert_eq!( 1651 unload(v1.or(v2)), 1652 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1653 ); 1654 } 1655 unsafe { test() } 1656 } 1657 1658 #[test] vector_shift_8bit_lane_right()1659 fn vector_shift_8bit_lane_right() { 1660 #[target_feature(enable = "neon")] 1661 unsafe fn test() { 1662 let v = load([ 1663 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1664 ]); 1665 assert_eq!( 1666 unload(v.shift_8bit_lane_right::<2>()), 1667 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] 1668 ); 1669 } 1670 unsafe { test() } 1671 } 1672 1673 #[test] vector_shift_in_one_byte()1674 fn vector_shift_in_one_byte() { 1675 #[target_feature(enable = "neon")] 1676 unsafe fn test() { 1677 let v1 = 1678 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 1679 let v2 = load([ 1680 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1681 ]); 1682 assert_eq!( 1683 unload(v1.shift_in_one_byte(v2)), 1684 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], 1685 ); 1686 } 1687 unsafe { test() } 1688 } 1689 1690 #[test] vector_shift_in_two_bytes()1691 fn vector_shift_in_two_bytes() { 1692 #[target_feature(enable = "neon")] 1693 unsafe fn test() { 1694 let v1 = 1695 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 1696 let v2 = load([ 1697 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1698 ]); 1699 assert_eq!( 1700 unload(v1.shift_in_two_bytes(v2)), 1701 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], 1702 ); 1703 } 1704 unsafe { test() } 1705 } 1706 1707 #[test] vector_shift_in_three_bytes()1708 fn vector_shift_in_three_bytes() { 1709 #[target_feature(enable = "neon")] 1710 unsafe fn test() { 1711 let v1 = 1712 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 1713 let v2 = load([ 1714 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 1715 ]); 1716 assert_eq!( 1717 unload(v1.shift_in_three_bytes(v2)), 1718 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13], 1719 ); 1720 } 1721 unsafe { test() } 1722 } 1723 1724 #[test] vector_shuffle_bytes()1725 fn vector_shuffle_bytes() { 1726 #[target_feature(enable = "neon")] 1727 unsafe fn test() { 1728 let v1 = 1729 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]); 1730 let v2 = 1731 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]); 1732 assert_eq!( 1733 unload(v1.shuffle_bytes(v2)), 1734 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13], 1735 ); 1736 } 1737 unsafe { test() } 1738 } 1739 1740 #[test] vector_for_each_64bit_lane()1741 fn vector_for_each_64bit_lane() { 1742 #[target_feature(enable = "neon")] 1743 unsafe fn test() { 1744 let v = load([ 1745 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 1746 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 1747 ]); 1748 let mut lanes = [0u64; 2]; 1749 v.for_each_64bit_lane(|i, lane| { 1750 lanes[i] = lane; 1751 None::<()> 1752 }); 1753 assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],); 1754 } 1755 unsafe { test() } 1756 } 1757 } 1758