1 // NOTE: The descriptions for each of the vector methods on the traits below
2 // are pretty inscrutable. For this reason, there are tests for every method
3 // on for every trait impl below. If you're confused about what an op does,
4 // consult its test. (They probably should be doc tests, but I couldn't figure
5 // out how to write them in a non-annoying way.)
6 
7 use core::{
8     fmt::Debug,
9     panic::{RefUnwindSafe, UnwindSafe},
10 };
11 
12 /// A trait for describing vector operations used by vectorized searchers.
13 ///
14 /// The trait is highly constrained to low level vector operations needed for
15 /// the specific algorithms used in this crate. In general, it was invented
16 /// mostly to be generic over x86's __m128i and __m256i types. At time of
17 /// writing, it also supports wasm and aarch64 128-bit vector types as well.
18 ///
19 /// # Safety
20 ///
21 /// All methods are not safe since they are intended to be implemented using
22 /// vendor intrinsics, which are also not safe. Callers must ensure that
23 /// the appropriate target features are enabled in the calling function,
24 /// and that the current CPU supports them. All implementations should
25 /// avoid marking the routines with `#[target_feature]` and instead mark
26 /// them as `#[inline(always)]` to ensure they get appropriately inlined.
27 /// (`inline(always)` cannot be used with target_feature.)
28 pub(crate) trait Vector:
29     Copy + Debug + Send + Sync + UnwindSafe + RefUnwindSafe
30 {
31     /// The number of bits in the vector.
32     const BITS: usize;
33     /// The number of bytes in the vector. That is, this is the size of the
34     /// vector in memory.
35     const BYTES: usize;
36 
37     /// Create a vector with 8-bit lanes with the given byte repeated into each
38     /// lane.
39     ///
40     /// # Safety
41     ///
42     /// Callers must ensure that this is okay to call in the current target for
43     /// the current CPU.
splat(byte: u8) -> Self44     unsafe fn splat(byte: u8) -> Self;
45 
46     /// Read a vector-size number of bytes from the given pointer. The pointer
47     /// does not need to be aligned.
48     ///
49     /// # Safety
50     ///
51     /// Callers must ensure that this is okay to call in the current target for
52     /// the current CPU.
53     ///
54     /// Callers must guarantee that at least `BYTES` bytes are readable from
55     /// `data`.
load_unaligned(data: *const u8) -> Self56     unsafe fn load_unaligned(data: *const u8) -> Self;
57 
58     /// Returns true if and only if this vector has zero in all of its lanes.
59     ///
60     /// # Safety
61     ///
62     /// Callers must ensure that this is okay to call in the current target for
63     /// the current CPU.
is_zero(self) -> bool64     unsafe fn is_zero(self) -> bool;
65 
66     /// Do an 8-bit pairwise equality check. If lane `i` is equal in this
67     /// vector and the one given, then lane `i` in the resulting vector is set
68     /// to `0xFF`. Otherwise, it is set to `0x00`.
69     ///
70     /// # Safety
71     ///
72     /// Callers must ensure that this is okay to call in the current target for
73     /// the current CPU.
cmpeq(self, vector2: Self) -> Self74     unsafe fn cmpeq(self, vector2: Self) -> Self;
75 
76     /// Perform a bitwise 'and' of this vector and the one given and return
77     /// the result.
78     ///
79     /// # Safety
80     ///
81     /// Callers must ensure that this is okay to call in the current target for
82     /// the current CPU.
and(self, vector2: Self) -> Self83     unsafe fn and(self, vector2: Self) -> Self;
84 
85     /// Perform a bitwise 'or' of this vector and the one given and return
86     /// the result.
87     ///
88     /// # Safety
89     ///
90     /// Callers must ensure that this is okay to call in the current target for
91     /// the current CPU.
92     #[allow(dead_code)] // unused, but useful enough to keep around?
or(self, vector2: Self) -> Self93     unsafe fn or(self, vector2: Self) -> Self;
94 
95     /// Shift each 8-bit lane in this vector to the right by the number of
96     /// bits indictated by the `BITS` type parameter.
97     ///
98     /// # Safety
99     ///
100     /// Callers must ensure that this is okay to call in the current target for
101     /// the current CPU.
shift_8bit_lane_right<const BITS: i32>(self) -> Self102     unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self;
103 
104     /// Shift this vector to the left by one byte and shift the most
105     /// significant byte of `vector2` into the least significant position of
106     /// this vector.
107     ///
108     /// Stated differently, this behaves as if `self` and `vector2` were
109     /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
110     /// right by `Self::BYTES - 1` bytes.
111     ///
112     /// With respect to the Teddy algorithm, `vector2` is usually a previous
113     /// `Self::BYTES` chunk from the haystack and `self` is the chunk
114     /// immediately following it. This permits combining the last two bytes
115     /// from the previous chunk (`vector2`) with the first `Self::BYTES - 1`
116     /// bytes from the current chunk. This permits aligning the result of
117     /// various shuffles so that they can be and-ed together and a possible
118     /// candidate discovered.
119     ///
120     /// # Safety
121     ///
122     /// Callers must ensure that this is okay to call in the current target for
123     /// the current CPU.
shift_in_one_byte(self, vector2: Self) -> Self124     unsafe fn shift_in_one_byte(self, vector2: Self) -> Self;
125 
126     /// Shift this vector to the left by two bytes and shift the two most
127     /// significant bytes of `vector2` into the least significant position of
128     /// this vector.
129     ///
130     /// Stated differently, this behaves as if `self` and `vector2` were
131     /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
132     /// right by `Self::BYTES - 2` bytes.
133     ///
134     /// With respect to the Teddy algorithm, `vector2` is usually a previous
135     /// `Self::BYTES` chunk from the haystack and `self` is the chunk
136     /// immediately following it. This permits combining the last two bytes
137     /// from the previous chunk (`vector2`) with the first `Self::BYTES - 2`
138     /// bytes from the current chunk. This permits aligning the result of
139     /// various shuffles so that they can be and-ed together and a possible
140     /// candidate discovered.
141     ///
142     /// # Safety
143     ///
144     /// Callers must ensure that this is okay to call in the current target for
145     /// the current CPU.
shift_in_two_bytes(self, vector2: Self) -> Self146     unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self;
147 
148     /// Shift this vector to the left by three bytes and shift the three most
149     /// significant bytes of `vector2` into the least significant position of
150     /// this vector.
151     ///
152     /// Stated differently, this behaves as if `self` and `vector2` were
153     /// concatenated into a `2 * Self::BITS` temporary buffer and then shifted
154     /// right by `Self::BYTES - 3` bytes.
155     ///
156     /// With respect to the Teddy algorithm, `vector2` is usually a previous
157     /// `Self::BYTES` chunk from the haystack and `self` is the chunk
158     /// immediately following it. This permits combining the last three bytes
159     /// from the previous chunk (`vector2`) with the first `Self::BYTES - 3`
160     /// bytes from the current chunk. This permits aligning the result of
161     /// various shuffles so that they can be and-ed together and a possible
162     /// candidate discovered.
163     ///
164     /// # Safety
165     ///
166     /// Callers must ensure that this is okay to call in the current target for
167     /// the current CPU.
shift_in_three_bytes(self, vector2: Self) -> Self168     unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self;
169 
170     /// Shuffles the bytes in this vector according to the indices in each of
171     /// the corresponding lanes in `indices`.
172     ///
173     /// If `i` is the index of corresponding lanes, `A` is this vector, `B` is
174     /// indices and `C` is the resulting vector, then `C = A[B[i]]`.
175     ///
176     /// # Safety
177     ///
178     /// Callers must ensure that this is okay to call in the current target for
179     /// the current CPU.
shuffle_bytes(self, indices: Self) -> Self180     unsafe fn shuffle_bytes(self, indices: Self) -> Self;
181 
182     /// Call the provided function for each 64-bit lane in this vector. The
183     /// given function is provided the lane index and lane value as a `u64`.
184     ///
185     /// If `f` returns `Some`, then iteration over the lanes is stopped and the
186     /// value is returned. Otherwise, this returns `None`.
187     ///
188     /// # Notes
189     ///
190     /// Conceptually it would be nice if we could have a
191     /// `unpack64(self) -> [u64; BITS / 64]` method, but defining that is
192     /// tricky given Rust's [current support for const generics][support].
193     /// And even if we could, it would be tricky to write generic code over
194     /// it. (Not impossible. We could introduce another layer that requires
195     /// `AsRef<[u64]>` or something.)
196     ///
197     /// [support]: https://github.com/rust-lang/rust/issues/60551
198     ///
199     /// # Safety
200     ///
201     /// Callers must ensure that this is okay to call in the current target for
202     /// the current CPU.
for_each_64bit_lane<T>( self, f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>203     unsafe fn for_each_64bit_lane<T>(
204         self,
205         f: impl FnMut(usize, u64) -> Option<T>,
206     ) -> Option<T>;
207 }
208 
209 /// This trait extends the `Vector` trait with additional operations to support
210 /// Fat Teddy.
211 ///
212 /// Fat Teddy uses 16 buckets instead of 8, but reads half as many bytes (as
213 /// the vector size) instead of the full size of a vector per iteration. For
214 /// example, when using a 256-bit vector, Slim Teddy reads 32 bytes at a timr
215 /// but Fat Teddy reads 16 bytes at a time.
216 ///
217 /// Fat Teddy is useful when searching for a large number of literals.
218 /// The extra number of buckets spreads the literals out more and reduces
219 /// verification time.
220 ///
221 /// Currently we only implement this for AVX on x86_64. It would be nice to
222 /// implement this for SSE on x86_64 and NEON on aarch64, with the latter two
223 /// only reading 8 bytes at a time. It's not clear how well it would work, but
224 /// there are some tricky things to figure out in terms of implementation. The
225 /// `half_shift_in_{one,two,three}_bytes` methods in particular are probably
226 /// the trickiest of the bunch. For AVX2, these are implemented by taking
227 /// advantage of the fact that `_mm256_alignr_epi8` operates on each 128-bit
228 /// half instead of the full 256-bit vector. (Where as `_mm_alignr_epi8`
229 /// operates on the full 128-bit vector and not on each 64-bit half.) I didn't
230 /// do a careful survey of NEON to see if it could easily support these
231 /// operations.
232 pub(crate) trait FatVector: Vector {
233     type Half: Vector;
234 
235     /// Read a half-vector-size number of bytes from the given pointer, and
236     /// broadcast it across both halfs of a full vector. The pointer does not
237     /// need to be aligned.
238     ///
239     /// # Safety
240     ///
241     /// Callers must ensure that this is okay to call in the current target for
242     /// the current CPU.
243     ///
244     /// Callers must guarantee that at least `Self::HALF::BYTES` bytes are
245     /// readable from `data`.
load_half_unaligned(data: *const u8) -> Self246     unsafe fn load_half_unaligned(data: *const u8) -> Self;
247 
248     /// Like `Vector::shift_in_one_byte`, except this is done for each half
249     /// of the vector instead.
250     ///
251     /// # Safety
252     ///
253     /// Callers must ensure that this is okay to call in the current target for
254     /// the current CPU.
half_shift_in_one_byte(self, vector2: Self) -> Self255     unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self;
256 
257     /// Like `Vector::shift_in_two_bytes`, except this is done for each half
258     /// of the vector instead.
259     ///
260     /// # Safety
261     ///
262     /// Callers must ensure that this is okay to call in the current target for
263     /// the current CPU.
half_shift_in_two_bytes(self, vector2: Self) -> Self264     unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self;
265 
266     /// Like `Vector::shift_in_two_bytes`, except this is done for each half
267     /// of the vector instead.
268     ///
269     /// # Safety
270     ///
271     /// Callers must ensure that this is okay to call in the current target for
272     /// the current CPU.
half_shift_in_three_bytes(self, vector2: Self) -> Self273     unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self;
274 
275     /// Swap the 128-bit lanes in this vector.
276     ///
277     /// # Safety
278     ///
279     /// Callers must ensure that this is okay to call in the current target for
280     /// the current CPU.
swap_halves(self) -> Self281     unsafe fn swap_halves(self) -> Self;
282 
283     /// Unpack and interleave the 8-bit lanes from the low 128 bits of each
284     /// vector and return the result.
285     ///
286     /// # Safety
287     ///
288     /// Callers must ensure that this is okay to call in the current target for
289     /// the current CPU.
interleave_low_8bit_lanes(self, vector2: Self) -> Self290     unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self;
291 
292     /// Unpack and interleave the 8-bit lanes from the high 128 bits of each
293     /// vector and return the result.
294     ///
295     /// # Safety
296     ///
297     /// Callers must ensure that this is okay to call in the current target for
298     /// the current CPU.
interleave_high_8bit_lanes(self, vector2: Self) -> Self299     unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self;
300 
301     /// Call the provided function for each 64-bit lane in the lower half
302     /// of this vector and then in the other vector. The given function is
303     /// provided the lane index and lane value as a `u64`. (The high 128-bits
304     /// of each vector are ignored.)
305     ///
306     /// If `f` returns `Some`, then iteration over the lanes is stopped and the
307     /// value is returned. Otherwise, this returns `None`.
308     ///
309     /// # Safety
310     ///
311     /// Callers must ensure that this is okay to call in the current target for
312     /// the current CPU.
for_each_low_64bit_lane<T>( self, vector2: Self, f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>313     unsafe fn for_each_low_64bit_lane<T>(
314         self,
315         vector2: Self,
316         f: impl FnMut(usize, u64) -> Option<T>,
317     ) -> Option<T>;
318 }
319 
320 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
321 mod x86_64_ssse3 {
322     use core::arch::x86_64::*;
323 
324     use crate::util::int::{I32, I8};
325 
326     use super::Vector;
327 
328     impl Vector for __m128i {
329         const BITS: usize = 128;
330         const BYTES: usize = 16;
331 
332         #[inline(always)]
splat(byte: u8) -> __m128i333         unsafe fn splat(byte: u8) -> __m128i {
334             _mm_set1_epi8(i8::from_bits(byte))
335         }
336 
337         #[inline(always)]
load_unaligned(data: *const u8) -> __m128i338         unsafe fn load_unaligned(data: *const u8) -> __m128i {
339             _mm_loadu_si128(data.cast::<__m128i>())
340         }
341 
342         #[inline(always)]
is_zero(self) -> bool343         unsafe fn is_zero(self) -> bool {
344             let cmp = self.cmpeq(Self::splat(0));
345             _mm_movemask_epi8(cmp).to_bits() == 0xFFFF
346         }
347 
348         #[inline(always)]
cmpeq(self, vector2: Self) -> __m128i349         unsafe fn cmpeq(self, vector2: Self) -> __m128i {
350             _mm_cmpeq_epi8(self, vector2)
351         }
352 
353         #[inline(always)]
and(self, vector2: Self) -> __m128i354         unsafe fn and(self, vector2: Self) -> __m128i {
355             _mm_and_si128(self, vector2)
356         }
357 
358         #[inline(always)]
or(self, vector2: Self) -> __m128i359         unsafe fn or(self, vector2: Self) -> __m128i {
360             _mm_or_si128(self, vector2)
361         }
362 
363         #[inline(always)]
shift_8bit_lane_right<const BITS: i32>(self) -> Self364         unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
365             // Apparently there is no _mm_srli_epi8, so we emulate it by
366             // shifting 16-bit integers and masking out the high nybble of each
367             // 8-bit lane (since that nybble will contain bits from the low
368             // nybble of the previous lane).
369             let lomask = Self::splat(0xF);
370             _mm_srli_epi16(self, BITS).and(lomask)
371         }
372 
373         #[inline(always)]
shift_in_one_byte(self, vector2: Self) -> Self374         unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
375             _mm_alignr_epi8(self, vector2, 15)
376         }
377 
378         #[inline(always)]
shift_in_two_bytes(self, vector2: Self) -> Self379         unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
380             _mm_alignr_epi8(self, vector2, 14)
381         }
382 
383         #[inline(always)]
shift_in_three_bytes(self, vector2: Self) -> Self384         unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
385             _mm_alignr_epi8(self, vector2, 13)
386         }
387 
388         #[inline(always)]
shuffle_bytes(self, indices: Self) -> Self389         unsafe fn shuffle_bytes(self, indices: Self) -> Self {
390             _mm_shuffle_epi8(self, indices)
391         }
392 
393         #[inline(always)]
for_each_64bit_lane<T>( self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>394         unsafe fn for_each_64bit_lane<T>(
395             self,
396             mut f: impl FnMut(usize, u64) -> Option<T>,
397         ) -> Option<T> {
398             // We could just use _mm_extract_epi64 here, but that requires
399             // SSE 4.1. It isn't necessarily a problem to just require SSE 4.1,
400             // but everything else works with SSSE3 so we stick to that subset.
401             let lanes: [u64; 2] = core::mem::transmute(self);
402             if let Some(t) = f(0, lanes[0]) {
403                 return Some(t);
404             }
405             if let Some(t) = f(1, lanes[1]) {
406                 return Some(t);
407             }
408             None
409         }
410     }
411 }
412 
413 #[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
414 mod x86_64_avx2 {
415     use core::arch::x86_64::*;
416 
417     use crate::util::int::{I32, I64, I8};
418 
419     use super::{FatVector, Vector};
420 
421     impl Vector for __m256i {
422         const BITS: usize = 256;
423         const BYTES: usize = 32;
424 
425         #[inline(always)]
splat(byte: u8) -> __m256i426         unsafe fn splat(byte: u8) -> __m256i {
427             _mm256_set1_epi8(i8::from_bits(byte))
428         }
429 
430         #[inline(always)]
load_unaligned(data: *const u8) -> __m256i431         unsafe fn load_unaligned(data: *const u8) -> __m256i {
432             _mm256_loadu_si256(data.cast::<__m256i>())
433         }
434 
435         #[inline(always)]
is_zero(self) -> bool436         unsafe fn is_zero(self) -> bool {
437             let cmp = self.cmpeq(Self::splat(0));
438             _mm256_movemask_epi8(cmp).to_bits() == 0xFFFFFFFF
439         }
440 
441         #[inline(always)]
cmpeq(self, vector2: Self) -> __m256i442         unsafe fn cmpeq(self, vector2: Self) -> __m256i {
443             _mm256_cmpeq_epi8(self, vector2)
444         }
445 
446         #[inline(always)]
and(self, vector2: Self) -> __m256i447         unsafe fn and(self, vector2: Self) -> __m256i {
448             _mm256_and_si256(self, vector2)
449         }
450 
451         #[inline(always)]
or(self, vector2: Self) -> __m256i452         unsafe fn or(self, vector2: Self) -> __m256i {
453             _mm256_or_si256(self, vector2)
454         }
455 
456         #[inline(always)]
shift_8bit_lane_right<const BITS: i32>(self) -> Self457         unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
458             let lomask = Self::splat(0xF);
459             _mm256_srli_epi16(self, BITS).and(lomask)
460         }
461 
462         #[inline(always)]
shift_in_one_byte(self, vector2: Self) -> Self463         unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
464             // Credit goes to jneem for figuring this out:
465             // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
466             //
467             // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
468             // PALIGNR instructions, which is not what we want, so we need to
469             // do some extra shuffling.
470             let v = _mm256_permute2x128_si256(vector2, self, 0x21);
471             _mm256_alignr_epi8(self, v, 15)
472         }
473 
474         #[inline(always)]
shift_in_two_bytes(self, vector2: Self) -> Self475         unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
476             // Credit goes to jneem for figuring this out:
477             // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
478             //
479             // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
480             // PALIGNR instructions, which is not what we want, so we need to
481             // do some extra shuffling.
482             let v = _mm256_permute2x128_si256(vector2, self, 0x21);
483             _mm256_alignr_epi8(self, v, 14)
484         }
485 
486         #[inline(always)]
shift_in_three_bytes(self, vector2: Self) -> Self487         unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
488             // Credit goes to jneem for figuring this out:
489             // https://github.com/jneem/teddy/blob/9ab5e899ad6ef6911aecd3cf1033f1abe6e1f66c/src/x86/teddy_simd.rs#L145-L184
490             //
491             // TL;DR avx2's PALIGNR instruction is actually just two 128-bit
492             // PALIGNR instructions, which is not what we want, so we need to
493             // do some extra shuffling.
494             let v = _mm256_permute2x128_si256(vector2, self, 0x21);
495             _mm256_alignr_epi8(self, v, 13)
496         }
497 
498         #[inline(always)]
shuffle_bytes(self, indices: Self) -> Self499         unsafe fn shuffle_bytes(self, indices: Self) -> Self {
500             _mm256_shuffle_epi8(self, indices)
501         }
502 
503         #[inline(always)]
for_each_64bit_lane<T>( self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>504         unsafe fn for_each_64bit_lane<T>(
505             self,
506             mut f: impl FnMut(usize, u64) -> Option<T>,
507         ) -> Option<T> {
508             // NOTE: At one point in the past, I used transmute to this to
509             // get a [u64; 4], but it turned out to lead to worse codegen IIRC.
510             // I've tried it more recently, and it looks like that's no longer
511             // the case. But since there's no difference, we stick with the
512             // slightly more complicated but transmute-free version.
513             let lane = _mm256_extract_epi64(self, 0).to_bits();
514             if let Some(t) = f(0, lane) {
515                 return Some(t);
516             }
517             let lane = _mm256_extract_epi64(self, 1).to_bits();
518             if let Some(t) = f(1, lane) {
519                 return Some(t);
520             }
521             let lane = _mm256_extract_epi64(self, 2).to_bits();
522             if let Some(t) = f(2, lane) {
523                 return Some(t);
524             }
525             let lane = _mm256_extract_epi64(self, 3).to_bits();
526             if let Some(t) = f(3, lane) {
527                 return Some(t);
528             }
529             None
530         }
531     }
532 
533     impl FatVector for __m256i {
534         type Half = __m128i;
535 
536         #[inline(always)]
load_half_unaligned(data: *const u8) -> Self537         unsafe fn load_half_unaligned(data: *const u8) -> Self {
538             let half = Self::Half::load_unaligned(data);
539             _mm256_broadcastsi128_si256(half)
540         }
541 
542         #[inline(always)]
half_shift_in_one_byte(self, vector2: Self) -> Self543         unsafe fn half_shift_in_one_byte(self, vector2: Self) -> Self {
544             _mm256_alignr_epi8(self, vector2, 15)
545         }
546 
547         #[inline(always)]
half_shift_in_two_bytes(self, vector2: Self) -> Self548         unsafe fn half_shift_in_two_bytes(self, vector2: Self) -> Self {
549             _mm256_alignr_epi8(self, vector2, 14)
550         }
551 
552         #[inline(always)]
half_shift_in_three_bytes(self, vector2: Self) -> Self553         unsafe fn half_shift_in_three_bytes(self, vector2: Self) -> Self {
554             _mm256_alignr_epi8(self, vector2, 13)
555         }
556 
557         #[inline(always)]
swap_halves(self) -> Self558         unsafe fn swap_halves(self) -> Self {
559             _mm256_permute4x64_epi64(self, 0x4E)
560         }
561 
562         #[inline(always)]
interleave_low_8bit_lanes(self, vector2: Self) -> Self563         unsafe fn interleave_low_8bit_lanes(self, vector2: Self) -> Self {
564             _mm256_unpacklo_epi8(self, vector2)
565         }
566 
567         #[inline(always)]
interleave_high_8bit_lanes(self, vector2: Self) -> Self568         unsafe fn interleave_high_8bit_lanes(self, vector2: Self) -> Self {
569             _mm256_unpackhi_epi8(self, vector2)
570         }
571 
572         #[inline(always)]
for_each_low_64bit_lane<T>( self, vector2: Self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>573         unsafe fn for_each_low_64bit_lane<T>(
574             self,
575             vector2: Self,
576             mut f: impl FnMut(usize, u64) -> Option<T>,
577         ) -> Option<T> {
578             let lane = _mm256_extract_epi64(self, 0).to_bits();
579             if let Some(t) = f(0, lane) {
580                 return Some(t);
581             }
582             let lane = _mm256_extract_epi64(self, 1).to_bits();
583             if let Some(t) = f(1, lane) {
584                 return Some(t);
585             }
586             let lane = _mm256_extract_epi64(vector2, 0).to_bits();
587             if let Some(t) = f(2, lane) {
588                 return Some(t);
589             }
590             let lane = _mm256_extract_epi64(vector2, 1).to_bits();
591             if let Some(t) = f(3, lane) {
592                 return Some(t);
593             }
594             None
595         }
596     }
597 }
598 
599 #[cfg(all(
600     target_arch = "aarch64",
601     target_feature = "neon",
602     target_endian = "little"
603 ))]
604 mod aarch64_neon {
605     use core::arch::aarch64::*;
606 
607     use super::Vector;
608 
609     impl Vector for uint8x16_t {
610         const BITS: usize = 128;
611         const BYTES: usize = 16;
612 
613         #[inline(always)]
splat(byte: u8) -> uint8x16_t614         unsafe fn splat(byte: u8) -> uint8x16_t {
615             vdupq_n_u8(byte)
616         }
617 
618         #[inline(always)]
load_unaligned(data: *const u8) -> uint8x16_t619         unsafe fn load_unaligned(data: *const u8) -> uint8x16_t {
620             vld1q_u8(data)
621         }
622 
623         #[inline(always)]
is_zero(self) -> bool624         unsafe fn is_zero(self) -> bool {
625             // Could also use vmaxvq_u8.
626             // ... I tried that and couldn't observe any meaningful difference
627             // in benchmarks.
628             let maxes = vreinterpretq_u64_u8(vpmaxq_u8(self, self));
629             vgetq_lane_u64(maxes, 0) == 0
630         }
631 
632         #[inline(always)]
cmpeq(self, vector2: Self) -> uint8x16_t633         unsafe fn cmpeq(self, vector2: Self) -> uint8x16_t {
634             vceqq_u8(self, vector2)
635         }
636 
637         #[inline(always)]
and(self, vector2: Self) -> uint8x16_t638         unsafe fn and(self, vector2: Self) -> uint8x16_t {
639             vandq_u8(self, vector2)
640         }
641 
642         #[inline(always)]
or(self, vector2: Self) -> uint8x16_t643         unsafe fn or(self, vector2: Self) -> uint8x16_t {
644             vorrq_u8(self, vector2)
645         }
646 
647         #[inline(always)]
shift_8bit_lane_right<const BITS: i32>(self) -> Self648         unsafe fn shift_8bit_lane_right<const BITS: i32>(self) -> Self {
649             debug_assert!(BITS <= 7);
650             vshrq_n_u8(self, BITS)
651         }
652 
653         #[inline(always)]
shift_in_one_byte(self, vector2: Self) -> Self654         unsafe fn shift_in_one_byte(self, vector2: Self) -> Self {
655             vextq_u8(vector2, self, 15)
656         }
657 
658         #[inline(always)]
shift_in_two_bytes(self, vector2: Self) -> Self659         unsafe fn shift_in_two_bytes(self, vector2: Self) -> Self {
660             vextq_u8(vector2, self, 14)
661         }
662 
663         #[inline(always)]
shift_in_three_bytes(self, vector2: Self) -> Self664         unsafe fn shift_in_three_bytes(self, vector2: Self) -> Self {
665             vextq_u8(vector2, self, 13)
666         }
667 
668         #[inline(always)]
shuffle_bytes(self, indices: Self) -> Self669         unsafe fn shuffle_bytes(self, indices: Self) -> Self {
670             vqtbl1q_u8(self, indices)
671         }
672 
673         #[inline(always)]
for_each_64bit_lane<T>( self, mut f: impl FnMut(usize, u64) -> Option<T>, ) -> Option<T>674         unsafe fn for_each_64bit_lane<T>(
675             self,
676             mut f: impl FnMut(usize, u64) -> Option<T>,
677         ) -> Option<T> {
678             let this = vreinterpretq_u64_u8(self);
679             let lane = vgetq_lane_u64(this, 0);
680             if let Some(t) = f(0, lane) {
681                 return Some(t);
682             }
683             let lane = vgetq_lane_u64(this, 1);
684             if let Some(t) = f(1, lane) {
685                 return Some(t);
686             }
687             None
688         }
689     }
690 }
691 
692 #[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
693 mod tests_x86_64_ssse3 {
694     use core::arch::x86_64::*;
695 
696     use crate::util::int::{I32, U32};
697 
698     use super::*;
699 
is_runnable() -> bool700     fn is_runnable() -> bool {
701         std::is_x86_feature_detected!("ssse3")
702     }
703 
704     #[target_feature(enable = "ssse3")]
load(lanes: [u8; 16]) -> __m128i705     unsafe fn load(lanes: [u8; 16]) -> __m128i {
706         __m128i::load_unaligned(&lanes as *const u8)
707     }
708 
709     #[target_feature(enable = "ssse3")]
unload(v: __m128i) -> [u8; 16]710     unsafe fn unload(v: __m128i) -> [u8; 16] {
711         [
712             _mm_extract_epi8(v, 0).to_bits().low_u8(),
713             _mm_extract_epi8(v, 1).to_bits().low_u8(),
714             _mm_extract_epi8(v, 2).to_bits().low_u8(),
715             _mm_extract_epi8(v, 3).to_bits().low_u8(),
716             _mm_extract_epi8(v, 4).to_bits().low_u8(),
717             _mm_extract_epi8(v, 5).to_bits().low_u8(),
718             _mm_extract_epi8(v, 6).to_bits().low_u8(),
719             _mm_extract_epi8(v, 7).to_bits().low_u8(),
720             _mm_extract_epi8(v, 8).to_bits().low_u8(),
721             _mm_extract_epi8(v, 9).to_bits().low_u8(),
722             _mm_extract_epi8(v, 10).to_bits().low_u8(),
723             _mm_extract_epi8(v, 11).to_bits().low_u8(),
724             _mm_extract_epi8(v, 12).to_bits().low_u8(),
725             _mm_extract_epi8(v, 13).to_bits().low_u8(),
726             _mm_extract_epi8(v, 14).to_bits().low_u8(),
727             _mm_extract_epi8(v, 15).to_bits().low_u8(),
728         ]
729     }
730 
731     #[test]
vector_splat()732     fn vector_splat() {
733         #[target_feature(enable = "ssse3")]
734         unsafe fn test() {
735             let v = __m128i::splat(0xAF);
736             assert_eq!(
737                 unload(v),
738                 [
739                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
740                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
741                 ]
742             );
743         }
744         if !is_runnable() {
745             return;
746         }
747         unsafe { test() }
748     }
749 
750     #[test]
vector_is_zero()751     fn vector_is_zero() {
752         #[target_feature(enable = "ssse3")]
753         unsafe fn test() {
754             let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
755             assert!(!v.is_zero());
756             let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
757             assert!(v.is_zero());
758         }
759         if !is_runnable() {
760             return;
761         }
762         unsafe { test() }
763     }
764 
765     #[test]
vector_cmpeq()766     fn vector_cmpeq() {
767         #[target_feature(enable = "ssse3")]
768         unsafe fn test() {
769             let v1 =
770                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
771             let v2 =
772                 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
773             assert_eq!(
774                 unload(v1.cmpeq(v2)),
775                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
776             );
777         }
778         if !is_runnable() {
779             return;
780         }
781         unsafe { test() }
782     }
783 
784     #[test]
vector_and()785     fn vector_and() {
786         #[target_feature(enable = "ssse3")]
787         unsafe fn test() {
788             let v1 =
789                 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
790             let v2 =
791                 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
792             assert_eq!(
793                 unload(v1.and(v2)),
794                 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
795             );
796         }
797         if !is_runnable() {
798             return;
799         }
800         unsafe { test() }
801     }
802 
803     #[test]
vector_or()804     fn vector_or() {
805         #[target_feature(enable = "ssse3")]
806         unsafe fn test() {
807             let v1 =
808                 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
809             let v2 =
810                 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
811             assert_eq!(
812                 unload(v1.or(v2)),
813                 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
814             );
815         }
816         if !is_runnable() {
817             return;
818         }
819         unsafe { test() }
820     }
821 
822     #[test]
vector_shift_8bit_lane_right()823     fn vector_shift_8bit_lane_right() {
824         #[target_feature(enable = "ssse3")]
825         unsafe fn test() {
826             let v = load([
827                 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
828             ]);
829             assert_eq!(
830                 unload(v.shift_8bit_lane_right::<2>()),
831                 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
832             );
833         }
834         if !is_runnable() {
835             return;
836         }
837         unsafe { test() }
838     }
839 
840     #[test]
vector_shift_in_one_byte()841     fn vector_shift_in_one_byte() {
842         #[target_feature(enable = "ssse3")]
843         unsafe fn test() {
844             let v1 =
845                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
846             let v2 = load([
847                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
848             ]);
849             assert_eq!(
850                 unload(v1.shift_in_one_byte(v2)),
851                 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
852             );
853         }
854         if !is_runnable() {
855             return;
856         }
857         unsafe { test() }
858     }
859 
860     #[test]
vector_shift_in_two_bytes()861     fn vector_shift_in_two_bytes() {
862         #[target_feature(enable = "ssse3")]
863         unsafe fn test() {
864             let v1 =
865                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
866             let v2 = load([
867                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
868             ]);
869             assert_eq!(
870                 unload(v1.shift_in_two_bytes(v2)),
871                 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
872             );
873         }
874         if !is_runnable() {
875             return;
876         }
877         unsafe { test() }
878     }
879 
880     #[test]
vector_shift_in_three_bytes()881     fn vector_shift_in_three_bytes() {
882         #[target_feature(enable = "ssse3")]
883         unsafe fn test() {
884             let v1 =
885                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
886             let v2 = load([
887                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
888             ]);
889             assert_eq!(
890                 unload(v1.shift_in_three_bytes(v2)),
891                 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
892             );
893         }
894         if !is_runnable() {
895             return;
896         }
897         unsafe { test() }
898     }
899 
900     #[test]
vector_shuffle_bytes()901     fn vector_shuffle_bytes() {
902         #[target_feature(enable = "ssse3")]
903         unsafe fn test() {
904             let v1 =
905                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
906             let v2 =
907                 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
908             assert_eq!(
909                 unload(v1.shuffle_bytes(v2)),
910                 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
911             );
912         }
913         if !is_runnable() {
914             return;
915         }
916         unsafe { test() }
917     }
918 
919     #[test]
vector_for_each_64bit_lane()920     fn vector_for_each_64bit_lane() {
921         #[target_feature(enable = "ssse3")]
922         unsafe fn test() {
923             let v = load([
924                 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
925                 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
926             ]);
927             let mut lanes = [0u64; 2];
928             v.for_each_64bit_lane(|i, lane| {
929                 lanes[i] = lane;
930                 None::<()>
931             });
932             assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
933         }
934         if !is_runnable() {
935             return;
936         }
937         unsafe { test() }
938     }
939 }
940 
941 #[cfg(all(test, target_arch = "x86_64", target_feature = "sse2"))]
942 mod tests_x86_64_avx2 {
943     use core::arch::x86_64::*;
944 
945     use crate::util::int::{I32, U32};
946 
947     use super::*;
948 
is_runnable() -> bool949     fn is_runnable() -> bool {
950         std::is_x86_feature_detected!("avx2")
951     }
952 
953     #[target_feature(enable = "avx2")]
load(lanes: [u8; 32]) -> __m256i954     unsafe fn load(lanes: [u8; 32]) -> __m256i {
955         __m256i::load_unaligned(&lanes as *const u8)
956     }
957 
958     #[target_feature(enable = "avx2")]
load_half(lanes: [u8; 16]) -> __m256i959     unsafe fn load_half(lanes: [u8; 16]) -> __m256i {
960         __m256i::load_half_unaligned(&lanes as *const u8)
961     }
962 
963     #[target_feature(enable = "avx2")]
unload(v: __m256i) -> [u8; 32]964     unsafe fn unload(v: __m256i) -> [u8; 32] {
965         [
966             _mm256_extract_epi8(v, 0).to_bits().low_u8(),
967             _mm256_extract_epi8(v, 1).to_bits().low_u8(),
968             _mm256_extract_epi8(v, 2).to_bits().low_u8(),
969             _mm256_extract_epi8(v, 3).to_bits().low_u8(),
970             _mm256_extract_epi8(v, 4).to_bits().low_u8(),
971             _mm256_extract_epi8(v, 5).to_bits().low_u8(),
972             _mm256_extract_epi8(v, 6).to_bits().low_u8(),
973             _mm256_extract_epi8(v, 7).to_bits().low_u8(),
974             _mm256_extract_epi8(v, 8).to_bits().low_u8(),
975             _mm256_extract_epi8(v, 9).to_bits().low_u8(),
976             _mm256_extract_epi8(v, 10).to_bits().low_u8(),
977             _mm256_extract_epi8(v, 11).to_bits().low_u8(),
978             _mm256_extract_epi8(v, 12).to_bits().low_u8(),
979             _mm256_extract_epi8(v, 13).to_bits().low_u8(),
980             _mm256_extract_epi8(v, 14).to_bits().low_u8(),
981             _mm256_extract_epi8(v, 15).to_bits().low_u8(),
982             _mm256_extract_epi8(v, 16).to_bits().low_u8(),
983             _mm256_extract_epi8(v, 17).to_bits().low_u8(),
984             _mm256_extract_epi8(v, 18).to_bits().low_u8(),
985             _mm256_extract_epi8(v, 19).to_bits().low_u8(),
986             _mm256_extract_epi8(v, 20).to_bits().low_u8(),
987             _mm256_extract_epi8(v, 21).to_bits().low_u8(),
988             _mm256_extract_epi8(v, 22).to_bits().low_u8(),
989             _mm256_extract_epi8(v, 23).to_bits().low_u8(),
990             _mm256_extract_epi8(v, 24).to_bits().low_u8(),
991             _mm256_extract_epi8(v, 25).to_bits().low_u8(),
992             _mm256_extract_epi8(v, 26).to_bits().low_u8(),
993             _mm256_extract_epi8(v, 27).to_bits().low_u8(),
994             _mm256_extract_epi8(v, 28).to_bits().low_u8(),
995             _mm256_extract_epi8(v, 29).to_bits().low_u8(),
996             _mm256_extract_epi8(v, 30).to_bits().low_u8(),
997             _mm256_extract_epi8(v, 31).to_bits().low_u8(),
998         ]
999     }
1000 
1001     #[test]
vector_splat()1002     fn vector_splat() {
1003         #[target_feature(enable = "avx2")]
1004         unsafe fn test() {
1005             let v = __m256i::splat(0xAF);
1006             assert_eq!(
1007                 unload(v),
1008                 [
1009                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1010                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1011                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1012                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1013                 ]
1014             );
1015         }
1016         if !is_runnable() {
1017             return;
1018         }
1019         unsafe { test() }
1020     }
1021 
1022     #[test]
vector_is_zero()1023     fn vector_is_zero() {
1024         #[target_feature(enable = "avx2")]
1025         unsafe fn test() {
1026             let v = load([
1027                 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1028                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1029             ]);
1030             assert!(!v.is_zero());
1031             let v = load([
1032                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1033                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1034             ]);
1035             assert!(v.is_zero());
1036         }
1037         if !is_runnable() {
1038             return;
1039         }
1040         unsafe { test() }
1041     }
1042 
1043     #[test]
vector_cmpeq()1044     fn vector_cmpeq() {
1045         #[target_feature(enable = "avx2")]
1046         unsafe fn test() {
1047             let v1 = load([
1048                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1049                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 1,
1050             ]);
1051             let v2 = load([
1052                 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18,
1053                 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
1054             ]);
1055             assert_eq!(
1056                 unload(v1.cmpeq(v2)),
1057                 [
1058                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1059                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF
1060                 ]
1061             );
1062         }
1063         if !is_runnable() {
1064             return;
1065         }
1066         unsafe { test() }
1067     }
1068 
1069     #[test]
vector_and()1070     fn vector_and() {
1071         #[target_feature(enable = "avx2")]
1072         unsafe fn test() {
1073             let v1 = load([
1074                 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1075                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1076             ]);
1077             let v2 = load([
1078                 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1079                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1080             ]);
1081             assert_eq!(
1082                 unload(v1.and(v2)),
1083                 [
1084                     0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1085                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1086                 ]
1087             );
1088         }
1089         if !is_runnable() {
1090             return;
1091         }
1092         unsafe { test() }
1093     }
1094 
1095     #[test]
vector_or()1096     fn vector_or() {
1097         #[target_feature(enable = "avx2")]
1098         unsafe fn test() {
1099             let v1 = load([
1100                 0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1101                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1102             ]);
1103             let v2 = load([
1104                 0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1105                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1106             ]);
1107             assert_eq!(
1108                 unload(v1.or(v2)),
1109                 [
1110                     0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1111                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1112                 ]
1113             );
1114         }
1115         if !is_runnable() {
1116             return;
1117         }
1118         unsafe { test() }
1119     }
1120 
1121     #[test]
vector_shift_8bit_lane_right()1122     fn vector_shift_8bit_lane_right() {
1123         #[target_feature(enable = "avx2")]
1124         unsafe fn test() {
1125             let v = load([
1126                 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1127                 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1128             ]);
1129             assert_eq!(
1130                 unload(v.shift_8bit_lane_right::<2>()),
1131                 [
1132                     0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1133                     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1134                 ]
1135             );
1136         }
1137         if !is_runnable() {
1138             return;
1139         }
1140         unsafe { test() }
1141     }
1142 
1143     #[test]
vector_shift_in_one_byte()1144     fn vector_shift_in_one_byte() {
1145         #[target_feature(enable = "avx2")]
1146         unsafe fn test() {
1147             let v1 = load([
1148                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1149                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1150             ]);
1151             let v2 = load([
1152                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1153                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1154                 63, 64,
1155             ]);
1156             assert_eq!(
1157                 unload(v1.shift_in_one_byte(v2)),
1158                 [
1159                     64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1160                     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1161                     31,
1162                 ],
1163             );
1164         }
1165         if !is_runnable() {
1166             return;
1167         }
1168         unsafe { test() }
1169     }
1170 
1171     #[test]
vector_shift_in_two_bytes()1172     fn vector_shift_in_two_bytes() {
1173         #[target_feature(enable = "avx2")]
1174         unsafe fn test() {
1175             let v1 = load([
1176                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1177                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1178             ]);
1179             let v2 = load([
1180                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1181                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1182                 63, 64,
1183             ]);
1184             assert_eq!(
1185                 unload(v1.shift_in_two_bytes(v2)),
1186                 [
1187                     63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1188                     16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
1189                     30,
1190                 ],
1191             );
1192         }
1193         if !is_runnable() {
1194             return;
1195         }
1196         unsafe { test() }
1197     }
1198 
1199     #[test]
vector_shift_in_three_bytes()1200     fn vector_shift_in_three_bytes() {
1201         #[target_feature(enable = "avx2")]
1202         unsafe fn test() {
1203             let v1 = load([
1204                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1205                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1206             ]);
1207             let v2 = load([
1208                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1209                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1210                 63, 64,
1211             ]);
1212             assert_eq!(
1213                 unload(v1.shift_in_three_bytes(v2)),
1214                 [
1215                     62, 63, 64, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1216                     15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
1217                     29,
1218                 ],
1219             );
1220         }
1221         if !is_runnable() {
1222             return;
1223         }
1224         unsafe { test() }
1225     }
1226 
1227     #[test]
vector_shuffle_bytes()1228     fn vector_shuffle_bytes() {
1229         #[target_feature(enable = "avx2")]
1230         unsafe fn test() {
1231             let v1 = load([
1232                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1233                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1234             ]);
1235             let v2 = load([
1236                 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12, 16, 16,
1237                 16, 16, 20, 20, 20, 20, 24, 24, 24, 24, 28, 28, 28, 28,
1238             ]);
1239             assert_eq!(
1240                 unload(v1.shuffle_bytes(v2)),
1241                 [
1242                     1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13, 17,
1243                     17, 17, 17, 21, 21, 21, 21, 25, 25, 25, 25, 29, 29, 29,
1244                     29
1245                 ],
1246             );
1247         }
1248         if !is_runnable() {
1249             return;
1250         }
1251         unsafe { test() }
1252     }
1253 
1254     #[test]
vector_for_each_64bit_lane()1255     fn vector_for_each_64bit_lane() {
1256         #[target_feature(enable = "avx2")]
1257         unsafe fn test() {
1258             let v = load([
1259                 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1260                 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1261                 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1262                 0x1F, 0x20,
1263             ]);
1264             let mut lanes = [0u64; 4];
1265             v.for_each_64bit_lane(|i, lane| {
1266                 lanes[i] = lane;
1267                 None::<()>
1268             });
1269             assert_eq!(
1270                 lanes,
1271                 [
1272                     0x0807060504030201,
1273                     0x100F0E0D0C0B0A09,
1274                     0x1817161514131211,
1275                     0x201F1E1D1C1B1A19
1276                 ]
1277             );
1278         }
1279         if !is_runnable() {
1280             return;
1281         }
1282         unsafe { test() }
1283     }
1284 
1285     #[test]
fat_vector_half_shift_in_one_byte()1286     fn fat_vector_half_shift_in_one_byte() {
1287         #[target_feature(enable = "avx2")]
1288         unsafe fn test() {
1289             let v1 = load_half([
1290                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1291             ]);
1292             let v2 = load_half([
1293                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1294             ]);
1295             assert_eq!(
1296                 unload(v1.half_shift_in_one_byte(v2)),
1297                 [
1298                     32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 32,
1299                     1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
1300                 ],
1301             );
1302         }
1303         if !is_runnable() {
1304             return;
1305         }
1306         unsafe { test() }
1307     }
1308 
1309     #[test]
fat_vector_half_shift_in_two_bytes()1310     fn fat_vector_half_shift_in_two_bytes() {
1311         #[target_feature(enable = "avx2")]
1312         unsafe fn test() {
1313             let v1 = load_half([
1314                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1315             ]);
1316             let v2 = load_half([
1317                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1318             ]);
1319             assert_eq!(
1320                 unload(v1.half_shift_in_two_bytes(v2)),
1321                 [
1322                     31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 31,
1323                     32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
1324                 ],
1325             );
1326         }
1327         if !is_runnable() {
1328             return;
1329         }
1330         unsafe { test() }
1331     }
1332 
1333     #[test]
fat_vector_half_shift_in_three_bytes()1334     fn fat_vector_half_shift_in_three_bytes() {
1335         #[target_feature(enable = "avx2")]
1336         unsafe fn test() {
1337             let v1 = load_half([
1338                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
1339             ]);
1340             let v2 = load_half([
1341                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1342             ]);
1343             assert_eq!(
1344                 unload(v1.half_shift_in_three_bytes(v2)),
1345                 [
1346                     30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 30,
1347                     31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
1348                 ],
1349             );
1350         }
1351         if !is_runnable() {
1352             return;
1353         }
1354         unsafe { test() }
1355     }
1356 
1357     #[test]
fat_vector_swap_halves()1358     fn fat_vector_swap_halves() {
1359         #[target_feature(enable = "avx2")]
1360         unsafe fn test() {
1361             let v = load([
1362                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1363                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1364             ]);
1365             assert_eq!(
1366                 unload(v.swap_halves()),
1367                 [
1368                     17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
1369                     31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
1370                     16,
1371                 ],
1372             );
1373         }
1374         if !is_runnable() {
1375             return;
1376         }
1377         unsafe { test() }
1378     }
1379 
1380     #[test]
fat_vector_interleave_low_8bit_lanes()1381     fn fat_vector_interleave_low_8bit_lanes() {
1382         #[target_feature(enable = "avx2")]
1383         unsafe fn test() {
1384             let v1 = load([
1385                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1386                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1387             ]);
1388             let v2 = load([
1389                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1390                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1391                 63, 64,
1392             ]);
1393             assert_eq!(
1394                 unload(v1.interleave_low_8bit_lanes(v2)),
1395                 [
1396                     1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, 8, 40,
1397                     17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55,
1398                     24, 56,
1399                 ],
1400             );
1401         }
1402         if !is_runnable() {
1403             return;
1404         }
1405         unsafe { test() }
1406     }
1407 
1408     #[test]
fat_vector_interleave_high_8bit_lanes()1409     fn fat_vector_interleave_high_8bit_lanes() {
1410         #[target_feature(enable = "avx2")]
1411         unsafe fn test() {
1412             let v1 = load([
1413                 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
1414                 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1415             ]);
1416             let v2 = load([
1417                 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
1418                 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
1419                 63, 64,
1420             ]);
1421             assert_eq!(
1422                 unload(v1.interleave_high_8bit_lanes(v2)),
1423                 [
1424                     9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, 16,
1425                     48, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31,
1426                     63, 32, 64,
1427                 ],
1428             );
1429         }
1430         if !is_runnable() {
1431             return;
1432         }
1433         unsafe { test() }
1434     }
1435 
1436     #[test]
fat_vector_for_each_low_64bit_lane()1437     fn fat_vector_for_each_low_64bit_lane() {
1438         #[target_feature(enable = "avx2")]
1439         unsafe fn test() {
1440             let v1 = load([
1441                 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1442                 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14,
1443                 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E,
1444                 0x1F, 0x20,
1445             ]);
1446             let v2 = load([
1447                 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A,
1448                 0x2B, 0x2C, 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34,
1449                 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E,
1450                 0x3F, 0x40,
1451             ]);
1452             let mut lanes = [0u64; 4];
1453             v1.for_each_low_64bit_lane(v2, |i, lane| {
1454                 lanes[i] = lane;
1455                 None::<()>
1456             });
1457             assert_eq!(
1458                 lanes,
1459                 [
1460                     0x0807060504030201,
1461                     0x100F0E0D0C0B0A09,
1462                     0x2827262524232221,
1463                     0x302F2E2D2C2B2A29
1464                 ]
1465             );
1466         }
1467         if !is_runnable() {
1468             return;
1469         }
1470         unsafe { test() }
1471     }
1472 }
1473 
1474 #[cfg(all(test, target_arch = "aarch64", target_feature = "neon"))]
1475 mod tests_aarch64_neon {
1476     use core::arch::aarch64::*;
1477 
1478     use super::*;
1479 
1480     #[target_feature(enable = "neon")]
load(lanes: [u8; 16]) -> uint8x16_t1481     unsafe fn load(lanes: [u8; 16]) -> uint8x16_t {
1482         uint8x16_t::load_unaligned(&lanes as *const u8)
1483     }
1484 
1485     #[target_feature(enable = "neon")]
unload(v: uint8x16_t) -> [u8; 16]1486     unsafe fn unload(v: uint8x16_t) -> [u8; 16] {
1487         [
1488             vgetq_lane_u8(v, 0),
1489             vgetq_lane_u8(v, 1),
1490             vgetq_lane_u8(v, 2),
1491             vgetq_lane_u8(v, 3),
1492             vgetq_lane_u8(v, 4),
1493             vgetq_lane_u8(v, 5),
1494             vgetq_lane_u8(v, 6),
1495             vgetq_lane_u8(v, 7),
1496             vgetq_lane_u8(v, 8),
1497             vgetq_lane_u8(v, 9),
1498             vgetq_lane_u8(v, 10),
1499             vgetq_lane_u8(v, 11),
1500             vgetq_lane_u8(v, 12),
1501             vgetq_lane_u8(v, 13),
1502             vgetq_lane_u8(v, 14),
1503             vgetq_lane_u8(v, 15),
1504         ]
1505     }
1506 
1507     // Example functions. These don't test the Vector traits, but rather,
1508     // specific NEON instructions. They are basically little experiments I
1509     // wrote to figure out what an instruction does since their descriptions
1510     // are so dense. I decided to keep the experiments around as example tests
1511     // in case there' useful.
1512 
1513     #[test]
example_vmaxvq_u8_non_zero()1514     fn example_vmaxvq_u8_non_zero() {
1515         #[target_feature(enable = "neon")]
1516         unsafe fn example() {
1517             let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1518             assert_eq!(vmaxvq_u8(v), 1);
1519         }
1520         unsafe { example() }
1521     }
1522 
1523     #[test]
example_vmaxvq_u8_zero()1524     fn example_vmaxvq_u8_zero() {
1525         #[target_feature(enable = "neon")]
1526         unsafe fn example() {
1527             let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1528             assert_eq!(vmaxvq_u8(v), 0);
1529         }
1530         unsafe { example() }
1531     }
1532 
1533     #[test]
example_vpmaxq_u8_non_zero()1534     fn example_vpmaxq_u8_non_zero() {
1535         #[target_feature(enable = "neon")]
1536         unsafe fn example() {
1537             let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1538             let r = vpmaxq_u8(v, v);
1539             assert_eq!(
1540                 unload(r),
1541                 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]
1542             );
1543         }
1544         unsafe { example() }
1545     }
1546 
1547     #[test]
example_vpmaxq_u8_self()1548     fn example_vpmaxq_u8_self() {
1549         #[target_feature(enable = "neon")]
1550         unsafe fn example() {
1551             let v =
1552                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1553             let r = vpmaxq_u8(v, v);
1554             assert_eq!(
1555                 unload(r),
1556                 [2, 4, 6, 8, 10, 12, 14, 16, 2, 4, 6, 8, 10, 12, 14, 16]
1557             );
1558         }
1559         unsafe { example() }
1560     }
1561 
1562     #[test]
example_vpmaxq_u8_other()1563     fn example_vpmaxq_u8_other() {
1564         #[target_feature(enable = "neon")]
1565         unsafe fn example() {
1566             let v1 =
1567                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1568             let v2 = load([
1569                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1570             ]);
1571             let r = vpmaxq_u8(v1, v2);
1572             assert_eq!(
1573                 unload(r),
1574                 [2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32]
1575             );
1576         }
1577         unsafe { example() }
1578     }
1579 
1580     // Now we test the actual methods on the Vector trait.
1581 
1582     #[test]
vector_splat()1583     fn vector_splat() {
1584         #[target_feature(enable = "neon")]
1585         unsafe fn test() {
1586             let v = uint8x16_t::splat(0xAF);
1587             assert_eq!(
1588                 unload(v),
1589                 [
1590                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF,
1591                     0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF, 0xAF
1592                 ]
1593             );
1594         }
1595         unsafe { test() }
1596     }
1597 
1598     #[test]
vector_is_zero()1599     fn vector_is_zero() {
1600         #[target_feature(enable = "neon")]
1601         unsafe fn test() {
1602             let v = load([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1603             assert!(!v.is_zero());
1604             let v = load([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1605             assert!(v.is_zero());
1606         }
1607         unsafe { test() }
1608     }
1609 
1610     #[test]
vector_cmpeq()1611     fn vector_cmpeq() {
1612         #[target_feature(enable = "neon")]
1613         unsafe fn test() {
1614             let v1 =
1615                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1]);
1616             let v2 =
1617                 load([16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1]);
1618             assert_eq!(
1619                 unload(v1.cmpeq(v2)),
1620                 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xFF]
1621             );
1622         }
1623         unsafe { test() }
1624     }
1625 
1626     #[test]
vector_and()1627     fn vector_and() {
1628         #[target_feature(enable = "neon")]
1629         unsafe fn test() {
1630             let v1 =
1631                 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1632             let v2 =
1633                 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1634             assert_eq!(
1635                 unload(v1.and(v2)),
1636                 [0, 0, 0, 0, 0, 0b1000, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1637             );
1638         }
1639         unsafe { test() }
1640     }
1641 
1642     #[test]
vector_or()1643     fn vector_or() {
1644         #[target_feature(enable = "neon")]
1645         unsafe fn test() {
1646             let v1 =
1647                 load([0, 0, 0, 0, 0, 0b1001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1648             let v2 =
1649                 load([0, 0, 0, 0, 0, 0b1010, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]);
1650             assert_eq!(
1651                 unload(v1.or(v2)),
1652                 [0, 0, 0, 0, 0, 0b1011, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1653             );
1654         }
1655         unsafe { test() }
1656     }
1657 
1658     #[test]
vector_shift_8bit_lane_right()1659     fn vector_shift_8bit_lane_right() {
1660         #[target_feature(enable = "neon")]
1661         unsafe fn test() {
1662             let v = load([
1663                 0, 0, 0, 0, 0b1011, 0b0101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1664             ]);
1665             assert_eq!(
1666                 unload(v.shift_8bit_lane_right::<2>()),
1667                 [0, 0, 0, 0, 0b0010, 0b0001, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1668             );
1669         }
1670         unsafe { test() }
1671     }
1672 
1673     #[test]
vector_shift_in_one_byte()1674     fn vector_shift_in_one_byte() {
1675         #[target_feature(enable = "neon")]
1676         unsafe fn test() {
1677             let v1 =
1678                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1679             let v2 = load([
1680                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1681             ]);
1682             assert_eq!(
1683                 unload(v1.shift_in_one_byte(v2)),
1684                 [32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
1685             );
1686         }
1687         unsafe { test() }
1688     }
1689 
1690     #[test]
vector_shift_in_two_bytes()1691     fn vector_shift_in_two_bytes() {
1692         #[target_feature(enable = "neon")]
1693         unsafe fn test() {
1694             let v1 =
1695                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1696             let v2 = load([
1697                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1698             ]);
1699             assert_eq!(
1700                 unload(v1.shift_in_two_bytes(v2)),
1701                 [31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14],
1702             );
1703         }
1704         unsafe { test() }
1705     }
1706 
1707     #[test]
vector_shift_in_three_bytes()1708     fn vector_shift_in_three_bytes() {
1709         #[target_feature(enable = "neon")]
1710         unsafe fn test() {
1711             let v1 =
1712                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1713             let v2 = load([
1714                 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
1715             ]);
1716             assert_eq!(
1717                 unload(v1.shift_in_three_bytes(v2)),
1718                 [30, 31, 32, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13],
1719             );
1720         }
1721         unsafe { test() }
1722     }
1723 
1724     #[test]
vector_shuffle_bytes()1725     fn vector_shuffle_bytes() {
1726         #[target_feature(enable = "neon")]
1727         unsafe fn test() {
1728             let v1 =
1729                 load([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
1730             let v2 =
1731                 load([0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12]);
1732             assert_eq!(
1733                 unload(v1.shuffle_bytes(v2)),
1734                 [1, 1, 1, 1, 5, 5, 5, 5, 9, 9, 9, 9, 13, 13, 13, 13],
1735             );
1736         }
1737         unsafe { test() }
1738     }
1739 
1740     #[test]
vector_for_each_64bit_lane()1741     fn vector_for_each_64bit_lane() {
1742         #[target_feature(enable = "neon")]
1743         unsafe fn test() {
1744             let v = load([
1745                 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A,
1746                 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10,
1747             ]);
1748             let mut lanes = [0u64; 2];
1749             v.for_each_64bit_lane(|i, lane| {
1750                 lanes[i] = lane;
1751                 None::<()>
1752             });
1753             assert_eq!(lanes, [0x0807060504030201, 0x100F0E0D0C0B0A09],);
1754         }
1755         unsafe { test() }
1756     }
1757 }
1758