1 use crate::iter::Bytes;
2 use core::arch::aarch64::*;
3 
4 #[inline]
match_header_name_vectored(bytes: &mut Bytes)5 pub fn match_header_name_vectored(bytes: &mut Bytes) {
6     while bytes.as_ref().len() >= 16 {
7         // SAFETY: ensured that there are at least 16 bytes remaining
8         unsafe {
9             let advance = match_header_name_char_16_neon(bytes.as_ref().as_ptr());
10             bytes.advance(advance);
11 
12             if advance != 16 {
13                 return;
14             }
15         }
16     }
17     super::swar::match_header_name_vectored(bytes);
18 }
19 
20 #[inline]
match_header_value_vectored(bytes: &mut Bytes)21 pub fn match_header_value_vectored(bytes: &mut Bytes) {
22     while bytes.as_ref().len() >= 16 {
23         // SAFETY: ensured that there are at least 16 bytes remaining
24         unsafe {
25             let advance = match_header_value_char_16_neon(bytes.as_ref().as_ptr());
26             bytes.advance(advance);
27 
28             if advance != 16 {
29                 return;
30             }
31         }
32     }
33     super::swar::match_header_value_vectored(bytes);
34 }
35 
36 #[inline]
match_uri_vectored(bytes: &mut Bytes)37 pub fn match_uri_vectored(bytes: &mut Bytes) {
38     while bytes.as_ref().len() >= 16 {
39         // SAFETY: ensured that there are at least 16 bytes remaining
40         unsafe {
41             let advance = match_url_char_16_neon(bytes.as_ref().as_ptr());
42             bytes.advance(advance);
43 
44             if advance != 16 {
45                 return;
46             }
47         }
48     }
49     super::swar::match_uri_vectored(bytes);
50 }
51 
bit_set(x: u8) -> bool52 const fn bit_set(x: u8) -> bool {
53     // Validates if a byte is a valid header name character
54     // https://tools.ietf.org/html/rfc7230#section-3.2.6
55     matches!(x, b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'!' | b'#' | b'$' | b'%' | b'&' | b'\'' | b'*' | b'+' | b'-' | b'.' | b'^' | b'_' | b'`' | b'|' | b'~')
56 }
57 
58 // A 256-bit bitmap, split into two halves
59 // lower half contains bits whose higher nibble is <= 7
60 // higher half contains bits whose higher nibble is >= 8
build_bitmap() -> ([u8; 16], [u8; 16])61 const fn build_bitmap() -> ([u8; 16], [u8; 16]) {
62     let mut bitmap_0_7 = [0u8; 16]; // 0x00..0x7F
63     let mut bitmap_8_15 = [0u8; 16]; // 0x80..0xFF
64     let mut i = 0;
65     while i < 256 {
66         if bit_set(i as u8) {
67             // Nibbles
68             let (lo, hi) = (i & 0x0F, i >> 4);
69             if i < 128 {
70                 bitmap_0_7[lo] |= 1 << hi;
71             } else {
72                 bitmap_8_15[lo] |= 1 << hi;
73             }
74         }
75         i += 1;
76     }
77     (bitmap_0_7, bitmap_8_15)
78 }
79 
80 const BITMAPS: ([u8; 16], [u8; 16]) = build_bitmap();
81 
82 // NOTE: adapted from 256-bit version, with upper 128-bit ops commented out
83 #[inline]
match_header_name_char_16_neon(ptr: *const u8) -> usize84 unsafe fn match_header_name_char_16_neon(ptr: *const u8) -> usize {
85     let bitmaps = BITMAPS;
86     // NOTE: ideally compile-time constants
87     let (bitmap_0_7, _bitmap_8_15) = bitmaps;
88     let bitmap_0_7 = vld1q_u8(bitmap_0_7.as_ptr());
89     // let bitmap_8_15 = vld1q_u8(bitmap_8_15.as_ptr());
90 
91     // Initialize the bitmask_lookup.
92     const BITMASK_LOOKUP_DATA: [u8; 16] =
93         [1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128];
94     let bitmask_lookup = vld1q_u8(BITMASK_LOOKUP_DATA.as_ptr());
95 
96     // Load 16 input bytes.
97     let input = vld1q_u8(ptr);
98 
99     // Extract indices for row_0_7.
100     let indices_0_7 = vandq_u8(input, vdupq_n_u8(0x8F)); // 0b1000_1111;
101 
102     // Extract indices for row_8_15.
103     // let msb = vandq_u8(input, vdupq_n_u8(0x80));
104     // let indices_8_15 = veorq_u8(indices_0_7, msb);
105 
106     // Fetch row_0_7 and row_8_15.
107     let row_0_7 = vqtbl1q_u8(bitmap_0_7, indices_0_7);
108     // let row_8_15 = vqtbl1q_u8(bitmap_8_15, indices_8_15);
109 
110     // Calculate a bitmask, i.e. (1 << hi_nibble % 8).
111     let bitmask = vqtbl1q_u8(bitmask_lookup, vshrq_n_u8(input, 4));
112 
113     // Choose rows halves depending on higher nibbles.
114     // let bitsets = vorrq_u8(row_0_7, row_8_15);
115     let bitsets = row_0_7;
116 
117     // Finally check which bytes belong to the set.
118     let tmp = vandq_u8(bitsets, bitmask);
119     let result = vceqq_u8(tmp, bitmask);
120 
121     offsetz(result) as usize
122 }
123 
124 #[inline]
match_url_char_16_neon(ptr: *const u8) -> usize125 unsafe fn match_url_char_16_neon(ptr: *const u8) -> usize {
126     let input = vld1q_u8(ptr);
127 
128     // Check that b'!' <= input <= b'~'
129     let result = vandq_u8(
130         vcleq_u8(vdupq_n_u8(b'!'), input),
131         vcleq_u8(input, vdupq_n_u8(b'~')),
132     );
133     // Check that input != b'<' and input != b'>'
134     let lt = vceqq_u8(input, vdupq_n_u8(b'<'));
135     let gt = vceqq_u8(input, vdupq_n_u8(b'>'));
136     let ltgt = vorrq_u8(lt, gt);
137     // Nand with result
138     let result = vbicq_u8(result, ltgt);
139 
140     offsetz(result) as usize
141 }
142 
143 #[inline]
match_header_value_char_16_neon(ptr: *const u8) -> usize144 unsafe fn match_header_value_char_16_neon(ptr: *const u8) -> usize {
145     let input = vld1q_u8(ptr);
146 
147     // Check that b' ' <= and b != 127 or b == 9
148     let result = vcleq_u8(vdupq_n_u8(b' '), input);
149 
150     // Allow tab
151     let tab = vceqq_u8(input, vdupq_n_u8(0x09));
152     let result = vorrq_u8(result, tab);
153 
154     // Disallow del
155     let del = vceqq_u8(input, vdupq_n_u8(0x7F));
156     let result = vbicq_u8(result, del);
157 
158     offsetz(result) as usize
159 }
160 
161 #[inline]
offsetz(x: uint8x16_t) -> u32162 unsafe fn offsetz(x: uint8x16_t) -> u32 {
163     // NOT the vector since it's faster to operate with zeros instead
164     offsetnz(vmvnq_u8(x))
165 }
166 
167 #[inline]
offsetnz(x: uint8x16_t) -> u32168 unsafe fn offsetnz(x: uint8x16_t) -> u32 {
169     // Extract two u64
170     let x = vreinterpretq_u64_u8(x);
171     // Extract to general purpose registers to perform clz
172     let low: u64 = vgetq_lane_u64::<0>(x);
173     let high: u64 = vgetq_lane_u64::<1>(x);
174 
175     #[inline]
176     fn clz(x: u64) -> u32 {
177         // perf: rust will unroll this loop
178         // and it's much faster than rbit + clz so voila
179         for (i, b) in x.to_ne_bytes().iter().copied().enumerate() {
180             if b != 0 {
181                 return i as u32;
182             }
183         }
184         8 // Technically not reachable since zero-guarded
185     }
186 
187     if low != 0 {
188         clz(low)
189     } else if high != 0 {
190         return 8 + clz(high);
191     } else {
192         return 16;
193     }
194 }
195 
196 #[test]
neon_code_matches_uri_chars_table()197 fn neon_code_matches_uri_chars_table() {
198     #[allow(clippy::undocumented_unsafe_blocks)]
199     unsafe {
200         assert!(byte_is_allowed(b'_', match_uri_vectored));
201 
202         for (b, allowed) in crate::URI_MAP.iter().cloned().enumerate() {
203             assert_eq!(
204                 byte_is_allowed(b as u8, match_uri_vectored),
205                 allowed,
206                 "byte_is_allowed({:?}) should be {:?}",
207                 b,
208                 allowed,
209             );
210         }
211     }
212 }
213 
214 #[test]
neon_code_matches_header_value_chars_table()215 fn neon_code_matches_header_value_chars_table() {
216     #[allow(clippy::undocumented_unsafe_blocks)]
217     unsafe {
218         assert!(byte_is_allowed(b'_', match_header_value_vectored));
219 
220         for (b, allowed) in crate::HEADER_VALUE_MAP.iter().cloned().enumerate() {
221             assert_eq!(
222                 byte_is_allowed(b as u8, match_header_value_vectored),
223                 allowed,
224                 "byte_is_allowed({:?}) should be {:?}",
225                 b,
226                 allowed,
227             );
228         }
229     }
230 }
231 
232 #[test]
neon_code_matches_header_name_chars_table()233 fn neon_code_matches_header_name_chars_table() {
234     #[allow(clippy::undocumented_unsafe_blocks)]
235     unsafe {
236         assert!(byte_is_allowed(b'_', match_header_name_vectored));
237 
238         for (b, allowed) in crate::HEADER_NAME_MAP.iter().cloned().enumerate() {
239             assert_eq!(
240                 byte_is_allowed(b as u8, match_header_name_vectored),
241                 allowed,
242                 "byte_is_allowed({:?}) should be {:?}",
243                 b,
244                 allowed,
245             );
246         }
247     }
248 }
249 
250 #[cfg(test)]
byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool251 unsafe fn byte_is_allowed(byte: u8, f: unsafe fn(bytes: &mut Bytes<'_>)) -> bool {
252     let mut slice = [b'_'; 16];
253     slice[10] = byte;
254     let mut bytes = Bytes::new(&slice);
255 
256     f(&mut bytes);
257 
258     match bytes.pos() {
259         16 => true,
260         10 => false,
261         x => panic!("unexpected pos: {}", x),
262     }
263 }
264