xref: /aosp_15_r20/external/pigweed/pw_tokenizer/rust/pw_tokenizer_core.rs (revision 61c4878ac05f98d0ceed94b57d316916de578985)
1 // Copyright 2023 The Pigweed Authors
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License"); you may not
4 // use this file except in compliance with the License. You may obtain a copy of
5 // the License at
6 //
7 //     https://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 // License for the specific language governing permissions and limitations under
13 // the License.
14 
15 //! # pw_tokenizer_core
16 //!
17 //! This crate provides the core functionality of calculating a token hash
18 //! for a string or byte sequence.  This is intended to provide a minimal core
19 //! for both the main `pw_tokenizer` and `pw_tokenizer_macro` crates.  Users
20 //! should prefer depending `pw_tokenizer` instead of this crate.
21 #![cfg_attr(not(feature = "std"), no_std)]
22 
23 pub const HASH_CONSTANT: u32 = 65599u32;
24 
25 /// Hasher is used to calculate a token's hash value.
26 ///
27 /// The hash state is encapsulated in the `Hasher` to allow the hashing of
28 /// multi-part strings where the concatenated value can not be know at macro
29 /// expansion time.  All functions are `const`.
30 pub struct Hasher {
31     coefficient: u32,
32     hash: u32,
33     bytes_hashed: usize,
34     hash_len: usize,
35 }
36 
37 impl Hasher {
38     /// Create a new `Hasher`
39     ///
40     /// `data_len` is the total length of the data to be hashed.  `hash_len`
41     /// is the number of bytes of data to be used in calculating the hash.
42     /// `data_len` is used to seed  the hash while `hash_len` controls how many
43     /// bytes are hashed.
new(data_len: usize, hash_len: usize) -> Self44     pub const fn new(data_len: usize, hash_len: usize) -> Self {
45         {
46             Self {
47                 coefficient: HASH_CONSTANT,
48                 hash: data_len as u32,
49                 bytes_hashed: 0,
50                 hash_len,
51             }
52         }
53     }
54 
55     /// Processes `bytes` and updates hash state.
56     ///
57     /// Consumes `self` and returns a [`Hasher`] with the updated state.
process_bytes(mut self, bytes: &[u8]) -> Self58     pub const fn process_bytes(mut self, bytes: &[u8]) -> Self {
59         let bytes_left = self.hash_len - self.bytes_hashed;
60 
61         let bytes = if bytes.len() > bytes_left {
62             bytes.split_at(bytes_left).0
63         } else {
64             bytes
65         };
66 
67         // For loops are not allowed in const functions.
68         let mut i = 0;
69         while i < bytes.len() {
70             // We call `u32::wrapping_*` instead of using `core::num::Wrapping` to
71             // avoid using traits in a const function.
72             self.hash = self
73                 .hash
74                 .wrapping_add(self.coefficient.wrapping_mul(bytes[i] as u32));
75             self.coefficient = self.coefficient.wrapping_mul(HASH_CONSTANT);
76             i += 1;
77         }
78         self.bytes_hashed += i;
79 
80         self
81     }
82 
83     /// Consume `self` and return the hash.
hash(self) -> u3284     pub const fn hash(self) -> u32 {
85         self.hash
86     }
87 }
88 /// Calculate the hash for a sequence of bytes.
89 ///
90 /// ```
91 /// use pw_tokenizer_core::hash_bytes;
92 ///
93 /// let hash = hash_bytes(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07]);
94 /// assert_eq!(hash, 0x9e624642);
95 /// ```
hash_bytes(bytes: &[u8]) -> u3296 pub const fn hash_bytes(bytes: &[u8]) -> u32 {
97     hash_bytes_fixed(bytes, bytes.len())
98 }
99 
100 /// Calculate the hash for a sequence of bytes, examining at most `len` bytes.
101 ///
102 /// ```
103 /// use pw_tokenizer_core::hash_bytes_fixed;
104 ///
105 /// let hash = hash_bytes_fixed(&[0x34, 0xd8, 0x3a, 0xbb, 0xf1, 0x0e, 0x07], 4);
106 /// assert_eq!(hash, 0x92c5d2ac);
107 /// ```
hash_bytes_fixed(bytes: &[u8], len: usize) -> u32108 pub const fn hash_bytes_fixed(bytes: &[u8], len: usize) -> u32 {
109     Hasher::new(bytes.len(), len).process_bytes(bytes).hash()
110 }
111 
112 /// Calculate the hash for a string.
113 ///
114 /// ```
115 /// use pw_tokenizer_core::hash_string;
116 ///
117 /// let hash = hash_string("I �� Pigweed");
118 /// assert_eq!(hash, 0xe318d1b3);
119 /// ```
hash_string(s: &str) -> u32120 pub const fn hash_string(s: &str) -> u32 {
121     hash_bytes(s.as_bytes())
122 }
123 
124 pub const TOKENIZER_ENTRY_MAGIC: u32 = 0xBAA98DEE;
125 
126 #[cfg(test)]
127 mod tests {
128     use super::{hash_bytes_fixed, Hasher};
129 
130     struct TestCase {
131         string: &'static [u8],
132         hash_length: usize,
133         hash: u32,
134     }
135 
136     // Generated file defines `test_cases()`.
137     include!("pw_tokenizer_core_test_cases.rs");
138 
139     #[test]
hash_bytes_fixed_generates_correct_hash()140     fn hash_bytes_fixed_generates_correct_hash() {
141         for test in test_cases() {
142             let hash = hash_bytes_fixed(test.string, test.hash_length);
143             assert_eq!(
144                 hash, test.hash,
145                 "{:08x} != {:08x} string: {:x?} hash_size: {}",
146                 hash, test.hash, test.string, test.hash_length
147             );
148         }
149     }
150 
151     #[test]
multi_part_data_generates_correct_hash()152     fn multi_part_data_generates_correct_hash() {
153         for test in test_cases() {
154             let mut hasher = Hasher::new(test.string.len(), test.hash_length);
155             // Break each test string into 8 byte chunks and feed them to the
156             // hasher separately.
157             for chunk in test.string.chunks(8) {
158                 hasher = hasher.process_bytes(chunk);
159             }
160             let hash = hasher.hash();
161             assert_eq!(
162                 hash, test.hash,
163                 "{:08x} != {:08x} string: {:x?} hash_size: {}",
164                 hash, test.hash, test.string, test.hash_length
165             );
166         }
167     }
168 }
169