xref: /aosp_15_r20/external/cronet/net/base/lookup_string_in_fixed_set.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2015 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/lookup_string_in_fixed_set.h"
6 
7 #include "base/check.h"
8 
9 namespace net {
10 
11 namespace {
12 
13 // Read next offset from |pos|, increment |offset| by that amount, and increment
14 // |pos| either to point to the start of the next encoded offset in its node, or
15 // nullptr, if there are no remaining offsets.
16 //
17 // Returns true if an offset could be read; false otherwise.
GetNextOffset(const unsigned char ** pos,const unsigned char ** offset)18 inline bool GetNextOffset(const unsigned char** pos,
19                           const unsigned char** offset) {
20   if (*pos == nullptr)
21     return false;
22 
23   size_t bytes_consumed;
24   switch (**pos & 0x60) {
25     case 0x60:  // Read three byte offset
26       *offset += (((*pos)[0] & 0x1F) << 16) | ((*pos)[1] << 8) | (*pos)[2];
27       bytes_consumed = 3;
28       break;
29     case 0x40:  // Read two byte offset
30       *offset += (((*pos)[0] & 0x1F) << 8) | (*pos)[1];
31       bytes_consumed = 2;
32       break;
33     default:
34       *offset += (*pos)[0] & 0x3F;
35       bytes_consumed = 1;
36   }
37   if ((**pos & 0x80) != 0) {
38     *pos = nullptr;
39   } else {
40     *pos += bytes_consumed;
41   }
42   return true;
43 }
44 
45 // Check if byte at |offset| is last in label.
IsEOL(const unsigned char * offset)46 bool IsEOL(const unsigned char* offset) {
47   return (*offset & 0x80) != 0;
48 }
49 
50 // Check if byte at |offset| matches key. This version matches both end-of-label
51 // chars and not-end-of-label chars.
IsMatch(const unsigned char * offset,char key)52 bool IsMatch(const unsigned char* offset, char key) {
53   return (*offset & 0x7F) == key;
54 }
55 
56 // Read return value at |offset|, if it is a return value. Returns true if a
57 // return value could be read, false otherwise.
GetReturnValue(const unsigned char * offset,int * return_value)58 bool GetReturnValue(const unsigned char* offset, int* return_value) {
59   // Return values are always encoded as end-of-label chars (so the high bit is
60   // set). So byte values in the inclusive range [0x80, 0x9F] encode the return
61   // values 0 through 31 (though make_dafsa.py doesn't currently encode values
62   // higher than 7). The following code does that translation.
63   if ((*offset & 0xE0) == 0x80) {
64     *return_value = *offset & 0x1F;
65     return true;
66   }
67   return false;
68 }
69 
70 }  // namespace
71 
FixedSetIncrementalLookup(const unsigned char * graph,size_t length)72 FixedSetIncrementalLookup::FixedSetIncrementalLookup(const unsigned char* graph,
73                                                      size_t length)
74     : pos_(graph), end_(graph + length) {}
75 
76 FixedSetIncrementalLookup::FixedSetIncrementalLookup(
77     const FixedSetIncrementalLookup& other) = default;
78 
79 FixedSetIncrementalLookup& FixedSetIncrementalLookup::operator=(
80     const FixedSetIncrementalLookup& other) = default;
81 
82 FixedSetIncrementalLookup::~FixedSetIncrementalLookup() = default;
83 
Advance(char input)84 bool FixedSetIncrementalLookup::Advance(char input) {
85   if (!pos_) {
86     // A previous input exhausted the graph, so there are no possible matches.
87     return false;
88   }
89 
90   // Only ASCII printable chars are supported by the current DAFSA format -- the
91   // high bit (values 0x80-0xFF) is reserved as a label-end signifier, and the
92   // low values (values 0x00-0x1F) are reserved to encode the return values. So
93   // values outside this range will never be in the dictionary.
94   if (input >= 0x20) {
95     if (pos_is_label_character_) {
96       // Currently processing a label, so it is only necessary to check the byte
97       // at |pos_| to see if it encodes a character matching |input|.
98       bool is_last_char_in_label = IsEOL(pos_);
99       bool is_match = IsMatch(pos_, input);
100       if (is_match) {
101         // If this is not the last character in the label, the next byte should
102         // be interpreted as a character or return value. Otherwise, the next
103         // byte should be interpreted as a list of child node offsets.
104         ++pos_;
105         DCHECK(pos_ < end_);
106         pos_is_label_character_ = !is_last_char_in_label;
107         return true;
108       }
109     } else {
110       const unsigned char* offset = pos_;
111       // Read offsets from |pos_| until the label of the child node at |offset|
112       // matches |input|, or until there are no more offsets.
113       while (GetNextOffset(&pos_, &offset)) {
114         DCHECK(offset < end_);
115         DCHECK((pos_ == nullptr) || (pos_ < end_));
116 
117         // |offset| points to a DAFSA node that is a child of the original node.
118         //
119         // The low 7 bits of a node encodes a character value; the high bit
120         // indicates whether it's the last character in the label.
121         //
122         // Note that |*offset| could also be a result code value, but these are
123         // really just out-of-range ASCII values, encoded the same way as
124         // characters. Since |input| was already validated as a printable ASCII
125         // value ASCII value, IsMatch will never return true if |offset| is a
126         // result code.
127         bool is_last_char_in_label = IsEOL(offset);
128         bool is_match = IsMatch(offset, input);
129 
130         if (is_match) {
131           // If this is not the last character in the label, the next byte
132           // should be interpreted as a character or return value. Otherwise,
133           // the next byte should be interpreted as a list of child node
134           // offsets.
135           pos_ = offset + 1;
136           DCHECK(pos_ < end_);
137           pos_is_label_character_ = !is_last_char_in_label;
138           return true;
139         }
140       }
141     }
142   }
143 
144   // If no match was found, then end of the DAFSA has been reached.
145   pos_ = nullptr;
146   pos_is_label_character_ = false;
147   return false;
148 }
149 
GetResultForCurrentSequence() const150 int FixedSetIncrementalLookup::GetResultForCurrentSequence() const {
151   int value = kDafsaNotFound;
152   // Look to see if there is a next character that's a return value.
153   if (pos_is_label_character_) {
154     // Currently processing a label, so it is only necessary to check the byte
155     // at |pos_| to see if encodes a return value.
156     GetReturnValue(pos_, &value);
157   } else {
158     // Otherwise, |pos_| is an offset list (or nullptr). Explore the list of
159     // child nodes (given by their offsets) to find one whose label is a result
160     // code.
161     //
162     // This search uses a temporary copy of |pos_|, since mutating |pos_| could
163     // skip over a node that would be important to a subsequent Advance() call.
164     const unsigned char* temp_pos = pos_;
165 
166     // Read offsets from |temp_pos| until either |temp_pos| is nullptr or until
167     // the byte at |offset| contains a result code (encoded as an ASCII
168     // character below 0x20).
169     const unsigned char* offset = pos_;
170     while (GetNextOffset(&temp_pos, &offset)) {
171       DCHECK(offset < end_);
172       DCHECK((temp_pos == nullptr) || temp_pos < end_);
173       if (GetReturnValue(offset, &value))
174         break;
175     }
176   }
177   return value;
178 }
179 
LookupStringInFixedSet(const unsigned char * graph,size_t length,const char * key,size_t key_length)180 int LookupStringInFixedSet(const unsigned char* graph,
181                            size_t length,
182                            const char* key,
183                            size_t key_length) {
184   // Do an incremental lookup until either the end of the graph is reached, or
185   // until every character in |key| is consumed.
186   FixedSetIncrementalLookup lookup(graph, length);
187   const char* key_end = key + key_length;
188   while (key != key_end) {
189     if (!lookup.Advance(*key))
190       return kDafsaNotFound;
191     key++;
192   }
193   // The entire input was consumed without reaching the end of the graph. Return
194   // the result code (if present) for the current position, or kDafsaNotFound.
195   return lookup.GetResultForCurrentSequence();
196 }
197 
198 // This function is only used by GetRegistryLengthInStrippedHost(), but is
199 // implemented here to allow inlining of
200 // LookupStringInFixedSet::GetResultForCurrentSequence() and
201 // LookupStringInFixedSet::Advance() at compile time. Tests on x86_64 linux
202 // indicated about 10% increased runtime cost for GetRegistryLength() in average
203 // if the implementation of this function was separated from the lookup methods.
LookupSuffixInReversedSet(const unsigned char * graph,size_t length,bool include_private,std::string_view host,size_t * suffix_length)204 int LookupSuffixInReversedSet(const unsigned char* graph,
205                               size_t length,
206                               bool include_private,
207                               std::string_view host,
208                               size_t* suffix_length) {
209   FixedSetIncrementalLookup lookup(graph, length);
210   *suffix_length = 0;
211   int result = kDafsaNotFound;
212   std::string_view::const_iterator pos = host.end();
213   // Look up host from right to left.
214   while (pos != host.begin() && lookup.Advance(*--pos)) {
215     // Only host itself or a part that follows a dot can match.
216     if (pos == host.begin() || *(pos - 1) == '.') {
217       int value = lookup.GetResultForCurrentSequence();
218       if (value != kDafsaNotFound) {
219         // Break if private and private rules should be excluded.
220         if ((value & kDafsaPrivateRule) && !include_private)
221           break;
222         // Save length and return value. Since hosts are looked up from right to
223         // left, the last saved values will be from the longest match.
224         *suffix_length = host.end() - pos;
225         result = value;
226       }
227     }
228   }
229   return result;
230 }
231 
232 }  // namespace net
233