xref: /aosp_15_r20/external/cronet/base/substring_set_matcher/substring_set_matcher.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/substring_set_matcher/substring_set_matcher.h"
6 
7 #include <stddef.h>
8 
9 #include <algorithm>
10 #include <queue>
11 
12 #ifdef __SSE2__
13 #include <immintrin.h>
14 #include "base/bits.h"
15 #endif
16 
17 #include "base/check_op.h"
18 #include "base/containers/contains.h"
19 #include "base/containers/queue.h"
20 #include "base/numerics/checked_math.h"
21 #include "base/trace_event/memory_usage_estimator.h"  // no-presubmit-check
22 
23 namespace base {
24 
25 namespace {
26 
27 // Compare MatcherStringPattern instances based on their string patterns.
ComparePatterns(const MatcherStringPattern * a,const MatcherStringPattern * b)28 bool ComparePatterns(const MatcherStringPattern* a,
29                      const MatcherStringPattern* b) {
30   return a->pattern() < b->pattern();
31 }
32 
GetVectorOfPointers(const std::vector<MatcherStringPattern> & patterns)33 std::vector<const MatcherStringPattern*> GetVectorOfPointers(
34     const std::vector<MatcherStringPattern>& patterns) {
35   std::vector<const MatcherStringPattern*> pattern_pointers;
36   pattern_pointers.reserve(patterns.size());
37 
38   for (const MatcherStringPattern& pattern : patterns)
39     pattern_pointers.push_back(&pattern);
40 
41   return pattern_pointers;
42 }
43 
44 }  // namespace
45 
Build(const std::vector<MatcherStringPattern> & patterns)46 bool SubstringSetMatcher::Build(
47     const std::vector<MatcherStringPattern>& patterns) {
48   return Build(GetVectorOfPointers(patterns));
49 }
50 
Build(std::vector<const MatcherStringPattern * > patterns)51 bool SubstringSetMatcher::Build(
52     std::vector<const MatcherStringPattern*> patterns) {
53   // Ensure there are no duplicate IDs and all pattern strings are distinct.
54 #if DCHECK_IS_ON()
55   {
56     std::set<MatcherStringPattern::ID> ids;
57     std::set<std::string> pattern_strings;
58     for (const MatcherStringPattern* pattern : patterns) {
59       CHECK(!base::Contains(ids, pattern->id()));
60       CHECK(!base::Contains(pattern_strings, pattern->pattern()));
61       ids.insert(pattern->id());
62       pattern_strings.insert(pattern->pattern());
63     }
64   }
65 #endif
66 
67   // Check that all the match labels fit into an edge.
68   for (const MatcherStringPattern* pattern : patterns) {
69     if (pattern->id() >= kInvalidNodeID) {
70       return false;
71     }
72   }
73 
74   // Compute the total number of tree nodes needed.
75   std::sort(patterns.begin(), patterns.end(), ComparePatterns);
76   NodeID tree_size = GetTreeSize(patterns);
77   if (tree_size >= kInvalidNodeID) {
78     return false;
79   }
80   tree_.reserve(GetTreeSize(patterns));
81   BuildAhoCorasickTree(patterns);
82 
83   // Sanity check that no new allocations happened in the tree and our computed
84   // size was correct.
85   DCHECK_EQ(tree_.size(), static_cast<size_t>(GetTreeSize(patterns)));
86 
87   is_empty_ = patterns.empty() && tree_.size() == 1u;
88   return true;
89 }
90 
91 SubstringSetMatcher::SubstringSetMatcher() = default;
92 SubstringSetMatcher::~SubstringSetMatcher() = default;
93 
Match(const std::string & text,std::set<MatcherStringPattern::ID> * matches) const94 bool SubstringSetMatcher::Match(
95     const std::string& text,
96     std::set<MatcherStringPattern::ID>* matches) const {
97   const size_t old_number_of_matches = matches->size();
98 
99   // Handle patterns matching the empty string.
100   const AhoCorasickNode* const root = &tree_[kRootID];
101   AccumulateMatchesForNode(root, matches);
102 
103   const AhoCorasickNode* current_node = root;
104   for (const char c : text) {
105     NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
106 
107     // If the child not can't be found, progressively iterate over the longest
108     // proper suffix of the string represented by the current node. In a sense
109     // we are pruning prefixes from the text.
110     while (child == kInvalidNodeID && current_node != root) {
111       current_node = &tree_[current_node->failure()];
112       child = current_node->GetEdge(static_cast<unsigned char>(c));
113     }
114 
115     if (child != kInvalidNodeID) {
116       // The string represented by |child| is the longest possible suffix of the
117       // current position of |text| in the trie.
118       current_node = &tree_[child];
119       AccumulateMatchesForNode(current_node, matches);
120     } else {
121       // The empty string is the longest possible suffix of the current position
122       // of |text| in the trie.
123       DCHECK_EQ(root, current_node);
124     }
125   }
126 
127   return old_number_of_matches != matches->size();
128 }
129 
AnyMatch(const std::string & text) const130 bool SubstringSetMatcher::AnyMatch(const std::string& text) const {
131   // Handle patterns matching the empty string.
132   const AhoCorasickNode* const root = &tree_[kRootID];
133   if (root->has_outputs()) {
134     return true;
135   }
136 
137   const AhoCorasickNode* current_node = root;
138   for (const char c : text) {
139     NodeID child = current_node->GetEdge(static_cast<unsigned char>(c));
140 
141     // If the child not can't be found, progressively iterate over the longest
142     // proper suffix of the string represented by the current node. In a sense
143     // we are pruning prefixes from the text.
144     while (child == kInvalidNodeID && current_node != root) {
145       current_node = &tree_[current_node->failure()];
146       child = current_node->GetEdge(static_cast<unsigned char>(c));
147     }
148 
149     if (child != kInvalidNodeID) {
150       // The string represented by |child| is the longest possible suffix of the
151       // current position of |text| in the trie.
152       current_node = &tree_[child];
153       if (current_node->has_outputs()) {
154         return true;
155       }
156     } else {
157       // The empty string is the longest possible suffix of the current position
158       // of |text| in the trie.
159       DCHECK_EQ(root, current_node);
160     }
161   }
162 
163   return false;
164 }
165 
EstimateMemoryUsage() const166 size_t SubstringSetMatcher::EstimateMemoryUsage() const {
167   return base::trace_event::EstimateMemoryUsage(tree_);
168 }
169 
170 // static
171 constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kInvalidNodeID;
172 constexpr SubstringSetMatcher::NodeID SubstringSetMatcher::kRootID;
173 
GetTreeSize(const std::vector<const MatcherStringPattern * > & patterns) const174 SubstringSetMatcher::NodeID SubstringSetMatcher::GetTreeSize(
175     const std::vector<const MatcherStringPattern*>& patterns) const {
176   DCHECK(std::is_sorted(patterns.begin(), patterns.end(), ComparePatterns));
177 
178   base::CheckedNumeric<NodeID> result = 1u;  // 1 for the root node.
179   if (patterns.empty())
180     return result.ValueOrDie();
181 
182   auto last = patterns.begin();
183   auto current = last + 1;
184   // For the first pattern, each letter is a label of an edge to a new node.
185   result += (*last)->pattern().size();
186 
187   // For the subsequent patterns, only count the edges which were not counted
188   // yet. For this it suffices to test against the previous pattern, because the
189   // patterns are sorted.
190   for (; current != patterns.end(); ++last, ++current) {
191     const std::string& last_pattern = (*last)->pattern();
192     const std::string& current_pattern = (*current)->pattern();
193     size_t prefix_bound = std::min(last_pattern.size(), current_pattern.size());
194 
195     size_t common_prefix = 0;
196     while (common_prefix < prefix_bound &&
197            last_pattern[common_prefix] == current_pattern[common_prefix]) {
198       ++common_prefix;
199     }
200 
201     result -= common_prefix;
202     result += current_pattern.size();
203   }
204 
205   return result.ValueOrDie();
206 }
207 
BuildAhoCorasickTree(const SubstringPatternVector & patterns)208 void SubstringSetMatcher::BuildAhoCorasickTree(
209     const SubstringPatternVector& patterns) {
210   DCHECK(tree_.empty());
211 
212   // Initialize root node of tree.
213   tree_.emplace_back();
214 
215   // Build the initial trie for all the patterns.
216   for (const MatcherStringPattern* pattern : patterns)
217     InsertPatternIntoAhoCorasickTree(pattern);
218 
219   CreateFailureAndOutputEdges();
220 }
221 
InsertPatternIntoAhoCorasickTree(const MatcherStringPattern * pattern)222 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
223     const MatcherStringPattern* pattern) {
224   const std::string& text = pattern->pattern();
225   const std::string::const_iterator text_end = text.end();
226 
227   // Iterators on the tree and the text.
228   AhoCorasickNode* current_node = &tree_[kRootID];
229   std::string::const_iterator i = text.begin();
230 
231   // Follow existing paths for as long as possible.
232   while (i != text_end) {
233     NodeID child = current_node->GetEdge(static_cast<unsigned char>(*i));
234     if (child == kInvalidNodeID)
235       break;
236     current_node = &tree_[child];
237     ++i;
238   }
239 
240   // Create new nodes if necessary.
241   while (i != text_end) {
242     tree_.emplace_back();
243     current_node->SetEdge(static_cast<unsigned char>(*i),
244                           static_cast<NodeID>(tree_.size() - 1));
245     current_node = &tree_.back();
246     ++i;
247   }
248 
249   // Register match.
250   current_node->SetMatchID(pattern->id());
251 }
252 
CreateFailureAndOutputEdges()253 void SubstringSetMatcher::CreateFailureAndOutputEdges() {
254   base::queue<AhoCorasickNode*> queue;
255 
256   // Initialize the failure edges for |root| and its children.
257   AhoCorasickNode* const root = &tree_[0];
258 
259   root->SetOutputLink(kInvalidNodeID);
260 
261   NodeID root_output_link = root->IsEndOfPattern() ? kRootID : kInvalidNodeID;
262 
263   for (unsigned edge_idx = 0; edge_idx < root->num_edges(); ++edge_idx) {
264     const AhoCorasickEdge& edge = root->edges()[edge_idx];
265     if (edge.label >= kFirstSpecialLabel) {
266       continue;
267     }
268     AhoCorasickNode* child = &tree_[edge.node_id];
269     // Failure node is kept as the root.
270     child->SetOutputLink(root_output_link);
271     queue.push(child);
272   }
273 
274   // Do a breadth first search over the trie to create failure edges. We
275   // maintain the invariant that any node in |queue| has had its |failure_| and
276   // |output_link_| edge already initialized.
277   while (!queue.empty()) {
278     AhoCorasickNode* current_node = queue.front();
279     queue.pop();
280 
281     // Compute the failure and output edges of children using the failure edges
282     // of the current node.
283     for (unsigned edge_idx = 0; edge_idx < current_node->num_edges();
284          ++edge_idx) {
285       const AhoCorasickEdge& edge = current_node->edges()[edge_idx];
286       if (edge.label >= kFirstSpecialLabel) {
287         continue;
288       }
289       AhoCorasickNode* child = &tree_[edge.node_id];
290 
291       const AhoCorasickNode* failure_candidate_parent =
292           &tree_[current_node->failure()];
293       NodeID failure_candidate_id =
294           failure_candidate_parent->GetEdge(edge.label);
295       while (failure_candidate_id == kInvalidNodeID &&
296              failure_candidate_parent != root) {
297         failure_candidate_parent = &tree_[failure_candidate_parent->failure()];
298         failure_candidate_id = failure_candidate_parent->GetEdge(edge.label);
299       }
300 
301       if (failure_candidate_id == kInvalidNodeID) {
302         DCHECK_EQ(root, failure_candidate_parent);
303         // |failure_candidate| is invalid and we can't proceed further since we
304         // have reached the root. Hence the longest proper suffix of this string
305         // represented by this node is the empty string (represented by root).
306         failure_candidate_id = kRootID;
307       } else {
308         child->SetFailure(failure_candidate_id);
309       }
310 
311       const AhoCorasickNode* failure_candidate = &tree_[failure_candidate_id];
312       // Now |failure_candidate| is |child|'s longest possible proper suffix in
313       // the trie. We also know that since we are doing a breadth first search,
314       // we would have established |failure_candidate|'s output link by now.
315       // Hence we can define |child|'s output link as follows:
316       child->SetOutputLink(failure_candidate->IsEndOfPattern()
317                                ? failure_candidate_id
318                                : failure_candidate->output_link());
319 
320       queue.push(child);
321     }
322   }
323 }
324 
AccumulateMatchesForNode(const AhoCorasickNode * node,std::set<MatcherStringPattern::ID> * matches) const325 void SubstringSetMatcher::AccumulateMatchesForNode(
326     const AhoCorasickNode* node,
327     std::set<MatcherStringPattern::ID>* matches) const {
328   DCHECK(matches);
329 
330   if (!node->has_outputs()) {
331     // Fast reject.
332     return;
333   }
334   if (node->IsEndOfPattern())
335     matches->insert(node->GetMatchID());
336 
337   NodeID node_id = node->output_link();
338   while (node_id != kInvalidNodeID) {
339     node = &tree_[node_id];
340     matches->insert(node->GetMatchID());
341     node_id = node->output_link();
342   }
343 }
344 
AhoCorasickNode()345 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() {
346   static_assert(kNumInlineEdges == 2, "Code below needs updating");
347   edges_.inline_edges[0].label = kEmptyLabel;
348   edges_.inline_edges[1].label = kEmptyLabel;
349 }
350 
~AhoCorasickNode()351 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {
352   if (edges_capacity_ != 0) {
353     delete[] edges_.edges;
354   }
355 }
356 
AhoCorasickNode(AhoCorasickNode && other)357 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(AhoCorasickNode&& other) {
358   *this = std::move(other);
359 }
360 
361 SubstringSetMatcher::AhoCorasickNode&
operator =(AhoCorasickNode && other)362 SubstringSetMatcher::AhoCorasickNode::operator=(AhoCorasickNode&& other) {
363   if (edges_capacity_ != 0) {
364     // Delete the old heap allocation if needed.
365     delete[] edges_.edges;
366   }
367   if (other.edges_capacity_ == 0) {
368     static_assert(kNumInlineEdges == 2, "Code below needs updating");
369     edges_.inline_edges[0] = other.edges_.inline_edges[0];
370     edges_.inline_edges[1] = other.edges_.inline_edges[1];
371   } else {
372     // Move over the heap allocation.
373     edges_.edges = other.edges_.edges;
374     other.edges_.edges = nullptr;
375   }
376   num_free_edges_ = other.num_free_edges_;
377   edges_capacity_ = other.edges_capacity_;
378   return *this;
379 }
380 
381 SubstringSetMatcher::NodeID
GetEdgeNoInline(uint32_t label) const382 SubstringSetMatcher::AhoCorasickNode::GetEdgeNoInline(uint32_t label) const {
383   DCHECK(edges_capacity_ != 0);
384 #ifdef __SSE2__
385   const __m128i lbl = _mm_set1_epi32(static_cast<int>(label));
386   const __m128i mask = _mm_set1_epi32(0x1ff);
387   for (unsigned edge_idx = 0; edge_idx < num_edges(); edge_idx += 4) {
388     const __m128i four = _mm_loadu_si128(
389         reinterpret_cast<const __m128i*>(&edges_.edges[edge_idx]));
390     const __m128i match = _mm_cmpeq_epi32(_mm_and_si128(four, mask), lbl);
391     const uint32_t match_mask = static_cast<uint32_t>(_mm_movemask_epi8(match));
392     if (match_mask != 0) {
393       if (match_mask & 0x1u) {
394         return edges_.edges[edge_idx].node_id;
395       }
396       if (match_mask & 0x10u) {
397         return edges_.edges[edge_idx + 1].node_id;
398       }
399       if (match_mask & 0x100u) {
400         return edges_.edges[edge_idx + 2].node_id;
401       }
402       DCHECK(match_mask & 0x1000u);
403       return edges_.edges[edge_idx + 3].node_id;
404     }
405   }
406 #else
407   for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
408     const AhoCorasickEdge& edge = edges_.edges[edge_idx];
409     if (edge.label == label)
410       return edge.node_id;
411   }
412 #endif
413   return kInvalidNodeID;
414 }
415 
SetEdge(uint32_t label,NodeID node)416 void SubstringSetMatcher::AhoCorasickNode::SetEdge(uint32_t label,
417                                                    NodeID node) {
418   DCHECK_LT(node, kInvalidNodeID);
419 
420 #if DCHECK_IS_ON()
421   // We don't support overwriting existing edges.
422   for (unsigned edge_idx = 0; edge_idx < num_edges(); ++edge_idx) {
423     DCHECK_NE(label, edges()[edge_idx].label);
424   }
425 #endif
426 
427   if (edges_capacity_ == 0 && num_free_edges_ > 0) {
428     // Still space in the inline storage, so use that.
429     edges_.inline_edges[num_edges()] = AhoCorasickEdge{label, node};
430     if (label == kFailureNodeLabel) {
431       // Make sure that kFailureNodeLabel is first.
432       // NOTE: We don't use std::swap here, because the compiler doesn't
433       // understand that inline_edges[] is 4-aligned and can give
434       // a warning or error.
435       AhoCorasickEdge temp = edges_.inline_edges[0];
436       edges_.inline_edges[0] = edges_.inline_edges[num_edges()];
437       edges_.inline_edges[num_edges()] = temp;
438     }
439     --num_free_edges_;
440     return;
441   }
442 
443   if (num_free_edges_ == 0) {
444     // We are out of space, so double our capacity (unless that would cause
445     // num_free_edges_ to overflow). This can either be because we are
446     // converting from inline to heap storage, or because we are increasing the
447     // size of our heap storage.
448     unsigned old_capacity =
449         edges_capacity_ == 0 ? kNumInlineEdges : edges_capacity_;
450     unsigned new_capacity = std::min(old_capacity * 2, kEmptyLabel + 1);
451     DCHECK_EQ(0u, new_capacity % 4);
452     AhoCorasickEdge* new_edges = new AhoCorasickEdge[new_capacity];
453     memcpy(new_edges, edges(), sizeof(AhoCorasickEdge) * old_capacity);
454     for (unsigned edge_idx = old_capacity; edge_idx < new_capacity;
455          ++edge_idx) {
456       new_edges[edge_idx].label = kEmptyLabel;
457     }
458     if (edges_capacity_ != 0) {
459       delete[] edges_.edges;
460     }
461     edges_.edges = new_edges;
462     // These casts are safe due to the DCHECK above.
463     edges_capacity_ = static_cast<uint16_t>(new_capacity);
464     num_free_edges_ = static_cast<uint8_t>(new_capacity - old_capacity);
465   }
466 
467   // Insert the new edge at the end of our heap storage.
468   edges_.edges[num_edges()] = AhoCorasickEdge{label, node};
469   if (label == kFailureNodeLabel) {
470     // Make sure that kFailureNodeLabel is first.
471     std::swap(edges_.edges[0], edges_.edges[num_edges()]);
472   }
473   --num_free_edges_;
474 }
475 
SetFailure(NodeID node)476 void SubstringSetMatcher::AhoCorasickNode::SetFailure(NodeID node) {
477   DCHECK_NE(kInvalidNodeID, node);
478   if (node != kRootID) {
479     SetEdge(kFailureNodeLabel, node);
480   }
481 }
482 
EstimateMemoryUsage() const483 size_t SubstringSetMatcher::AhoCorasickNode::EstimateMemoryUsage() const {
484   if (edges_capacity_ == 0) {
485     return 0;
486   } else {
487     return base::trace_event::EstimateMemoryUsage(edges_.edges,
488                                                   edges_capacity_);
489   }
490 }
491 
492 }  // namespace base
493