xref: /aosp_15_r20/external/cronet/base/substring_set_matcher/substring_set_matcher.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
6 #define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
7 
8 #include <stdint.h>
9 
10 #include <limits>
11 #include <set>
12 #include <string>
13 #include <vector>
14 
15 #include "base/base_export.h"
16 #include "base/check_op.h"
17 #include "base/memory/raw_ptr_exclusion.h"
18 #include "base/substring_set_matcher/matcher_string_pattern.h"
19 
20 namespace base {
21 
22 // Class that store a set of string patterns and can find for a string S,
23 // which string patterns occur in S.
24 class BASE_EXPORT SubstringSetMatcher {
25  public:
26   SubstringSetMatcher();
27   SubstringSetMatcher(const SubstringSetMatcher&) = delete;
28   SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete;
29 
30   ~SubstringSetMatcher();
31 
32   // Registers all |patterns|. Each pattern needs to have a unique ID and all
33   // pattern strings must be unique. Build() should be called exactly once
34   // (before it is called, the tree is empty).
35   //
36   // Complexity:
37   //    Let n = number of patterns.
38   //    Let S = sum of pattern lengths.
39   //    Let k = range of char. Generally 256.
40   // Complexity = O(nlogn + S * logk)
41   // nlogn comes from sorting the patterns.
42   // log(k) comes from our usage of std::map to store edges.
43   //
44   // Returns true on success (may fail if e.g. if the tree gets too many nodes).
45   bool Build(const std::vector<MatcherStringPattern>& patterns);
46   bool Build(std::vector<const MatcherStringPattern*> patterns);
47 
48   // Matches |text| against all registered MatcherStringPatterns. Stores the IDs
49   // of matching patterns in |matches|. |matches| is not cleared before adding
50   // to it.
51   // Complexity:
52   //    Let t = length of |text|.
53   //    Let k = range of char. Generally 256.
54   //    Let z = number of matches returned.
55   // Complexity = O(t * logk + zlogz)
56   bool Match(const std::string& text,
57              std::set<MatcherStringPattern::ID>* matches) const;
58 
59   // As Match(), except it returns immediately on the first match.
60   // This allows true/false matching to be done without any dynamic
61   // memory allocation.
62   // Complexity = O(t * logk)
63   bool AnyMatch(const std::string& text) const;
64 
65   // Returns true if this object retains no allocated data.
IsEmpty()66   bool IsEmpty() const { return is_empty_; }
67 
68   // Returns the dynamically allocated memory usage in bytes. See
69   // base/trace_event/memory_usage_estimator.h for details.
70   size_t EstimateMemoryUsage() const;
71 
72  private:
73   // Represents the index of the node within |tree_|. It is specifically
74   // uint32_t so that we can be sure it takes up 4 bytes when stored together
75   // with the 9-bit label (so 23 bits are allocated to the NodeID, even though
76   // it is exposed as uint32_t). If the computed size of |tree_| is
77   // larger than what can be stored within 23 bits, Build() will fail.
78   using NodeID = uint32_t;
79 
80   // This is the maximum possible size of |tree_| and hence can't be a valid ID.
81   static constexpr NodeID kInvalidNodeID = (1u << 23) - 1;
82 
83   static constexpr NodeID kRootID = 0;
84 
85   // A node of an Aho Corasick Tree. See
86   // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf
87   // to understand the algorithm.
88   //
89   // The algorithm is based on the idea of building a trie of all registered
90   // patterns. Each node of the tree is annotated with a set of pattern
91   // IDs that are used to report matches.
92   //
93   // The root of the trie represents an empty match. If we were looking whether
94   // any registered pattern matches a text at the beginning of the text (i.e.
95   // whether any pattern is a prefix of the text), we could just follow
96   // nodes in the trie according to the matching characters in the text.
97   // E.g., if text == "foobar", we would follow the trie from the root node
98   // to its child labeled 'f', from there to child 'o', etc. In this process we
99   // would report all pattern IDs associated with the trie nodes as matches.
100   //
101   // As we are not looking for all prefix matches but all substring matches,
102   // this algorithm would need to compare text.substr(0), text.substr(1), ...
103   // against the trie, which is in O(|text|^2).
104   //
105   // The Aho Corasick algorithm improves this runtime by using failure edges.
106   // In case we have found a partial match of length k in the text
107   // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at
108   // a node at depth k, but cannot find a match in the trie for character
109   // text[i + k] at depth k + 1, we follow a failure edge. This edge
110   // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that
111   // is a prefix of any registered pattern.
112   //
113   // If your brain thinks "Forget it, let's go shopping.", don't worry.
114   // Take a nap and read an introductory text on the Aho Corasick algorithm.
115   // It will make sense. Eventually.
116 
117   // An edge internal to the tree. We pack the label (character we are
118   // matching on) and the destination node ID into 32 bits, to save memory.
119   // We also use these edges as a sort of generic key/value store for
120   // some special values that not all nodes will have; this also saves on
121   // memory over the otherwise obvious choice of having them as struct fields,
122   // as it means we do not to store them when they are not present.
123   struct AhoCorasickEdge {
124     // char (unsigned, so [0..255]), or a special label below.
125     uint32_t label : 9;
126     NodeID node_id : 23;
127   };
128 
129   // Node index that failure edge leads to. The failure node corresponds to
130   // the node which represents the longest proper suffix (include empty
131   // string) of the string represented by this node. Not stored if it is
132   // equal to kRootID (since that is the most common value).
133   //
134   // NOTE: Assigning |root| as the failure edge for itself doesn't strictly
135   // abide by the definition of "proper" suffix. The proper suffix of an empty
136   // string should probably be defined as null, but we assign it to the |root|
137   // to simplify the code and have the invariant that the failure edge is always
138   // defined.
139   static constexpr uint32_t kFailureNodeLabel = 0x100;
140   static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel;
141 
142   // Node index that corresponds to the longest proper suffix (including empty
143   // suffix) of this node and which also represents the end of a pattern.
144   // Does not have to exist.
145   static constexpr uint32_t kOutputLinkLabel = 0x101;
146 
147   // If present, this node represents the end of a pattern. It stores the ID of
148   // the corresponding pattern (ie., it is not really a NodeID, but a
149   // MatcherStringPattern::ID).
150   static constexpr uint32_t kMatchIDLabel = 0x102;
151 
152   // Used for uninitialized label slots; used so that we do not have to test for
153   // them in other ways, since we know the data will be initialized and never
154   // match any other labels.
155   static constexpr uint32_t kEmptyLabel = 0x103;
156 
157   // A node in the trie, packed tightly together so that it occupies 12 bytes
158   // (both on 32- and 64-bit platforms), but aligned to at least 4 (see the
159   // comment on edges_).
160   class alignas(AhoCorasickEdge) AhoCorasickNode {
161    public:
162     AhoCorasickNode();
163     ~AhoCorasickNode();
164     AhoCorasickNode(AhoCorasickNode&& other);
165     AhoCorasickNode& operator=(AhoCorasickNode&& other);
166 
GetEdge(uint32_t label)167     NodeID GetEdge(uint32_t label) const {
168       if (edges_capacity_ != 0) {
169         return GetEdgeNoInline(label);
170       }
171       static_assert(kNumInlineEdges == 2, "Code below needs updating");
172       if (edges_.inline_edges[0].label == label) {
173         return edges_.inline_edges[0].node_id;
174       }
175       if (edges_.inline_edges[1].label == label) {
176         return edges_.inline_edges[1].node_id;
177       }
178       return kInvalidNodeID;
179     }
180     NodeID GetEdgeNoInline(uint32_t label) const;
181     void SetEdge(uint32_t label, NodeID node);
edges()182     const AhoCorasickEdge* edges() const {
183       // NOTE: Returning edges_.inline_edges here is fine, because it's
184       // the first thing in the struct (see the comment on edges_).
185       DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) %
186                         alignof(AhoCorasickEdge));
187       return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges;
188     }
189 
failure()190     NodeID failure() const {
191       // NOTE: Even if num_edges_ == 0, we are not doing anything
192       // undefined, as we will have room for at least two edges
193       // and empty edges are set to kEmptyLabel.
194       const AhoCorasickEdge& first_edge = *edges();
195       if (first_edge.label == kFailureNodeLabel) {
196         return first_edge.node_id;
197       } else {
198         return kRootID;
199       }
200     }
201     void SetFailure(NodeID failure);
202 
SetMatchID(MatcherStringPattern::ID id)203     void SetMatchID(MatcherStringPattern::ID id) {
204       DCHECK(!IsEndOfPattern());
205       DCHECK(id < kInvalidNodeID);  // This is enforced by Build().
206       SetEdge(kMatchIDLabel, static_cast<NodeID>(id));
207       has_outputs_ = true;
208     }
209 
210     // Returns true if this node corresponds to a pattern.
IsEndOfPattern()211     bool IsEndOfPattern() const {
212       if (!has_outputs_) {
213         // Fast reject.
214         return false;
215       }
216       return GetEdge(kMatchIDLabel) != kInvalidNodeID;
217     }
218 
219     // Must only be called if |IsEndOfPattern| returns true for this node.
GetMatchID()220     MatcherStringPattern::ID GetMatchID() const {
221       DCHECK(IsEndOfPattern());
222       return GetEdge(kMatchIDLabel);
223     }
224 
SetOutputLink(NodeID node)225     void SetOutputLink(NodeID node) {
226       if (node != kInvalidNodeID) {
227         SetEdge(kOutputLinkLabel, node);
228         has_outputs_ = true;
229       }
230     }
output_link()231     NodeID output_link() const { return GetEdge(kOutputLinkLabel); }
232 
233     size_t EstimateMemoryUsage() const;
num_edges()234     size_t num_edges() const {
235       if (edges_capacity_ == 0) {
236         return kNumInlineEdges - num_free_edges_;
237       } else {
238         return edges_capacity_ - num_free_edges_;
239       }
240     }
241 
has_outputs()242     bool has_outputs() const { return has_outputs_; }
243 
244    private:
245     // Outgoing edges of current node, including failure edge and output links.
246     // Most nodes have only one or two (or even zero) edges, not the last
247     // because many of them are leaves. Thus, we make an optimization for this
248     // common case; instead of a pointer to an edge array on the heap, we can
249     // pack two edges inline where the pointer would otherwise be. This reduces
250     // memory usage dramatically, as well as saving us a cache-line fetch.
251     //
252     // Note that even though most nodes have fewer outgoing edges, most nodes
253     // that we actually traverse will have any of them. This apparent
254     // contradiction is because we tend to spend more of our time near the root
255     // of the trie, where it is wide. This means that another layout would be
256     // possible: If we wanted to, non-inline nodes could simply store an array
257     // of 259 (256 possible characters plus the three special label types)
258     // edges, indexed directly by label type. This would use 20–50% more RAM,
259     // but also increases the speed of lookups due to removing the search loop.
260     //
261     // The nodes are generally unordered; since we typically index text, even
262     // the root will rarely be more than 20–30 wide, and at that point, it's
263     // better to just do a linear search than a binary one (which fares poorly
264     // on branch predictors). However, a special case, we put kFailureNodeLabel
265     // in the first slot if it exists (ie., is not equal to kRootID), since we
266     // need to access that label during every single node we look at during
267     // traversal.
268     //
269     // NOTE: Keep this the first member in the struct, so that inline_edges gets
270     // 4-aligned (since the class is marked as such, despite being packed.
271     // Otherwise, edges() can return an unaligned pointer marked as aligned
272     // (the unalignedness gets lost).
273     static constexpr int kNumInlineEdges = 2;
274     union {
275       // Out-of-line edge storage, having room for edges_capacity_ elements.
276       // Note that due to __attribute__((packed)) below, this pointer may be
277       // unaligned on 64-bit platforms, causing slightly less efficient
278       // access to it in some cases.
279       // This field is not a raw_ptr<> because it was filtered by the rewriter
280       // for: #union
281       RAW_PTR_EXCLUSION AhoCorasickEdge* edges;
282 
283       // Inline edge storage, used if edges_capacity_ == 0.
284       AhoCorasickEdge inline_edges[kNumInlineEdges];
285     } edges_;
286 
287     // Whether we have an edge for kMatchIDLabel or kOutputLinkLabel,
288     // ie., hitting this node during traversal will create one or more
289     // matches. This is redundant, but since every single lookup during
290     // traversal needs this, it saves a few searches for us.
291     bool has_outputs_ = false;
292 
293     // Number of unused left in edges_. Edges are always allocated from the
294     // beginning and never deleted; those after num_edges_ will be marked with
295     // kEmptyLabel (and have an undefined node_id). We store the number of
296     // free edges instead of the more common number of _used_ edges, to be
297     // sure that we are able to fit it in an uint8_t. num_edges() provides
298     // a useful abstraction over this.
299     uint8_t num_free_edges_ = kNumInlineEdges;
300 
301     // How many edges we have allocated room for (can never be more than
302     // kEmptyLabel + 1). If equal to zero, we are not using heap storage,
303     // but instead are using inline_edges.
304     //
305     // If not equal to zero, will be a multiple of 4, so that we can use
306     // SIMD to accelerate looking for edges.
307     uint16_t edges_capacity_ = 0;
308   } __attribute__((packed));
309 
310   using SubstringPatternVector = std::vector<const MatcherStringPattern*>;
311 
312   // Given the set of patterns, compute how many nodes will the corresponding
313   // Aho-Corasick tree have. Note that |patterns| need to be sorted.
314   NodeID GetTreeSize(
315       const std::vector<const MatcherStringPattern*>& patterns) const;
316 
317   void BuildAhoCorasickTree(const SubstringPatternVector& patterns);
318 
319   // Inserts a path for |pattern->pattern()| into the tree and adds
320   // |pattern->id()| to the set of matches.
321   void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern);
322 
323   void CreateFailureAndOutputEdges();
324 
325   // Adds all pattern IDs to |matches| which are a suffix of the string
326   // represented by |node|.
327   void AccumulateMatchesForNode(
328       const AhoCorasickNode* node,
329       std::set<MatcherStringPattern::ID>* matches) const;
330 
331   // The nodes of a Aho-Corasick tree.
332   std::vector<AhoCorasickNode> tree_;
333 
334   bool is_empty_ = true;
335 };
336 
337 }  // namespace base
338 
339 #endif  // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_
340