1 // Copyright 2013 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_ 6 #define BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_ 7 8 #include <stdint.h> 9 10 #include <limits> 11 #include <set> 12 #include <string> 13 #include <vector> 14 15 #include "base/base_export.h" 16 #include "base/check_op.h" 17 #include "base/memory/raw_ptr_exclusion.h" 18 #include "base/substring_set_matcher/matcher_string_pattern.h" 19 20 namespace base { 21 22 // Class that store a set of string patterns and can find for a string S, 23 // which string patterns occur in S. 24 class BASE_EXPORT SubstringSetMatcher { 25 public: 26 SubstringSetMatcher(); 27 SubstringSetMatcher(const SubstringSetMatcher&) = delete; 28 SubstringSetMatcher& operator=(const SubstringSetMatcher&) = delete; 29 30 ~SubstringSetMatcher(); 31 32 // Registers all |patterns|. Each pattern needs to have a unique ID and all 33 // pattern strings must be unique. Build() should be called exactly once 34 // (before it is called, the tree is empty). 35 // 36 // Complexity: 37 // Let n = number of patterns. 38 // Let S = sum of pattern lengths. 39 // Let k = range of char. Generally 256. 40 // Complexity = O(nlogn + S * logk) 41 // nlogn comes from sorting the patterns. 42 // log(k) comes from our usage of std::map to store edges. 43 // 44 // Returns true on success (may fail if e.g. if the tree gets too many nodes). 45 bool Build(const std::vector<MatcherStringPattern>& patterns); 46 bool Build(std::vector<const MatcherStringPattern*> patterns); 47 48 // Matches |text| against all registered MatcherStringPatterns. Stores the IDs 49 // of matching patterns in |matches|. |matches| is not cleared before adding 50 // to it. 51 // Complexity: 52 // Let t = length of |text|. 53 // Let k = range of char. Generally 256. 54 // Let z = number of matches returned. 55 // Complexity = O(t * logk + zlogz) 56 bool Match(const std::string& text, 57 std::set<MatcherStringPattern::ID>* matches) const; 58 59 // As Match(), except it returns immediately on the first match. 60 // This allows true/false matching to be done without any dynamic 61 // memory allocation. 62 // Complexity = O(t * logk) 63 bool AnyMatch(const std::string& text) const; 64 65 // Returns true if this object retains no allocated data. IsEmpty()66 bool IsEmpty() const { return is_empty_; } 67 68 // Returns the dynamically allocated memory usage in bytes. See 69 // base/trace_event/memory_usage_estimator.h for details. 70 size_t EstimateMemoryUsage() const; 71 72 private: 73 // Represents the index of the node within |tree_|. It is specifically 74 // uint32_t so that we can be sure it takes up 4 bytes when stored together 75 // with the 9-bit label (so 23 bits are allocated to the NodeID, even though 76 // it is exposed as uint32_t). If the computed size of |tree_| is 77 // larger than what can be stored within 23 bits, Build() will fail. 78 using NodeID = uint32_t; 79 80 // This is the maximum possible size of |tree_| and hence can't be a valid ID. 81 static constexpr NodeID kInvalidNodeID = (1u << 23) - 1; 82 83 static constexpr NodeID kRootID = 0; 84 85 // A node of an Aho Corasick Tree. See 86 // http://web.stanford.edu/class/archive/cs/cs166/cs166.1166/lectures/02/Small02.pdf 87 // to understand the algorithm. 88 // 89 // The algorithm is based on the idea of building a trie of all registered 90 // patterns. Each node of the tree is annotated with a set of pattern 91 // IDs that are used to report matches. 92 // 93 // The root of the trie represents an empty match. If we were looking whether 94 // any registered pattern matches a text at the beginning of the text (i.e. 95 // whether any pattern is a prefix of the text), we could just follow 96 // nodes in the trie according to the matching characters in the text. 97 // E.g., if text == "foobar", we would follow the trie from the root node 98 // to its child labeled 'f', from there to child 'o', etc. In this process we 99 // would report all pattern IDs associated with the trie nodes as matches. 100 // 101 // As we are not looking for all prefix matches but all substring matches, 102 // this algorithm would need to compare text.substr(0), text.substr(1), ... 103 // against the trie, which is in O(|text|^2). 104 // 105 // The Aho Corasick algorithm improves this runtime by using failure edges. 106 // In case we have found a partial match of length k in the text 107 // (text[i, ..., i + k - 1]) in the trie starting at the root and ending at 108 // a node at depth k, but cannot find a match in the trie for character 109 // text[i + k] at depth k + 1, we follow a failure edge. This edge 110 // corresponds to the longest proper suffix of text[i, ..., i + k - 1] that 111 // is a prefix of any registered pattern. 112 // 113 // If your brain thinks "Forget it, let's go shopping.", don't worry. 114 // Take a nap and read an introductory text on the Aho Corasick algorithm. 115 // It will make sense. Eventually. 116 117 // An edge internal to the tree. We pack the label (character we are 118 // matching on) and the destination node ID into 32 bits, to save memory. 119 // We also use these edges as a sort of generic key/value store for 120 // some special values that not all nodes will have; this also saves on 121 // memory over the otherwise obvious choice of having them as struct fields, 122 // as it means we do not to store them when they are not present. 123 struct AhoCorasickEdge { 124 // char (unsigned, so [0..255]), or a special label below. 125 uint32_t label : 9; 126 NodeID node_id : 23; 127 }; 128 129 // Node index that failure edge leads to. The failure node corresponds to 130 // the node which represents the longest proper suffix (include empty 131 // string) of the string represented by this node. Not stored if it is 132 // equal to kRootID (since that is the most common value). 133 // 134 // NOTE: Assigning |root| as the failure edge for itself doesn't strictly 135 // abide by the definition of "proper" suffix. The proper suffix of an empty 136 // string should probably be defined as null, but we assign it to the |root| 137 // to simplify the code and have the invariant that the failure edge is always 138 // defined. 139 static constexpr uint32_t kFailureNodeLabel = 0x100; 140 static constexpr uint32_t kFirstSpecialLabel = kFailureNodeLabel; 141 142 // Node index that corresponds to the longest proper suffix (including empty 143 // suffix) of this node and which also represents the end of a pattern. 144 // Does not have to exist. 145 static constexpr uint32_t kOutputLinkLabel = 0x101; 146 147 // If present, this node represents the end of a pattern. It stores the ID of 148 // the corresponding pattern (ie., it is not really a NodeID, but a 149 // MatcherStringPattern::ID). 150 static constexpr uint32_t kMatchIDLabel = 0x102; 151 152 // Used for uninitialized label slots; used so that we do not have to test for 153 // them in other ways, since we know the data will be initialized and never 154 // match any other labels. 155 static constexpr uint32_t kEmptyLabel = 0x103; 156 157 // A node in the trie, packed tightly together so that it occupies 12 bytes 158 // (both on 32- and 64-bit platforms), but aligned to at least 4 (see the 159 // comment on edges_). 160 class alignas(AhoCorasickEdge) AhoCorasickNode { 161 public: 162 AhoCorasickNode(); 163 ~AhoCorasickNode(); 164 AhoCorasickNode(AhoCorasickNode&& other); 165 AhoCorasickNode& operator=(AhoCorasickNode&& other); 166 GetEdge(uint32_t label)167 NodeID GetEdge(uint32_t label) const { 168 if (edges_capacity_ != 0) { 169 return GetEdgeNoInline(label); 170 } 171 static_assert(kNumInlineEdges == 2, "Code below needs updating"); 172 if (edges_.inline_edges[0].label == label) { 173 return edges_.inline_edges[0].node_id; 174 } 175 if (edges_.inline_edges[1].label == label) { 176 return edges_.inline_edges[1].node_id; 177 } 178 return kInvalidNodeID; 179 } 180 NodeID GetEdgeNoInline(uint32_t label) const; 181 void SetEdge(uint32_t label, NodeID node); edges()182 const AhoCorasickEdge* edges() const { 183 // NOTE: Returning edges_.inline_edges here is fine, because it's 184 // the first thing in the struct (see the comment on edges_). 185 DCHECK_EQ(0u, reinterpret_cast<uintptr_t>(edges_.inline_edges) % 186 alignof(AhoCorasickEdge)); 187 return edges_capacity_ == 0 ? edges_.inline_edges : edges_.edges; 188 } 189 failure()190 NodeID failure() const { 191 // NOTE: Even if num_edges_ == 0, we are not doing anything 192 // undefined, as we will have room for at least two edges 193 // and empty edges are set to kEmptyLabel. 194 const AhoCorasickEdge& first_edge = *edges(); 195 if (first_edge.label == kFailureNodeLabel) { 196 return first_edge.node_id; 197 } else { 198 return kRootID; 199 } 200 } 201 void SetFailure(NodeID failure); 202 SetMatchID(MatcherStringPattern::ID id)203 void SetMatchID(MatcherStringPattern::ID id) { 204 DCHECK(!IsEndOfPattern()); 205 DCHECK(id < kInvalidNodeID); // This is enforced by Build(). 206 SetEdge(kMatchIDLabel, static_cast<NodeID>(id)); 207 has_outputs_ = true; 208 } 209 210 // Returns true if this node corresponds to a pattern. IsEndOfPattern()211 bool IsEndOfPattern() const { 212 if (!has_outputs_) { 213 // Fast reject. 214 return false; 215 } 216 return GetEdge(kMatchIDLabel) != kInvalidNodeID; 217 } 218 219 // Must only be called if |IsEndOfPattern| returns true for this node. GetMatchID()220 MatcherStringPattern::ID GetMatchID() const { 221 DCHECK(IsEndOfPattern()); 222 return GetEdge(kMatchIDLabel); 223 } 224 SetOutputLink(NodeID node)225 void SetOutputLink(NodeID node) { 226 if (node != kInvalidNodeID) { 227 SetEdge(kOutputLinkLabel, node); 228 has_outputs_ = true; 229 } 230 } output_link()231 NodeID output_link() const { return GetEdge(kOutputLinkLabel); } 232 233 size_t EstimateMemoryUsage() const; num_edges()234 size_t num_edges() const { 235 if (edges_capacity_ == 0) { 236 return kNumInlineEdges - num_free_edges_; 237 } else { 238 return edges_capacity_ - num_free_edges_; 239 } 240 } 241 has_outputs()242 bool has_outputs() const { return has_outputs_; } 243 244 private: 245 // Outgoing edges of current node, including failure edge and output links. 246 // Most nodes have only one or two (or even zero) edges, not the last 247 // because many of them are leaves. Thus, we make an optimization for this 248 // common case; instead of a pointer to an edge array on the heap, we can 249 // pack two edges inline where the pointer would otherwise be. This reduces 250 // memory usage dramatically, as well as saving us a cache-line fetch. 251 // 252 // Note that even though most nodes have fewer outgoing edges, most nodes 253 // that we actually traverse will have any of them. This apparent 254 // contradiction is because we tend to spend more of our time near the root 255 // of the trie, where it is wide. This means that another layout would be 256 // possible: If we wanted to, non-inline nodes could simply store an array 257 // of 259 (256 possible characters plus the three special label types) 258 // edges, indexed directly by label type. This would use 20–50% more RAM, 259 // but also increases the speed of lookups due to removing the search loop. 260 // 261 // The nodes are generally unordered; since we typically index text, even 262 // the root will rarely be more than 20–30 wide, and at that point, it's 263 // better to just do a linear search than a binary one (which fares poorly 264 // on branch predictors). However, a special case, we put kFailureNodeLabel 265 // in the first slot if it exists (ie., is not equal to kRootID), since we 266 // need to access that label during every single node we look at during 267 // traversal. 268 // 269 // NOTE: Keep this the first member in the struct, so that inline_edges gets 270 // 4-aligned (since the class is marked as such, despite being packed. 271 // Otherwise, edges() can return an unaligned pointer marked as aligned 272 // (the unalignedness gets lost). 273 static constexpr int kNumInlineEdges = 2; 274 union { 275 // Out-of-line edge storage, having room for edges_capacity_ elements. 276 // Note that due to __attribute__((packed)) below, this pointer may be 277 // unaligned on 64-bit platforms, causing slightly less efficient 278 // access to it in some cases. 279 // This field is not a raw_ptr<> because it was filtered by the rewriter 280 // for: #union 281 RAW_PTR_EXCLUSION AhoCorasickEdge* edges; 282 283 // Inline edge storage, used if edges_capacity_ == 0. 284 AhoCorasickEdge inline_edges[kNumInlineEdges]; 285 } edges_; 286 287 // Whether we have an edge for kMatchIDLabel or kOutputLinkLabel, 288 // ie., hitting this node during traversal will create one or more 289 // matches. This is redundant, but since every single lookup during 290 // traversal needs this, it saves a few searches for us. 291 bool has_outputs_ = false; 292 293 // Number of unused left in edges_. Edges are always allocated from the 294 // beginning and never deleted; those after num_edges_ will be marked with 295 // kEmptyLabel (and have an undefined node_id). We store the number of 296 // free edges instead of the more common number of _used_ edges, to be 297 // sure that we are able to fit it in an uint8_t. num_edges() provides 298 // a useful abstraction over this. 299 uint8_t num_free_edges_ = kNumInlineEdges; 300 301 // How many edges we have allocated room for (can never be more than 302 // kEmptyLabel + 1). If equal to zero, we are not using heap storage, 303 // but instead are using inline_edges. 304 // 305 // If not equal to zero, will be a multiple of 4, so that we can use 306 // SIMD to accelerate looking for edges. 307 uint16_t edges_capacity_ = 0; 308 } __attribute__((packed)); 309 310 using SubstringPatternVector = std::vector<const MatcherStringPattern*>; 311 312 // Given the set of patterns, compute how many nodes will the corresponding 313 // Aho-Corasick tree have. Note that |patterns| need to be sorted. 314 NodeID GetTreeSize( 315 const std::vector<const MatcherStringPattern*>& patterns) const; 316 317 void BuildAhoCorasickTree(const SubstringPatternVector& patterns); 318 319 // Inserts a path for |pattern->pattern()| into the tree and adds 320 // |pattern->id()| to the set of matches. 321 void InsertPatternIntoAhoCorasickTree(const MatcherStringPattern* pattern); 322 323 void CreateFailureAndOutputEdges(); 324 325 // Adds all pattern IDs to |matches| which are a suffix of the string 326 // represented by |node|. 327 void AccumulateMatchesForNode( 328 const AhoCorasickNode* node, 329 std::set<MatcherStringPattern::ID>* matches) const; 330 331 // The nodes of a Aho-Corasick tree. 332 std::vector<AhoCorasickNode> tree_; 333 334 bool is_empty_ = true; 335 }; 336 337 } // namespace base 338 339 #endif // BASE_SUBSTRING_SET_MATCHER_SUBSTRING_SET_MATCHER_H_ 340