1 // Copyright 2015 The Chromium Authors 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 7 8 #include <stddef.h> 9 10 #include <string_view> 11 12 #include "net/base/net_export.h" 13 14 namespace net { 15 16 enum { 17 kDafsaNotFound = -1, // key is not in set 18 kDafsaFound = 0, // key is in set 19 // The following return values are used by the implementation of 20 // GetDomainAndRegistry() and are probably not generally useful. 21 kDafsaExceptionRule = 1, // key excluded from set via exception 22 kDafsaWildcardRule = 2, // key matched a wildcard rule 23 kDafsaPrivateRule = 4, // key matched a private rule 24 }; 25 26 // Looks up the string |key| with length |key_length| in a fixed set of 27 // strings. The set of strings must be known at compile time. It is converted to 28 // a graph structure named a DAFSA (Deterministic Acyclic Finite State 29 // Automaton) by the script make_dafsa.py during compilation. This permits 30 // efficient (in time and space) lookup. The graph generated by make_dafsa.py 31 // takes the form of a constant byte array which should be supplied via the 32 // |graph| and |length| parameters. The return value is kDafsaNotFound, 33 // kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule, 34 // kDafsaWildcardRule and kDafsaPrivateRule ORed together. 35 // 36 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere. 37 NET_EXPORT int LookupStringInFixedSet(const unsigned char* graph, 38 size_t length, 39 const char* key, 40 size_t key_length); 41 42 // Looks up the longest matching suffix for |host| with length |length| in a 43 // reversed DAFSA. Partial matches must begin at a new component, i.e. host 44 // itself could match or a host part starting after a dot could match. 45 // If no match was found a value of 0 is written to |suffix_length| and the 46 // value kDafsaNotFound is returned, otherwise the length of the longest match 47 // is written to |suffix_length| and the type of the longest match is returned. 48 int LookupSuffixInReversedSet(const unsigned char* graph, 49 size_t length, 50 bool include_private, 51 std::string_view host, 52 size_t* suffix_length); 53 54 // FixedSetIncrementalLookup provides efficient membership and prefix queries 55 // against a fixed set of strings. The set of strings must be known at compile 56 // time. The set is converted to a graph structure named a DAFSA (Deterministic 57 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py 58 // during compilation. The conversion generates a C++ header file defining the 59 // encoded graph as a constant byte array. This class provides a fast, constant- 60 // space lookup operation against such byte arrays. 61 // 62 // The lookup proceeds incrementally, with input characters provided one at a 63 // time. This approach allow queries of the form: "given an input string, which 64 // prefixes of that string that appear in the fixed set?" As the matching 65 // prefixes (and their result codes) are enumerated, the most suitable match 66 // among them can be selected in a single pass. 67 // 68 // This class can also be used to perform suffix queries (instead of prefix 69 // queries) against a fixed set, so long as the DAFSA is constructed on reversed 70 // values, and the input is provided in reverse order. 71 // 72 // Example usage for simple membership query; |input| is a std::string: 73 // 74 // FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa)); 75 // for (char c : input) { 76 // if (!lookup.Advance(c)) 77 // return false; 78 // } 79 // return lookup.GetResultForCurrentSequence() != kDafsaNotFound; 80 // 81 // Example usage for 'find longest prefix in set with result code == 3' query: 82 // 83 // FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa)); 84 // size_t longest_match_end = 0; 85 // for (size_t i = 0; i < input.length(); ++i) { 86 // if (!prefix_lookup.Advance(input[i])) 87 // break; 88 // if (prefix_lookup.GetResultForCurrentSequence() == 3) 89 // longest_match_end = (i + 1); 90 // } 91 // return input.substr(0, longest_match_end); 92 // 93 class NET_EXPORT FixedSetIncrementalLookup { 94 public: 95 // Begin a lookup against the provided fixed set. |graph| and |length| 96 // describe a byte buffer generated by the make_dafsa.py script, as described 97 // in the class comment. 98 // 99 // FixedSetIncrementalLookup is initialized to a state corresponding to the 100 // empty input sequence. Calling GetResultForCurrentSequence() in the initial 101 // state would indicate whether the empty string appears in the fixed set. 102 // Characters can be added to the sequence by calling Advance(), and the 103 // lookup result can be checked after each addition by calling 104 // GetResultForCurrentSequence(). 105 FixedSetIncrementalLookup(const unsigned char* graph, size_t length); 106 107 // FixedSetIncrementalLookup is copyable so that callers can save/restore 108 // their position in the search. This is for cases where branching or 109 // backtracking might be required (e.g. to probe for the presence of a 110 // designated wildcard character). 111 FixedSetIncrementalLookup(const FixedSetIncrementalLookup&); 112 FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&); 113 114 ~FixedSetIncrementalLookup(); 115 116 // Advance the query by adding a character to the input sequence. |input| can 117 // be any char value, but only ASCII characters will ever result in matches, 118 // since the fixed set itself is limited to ASCII strings. 119 // 120 // Returns true if the resulting input sequence either appears in the fixed 121 // set itself, or is a prefix of some longer string in the fixed set. Returns 122 // false otherwise, implying that the graph is exhausted and 123 // GetResultForCurrentSequence() will return kDafsaNotFound. 124 // 125 // Once Advance() has returned false, the caller can safely stop feeding more 126 // characters, as subsequent calls to Advance() will return false and have no 127 // effect. 128 bool Advance(char input); 129 130 // Returns the result code corresponding to the input sequence provided thus 131 // far to Advance(). 132 // 133 // If the sequence does not appear in the fixed set, the return value is 134 // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently 135 // limited to 0-7) corresponding to the result code for that string, as listed 136 // in the .gperf file from which the DAFSA was generated. For 137 // GetDomainAndRegistry DAFSAs, these values should be interpreted as a 138 // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule. 139 // 140 // It is okay to call this function, and then extend the sequence further by 141 // calling Advance(). 142 int GetResultForCurrentSequence() const; 143 144 private: 145 // Pointer to the current position in the graph indicating the current state 146 // of the automaton, or nullptr if the graph is exhausted. 147 const unsigned char* pos_; 148 149 // Pointer just past the end of the graph. |pos_| should never get here. This 150 // is used only in DCHECKs. 151 const unsigned char* end_; 152 153 // Contains the current decoder state. If true, |pos_| points to a label 154 // character or a return code. If false, |pos_| points to a sequence of 155 // offsets that indicate the child nodes of the current state. 156 bool pos_is_label_character_ = false; 157 }; 158 159 } // namespace net 160 161 #endif // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_ 162