xref: /aosp_15_r20/external/cronet/net/base/lookup_string_in_fixed_set.h (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2015 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_
6 #define NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_
7 
8 #include <stddef.h>
9 
10 #include <string_view>
11 
12 #include "net/base/net_export.h"
13 
14 namespace net {
15 
16 enum {
17   kDafsaNotFound = -1,  // key is not in set
18   kDafsaFound = 0,      // key is in set
19   // The following return values are used by the implementation of
20   // GetDomainAndRegistry() and are probably not generally useful.
21   kDafsaExceptionRule = 1,  // key excluded from set via exception
22   kDafsaWildcardRule = 2,   // key matched a wildcard rule
23   kDafsaPrivateRule = 4,    // key matched a private rule
24 };
25 
26 // Looks up the string |key| with length |key_length| in a fixed set of
27 // strings. The set of strings must be known at compile time. It is converted to
28 // a graph structure named a DAFSA (Deterministic Acyclic Finite State
29 // Automaton) by the script make_dafsa.py during compilation. This permits
30 // efficient (in time and space) lookup. The graph generated by make_dafsa.py
31 // takes the form of a constant byte array which should be supplied via the
32 // |graph| and |length| parameters.  The return value is kDafsaNotFound,
33 // kDafsaFound, or a bitmap consisting of one or more of kDafsaExceptionRule,
34 // kDafsaWildcardRule and kDafsaPrivateRule ORed together.
35 //
36 // TODO(nick): Replace this with FixedSetIncrementalLookup everywhere.
37 NET_EXPORT int LookupStringInFixedSet(const unsigned char* graph,
38                                       size_t length,
39                                       const char* key,
40                                       size_t key_length);
41 
42 // Looks up the longest matching suffix for |host| with length |length| in a
43 // reversed DAFSA. Partial matches must begin at a new component, i.e. host
44 // itself could match or a host part starting after a dot could match.
45 // If no match was found a value of 0 is written to |suffix_length| and the
46 // value kDafsaNotFound is returned, otherwise the length of the longest match
47 // is written to |suffix_length| and the type of the longest match is returned.
48 int LookupSuffixInReversedSet(const unsigned char* graph,
49                               size_t length,
50                               bool include_private,
51                               std::string_view host,
52                               size_t* suffix_length);
53 
54 // FixedSetIncrementalLookup provides efficient membership and prefix queries
55 // against a fixed set of strings. The set of strings must be known at compile
56 // time. The set is converted to a graph structure named a DAFSA (Deterministic
57 // Acyclic Finite State Automaton) by the script //net/tools/dafsa/make_dafsa.py
58 // during compilation. The conversion generates a C++ header file defining the
59 // encoded graph as a constant byte array. This class provides a fast, constant-
60 // space lookup operation against such byte arrays.
61 //
62 // The lookup proceeds incrementally, with input characters provided one at a
63 // time. This approach allow queries of the form: "given an input string, which
64 // prefixes of that string that appear in the fixed set?" As the matching
65 // prefixes (and their result codes) are enumerated, the most suitable match
66 // among them can be selected in a single pass.
67 //
68 // This class can also be used to perform suffix queries (instead of prefix
69 // queries) against a fixed set, so long as the DAFSA is constructed on reversed
70 // values, and the input is provided in reverse order.
71 //
72 // Example usage for simple membership query; |input| is a std::string:
73 //
74 //    FixedSetIncrementalLookup lookup(kDafsa, sizeof(kDafsa));
75 //    for (char c : input) {
76 //      if (!lookup.Advance(c))
77 //         return false;
78 //    }
79 //    return lookup.GetResultForCurrentSequence() != kDafsaNotFound;
80 //
81 // Example usage for 'find longest prefix in set with result code == 3' query:
82 //
83 //    FixedSetIncrementalLookup prefix_lookup(kDafsa, sizeof(kDafsa));
84 //    size_t longest_match_end = 0;
85 //    for (size_t i = 0; i < input.length(); ++i) {
86 //      if (!prefix_lookup.Advance(input[i]))
87 //         break;
88 //      if (prefix_lookup.GetResultForCurrentSequence() == 3)
89 //        longest_match_end = (i + 1);
90 //    }
91 //    return input.substr(0, longest_match_end);
92 //
93 class NET_EXPORT FixedSetIncrementalLookup {
94  public:
95   // Begin a lookup against the provided fixed set. |graph| and |length|
96   // describe a byte buffer generated by the make_dafsa.py script, as described
97   // in the class comment.
98   //
99   // FixedSetIncrementalLookup is initialized to a state corresponding to the
100   // empty input sequence. Calling GetResultForCurrentSequence() in the initial
101   // state would indicate whether the empty string appears in the fixed set.
102   // Characters can be added to the sequence by calling Advance(), and the
103   // lookup result can be checked after each addition by calling
104   // GetResultForCurrentSequence().
105   FixedSetIncrementalLookup(const unsigned char* graph, size_t length);
106 
107   // FixedSetIncrementalLookup is copyable so that callers can save/restore
108   // their position in the search. This is for cases where branching or
109   // backtracking might be required (e.g. to probe for the presence of a
110   // designated wildcard character).
111   FixedSetIncrementalLookup(const FixedSetIncrementalLookup&);
112   FixedSetIncrementalLookup& operator=(const FixedSetIncrementalLookup&);
113 
114   ~FixedSetIncrementalLookup();
115 
116   // Advance the query by adding a character to the input sequence. |input| can
117   // be any char value, but only ASCII characters will ever result in matches,
118   // since the fixed set itself is limited to ASCII strings.
119   //
120   // Returns true if the resulting input sequence either appears in the fixed
121   // set itself, or is a prefix of some longer string in the fixed set. Returns
122   // false otherwise, implying that the graph is exhausted and
123   // GetResultForCurrentSequence() will return kDafsaNotFound.
124   //
125   // Once Advance() has returned false, the caller can safely stop feeding more
126   // characters, as subsequent calls to Advance() will return false and have no
127   // effect.
128   bool Advance(char input);
129 
130   // Returns the result code corresponding to the input sequence provided thus
131   // far to Advance().
132   //
133   // If the sequence does not appear in the fixed set, the return value is
134   // kDafsaNotFound. Otherwise, the value is a non-negative integer (currently
135   // limited to 0-7) corresponding to the result code for that string, as listed
136   // in the .gperf file from which the DAFSA was generated. For
137   // GetDomainAndRegistry DAFSAs, these values should be interpreted as a
138   // bitmask of kDafsaExceptionRule, kDafsaWildcardRule, and kDafsaPrivateRule.
139   //
140   // It is okay to call this function, and then extend the sequence further by
141   // calling Advance().
142   int GetResultForCurrentSequence() const;
143 
144  private:
145   // Pointer to the current position in the graph indicating the current state
146   // of the automaton, or nullptr if the graph is exhausted.
147   const unsigned char* pos_;
148 
149   // Pointer just past the end of the graph. |pos_| should never get here. This
150   // is used only in DCHECKs.
151   const unsigned char* end_;
152 
153   // Contains the current decoder state. If true, |pos_| points to a label
154   // character or a return code. If false, |pos_| points to a sequence of
155   // offsets that indicate the child nodes of the current state.
156   bool pos_is_label_character_ = false;
157 };
158 
159 }  // namespace net
160 
161 #endif  // NET_BASE_LOOKUP_STRING_IN_FIXED_SET_H_
162