xref: /aosp_15_r20/external/libchrome/base/strings/string_tokenizer.h (revision 635a864187cb8b6c713ff48b7e790a6b21769273)
1*635a8641SAndroid Build Coastguard Worker // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2*635a8641SAndroid Build Coastguard Worker // Use of this source code is governed by a BSD-style license that can be
3*635a8641SAndroid Build Coastguard Worker // found in the LICENSE file.
4*635a8641SAndroid Build Coastguard Worker 
5*635a8641SAndroid Build Coastguard Worker #ifndef BASE_STRINGS_STRING_TOKENIZER_H_
6*635a8641SAndroid Build Coastguard Worker #define BASE_STRINGS_STRING_TOKENIZER_H_
7*635a8641SAndroid Build Coastguard Worker 
8*635a8641SAndroid Build Coastguard Worker #include <algorithm>
9*635a8641SAndroid Build Coastguard Worker #include <string>
10*635a8641SAndroid Build Coastguard Worker 
11*635a8641SAndroid Build Coastguard Worker #include "base/strings/string_piece.h"
12*635a8641SAndroid Build Coastguard Worker 
13*635a8641SAndroid Build Coastguard Worker namespace base {
14*635a8641SAndroid Build Coastguard Worker 
15*635a8641SAndroid Build Coastguard Worker // StringTokenizerT is a simple string tokenizer class.  It works like an
16*635a8641SAndroid Build Coastguard Worker // iterator that with each step (see the Advance method) updates members that
17*635a8641SAndroid Build Coastguard Worker // refer to the next token in the input string.  The user may optionally
18*635a8641SAndroid Build Coastguard Worker // configure the tokenizer to return delimiters.
19*635a8641SAndroid Build Coastguard Worker //
20*635a8641SAndroid Build Coastguard Worker // EXAMPLE 1:
21*635a8641SAndroid Build Coastguard Worker //
22*635a8641SAndroid Build Coastguard Worker //   char input[] = "this is a test";
23*635a8641SAndroid Build Coastguard Worker //   CStringTokenizer t(input, input + strlen(input), " ");
24*635a8641SAndroid Build Coastguard Worker //   while (t.GetNext()) {
25*635a8641SAndroid Build Coastguard Worker //     printf("%s\n", t.token().c_str());
26*635a8641SAndroid Build Coastguard Worker //   }
27*635a8641SAndroid Build Coastguard Worker //
28*635a8641SAndroid Build Coastguard Worker // Output:
29*635a8641SAndroid Build Coastguard Worker //
30*635a8641SAndroid Build Coastguard Worker //   this
31*635a8641SAndroid Build Coastguard Worker //   is
32*635a8641SAndroid Build Coastguard Worker //   a
33*635a8641SAndroid Build Coastguard Worker //   test
34*635a8641SAndroid Build Coastguard Worker //
35*635a8641SAndroid Build Coastguard Worker //
36*635a8641SAndroid Build Coastguard Worker // EXAMPLE 2:
37*635a8641SAndroid Build Coastguard Worker //
38*635a8641SAndroid Build Coastguard Worker //   std::string input = "no-cache=\"foo, bar\", private";
39*635a8641SAndroid Build Coastguard Worker //   StringTokenizer t(input, ", ");
40*635a8641SAndroid Build Coastguard Worker //   t.set_quote_chars("\"");
41*635a8641SAndroid Build Coastguard Worker //   while (t.GetNext()) {
42*635a8641SAndroid Build Coastguard Worker //     printf("%s\n", t.token().c_str());
43*635a8641SAndroid Build Coastguard Worker //   }
44*635a8641SAndroid Build Coastguard Worker //
45*635a8641SAndroid Build Coastguard Worker // Output:
46*635a8641SAndroid Build Coastguard Worker //
47*635a8641SAndroid Build Coastguard Worker //   no-cache="foo, bar"
48*635a8641SAndroid Build Coastguard Worker //   private
49*635a8641SAndroid Build Coastguard Worker //
50*635a8641SAndroid Build Coastguard Worker //
51*635a8641SAndroid Build Coastguard Worker // EXAMPLE 3:
52*635a8641SAndroid Build Coastguard Worker //
53*635a8641SAndroid Build Coastguard Worker //   bool next_is_option = false, next_is_value = false;
54*635a8641SAndroid Build Coastguard Worker //   std::string input = "text/html; charset=UTF-8; foo=bar";
55*635a8641SAndroid Build Coastguard Worker //   StringTokenizer t(input, "; =");
56*635a8641SAndroid Build Coastguard Worker //   t.set_options(StringTokenizer::RETURN_DELIMS);
57*635a8641SAndroid Build Coastguard Worker //   while (t.GetNext()) {
58*635a8641SAndroid Build Coastguard Worker //     if (t.token_is_delim()) {
59*635a8641SAndroid Build Coastguard Worker //       switch (*t.token_begin()) {
60*635a8641SAndroid Build Coastguard Worker //         case ';':
61*635a8641SAndroid Build Coastguard Worker //           next_is_option = true;
62*635a8641SAndroid Build Coastguard Worker //           break;
63*635a8641SAndroid Build Coastguard Worker //         case '=':
64*635a8641SAndroid Build Coastguard Worker //           next_is_value = true;
65*635a8641SAndroid Build Coastguard Worker //           break;
66*635a8641SAndroid Build Coastguard Worker //       }
67*635a8641SAndroid Build Coastguard Worker //     } else {
68*635a8641SAndroid Build Coastguard Worker //       const char* label;
69*635a8641SAndroid Build Coastguard Worker //       if (next_is_option) {
70*635a8641SAndroid Build Coastguard Worker //         label = "option-name";
71*635a8641SAndroid Build Coastguard Worker //         next_is_option = false;
72*635a8641SAndroid Build Coastguard Worker //       } else if (next_is_value) {
73*635a8641SAndroid Build Coastguard Worker //         label = "option-value";
74*635a8641SAndroid Build Coastguard Worker //         next_is_value = false;
75*635a8641SAndroid Build Coastguard Worker //       } else {
76*635a8641SAndroid Build Coastguard Worker //         label = "mime-type";
77*635a8641SAndroid Build Coastguard Worker //       }
78*635a8641SAndroid Build Coastguard Worker //       printf("%s: %s\n", label, t.token().c_str());
79*635a8641SAndroid Build Coastguard Worker //     }
80*635a8641SAndroid Build Coastguard Worker //   }
81*635a8641SAndroid Build Coastguard Worker //
82*635a8641SAndroid Build Coastguard Worker //
83*635a8641SAndroid Build Coastguard Worker template <class str, class const_iterator>
84*635a8641SAndroid Build Coastguard Worker class StringTokenizerT {
85*635a8641SAndroid Build Coastguard Worker  public:
86*635a8641SAndroid Build Coastguard Worker   typedef typename str::value_type char_type;
87*635a8641SAndroid Build Coastguard Worker 
88*635a8641SAndroid Build Coastguard Worker   // Options that may be pass to set_options()
89*635a8641SAndroid Build Coastguard Worker   enum {
90*635a8641SAndroid Build Coastguard Worker     // Specifies the delimiters should be returned as tokens
91*635a8641SAndroid Build Coastguard Worker     RETURN_DELIMS = 1 << 0,
92*635a8641SAndroid Build Coastguard Worker   };
93*635a8641SAndroid Build Coastguard Worker 
94*635a8641SAndroid Build Coastguard Worker   // The string object must live longer than the tokenizer. In particular, this
95*635a8641SAndroid Build Coastguard Worker   // should not be constructed with a temporary. The deleted rvalue constructor
96*635a8641SAndroid Build Coastguard Worker   // blocks the most obvious instances of this (e.g. passing a string literal to
97*635a8641SAndroid Build Coastguard Worker   // the constructor), but caution must still be exercised.
StringTokenizerT(const str & string,const str & delims)98*635a8641SAndroid Build Coastguard Worker   StringTokenizerT(const str& string,
99*635a8641SAndroid Build Coastguard Worker                    const str& delims) {
100*635a8641SAndroid Build Coastguard Worker     Init(string.begin(), string.end(), delims);
101*635a8641SAndroid Build Coastguard Worker   }
102*635a8641SAndroid Build Coastguard Worker 
103*635a8641SAndroid Build Coastguard Worker   // Don't allow temporary strings to be used with string tokenizer, since
104*635a8641SAndroid Build Coastguard Worker   // Init() would otherwise save iterators to a temporary string.
105*635a8641SAndroid Build Coastguard Worker   StringTokenizerT(str&&, const str& delims) = delete;
106*635a8641SAndroid Build Coastguard Worker 
StringTokenizerT(const_iterator string_begin,const_iterator string_end,const str & delims)107*635a8641SAndroid Build Coastguard Worker   StringTokenizerT(const_iterator string_begin,
108*635a8641SAndroid Build Coastguard Worker                    const_iterator string_end,
109*635a8641SAndroid Build Coastguard Worker                    const str& delims) {
110*635a8641SAndroid Build Coastguard Worker     Init(string_begin, string_end, delims);
111*635a8641SAndroid Build Coastguard Worker   }
112*635a8641SAndroid Build Coastguard Worker 
113*635a8641SAndroid Build Coastguard Worker   // Set the options for this tokenizer.  By default, this is 0.
set_options(int options)114*635a8641SAndroid Build Coastguard Worker   void set_options(int options) { options_ = options; }
115*635a8641SAndroid Build Coastguard Worker 
116*635a8641SAndroid Build Coastguard Worker   // Set the characters to regard as quotes.  By default, this is empty.  When
117*635a8641SAndroid Build Coastguard Worker   // a quote char is encountered, the tokenizer will switch into a mode where
118*635a8641SAndroid Build Coastguard Worker   // it ignores delimiters that it finds.  It switches out of this mode once it
119*635a8641SAndroid Build Coastguard Worker   // finds another instance of the quote char.  If a backslash is encountered
120*635a8641SAndroid Build Coastguard Worker   // within a quoted string, then the next character is skipped.
set_quote_chars(const str & quotes)121*635a8641SAndroid Build Coastguard Worker   void set_quote_chars(const str& quotes) { quotes_ = quotes; }
122*635a8641SAndroid Build Coastguard Worker 
123*635a8641SAndroid Build Coastguard Worker   // Call this method to advance the tokenizer to the next delimiter.  This
124*635a8641SAndroid Build Coastguard Worker   // returns false if the tokenizer is complete.  This method must be called
125*635a8641SAndroid Build Coastguard Worker   // before calling any of the token* methods.
GetNext()126*635a8641SAndroid Build Coastguard Worker   bool GetNext() {
127*635a8641SAndroid Build Coastguard Worker     if (quotes_.empty() && options_ == 0)
128*635a8641SAndroid Build Coastguard Worker       return QuickGetNext();
129*635a8641SAndroid Build Coastguard Worker     else
130*635a8641SAndroid Build Coastguard Worker       return FullGetNext();
131*635a8641SAndroid Build Coastguard Worker   }
132*635a8641SAndroid Build Coastguard Worker 
133*635a8641SAndroid Build Coastguard Worker   // Start iterating through tokens from the beginning of the string.
Reset()134*635a8641SAndroid Build Coastguard Worker   void Reset() {
135*635a8641SAndroid Build Coastguard Worker     token_end_ = start_pos_;
136*635a8641SAndroid Build Coastguard Worker   }
137*635a8641SAndroid Build Coastguard Worker 
138*635a8641SAndroid Build Coastguard Worker   // Returns true if token is a delimiter.  When the tokenizer is constructed
139*635a8641SAndroid Build Coastguard Worker   // with the RETURN_DELIMS option, this method can be used to check if the
140*635a8641SAndroid Build Coastguard Worker   // returned token is actually a delimiter.
token_is_delim()141*635a8641SAndroid Build Coastguard Worker   bool token_is_delim() const { return token_is_delim_; }
142*635a8641SAndroid Build Coastguard Worker 
143*635a8641SAndroid Build Coastguard Worker   // If GetNext() returned true, then these methods may be used to read the
144*635a8641SAndroid Build Coastguard Worker   // value of the token.
token_begin()145*635a8641SAndroid Build Coastguard Worker   const_iterator token_begin() const { return token_begin_; }
token_end()146*635a8641SAndroid Build Coastguard Worker   const_iterator token_end() const { return token_end_; }
token()147*635a8641SAndroid Build Coastguard Worker   str token() const { return str(token_begin_, token_end_); }
token_piece()148*635a8641SAndroid Build Coastguard Worker   BasicStringPiece<str> token_piece() const {
149*635a8641SAndroid Build Coastguard Worker     return BasicStringPiece<str>(&*token_begin_,
150*635a8641SAndroid Build Coastguard Worker                                  std::distance(token_begin_, token_end_));
151*635a8641SAndroid Build Coastguard Worker   }
152*635a8641SAndroid Build Coastguard Worker 
153*635a8641SAndroid Build Coastguard Worker  private:
Init(const_iterator string_begin,const_iterator string_end,const str & delims)154*635a8641SAndroid Build Coastguard Worker   void Init(const_iterator string_begin,
155*635a8641SAndroid Build Coastguard Worker             const_iterator string_end,
156*635a8641SAndroid Build Coastguard Worker             const str& delims) {
157*635a8641SAndroid Build Coastguard Worker     start_pos_ = string_begin;
158*635a8641SAndroid Build Coastguard Worker     token_begin_ = string_begin;
159*635a8641SAndroid Build Coastguard Worker     token_end_ = string_begin;
160*635a8641SAndroid Build Coastguard Worker     end_ = string_end;
161*635a8641SAndroid Build Coastguard Worker     delims_ = delims;
162*635a8641SAndroid Build Coastguard Worker     options_ = 0;
163*635a8641SAndroid Build Coastguard Worker     token_is_delim_ = false;
164*635a8641SAndroid Build Coastguard Worker   }
165*635a8641SAndroid Build Coastguard Worker 
166*635a8641SAndroid Build Coastguard Worker   // Implementation of GetNext() for when we have no quote characters. We have
167*635a8641SAndroid Build Coastguard Worker   // two separate implementations because AdvanceOne() is a hot spot in large
168*635a8641SAndroid Build Coastguard Worker   // text files with large tokens.
QuickGetNext()169*635a8641SAndroid Build Coastguard Worker   bool QuickGetNext() {
170*635a8641SAndroid Build Coastguard Worker     token_is_delim_ = false;
171*635a8641SAndroid Build Coastguard Worker     for (;;) {
172*635a8641SAndroid Build Coastguard Worker       token_begin_ = token_end_;
173*635a8641SAndroid Build Coastguard Worker       if (token_end_ == end_)
174*635a8641SAndroid Build Coastguard Worker         return false;
175*635a8641SAndroid Build Coastguard Worker       ++token_end_;
176*635a8641SAndroid Build Coastguard Worker       if (delims_.find(*token_begin_) == str::npos)
177*635a8641SAndroid Build Coastguard Worker         break;
178*635a8641SAndroid Build Coastguard Worker       // else skip over delimiter.
179*635a8641SAndroid Build Coastguard Worker     }
180*635a8641SAndroid Build Coastguard Worker     while (token_end_ != end_ && delims_.find(*token_end_) == str::npos)
181*635a8641SAndroid Build Coastguard Worker       ++token_end_;
182*635a8641SAndroid Build Coastguard Worker     return true;
183*635a8641SAndroid Build Coastguard Worker   }
184*635a8641SAndroid Build Coastguard Worker 
185*635a8641SAndroid Build Coastguard Worker   // Implementation of GetNext() for when we have to take quotes into account.
FullGetNext()186*635a8641SAndroid Build Coastguard Worker   bool FullGetNext() {
187*635a8641SAndroid Build Coastguard Worker     AdvanceState state;
188*635a8641SAndroid Build Coastguard Worker     token_is_delim_ = false;
189*635a8641SAndroid Build Coastguard Worker     for (;;) {
190*635a8641SAndroid Build Coastguard Worker       token_begin_ = token_end_;
191*635a8641SAndroid Build Coastguard Worker       if (token_end_ == end_)
192*635a8641SAndroid Build Coastguard Worker         return false;
193*635a8641SAndroid Build Coastguard Worker       ++token_end_;
194*635a8641SAndroid Build Coastguard Worker       if (AdvanceOne(&state, *token_begin_))
195*635a8641SAndroid Build Coastguard Worker         break;
196*635a8641SAndroid Build Coastguard Worker       if (options_ & RETURN_DELIMS) {
197*635a8641SAndroid Build Coastguard Worker         token_is_delim_ = true;
198*635a8641SAndroid Build Coastguard Worker         return true;
199*635a8641SAndroid Build Coastguard Worker       }
200*635a8641SAndroid Build Coastguard Worker       // else skip over delimiter.
201*635a8641SAndroid Build Coastguard Worker     }
202*635a8641SAndroid Build Coastguard Worker     while (token_end_ != end_ && AdvanceOne(&state, *token_end_))
203*635a8641SAndroid Build Coastguard Worker       ++token_end_;
204*635a8641SAndroid Build Coastguard Worker     return true;
205*635a8641SAndroid Build Coastguard Worker   }
206*635a8641SAndroid Build Coastguard Worker 
IsDelim(char_type c)207*635a8641SAndroid Build Coastguard Worker   bool IsDelim(char_type c) const {
208*635a8641SAndroid Build Coastguard Worker     return delims_.find(c) != str::npos;
209*635a8641SAndroid Build Coastguard Worker   }
210*635a8641SAndroid Build Coastguard Worker 
IsQuote(char_type c)211*635a8641SAndroid Build Coastguard Worker   bool IsQuote(char_type c) const {
212*635a8641SAndroid Build Coastguard Worker     return quotes_.find(c) != str::npos;
213*635a8641SAndroid Build Coastguard Worker   }
214*635a8641SAndroid Build Coastguard Worker 
215*635a8641SAndroid Build Coastguard Worker   struct AdvanceState {
216*635a8641SAndroid Build Coastguard Worker     bool in_quote;
217*635a8641SAndroid Build Coastguard Worker     bool in_escape;
218*635a8641SAndroid Build Coastguard Worker     char_type quote_char;
AdvanceStateAdvanceState219*635a8641SAndroid Build Coastguard Worker     AdvanceState() : in_quote(false), in_escape(false), quote_char('\0') {}
220*635a8641SAndroid Build Coastguard Worker   };
221*635a8641SAndroid Build Coastguard Worker 
222*635a8641SAndroid Build Coastguard Worker   // Returns true if a delimiter was not hit.
AdvanceOne(AdvanceState * state,char_type c)223*635a8641SAndroid Build Coastguard Worker   bool AdvanceOne(AdvanceState* state, char_type c) {
224*635a8641SAndroid Build Coastguard Worker     if (state->in_quote) {
225*635a8641SAndroid Build Coastguard Worker       if (state->in_escape) {
226*635a8641SAndroid Build Coastguard Worker         state->in_escape = false;
227*635a8641SAndroid Build Coastguard Worker       } else if (c == '\\') {
228*635a8641SAndroid Build Coastguard Worker         state->in_escape = true;
229*635a8641SAndroid Build Coastguard Worker       } else if (c == state->quote_char) {
230*635a8641SAndroid Build Coastguard Worker         state->in_quote = false;
231*635a8641SAndroid Build Coastguard Worker       }
232*635a8641SAndroid Build Coastguard Worker     } else {
233*635a8641SAndroid Build Coastguard Worker       if (IsDelim(c))
234*635a8641SAndroid Build Coastguard Worker         return false;
235*635a8641SAndroid Build Coastguard Worker       state->in_quote = IsQuote(state->quote_char = c);
236*635a8641SAndroid Build Coastguard Worker     }
237*635a8641SAndroid Build Coastguard Worker     return true;
238*635a8641SAndroid Build Coastguard Worker   }
239*635a8641SAndroid Build Coastguard Worker 
240*635a8641SAndroid Build Coastguard Worker   const_iterator start_pos_;
241*635a8641SAndroid Build Coastguard Worker   const_iterator token_begin_;
242*635a8641SAndroid Build Coastguard Worker   const_iterator token_end_;
243*635a8641SAndroid Build Coastguard Worker   const_iterator end_;
244*635a8641SAndroid Build Coastguard Worker   str delims_;
245*635a8641SAndroid Build Coastguard Worker   str quotes_;
246*635a8641SAndroid Build Coastguard Worker   int options_;
247*635a8641SAndroid Build Coastguard Worker   bool token_is_delim_;
248*635a8641SAndroid Build Coastguard Worker };
249*635a8641SAndroid Build Coastguard Worker 
250*635a8641SAndroid Build Coastguard Worker typedef StringTokenizerT<std::string, std::string::const_iterator>
251*635a8641SAndroid Build Coastguard Worker     StringTokenizer;
252*635a8641SAndroid Build Coastguard Worker typedef StringTokenizerT<std::wstring, std::wstring::const_iterator>
253*635a8641SAndroid Build Coastguard Worker     WStringTokenizer;
254*635a8641SAndroid Build Coastguard Worker typedef StringTokenizerT<std::string, const char*> CStringTokenizer;
255*635a8641SAndroid Build Coastguard Worker 
256*635a8641SAndroid Build Coastguard Worker }  // namespace base
257*635a8641SAndroid Build Coastguard Worker 
258*635a8641SAndroid Build Coastguard Worker #endif  // BASE_STRINGS_STRING_TOKENIZER_H_
259