xref: /aosp_15_r20/external/icing/icing/util/character-iterator.cc (revision 8b6cd535a057e39b3b86660c4aa06c99747c2136)
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/character-iterator.h"
16 
17 #include <string_view>
18 
19 #include "icing/util/i18n-utils.h"
20 #include "unicode/utypes.h"
21 
22 namespace icing {
23 namespace lib {
24 
25 namespace {
26 
27 // Returns the lead byte of the UTF-8 character that includes the byte at
28 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)29 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
30   while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
31     --current_byte_index;
32   }
33   return current_byte_index;
34 }
35 
36 }  // namespace
37 
GetCurrentChar() const38 UChar32 CharacterIterator::GetCurrentChar() const {
39   if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
40     // Our indices point to the right character, we just need to read that
41     // character. No need to worry about an error. If GetUChar32At fails, then
42     // current_char will be i18n_utils::kInvalidUChar32.
43     cached_current_char_ =
44         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
45   }
46   return cached_current_char_;
47 }
48 
MoveToUtf8(int desired_utf8_index)49 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
50   return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
51                                             : RewindToUtf8(desired_utf8_index);
52 }
53 
AdvanceToUtf8(int desired_utf8_index)54 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
55   ResetToStartIfNecessary();
56 
57   if (desired_utf8_index > text_.length()) {
58     // Enforce the requirement.
59     return false;
60   }
61   // Need to work forwards.
62   UChar32 uchar32 = cached_current_char_;
63   while (utf8_index_ < desired_utf8_index) {
64     uchar32 =
65         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
66     if (uchar32 == i18n_utils::kInvalidUChar32) {
67       // Unable to retrieve a valid UTF-32 character at the previous position.
68       cached_current_char_ = i18n_utils::kInvalidUChar32;
69       return false;
70     }
71     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
72     if (utf8_index_ + utf8_length > desired_utf8_index) {
73       // Ah! Don't go too far!
74       break;
75     }
76     utf8_index_ += utf8_length;
77     utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
78     ++utf32_index_;
79   }
80   cached_current_char_ =
81       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
82   return true;
83 }
84 
RewindToUtf8(int desired_utf8_index)85 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
86   if (desired_utf8_index < 0) {
87     // Enforce the requirement.
88     return false;
89   }
90   // Need to work backwards.
91   UChar32 uchar32 = cached_current_char_;
92   while (utf8_index_ > desired_utf8_index) {
93     int utf8_index = utf8_index_ - 1;
94     utf8_index = GetUTF8StartPosition(text_, utf8_index);
95     if (utf8_index < 0) {
96       // Somehow, there wasn't a single UTF-8 lead byte at
97       // requested_byte_index or an earlier byte.
98       cached_current_char_ = i18n_utils::kInvalidUChar32;
99       return false;
100     }
101     // We've found the start of a unicode char!
102     uchar32 =
103         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
104     int expected_length = utf8_index_ - utf8_index;
105     if (uchar32 == i18n_utils::kInvalidUChar32 ||
106         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
107       // Either unable to retrieve a valid UTF-32 character at the previous
108       // position or we skipped past an invalid sequence while seeking the
109       // previous start position.
110       cached_current_char_ = i18n_utils::kInvalidUChar32;
111       return false;
112     }
113     cached_current_char_ = uchar32;
114     utf8_index_ = utf8_index;
115     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
116     --utf32_index_;
117   }
118   return true;
119 }
120 
MoveToUtf16(int desired_utf16_index)121 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
122   return (desired_utf16_index > utf16_index_)
123              ? AdvanceToUtf16(desired_utf16_index)
124              : RewindToUtf16(desired_utf16_index);
125 }
126 
AdvanceToUtf16(int desired_utf16_index)127 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
128   ResetToStartIfNecessary();
129 
130   UChar32 uchar32 = cached_current_char_;
131   while (utf16_index_ < desired_utf16_index) {
132     uchar32 =
133         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
134     if (uchar32 == i18n_utils::kInvalidUChar32) {
135       // Unable to retrieve a valid UTF-32 character at the previous position.
136       cached_current_char_ = i18n_utils::kInvalidUChar32;
137       return false;
138     }
139     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
140     if (utf16_index_ + utf16_length > desired_utf16_index) {
141       // Ah! Don't go too far!
142       break;
143     }
144     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
145     if (utf8_index_ + utf8_length > text_.length()) {
146       // Enforce the requirement.
147       cached_current_char_ = i18n_utils::kInvalidUChar32;
148       return false;
149     }
150     utf8_index_ += utf8_length;
151     utf16_index_ += utf16_length;
152     ++utf32_index_;
153   }
154   cached_current_char_ =
155       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
156   return true;
157 }
158 
RewindToUtf16(int desired_utf16_index)159 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
160   if (desired_utf16_index < 0) {
161     return false;
162   }
163   UChar32 uchar32 = cached_current_char_;
164   while (utf16_index_ > desired_utf16_index) {
165     int utf8_index = utf8_index_ - 1;
166     utf8_index = GetUTF8StartPosition(text_, utf8_index);
167     if (utf8_index < 0) {
168       // Somehow, there wasn't a single UTF-8 lead byte at
169       // requested_byte_index or an earlier byte.
170       cached_current_char_ = i18n_utils::kInvalidUChar32;
171       return false;
172     }
173     // We've found the start of a unicode char!
174     uchar32 =
175         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
176     int expected_length = utf8_index_ - utf8_index;
177     if (uchar32 == i18n_utils::kInvalidUChar32 ||
178         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
179       // Either unable to retrieve a valid UTF-32 character at the previous
180       // position or we skipped past an invalid sequence while seeking the
181       // previous start position.
182       cached_current_char_ = i18n_utils::kInvalidUChar32;
183       return false;
184     }
185     cached_current_char_ = uchar32;
186     utf8_index_ = utf8_index;
187     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
188     --utf32_index_;
189   }
190   return true;
191 }
192 
MoveToUtf32(int desired_utf32_index)193 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
194   return (desired_utf32_index > utf32_index_)
195              ? AdvanceToUtf32(desired_utf32_index)
196              : RewindToUtf32(desired_utf32_index);
197 }
198 
AdvanceToUtf32(int desired_utf32_index)199 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
200   ResetToStartIfNecessary();
201 
202   UChar32 uchar32 = cached_current_char_;
203   while (utf32_index_ < desired_utf32_index) {
204     uchar32 =
205         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
206     if (uchar32 == i18n_utils::kInvalidUChar32) {
207       // Unable to retrieve a valid UTF-32 character at the previous position.
208       cached_current_char_ = i18n_utils::kInvalidUChar32;
209       return false;
210     }
211     int utf16_length = i18n_utils::GetUtf16Length(uchar32);
212     int utf8_length = i18n_utils::GetUtf8Length(uchar32);
213     if (utf8_index_ + utf8_length > text_.length()) {
214       // Enforce the requirement.
215       cached_current_char_ = i18n_utils::kInvalidUChar32;
216       return false;
217     }
218     utf8_index_ += utf8_length;
219     utf16_index_ += utf16_length;
220     ++utf32_index_;
221   }
222   cached_current_char_ =
223       i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
224   return true;
225 }
226 
RewindToUtf32(int desired_utf32_index)227 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
228   if (desired_utf32_index < 0) {
229     return false;
230   }
231   UChar32 uchar32 = cached_current_char_;
232   while (utf32_index_ > desired_utf32_index) {
233     int utf8_index = utf8_index_ - 1;
234     utf8_index = GetUTF8StartPosition(text_, utf8_index);
235     if (utf8_index < 0) {
236       // Somehow, there wasn't a single UTF-8 lead byte at
237       // requested_byte_index or an earlier byte.
238       cached_current_char_ = i18n_utils::kInvalidUChar32;
239       return false;
240     }
241     // We've found the start of a unicode char!
242     uchar32 =
243         i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
244     int expected_length = utf8_index_ - utf8_index;
245     if (uchar32 == i18n_utils::kInvalidUChar32 ||
246         expected_length != i18n_utils::GetUtf8Length(uchar32)) {
247       // Either unable to retrieve a valid UTF-32 character at the previous
248       // position or we skipped past an invalid sequence while seeking the
249       // previous start position.
250       cached_current_char_ = i18n_utils::kInvalidUChar32;
251       return false;
252     }
253     cached_current_char_ = uchar32;
254     utf8_index_ = utf8_index;
255     utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
256     --utf32_index_;
257   }
258   return true;
259 }
260 
ResetToStartIfNecessary()261 void CharacterIterator::ResetToStartIfNecessary() {
262   if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
263     utf8_index_ = 0;
264     utf16_index_ = 0;
265     utf32_index_ = 0;
266     cached_current_char_ =
267         i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
268   }
269 }
270 
271 }  // namespace lib
272 }  // namespace icing
273