1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/character-iterator.h"
16
17 #include <string_view>
18
19 #include "icing/util/i18n-utils.h"
20 #include "unicode/utypes.h"
21
22 namespace icing {
23 namespace lib {
24
25 namespace {
26
27 // Returns the lead byte of the UTF-8 character that includes the byte at
28 // current_byte_index within it.
GetUTF8StartPosition(std::string_view text,int current_byte_index)29 int GetUTF8StartPosition(std::string_view text, int current_byte_index) {
30 while (!i18n_utils::IsLeadUtf8Byte(text[current_byte_index])) {
31 --current_byte_index;
32 }
33 return current_byte_index;
34 }
35
36 } // namespace
37
GetCurrentChar() const38 UChar32 CharacterIterator::GetCurrentChar() const {
39 if (cached_current_char_ == i18n_utils::kInvalidUChar32) {
40 // Our indices point to the right character, we just need to read that
41 // character. No need to worry about an error. If GetUChar32At fails, then
42 // current_char will be i18n_utils::kInvalidUChar32.
43 cached_current_char_ =
44 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
45 }
46 return cached_current_char_;
47 }
48
MoveToUtf8(int desired_utf8_index)49 bool CharacterIterator::MoveToUtf8(int desired_utf8_index) {
50 return (desired_utf8_index > utf8_index_) ? AdvanceToUtf8(desired_utf8_index)
51 : RewindToUtf8(desired_utf8_index);
52 }
53
AdvanceToUtf8(int desired_utf8_index)54 bool CharacterIterator::AdvanceToUtf8(int desired_utf8_index) {
55 ResetToStartIfNecessary();
56
57 if (desired_utf8_index > text_.length()) {
58 // Enforce the requirement.
59 return false;
60 }
61 // Need to work forwards.
62 UChar32 uchar32 = cached_current_char_;
63 while (utf8_index_ < desired_utf8_index) {
64 uchar32 =
65 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
66 if (uchar32 == i18n_utils::kInvalidUChar32) {
67 // Unable to retrieve a valid UTF-32 character at the previous position.
68 cached_current_char_ = i18n_utils::kInvalidUChar32;
69 return false;
70 }
71 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
72 if (utf8_index_ + utf8_length > desired_utf8_index) {
73 // Ah! Don't go too far!
74 break;
75 }
76 utf8_index_ += utf8_length;
77 utf16_index_ += i18n_utils::GetUtf16Length(uchar32);
78 ++utf32_index_;
79 }
80 cached_current_char_ =
81 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
82 return true;
83 }
84
RewindToUtf8(int desired_utf8_index)85 bool CharacterIterator::RewindToUtf8(int desired_utf8_index) {
86 if (desired_utf8_index < 0) {
87 // Enforce the requirement.
88 return false;
89 }
90 // Need to work backwards.
91 UChar32 uchar32 = cached_current_char_;
92 while (utf8_index_ > desired_utf8_index) {
93 int utf8_index = utf8_index_ - 1;
94 utf8_index = GetUTF8StartPosition(text_, utf8_index);
95 if (utf8_index < 0) {
96 // Somehow, there wasn't a single UTF-8 lead byte at
97 // requested_byte_index or an earlier byte.
98 cached_current_char_ = i18n_utils::kInvalidUChar32;
99 return false;
100 }
101 // We've found the start of a unicode char!
102 uchar32 =
103 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
104 int expected_length = utf8_index_ - utf8_index;
105 if (uchar32 == i18n_utils::kInvalidUChar32 ||
106 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
107 // Either unable to retrieve a valid UTF-32 character at the previous
108 // position or we skipped past an invalid sequence while seeking the
109 // previous start position.
110 cached_current_char_ = i18n_utils::kInvalidUChar32;
111 return false;
112 }
113 cached_current_char_ = uchar32;
114 utf8_index_ = utf8_index;
115 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
116 --utf32_index_;
117 }
118 return true;
119 }
120
MoveToUtf16(int desired_utf16_index)121 bool CharacterIterator::MoveToUtf16(int desired_utf16_index) {
122 return (desired_utf16_index > utf16_index_)
123 ? AdvanceToUtf16(desired_utf16_index)
124 : RewindToUtf16(desired_utf16_index);
125 }
126
AdvanceToUtf16(int desired_utf16_index)127 bool CharacterIterator::AdvanceToUtf16(int desired_utf16_index) {
128 ResetToStartIfNecessary();
129
130 UChar32 uchar32 = cached_current_char_;
131 while (utf16_index_ < desired_utf16_index) {
132 uchar32 =
133 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
134 if (uchar32 == i18n_utils::kInvalidUChar32) {
135 // Unable to retrieve a valid UTF-32 character at the previous position.
136 cached_current_char_ = i18n_utils::kInvalidUChar32;
137 return false;
138 }
139 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
140 if (utf16_index_ + utf16_length > desired_utf16_index) {
141 // Ah! Don't go too far!
142 break;
143 }
144 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
145 if (utf8_index_ + utf8_length > text_.length()) {
146 // Enforce the requirement.
147 cached_current_char_ = i18n_utils::kInvalidUChar32;
148 return false;
149 }
150 utf8_index_ += utf8_length;
151 utf16_index_ += utf16_length;
152 ++utf32_index_;
153 }
154 cached_current_char_ =
155 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
156 return true;
157 }
158
RewindToUtf16(int desired_utf16_index)159 bool CharacterIterator::RewindToUtf16(int desired_utf16_index) {
160 if (desired_utf16_index < 0) {
161 return false;
162 }
163 UChar32 uchar32 = cached_current_char_;
164 while (utf16_index_ > desired_utf16_index) {
165 int utf8_index = utf8_index_ - 1;
166 utf8_index = GetUTF8StartPosition(text_, utf8_index);
167 if (utf8_index < 0) {
168 // Somehow, there wasn't a single UTF-8 lead byte at
169 // requested_byte_index or an earlier byte.
170 cached_current_char_ = i18n_utils::kInvalidUChar32;
171 return false;
172 }
173 // We've found the start of a unicode char!
174 uchar32 =
175 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
176 int expected_length = utf8_index_ - utf8_index;
177 if (uchar32 == i18n_utils::kInvalidUChar32 ||
178 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
179 // Either unable to retrieve a valid UTF-32 character at the previous
180 // position or we skipped past an invalid sequence while seeking the
181 // previous start position.
182 cached_current_char_ = i18n_utils::kInvalidUChar32;
183 return false;
184 }
185 cached_current_char_ = uchar32;
186 utf8_index_ = utf8_index;
187 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
188 --utf32_index_;
189 }
190 return true;
191 }
192
MoveToUtf32(int desired_utf32_index)193 bool CharacterIterator::MoveToUtf32(int desired_utf32_index) {
194 return (desired_utf32_index > utf32_index_)
195 ? AdvanceToUtf32(desired_utf32_index)
196 : RewindToUtf32(desired_utf32_index);
197 }
198
AdvanceToUtf32(int desired_utf32_index)199 bool CharacterIterator::AdvanceToUtf32(int desired_utf32_index) {
200 ResetToStartIfNecessary();
201
202 UChar32 uchar32 = cached_current_char_;
203 while (utf32_index_ < desired_utf32_index) {
204 uchar32 =
205 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
206 if (uchar32 == i18n_utils::kInvalidUChar32) {
207 // Unable to retrieve a valid UTF-32 character at the previous position.
208 cached_current_char_ = i18n_utils::kInvalidUChar32;
209 return false;
210 }
211 int utf16_length = i18n_utils::GetUtf16Length(uchar32);
212 int utf8_length = i18n_utils::GetUtf8Length(uchar32);
213 if (utf8_index_ + utf8_length > text_.length()) {
214 // Enforce the requirement.
215 cached_current_char_ = i18n_utils::kInvalidUChar32;
216 return false;
217 }
218 utf8_index_ += utf8_length;
219 utf16_index_ += utf16_length;
220 ++utf32_index_;
221 }
222 cached_current_char_ =
223 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index_);
224 return true;
225 }
226
RewindToUtf32(int desired_utf32_index)227 bool CharacterIterator::RewindToUtf32(int desired_utf32_index) {
228 if (desired_utf32_index < 0) {
229 return false;
230 }
231 UChar32 uchar32 = cached_current_char_;
232 while (utf32_index_ > desired_utf32_index) {
233 int utf8_index = utf8_index_ - 1;
234 utf8_index = GetUTF8StartPosition(text_, utf8_index);
235 if (utf8_index < 0) {
236 // Somehow, there wasn't a single UTF-8 lead byte at
237 // requested_byte_index or an earlier byte.
238 cached_current_char_ = i18n_utils::kInvalidUChar32;
239 return false;
240 }
241 // We've found the start of a unicode char!
242 uchar32 =
243 i18n_utils::GetUChar32At(text_.data(), text_.length(), utf8_index);
244 int expected_length = utf8_index_ - utf8_index;
245 if (uchar32 == i18n_utils::kInvalidUChar32 ||
246 expected_length != i18n_utils::GetUtf8Length(uchar32)) {
247 // Either unable to retrieve a valid UTF-32 character at the previous
248 // position or we skipped past an invalid sequence while seeking the
249 // previous start position.
250 cached_current_char_ = i18n_utils::kInvalidUChar32;
251 return false;
252 }
253 cached_current_char_ = uchar32;
254 utf8_index_ = utf8_index;
255 utf16_index_ -= i18n_utils::GetUtf16Length(uchar32);
256 --utf32_index_;
257 }
258 return true;
259 }
260
ResetToStartIfNecessary()261 void CharacterIterator::ResetToStartIfNecessary() {
262 if (utf8_index_ < 0 || utf16_index_ < 0 || utf32_index_ < 0) {
263 utf8_index_ = 0;
264 utf16_index_ = 0;
265 utf32_index_ = 0;
266 cached_current_char_ =
267 i18n_utils::GetUChar32At(text_.data(), text_.length(), 0);
268 }
269 }
270
271 } // namespace lib
272 } // namespace icing
273