1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/utf_offset_string_conversions.h"
6
7 #include <stdint.h>
8
9 #include <algorithm>
10 #include <memory>
11
12 #include "base/check_op.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15
16 namespace base {
17
Adjustment(size_t original_offset,size_t original_length,size_t output_length)18 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
19 size_t original_length,
20 size_t output_length)
21 : original_offset(original_offset),
22 original_length(original_length),
23 output_length(output_length) {
24 }
25
26 // static
AdjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_adjustment,size_t limit)27 void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
28 std::vector<size_t>* offsets_for_adjustment,
29 size_t limit) {
30 DCHECK(offsets_for_adjustment);
31 for (auto& i : *offsets_for_adjustment)
32 AdjustOffset(adjustments, &i, limit);
33 }
34
35 // static
AdjustOffset(const Adjustments & adjustments,size_t * offset,size_t limit)36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
37 size_t* offset,
38 size_t limit) {
39 DCHECK(offset);
40 if (*offset == std::u16string::npos)
41 return;
42 size_t original_lengths = 0;
43 size_t output_lengths = 0;
44 for (const auto& i : adjustments) {
45 if (*offset <= i.original_offset)
46 break;
47 if (*offset < (i.original_offset + i.original_length)) {
48 *offset = std::u16string::npos;
49 return;
50 }
51 original_lengths += i.original_length;
52 output_lengths += i.output_length;
53 }
54 *offset += output_lengths - original_lengths;
55
56 if (*offset > limit)
57 *offset = std::u16string::npos;
58 }
59
60 // static
UnadjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_unadjustment)61 void OffsetAdjuster::UnadjustOffsets(
62 const Adjustments& adjustments,
63 std::vector<size_t>* offsets_for_unadjustment) {
64 if (!offsets_for_unadjustment || adjustments.empty())
65 return;
66 for (auto& i : *offsets_for_unadjustment)
67 UnadjustOffset(adjustments, &i);
68 }
69
70 // static
UnadjustOffset(const Adjustments & adjustments,size_t * offset)71 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
72 size_t* offset) {
73 if (*offset == std::u16string::npos)
74 return;
75 size_t original_lengths = 0;
76 size_t output_lengths = 0;
77 for (const auto& i : adjustments) {
78 if (*offset + original_lengths - output_lengths <= i.original_offset)
79 break;
80 original_lengths += i.original_length;
81 output_lengths += i.output_length;
82 if ((*offset + original_lengths - output_lengths) <
83 (i.original_offset + i.original_length)) {
84 *offset = std::u16string::npos;
85 return;
86 }
87 }
88 *offset += original_lengths - output_lengths;
89 }
90
91 // static
MergeSequentialAdjustments(const Adjustments & first_adjustments,Adjustments * adjustments_on_adjusted_string)92 void OffsetAdjuster::MergeSequentialAdjustments(
93 const Adjustments& first_adjustments,
94 Adjustments* adjustments_on_adjusted_string) {
95 auto adjusted_iter = adjustments_on_adjusted_string->begin();
96 auto first_iter = first_adjustments.begin();
97 // Simultaneously iterate over all |adjustments_on_adjusted_string| and
98 // |first_adjustments|, pushing adjustments at the end of
99 // |adjustments_builder| as we go. |shift| keeps track of the current number
100 // of characters collapsed by |first_adjustments| up to this point.
101 // |currently_collapsing| keeps track of the number of characters collapsed by
102 // |first_adjustments| into the current |adjusted_iter|'s length. These are
103 // characters that will change |shift| as soon as we're done processing the
104 // current |adjusted_iter|; they are not yet reflected in |shift|.
105 size_t shift = 0;
106 size_t currently_collapsing = 0;
107 // While we *could* update |adjustments_on_adjusted_string| in place by
108 // inserting new adjustments into the middle, we would be repeatedly calling
109 // |std::vector::insert|. That would cost O(n) time per insert, relative to
110 // distance from end of the string. By instead allocating
111 // |adjustments_builder| and calling |std::vector::push_back|, we only pay
112 // amortized constant time per push. We are trading space for time.
113 Adjustments adjustments_builder;
114 while (adjusted_iter != adjustments_on_adjusted_string->end()) {
115 if ((first_iter == first_adjustments.end()) ||
116 ((adjusted_iter->original_offset + shift +
117 adjusted_iter->original_length) <= first_iter->original_offset)) {
118 // Entire |adjusted_iter| (accounting for its shift and including its
119 // whole original length) comes before |first_iter|.
120 //
121 // Correct the offset at |adjusted_iter| and move onto the next
122 // adjustment that needs revising.
123 adjusted_iter->original_offset += shift;
124 shift += currently_collapsing;
125 currently_collapsing = 0;
126 adjustments_builder.push_back(*adjusted_iter);
127 ++adjusted_iter;
128 } else if ((adjusted_iter->original_offset + shift) >
129 first_iter->original_offset) {
130 // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
131
132 // It's not possible for the adjustments to overlap. (It shouldn't
133 // be possible that we have an |adjusted_iter->original_offset| that,
134 // when adjusted by the computed |shift|, is in the middle of
135 // |first_iter|'s output's length. After all, that would mean the
136 // current adjustment_on_adjusted_string somehow points to an offset
137 // that was supposed to have been eliminated by the first set of
138 // adjustments.)
139 DCHECK_LE(first_iter->original_offset + first_iter->output_length,
140 adjusted_iter->original_offset + shift);
141
142 // Add the |first_iter| to the full set of adjustments.
143 shift += first_iter->original_length - first_iter->output_length;
144 adjustments_builder.push_back(*first_iter);
145 ++first_iter;
146 } else {
147 // The first adjustment adjusted something that then got further adjusted
148 // by the second set of adjustments. In other words, |first_iter| points
149 // to something in the range covered by |adjusted_iter|'s length (after
150 // accounting for |shift|). Precisely,
151 // adjusted_iter->original_offset + shift
152 // <=
153 // first_iter->original_offset
154 // <=
155 // adjusted_iter->original_offset + shift +
156 // adjusted_iter->original_length
157 // Modify the current |adjusted_iter| to include whatever collapsing
158 // happened in |first_iter|, then advance to the next |first_adjustments|
159 // because we dealt with the current one.
160
161 // This function does not know how to deal with a string that expands and
162 // then gets modified, only strings that collapse and then get modified.
163 DCHECK_GT(first_iter->original_length, first_iter->output_length);
164 const size_t collapse =
165 first_iter->original_length - first_iter->output_length;
166 adjusted_iter->original_length += collapse;
167 currently_collapsing += collapse;
168 ++first_iter;
169 }
170 }
171 DCHECK_EQ(0u, currently_collapsing);
172 if (first_iter != first_adjustments.end()) {
173 // Only first adjustments are left. These do not need to be modified.
174 // (Their offsets are already correct with respect to the original string.)
175 // Append them all.
176 DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
177 adjustments_builder.insert(adjustments_builder.end(), first_iter,
178 first_adjustments.end());
179 }
180 *adjustments_on_adjusted_string = std::move(adjustments_builder);
181 }
182
183 // Converts the given source Unicode character type to the given destination
184 // Unicode character type as a STL string. The given input buffer and size
185 // determine the source, and the given output STL string will be replaced by
186 // the result. If non-NULL, |adjustments| is set to reflect the all the
187 // alterations to the string that are not one-character-to-one-character.
188 // It will always be sorted by increasing offset.
189 template<typename SrcChar, typename DestStdString>
ConvertUnicode(const SrcChar * src,size_t src_len,DestStdString * output,OffsetAdjuster::Adjustments * adjustments)190 bool ConvertUnicode(const SrcChar* src,
191 size_t src_len,
192 DestStdString* output,
193 OffsetAdjuster::Adjustments* adjustments) {
194 if (adjustments)
195 adjustments->clear();
196 bool success = true;
197 for (size_t i = 0; i < src_len; i++) {
198 base_icu::UChar32 code_point;
199 size_t original_i = i;
200 size_t chars_written = 0;
201 if (ReadUnicodeCharacter(src, src_len, &i, &code_point)) {
202 chars_written = WriteUnicodeCharacter(code_point, output);
203 } else {
204 chars_written = WriteUnicodeCharacter(0xFFFD, output);
205 success = false;
206 }
207
208 // Only bother writing an adjustment if this modification changed the
209 // length of this character.
210 // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
211 // character read, not after it (so that incrementing it in the loop
212 // increment will place it at the right location), so we need to account
213 // for that in determining the amount that was read.
214 if (adjustments && ((i - original_i + 1) != chars_written)) {
215 adjustments->push_back(OffsetAdjuster::Adjustment(
216 original_i, i - original_i + 1, chars_written));
217 }
218 }
219 return success;
220 }
221
UTF8ToUTF16WithAdjustments(const char * src,size_t src_len,std::u16string * output,base::OffsetAdjuster::Adjustments * adjustments)222 bool UTF8ToUTF16WithAdjustments(
223 const char* src,
224 size_t src_len,
225 std::u16string* output,
226 base::OffsetAdjuster::Adjustments* adjustments) {
227 PrepareForUTF16Or32Output(src, src_len, output);
228 return ConvertUnicode(src, src_len, output, adjustments);
229 }
230
UTF8ToUTF16WithAdjustments(const base::StringPiece & utf8,base::OffsetAdjuster::Adjustments * adjustments)231 std::u16string UTF8ToUTF16WithAdjustments(
232 const base::StringPiece& utf8,
233 base::OffsetAdjuster::Adjustments* adjustments) {
234 std::u16string result;
235 UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
236 return result;
237 }
238
UTF8ToUTF16AndAdjustOffsets(const base::StringPiece & utf8,std::vector<size_t> * offsets_for_adjustment)239 std::u16string UTF8ToUTF16AndAdjustOffsets(
240 const base::StringPiece& utf8,
241 std::vector<size_t>* offsets_for_adjustment) {
242 for (size_t& offset : *offsets_for_adjustment) {
243 if (offset > utf8.length())
244 offset = std::u16string::npos;
245 }
246 OffsetAdjuster::Adjustments adjustments;
247 std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
248 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
249 return result;
250 }
251
UTF16ToUTF8AndAdjustOffsets(const base::StringPiece16 & utf16,std::vector<size_t> * offsets_for_adjustment)252 std::string UTF16ToUTF8AndAdjustOffsets(
253 const base::StringPiece16& utf16,
254 std::vector<size_t>* offsets_for_adjustment) {
255 for (size_t& offset : *offsets_for_adjustment) {
256 if (offset > utf16.length())
257 offset = std::u16string::npos;
258 }
259 std::string result;
260 PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
261 OffsetAdjuster::Adjustments adjustments;
262 ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
263 OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
264 return result;
265 }
266
267 } // namespace base
268