xref: /aosp_15_r20/external/libchrome/base/strings/utf_offset_string_conversions.cc (revision 635a864187cb8b6c713ff48b7e790a6b21769273)
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_offset_string_conversions.h"
6 
7 #include <stdint.h>
8 
9 #include <algorithm>
10 #include <memory>
11 
12 #include "base/logging.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 
16 namespace base {
17 
Adjustment(size_t original_offset,size_t original_length,size_t output_length)18 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
19                                        size_t original_length,
20                                        size_t output_length)
21     : original_offset(original_offset),
22       original_length(original_length),
23       output_length(output_length) {
24 }
25 
26 // static
AdjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_adjustment,size_t limit)27 void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
28                                    std::vector<size_t>* offsets_for_adjustment,
29                                    size_t limit) {
30   DCHECK(offsets_for_adjustment);
31   for (std::vector<size_t>::iterator i(offsets_for_adjustment->begin());
32        i != offsets_for_adjustment->end(); ++i)
33     AdjustOffset(adjustments, &(*i), limit);
34 }
35 
36 // static
AdjustOffset(const Adjustments & adjustments,size_t * offset,size_t limit)37 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
38                                   size_t* offset,
39                                   size_t limit) {
40   DCHECK(offset);
41   if (*offset == string16::npos)
42     return;
43   int adjustment = 0;
44   for (Adjustments::const_iterator i = adjustments.begin();
45        i != adjustments.end(); ++i) {
46     if (*offset <= i->original_offset)
47       break;
48     if (*offset < (i->original_offset + i->original_length)) {
49       *offset = string16::npos;
50       return;
51     }
52     adjustment += static_cast<int>(i->original_length - i->output_length);
53   }
54   *offset -= adjustment;
55 
56   if (*offset > limit)
57     *offset = string16::npos;
58 }
59 
60 // static
UnadjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_unadjustment)61 void OffsetAdjuster::UnadjustOffsets(
62     const Adjustments& adjustments,
63     std::vector<size_t>* offsets_for_unadjustment) {
64   if (!offsets_for_unadjustment || adjustments.empty())
65     return;
66   for (std::vector<size_t>::iterator i(offsets_for_unadjustment->begin());
67        i != offsets_for_unadjustment->end(); ++i)
68     UnadjustOffset(adjustments, &(*i));
69 }
70 
71 // static
UnadjustOffset(const Adjustments & adjustments,size_t * offset)72 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
73                                     size_t* offset) {
74   if (*offset == string16::npos)
75     return;
76   int adjustment = 0;
77   for (Adjustments::const_iterator i = adjustments.begin();
78        i != adjustments.end(); ++i) {
79     if (*offset + adjustment <= i->original_offset)
80       break;
81     adjustment += static_cast<int>(i->original_length - i->output_length);
82     if ((*offset + adjustment) <
83         (i->original_offset + i->original_length)) {
84       *offset = string16::npos;
85       return;
86     }
87   }
88   *offset += adjustment;
89 }
90 
91 // static
MergeSequentialAdjustments(const Adjustments & first_adjustments,Adjustments * adjustments_on_adjusted_string)92 void OffsetAdjuster::MergeSequentialAdjustments(
93     const Adjustments& first_adjustments,
94     Adjustments* adjustments_on_adjusted_string) {
95   Adjustments::iterator adjusted_iter = adjustments_on_adjusted_string->begin();
96   Adjustments::const_iterator first_iter = first_adjustments.begin();
97   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
98   // |first_adjustments|, adding adjustments to or correcting the adjustments
99   // in |adjustments_on_adjusted_string| as we go.  |shift| keeps track of the
100   // current number of characters collapsed by |first_adjustments| up to this
101   // point.  |currently_collapsing| keeps track of the number of characters
102   // collapsed by |first_adjustments| into the current |adjusted_iter|'s
103   // length.  These are characters that will change |shift| as soon as we're
104   // done processing the current |adjusted_iter|; they are not yet reflected in
105   // |shift|.
106   size_t shift = 0;
107   size_t currently_collapsing = 0;
108   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
109     if ((first_iter == first_adjustments.end()) ||
110         ((adjusted_iter->original_offset + shift +
111           adjusted_iter->original_length) <= first_iter->original_offset)) {
112       // Entire |adjusted_iter| (accounting for its shift and including its
113       // whole original length) comes before |first_iter|.
114       //
115       // Correct the offset at |adjusted_iter| and move onto the next
116       // adjustment that needs revising.
117       adjusted_iter->original_offset += shift;
118       shift += currently_collapsing;
119       currently_collapsing = 0;
120       ++adjusted_iter;
121     } else if ((adjusted_iter->original_offset + shift) >
122                first_iter->original_offset) {
123       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
124 
125       // It's not possible for the adjustments to overlap.  (It shouldn't
126       // be possible that we have an |adjusted_iter->original_offset| that,
127       // when adjusted by the computed |shift|, is in the middle of
128       // |first_iter|'s output's length.  After all, that would mean the
129       // current adjustment_on_adjusted_string somehow points to an offset
130       // that was supposed to have been eliminated by the first set of
131       // adjustments.)
132       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
133                 adjusted_iter->original_offset + shift);
134 
135       // Add the |first_adjustment_iter| to the full set of adjustments while
136       // making sure |adjusted_iter| continues pointing to the same element.
137       // We do this by inserting the |first_adjustment_iter| right before
138       // |adjusted_iter|, then incrementing |adjusted_iter| so it points to
139       // the following element.
140       shift += first_iter->original_length - first_iter->output_length;
141       adjusted_iter = adjustments_on_adjusted_string->insert(
142           adjusted_iter, *first_iter);
143       ++adjusted_iter;
144       ++first_iter;
145     } else {
146       // The first adjustment adjusted something that then got further adjusted
147       // by the second set of adjustments.  In other words, |first_iter| points
148       // to something in the range covered by |adjusted_iter|'s length (after
149       // accounting for |shift|).  Precisely,
150       //   adjusted_iter->original_offset + shift
151       //   <=
152       //   first_iter->original_offset
153       //   <=
154       //   adjusted_iter->original_offset + shift +
155       //       adjusted_iter->original_length
156 
157       // Modify the current |adjusted_iter| to include whatever collapsing
158       // happened in |first_iter|, then advance to the next |first_adjustments|
159       // because we dealt with the current one.
160       const int collapse = static_cast<int>(first_iter->original_length) -
161           static_cast<int>(first_iter->output_length);
162       // This function does not know how to deal with a string that expands and
163       // then gets modified, only strings that collapse and then get modified.
164       DCHECK_GT(collapse, 0);
165       adjusted_iter->original_length += collapse;
166       currently_collapsing += collapse;
167       ++first_iter;
168     }
169   }
170   DCHECK_EQ(0u, currently_collapsing);
171   if (first_iter != first_adjustments.end()) {
172     // Only first adjustments are left.  These do not need to be modified.
173     // (Their offsets are already correct with respect to the original string.)
174     // Append them all.
175     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
176     adjustments_on_adjusted_string->insert(
177         adjustments_on_adjusted_string->end(), first_iter,
178         first_adjustments.end());
179   }
180 }
181 
182 // Converts the given source Unicode character type to the given destination
183 // Unicode character type as a STL string. The given input buffer and size
184 // determine the source, and the given output STL string will be replaced by
185 // the result.  If non-NULL, |adjustments| is set to reflect the all the
186 // alterations to the string that are not one-character-to-one-character.
187 // It will always be sorted by increasing offset.
188 template<typename SrcChar, typename DestStdString>
ConvertUnicode(const SrcChar * src,size_t src_len,DestStdString * output,OffsetAdjuster::Adjustments * adjustments)189 bool ConvertUnicode(const SrcChar* src,
190                     size_t src_len,
191                     DestStdString* output,
192                     OffsetAdjuster::Adjustments* adjustments) {
193   if (adjustments)
194     adjustments->clear();
195   // ICU requires 32-bit numbers.
196   bool success = true;
197   int32_t src_len32 = static_cast<int32_t>(src_len);
198   for (int32_t i = 0; i < src_len32; i++) {
199     uint32_t code_point;
200     size_t original_i = i;
201     size_t chars_written = 0;
202     if (ReadUnicodeCharacter(src, src_len32, &i, &code_point)) {
203       chars_written = WriteUnicodeCharacter(code_point, output);
204     } else {
205       chars_written = WriteUnicodeCharacter(0xFFFD, output);
206       success = false;
207     }
208 
209     // Only bother writing an adjustment if this modification changed the
210     // length of this character.
211     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
212     // character read, not after it (so that incrementing it in the loop
213     // increment will place it at the right location), so we need to account
214     // for that in determining the amount that was read.
215     if (adjustments && ((i - original_i + 1) != chars_written)) {
216       adjustments->push_back(OffsetAdjuster::Adjustment(
217           original_i, i - original_i + 1, chars_written));
218     }
219   }
220   return success;
221 }
222 
UTF8ToUTF16WithAdjustments(const char * src,size_t src_len,string16 * output,base::OffsetAdjuster::Adjustments * adjustments)223 bool UTF8ToUTF16WithAdjustments(
224     const char* src,
225     size_t src_len,
226     string16* output,
227     base::OffsetAdjuster::Adjustments* adjustments) {
228   PrepareForUTF16Or32Output(src, src_len, output);
229   return ConvertUnicode(src, src_len, output, adjustments);
230 }
231 
UTF8ToUTF16WithAdjustments(const base::StringPiece & utf8,base::OffsetAdjuster::Adjustments * adjustments)232 string16 UTF8ToUTF16WithAdjustments(
233     const base::StringPiece& utf8,
234     base::OffsetAdjuster::Adjustments* adjustments) {
235   string16 result;
236   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
237   return result;
238 }
239 
UTF8ToUTF16AndAdjustOffsets(const base::StringPiece & utf8,std::vector<size_t> * offsets_for_adjustment)240 string16 UTF8ToUTF16AndAdjustOffsets(
241     const base::StringPiece& utf8,
242     std::vector<size_t>* offsets_for_adjustment) {
243   for (size_t& offset : *offsets_for_adjustment) {
244     if (offset > utf8.length())
245       offset = string16::npos;
246   }
247   OffsetAdjuster::Adjustments adjustments;
248   string16 result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
249   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
250   return result;
251 }
252 
UTF16ToUTF8AndAdjustOffsets(const base::StringPiece16 & utf16,std::vector<size_t> * offsets_for_adjustment)253 std::string UTF16ToUTF8AndAdjustOffsets(
254     const base::StringPiece16& utf16,
255     std::vector<size_t>* offsets_for_adjustment) {
256   for (size_t& offset : *offsets_for_adjustment) {
257     if (offset > utf16.length())
258       offset = string16::npos;
259   }
260   std::string result;
261   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
262   OffsetAdjuster::Adjustments adjustments;
263   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
264   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
265   return result;
266 }
267 
268 }  // namespace base
269