xref: /aosp_15_r20/external/cronet/base/strings/utf_offset_string_conversions.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_offset_string_conversions.h"
6 
7 #include <stdint.h>
8 
9 #include <algorithm>
10 #include <memory>
11 
12 #include "base/check_op.h"
13 #include "base/strings/string_piece.h"
14 #include "base/strings/utf_string_conversion_utils.h"
15 
16 namespace base {
17 
Adjustment(size_t original_offset,size_t original_length,size_t output_length)18 OffsetAdjuster::Adjustment::Adjustment(size_t original_offset,
19                                        size_t original_length,
20                                        size_t output_length)
21     : original_offset(original_offset),
22       original_length(original_length),
23       output_length(output_length) {
24 }
25 
26 // static
AdjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_adjustment,size_t limit)27 void OffsetAdjuster::AdjustOffsets(const Adjustments& adjustments,
28                                    std::vector<size_t>* offsets_for_adjustment,
29                                    size_t limit) {
30   DCHECK(offsets_for_adjustment);
31   for (auto& i : *offsets_for_adjustment)
32     AdjustOffset(adjustments, &i, limit);
33 }
34 
35 // static
AdjustOffset(const Adjustments & adjustments,size_t * offset,size_t limit)36 void OffsetAdjuster::AdjustOffset(const Adjustments& adjustments,
37                                   size_t* offset,
38                                   size_t limit) {
39   DCHECK(offset);
40   if (*offset == std::u16string::npos)
41     return;
42   size_t original_lengths = 0;
43   size_t output_lengths = 0;
44   for (const auto& i : adjustments) {
45     if (*offset <= i.original_offset)
46       break;
47     if (*offset < (i.original_offset + i.original_length)) {
48       *offset = std::u16string::npos;
49       return;
50     }
51     original_lengths += i.original_length;
52     output_lengths += i.output_length;
53   }
54   *offset += output_lengths - original_lengths;
55 
56   if (*offset > limit)
57     *offset = std::u16string::npos;
58 }
59 
60 // static
UnadjustOffsets(const Adjustments & adjustments,std::vector<size_t> * offsets_for_unadjustment)61 void OffsetAdjuster::UnadjustOffsets(
62     const Adjustments& adjustments,
63     std::vector<size_t>* offsets_for_unadjustment) {
64   if (!offsets_for_unadjustment || adjustments.empty())
65     return;
66   for (auto& i : *offsets_for_unadjustment)
67     UnadjustOffset(adjustments, &i);
68 }
69 
70 // static
UnadjustOffset(const Adjustments & adjustments,size_t * offset)71 void OffsetAdjuster::UnadjustOffset(const Adjustments& adjustments,
72                                     size_t* offset) {
73   if (*offset == std::u16string::npos)
74     return;
75   size_t original_lengths = 0;
76   size_t output_lengths = 0;
77   for (const auto& i : adjustments) {
78     if (*offset + original_lengths - output_lengths <= i.original_offset)
79       break;
80     original_lengths += i.original_length;
81     output_lengths += i.output_length;
82     if ((*offset + original_lengths - output_lengths) <
83         (i.original_offset + i.original_length)) {
84       *offset = std::u16string::npos;
85       return;
86     }
87   }
88   *offset += original_lengths - output_lengths;
89 }
90 
91 // static
MergeSequentialAdjustments(const Adjustments & first_adjustments,Adjustments * adjustments_on_adjusted_string)92 void OffsetAdjuster::MergeSequentialAdjustments(
93     const Adjustments& first_adjustments,
94     Adjustments* adjustments_on_adjusted_string) {
95   auto adjusted_iter = adjustments_on_adjusted_string->begin();
96   auto first_iter = first_adjustments.begin();
97   // Simultaneously iterate over all |adjustments_on_adjusted_string| and
98   // |first_adjustments|, pushing adjustments at the end of
99   // |adjustments_builder| as we go.  |shift| keeps track of the current number
100   // of characters collapsed by |first_adjustments| up to this point.
101   // |currently_collapsing| keeps track of the number of characters collapsed by
102   // |first_adjustments| into the current |adjusted_iter|'s length.  These are
103   // characters that will change |shift| as soon as we're done processing the
104   // current |adjusted_iter|; they are not yet reflected in |shift|.
105   size_t shift = 0;
106   size_t currently_collapsing = 0;
107   // While we *could* update |adjustments_on_adjusted_string| in place by
108   // inserting new adjustments into the middle, we would be repeatedly calling
109   // |std::vector::insert|. That would cost O(n) time per insert, relative to
110   // distance from end of the string.  By instead allocating
111   // |adjustments_builder| and calling |std::vector::push_back|, we only pay
112   // amortized constant time per push. We are trading space for time.
113   Adjustments adjustments_builder;
114   while (adjusted_iter != adjustments_on_adjusted_string->end()) {
115     if ((first_iter == first_adjustments.end()) ||
116         ((adjusted_iter->original_offset + shift +
117           adjusted_iter->original_length) <= first_iter->original_offset)) {
118       // Entire |adjusted_iter| (accounting for its shift and including its
119       // whole original length) comes before |first_iter|.
120       //
121       // Correct the offset at |adjusted_iter| and move onto the next
122       // adjustment that needs revising.
123       adjusted_iter->original_offset += shift;
124       shift += currently_collapsing;
125       currently_collapsing = 0;
126       adjustments_builder.push_back(*adjusted_iter);
127       ++adjusted_iter;
128     } else if ((adjusted_iter->original_offset + shift) >
129                first_iter->original_offset) {
130       // |first_iter| comes before the |adjusted_iter| (as adjusted by |shift|).
131 
132       // It's not possible for the adjustments to overlap.  (It shouldn't
133       // be possible that we have an |adjusted_iter->original_offset| that,
134       // when adjusted by the computed |shift|, is in the middle of
135       // |first_iter|'s output's length.  After all, that would mean the
136       // current adjustment_on_adjusted_string somehow points to an offset
137       // that was supposed to have been eliminated by the first set of
138       // adjustments.)
139       DCHECK_LE(first_iter->original_offset + first_iter->output_length,
140                 adjusted_iter->original_offset + shift);
141 
142       // Add the |first_iter| to the full set of adjustments.
143       shift += first_iter->original_length - first_iter->output_length;
144       adjustments_builder.push_back(*first_iter);
145       ++first_iter;
146     } else {
147       // The first adjustment adjusted something that then got further adjusted
148       // by the second set of adjustments.  In other words, |first_iter| points
149       // to something in the range covered by |adjusted_iter|'s length (after
150       // accounting for |shift|).  Precisely,
151       //   adjusted_iter->original_offset + shift
152       //   <=
153       //   first_iter->original_offset
154       //   <=
155       //   adjusted_iter->original_offset + shift +
156       //       adjusted_iter->original_length
157       // Modify the current |adjusted_iter| to include whatever collapsing
158       // happened in |first_iter|, then advance to the next |first_adjustments|
159       // because we dealt with the current one.
160 
161       // This function does not know how to deal with a string that expands and
162       // then gets modified, only strings that collapse and then get modified.
163       DCHECK_GT(first_iter->original_length, first_iter->output_length);
164       const size_t collapse =
165           first_iter->original_length - first_iter->output_length;
166       adjusted_iter->original_length += collapse;
167       currently_collapsing += collapse;
168       ++first_iter;
169     }
170   }
171   DCHECK_EQ(0u, currently_collapsing);
172   if (first_iter != first_adjustments.end()) {
173     // Only first adjustments are left.  These do not need to be modified.
174     // (Their offsets are already correct with respect to the original string.)
175     // Append them all.
176     DCHECK(adjusted_iter == adjustments_on_adjusted_string->end());
177     adjustments_builder.insert(adjustments_builder.end(), first_iter,
178                                first_adjustments.end());
179   }
180   *adjustments_on_adjusted_string = std::move(adjustments_builder);
181 }
182 
183 // Converts the given source Unicode character type to the given destination
184 // Unicode character type as a STL string. The given input buffer and size
185 // determine the source, and the given output STL string will be replaced by
186 // the result.  If non-NULL, |adjustments| is set to reflect the all the
187 // alterations to the string that are not one-character-to-one-character.
188 // It will always be sorted by increasing offset.
189 template<typename SrcChar, typename DestStdString>
ConvertUnicode(const SrcChar * src,size_t src_len,DestStdString * output,OffsetAdjuster::Adjustments * adjustments)190 bool ConvertUnicode(const SrcChar* src,
191                     size_t src_len,
192                     DestStdString* output,
193                     OffsetAdjuster::Adjustments* adjustments) {
194   if (adjustments)
195     adjustments->clear();
196   bool success = true;
197   for (size_t i = 0; i < src_len; i++) {
198     base_icu::UChar32 code_point;
199     size_t original_i = i;
200     size_t chars_written = 0;
201     if (ReadUnicodeCharacter(src, src_len, &i, &code_point)) {
202       chars_written = WriteUnicodeCharacter(code_point, output);
203     } else {
204       chars_written = WriteUnicodeCharacter(0xFFFD, output);
205       success = false;
206     }
207 
208     // Only bother writing an adjustment if this modification changed the
209     // length of this character.
210     // NOTE: ReadUnicodeCharacter() adjusts |i| to point _at_ the last
211     // character read, not after it (so that incrementing it in the loop
212     // increment will place it at the right location), so we need to account
213     // for that in determining the amount that was read.
214     if (adjustments && ((i - original_i + 1) != chars_written)) {
215       adjustments->push_back(OffsetAdjuster::Adjustment(
216           original_i, i - original_i + 1, chars_written));
217     }
218   }
219   return success;
220 }
221 
UTF8ToUTF16WithAdjustments(const char * src,size_t src_len,std::u16string * output,base::OffsetAdjuster::Adjustments * adjustments)222 bool UTF8ToUTF16WithAdjustments(
223     const char* src,
224     size_t src_len,
225     std::u16string* output,
226     base::OffsetAdjuster::Adjustments* adjustments) {
227   PrepareForUTF16Or32Output(src, src_len, output);
228   return ConvertUnicode(src, src_len, output, adjustments);
229 }
230 
UTF8ToUTF16WithAdjustments(const base::StringPiece & utf8,base::OffsetAdjuster::Adjustments * adjustments)231 std::u16string UTF8ToUTF16WithAdjustments(
232     const base::StringPiece& utf8,
233     base::OffsetAdjuster::Adjustments* adjustments) {
234   std::u16string result;
235   UTF8ToUTF16WithAdjustments(utf8.data(), utf8.length(), &result, adjustments);
236   return result;
237 }
238 
UTF8ToUTF16AndAdjustOffsets(const base::StringPiece & utf8,std::vector<size_t> * offsets_for_adjustment)239 std::u16string UTF8ToUTF16AndAdjustOffsets(
240     const base::StringPiece& utf8,
241     std::vector<size_t>* offsets_for_adjustment) {
242   for (size_t& offset : *offsets_for_adjustment) {
243     if (offset > utf8.length())
244       offset = std::u16string::npos;
245   }
246   OffsetAdjuster::Adjustments adjustments;
247   std::u16string result = UTF8ToUTF16WithAdjustments(utf8, &adjustments);
248   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
249   return result;
250 }
251 
UTF16ToUTF8AndAdjustOffsets(const base::StringPiece16 & utf16,std::vector<size_t> * offsets_for_adjustment)252 std::string UTF16ToUTF8AndAdjustOffsets(
253     const base::StringPiece16& utf16,
254     std::vector<size_t>* offsets_for_adjustment) {
255   for (size_t& offset : *offsets_for_adjustment) {
256     if (offset > utf16.length())
257       offset = std::u16string::npos;
258   }
259   std::string result;
260   PrepareForUTF8Output(utf16.data(), utf16.length(), &result);
261   OffsetAdjuster::Adjustments adjustments;
262   ConvertUnicode(utf16.data(), utf16.length(), &result, &adjustments);
263   OffsetAdjuster::AdjustOffsets(adjustments, offsets_for_adjustment);
264   return result;
265 }
266 
267 }  // namespace base
268