xref: /aosp_15_r20/external/cronet/base/i18n/break_iterator.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/break_iterator.h"
6 
7 #include <stdint.h>
8 #include <ostream>
9 #include <string_view>
10 
11 #include "base/check.h"
12 #include "base/lazy_instance.h"
13 #include "base/memory/raw_ptr.h"
14 #include "base/notreached.h"
15 #include "base/synchronization/lock.h"
16 #include "third_party/icu/source/common/unicode/ubrk.h"
17 #include "third_party/icu/source/common/unicode/uchar.h"
18 #include "third_party/icu/source/common/unicode/ustring.h"
19 
20 namespace base {
21 namespace i18n {
22 
23 namespace {
24 
25 // We found the usage pattern of break iterator is to create, use and destroy.
26 // The following cache support multiple break iterator in the same thread and
27 // also optimize to not create break iterator many time. For each kind of break
28 // iterator (character, word, line and sentence, but NOT rule), we keep one of
29 // them in the main_ and lease it out. If some other code request a lease
30 // before |main_| is returned, we create a new instance of the iterator.
31 // This will keep at most 4 break iterators (one for each kind) unreleased until
32 // the program destruction time.
33 template <UBreakIteratorType break_type>
34 class DefaultLocaleBreakIteratorCache {
35  public:
DefaultLocaleBreakIteratorCache()36   DefaultLocaleBreakIteratorCache() {
37     main_ = UBreakIteratorPtr(
38         ubrk_open(break_type, nullptr, nullptr, 0, &main_status_));
39     if (U_FAILURE(main_status_)) {
40       NOTREACHED() << "ubrk_open failed for type " << break_type
41                    << " with error " << main_status_;
42     }
43   }
Lease(UErrorCode & status)44   UBreakIteratorPtr Lease(UErrorCode& status) {
45     if (U_FAILURE(status)) {
46       return nullptr;
47     }
48     if (U_FAILURE(main_status_)) {
49       status = main_status_;
50       return nullptr;
51     }
52     {
53       AutoLock scoped_lock(lock_);
54       if (main_) {
55         return std::move(main_);
56       }
57     }
58 
59     // The main_ is already leased out to some other places, return a new
60     // object instead.
61     UBreakIteratorPtr result(
62         ubrk_open(break_type, nullptr, nullptr, 0, &status));
63     if (U_FAILURE(status)) {
64       NOTREACHED() << "ubrk_open failed for type " << break_type
65                    << " with error " << status;
66     }
67     return result;
68   }
69 
Return(UBreakIteratorPtr item)70   void Return(UBreakIteratorPtr item) {
71     AutoLock scoped_lock(lock_);
72     if (!main_) {
73       main_ = std::move(item);
74     }
75   }
76 
77  private:
78   UErrorCode main_status_ = U_ZERO_ERROR;
79   UBreakIteratorPtr main_ GUARDED_BY(lock_);
80   Lock lock_;
81 };
82 
83 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_CHARACTER>>::Leaky
84     char_break_cache = LAZY_INSTANCE_INITIALIZER;
85 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_WORD>>::Leaky
86     word_break_cache = LAZY_INSTANCE_INITIALIZER;
87 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_SENTENCE>>::Leaky
88     sentence_break_cache = LAZY_INSTANCE_INITIALIZER;
89 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_LINE>>::Leaky
90     line_break_cache = LAZY_INSTANCE_INITIALIZER;
91 
92 }  // namespace
93 
operator ()(UBreakIterator * ptr)94 void UBreakIteratorDeleter::operator()(UBreakIterator* ptr) {
95   if (ptr) {
96     ubrk_close(ptr);
97   }
98 }
99 
BreakIterator(std::u16string_view str,BreakType break_type)100 BreakIterator::BreakIterator(std::u16string_view str, BreakType break_type)
101     : string_(str), break_type_(break_type) {}
102 
BreakIterator(std::u16string_view str,const std::u16string & rules)103 BreakIterator::BreakIterator(std::u16string_view str,
104                              const std::u16string& rules)
105     : string_(str), rules_(rules), break_type_(RULE_BASED) {}
106 
~BreakIterator()107 BreakIterator::~BreakIterator() {
108   switch (break_type_) {
109     case RULE_BASED:
110       return;
111     case BREAK_CHARACTER:
112       char_break_cache.Pointer()->Return(std::move(iter_));
113       return;
114     case BREAK_WORD:
115       word_break_cache.Pointer()->Return(std::move(iter_));
116       return;
117     case BREAK_SENTENCE:
118       sentence_break_cache.Pointer()->Return(std::move(iter_));
119       return;
120     case BREAK_LINE:
121     case BREAK_NEWLINE:
122       line_break_cache.Pointer()->Return(std::move(iter_));
123       return;
124   }
125 }
126 
Init()127 bool BreakIterator::Init() {
128   UErrorCode status = U_ZERO_ERROR;
129   UParseError parse_error;
130   switch (break_type_) {
131     case BREAK_CHARACTER:
132       iter_ = char_break_cache.Pointer()->Lease(status);
133       break;
134     case BREAK_WORD:
135       iter_ = word_break_cache.Pointer()->Lease(status);
136       break;
137     case BREAK_SENTENCE:
138       iter_ = sentence_break_cache.Pointer()->Lease(status);
139       break;
140     case BREAK_LINE:
141     case BREAK_NEWLINE:
142       iter_ = line_break_cache.Pointer()->Lease(status);
143       break;
144     case RULE_BASED:
145       iter_ = UBreakIteratorPtr(
146           ubrk_openRules(rules_.c_str(), static_cast<int32_t>(rules_.length()),
147                          nullptr, 0, &parse_error, &status));
148       if (U_FAILURE(status)) {
149         NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
150                      << parse_error.line << ", offset " << parse_error.offset;
151       }
152       break;
153   }
154 
155   if (U_FAILURE(status) || iter_ == nullptr) {
156     return false;
157   }
158 
159   if (string_.data() != nullptr) {
160     ubrk_setText(iter_.get(), string_.data(),
161                  static_cast<int32_t>(string_.size()), &status);
162     if (U_FAILURE(status)) {
163       return false;
164     }
165   }
166 
167   // Move the iterator to the beginning of the string.
168   ubrk_first(iter_.get());
169   return true;
170 }
171 
Advance()172 bool BreakIterator::Advance() {
173   int32_t pos;
174   int32_t status;
175   prev_ = pos_;
176   switch (break_type_) {
177     case BREAK_CHARACTER:
178     case BREAK_WORD:
179     case BREAK_LINE:
180     case BREAK_SENTENCE:
181     case RULE_BASED:
182       pos = ubrk_next(iter_.get());
183       if (pos == UBRK_DONE) {
184         pos_ = npos;
185         return false;
186       }
187       pos_ = static_cast<size_t>(pos);
188       return true;
189     case BREAK_NEWLINE:
190       do {
191         pos = ubrk_next(iter_.get());
192         if (pos == UBRK_DONE)
193           break;
194         pos_ = static_cast<size_t>(pos);
195         status = ubrk_getRuleStatus(iter_.get());
196       } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
197       if (pos == UBRK_DONE && prev_ == pos_) {
198         pos_ = npos;
199         return false;
200       }
201       return true;
202   }
203 }
204 
SetText(std::u16string_view text)205 bool BreakIterator::SetText(std::u16string_view text) {
206   UErrorCode status = U_ZERO_ERROR;
207   ubrk_setText(iter_.get(), text.data(), text.length(), &status);
208   pos_ = 0;  // implicit when ubrk_setText is done
209   prev_ = npos;
210   if (U_FAILURE(status)) {
211     NOTREACHED() << "ubrk_setText failed";
212     return false;
213   }
214   string_ = text;
215   return true;
216 }
217 
IsWord() const218 bool BreakIterator::IsWord() const {
219   return GetWordBreakStatus() == IS_WORD_BREAK;
220 }
221 
GetWordBreakStatus() const222 BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
223   int32_t status = ubrk_getRuleStatus(iter_.get());
224   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
225     return IS_LINE_OR_CHAR_BREAK;
226   // In ICU 60, trying to advance past the end of the text does not change
227   // |status| so that |pos_| has to be checked as well as |status|.
228   // See http://bugs.icu-project.org/trac/ticket/13447 .
229   return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
230                                                     : IS_WORD_BREAK;
231 }
232 
IsEndOfWord(size_t position) const233 bool BreakIterator::IsEndOfWord(size_t position) const {
234   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
235     return false;
236 
237   UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
238   int32_t status = ubrk_getRuleStatus(iter_.get());
239   return (!!boundary && status != UBRK_WORD_NONE);
240 }
241 
IsStartOfWord(size_t position) const242 bool BreakIterator::IsStartOfWord(size_t position) const {
243   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
244     return false;
245 
246   UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
247   ubrk_next(iter_.get());
248   int32_t next_status = ubrk_getRuleStatus(iter_.get());
249   return (!!boundary && next_status != UBRK_WORD_NONE);
250 }
251 
IsSentenceBoundary(size_t position) const252 bool BreakIterator::IsSentenceBoundary(size_t position) const {
253   if (break_type_ != BREAK_SENTENCE && break_type_ != RULE_BASED)
254     return false;
255 
256   return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
257 }
258 
IsGraphemeBoundary(size_t position) const259 bool BreakIterator::IsGraphemeBoundary(size_t position) const {
260   if (break_type_ != BREAK_CHARACTER)
261     return false;
262 
263   return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
264 }
265 
GetString() const266 std::u16string BreakIterator::GetString() const {
267   return std::u16string(GetStringPiece());
268 }
269 
GetStringPiece() const270 std::u16string_view BreakIterator::GetStringPiece() const {
271   DCHECK(prev_ != npos && pos_ != npos);
272   return string_.substr(prev_, pos_ - prev_);
273 }
274 
275 }  // namespace i18n
276 }  // namespace base
277