1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/i18n/break_iterator.h"
6
7 #include <stdint.h>
8 #include <ostream>
9 #include <string_view>
10
11 #include "base/check.h"
12 #include "base/lazy_instance.h"
13 #include "base/memory/raw_ptr.h"
14 #include "base/notreached.h"
15 #include "base/synchronization/lock.h"
16 #include "third_party/icu/source/common/unicode/ubrk.h"
17 #include "third_party/icu/source/common/unicode/uchar.h"
18 #include "third_party/icu/source/common/unicode/ustring.h"
19
20 namespace base {
21 namespace i18n {
22
23 namespace {
24
25 // We found the usage pattern of break iterator is to create, use and destroy.
26 // The following cache support multiple break iterator in the same thread and
27 // also optimize to not create break iterator many time. For each kind of break
28 // iterator (character, word, line and sentence, but NOT rule), we keep one of
29 // them in the main_ and lease it out. If some other code request a lease
30 // before |main_| is returned, we create a new instance of the iterator.
31 // This will keep at most 4 break iterators (one for each kind) unreleased until
32 // the program destruction time.
33 template <UBreakIteratorType break_type>
34 class DefaultLocaleBreakIteratorCache {
35 public:
DefaultLocaleBreakIteratorCache()36 DefaultLocaleBreakIteratorCache() {
37 main_ = UBreakIteratorPtr(
38 ubrk_open(break_type, nullptr, nullptr, 0, &main_status_));
39 if (U_FAILURE(main_status_)) {
40 NOTREACHED() << "ubrk_open failed for type " << break_type
41 << " with error " << main_status_;
42 }
43 }
Lease(UErrorCode & status)44 UBreakIteratorPtr Lease(UErrorCode& status) {
45 if (U_FAILURE(status)) {
46 return nullptr;
47 }
48 if (U_FAILURE(main_status_)) {
49 status = main_status_;
50 return nullptr;
51 }
52 {
53 AutoLock scoped_lock(lock_);
54 if (main_) {
55 return std::move(main_);
56 }
57 }
58
59 // The main_ is already leased out to some other places, return a new
60 // object instead.
61 UBreakIteratorPtr result(
62 ubrk_open(break_type, nullptr, nullptr, 0, &status));
63 if (U_FAILURE(status)) {
64 NOTREACHED() << "ubrk_open failed for type " << break_type
65 << " with error " << status;
66 }
67 return result;
68 }
69
Return(UBreakIteratorPtr item)70 void Return(UBreakIteratorPtr item) {
71 AutoLock scoped_lock(lock_);
72 if (!main_) {
73 main_ = std::move(item);
74 }
75 }
76
77 private:
78 UErrorCode main_status_ = U_ZERO_ERROR;
79 UBreakIteratorPtr main_ GUARDED_BY(lock_);
80 Lock lock_;
81 };
82
83 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_CHARACTER>>::Leaky
84 char_break_cache = LAZY_INSTANCE_INITIALIZER;
85 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_WORD>>::Leaky
86 word_break_cache = LAZY_INSTANCE_INITIALIZER;
87 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_SENTENCE>>::Leaky
88 sentence_break_cache = LAZY_INSTANCE_INITIALIZER;
89 static LazyInstance<DefaultLocaleBreakIteratorCache<UBRK_LINE>>::Leaky
90 line_break_cache = LAZY_INSTANCE_INITIALIZER;
91
92 } // namespace
93
operator ()(UBreakIterator * ptr)94 void UBreakIteratorDeleter::operator()(UBreakIterator* ptr) {
95 if (ptr) {
96 ubrk_close(ptr);
97 }
98 }
99
BreakIterator(std::u16string_view str,BreakType break_type)100 BreakIterator::BreakIterator(std::u16string_view str, BreakType break_type)
101 : string_(str), break_type_(break_type) {}
102
BreakIterator(std::u16string_view str,const std::u16string & rules)103 BreakIterator::BreakIterator(std::u16string_view str,
104 const std::u16string& rules)
105 : string_(str), rules_(rules), break_type_(RULE_BASED) {}
106
~BreakIterator()107 BreakIterator::~BreakIterator() {
108 switch (break_type_) {
109 case RULE_BASED:
110 return;
111 case BREAK_CHARACTER:
112 char_break_cache.Pointer()->Return(std::move(iter_));
113 return;
114 case BREAK_WORD:
115 word_break_cache.Pointer()->Return(std::move(iter_));
116 return;
117 case BREAK_SENTENCE:
118 sentence_break_cache.Pointer()->Return(std::move(iter_));
119 return;
120 case BREAK_LINE:
121 case BREAK_NEWLINE:
122 line_break_cache.Pointer()->Return(std::move(iter_));
123 return;
124 }
125 }
126
Init()127 bool BreakIterator::Init() {
128 UErrorCode status = U_ZERO_ERROR;
129 UParseError parse_error;
130 switch (break_type_) {
131 case BREAK_CHARACTER:
132 iter_ = char_break_cache.Pointer()->Lease(status);
133 break;
134 case BREAK_WORD:
135 iter_ = word_break_cache.Pointer()->Lease(status);
136 break;
137 case BREAK_SENTENCE:
138 iter_ = sentence_break_cache.Pointer()->Lease(status);
139 break;
140 case BREAK_LINE:
141 case BREAK_NEWLINE:
142 iter_ = line_break_cache.Pointer()->Lease(status);
143 break;
144 case RULE_BASED:
145 iter_ = UBreakIteratorPtr(
146 ubrk_openRules(rules_.c_str(), static_cast<int32_t>(rules_.length()),
147 nullptr, 0, &parse_error, &status));
148 if (U_FAILURE(status)) {
149 NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
150 << parse_error.line << ", offset " << parse_error.offset;
151 }
152 break;
153 }
154
155 if (U_FAILURE(status) || iter_ == nullptr) {
156 return false;
157 }
158
159 if (string_.data() != nullptr) {
160 ubrk_setText(iter_.get(), string_.data(),
161 static_cast<int32_t>(string_.size()), &status);
162 if (U_FAILURE(status)) {
163 return false;
164 }
165 }
166
167 // Move the iterator to the beginning of the string.
168 ubrk_first(iter_.get());
169 return true;
170 }
171
Advance()172 bool BreakIterator::Advance() {
173 int32_t pos;
174 int32_t status;
175 prev_ = pos_;
176 switch (break_type_) {
177 case BREAK_CHARACTER:
178 case BREAK_WORD:
179 case BREAK_LINE:
180 case BREAK_SENTENCE:
181 case RULE_BASED:
182 pos = ubrk_next(iter_.get());
183 if (pos == UBRK_DONE) {
184 pos_ = npos;
185 return false;
186 }
187 pos_ = static_cast<size_t>(pos);
188 return true;
189 case BREAK_NEWLINE:
190 do {
191 pos = ubrk_next(iter_.get());
192 if (pos == UBRK_DONE)
193 break;
194 pos_ = static_cast<size_t>(pos);
195 status = ubrk_getRuleStatus(iter_.get());
196 } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
197 if (pos == UBRK_DONE && prev_ == pos_) {
198 pos_ = npos;
199 return false;
200 }
201 return true;
202 }
203 }
204
SetText(std::u16string_view text)205 bool BreakIterator::SetText(std::u16string_view text) {
206 UErrorCode status = U_ZERO_ERROR;
207 ubrk_setText(iter_.get(), text.data(), text.length(), &status);
208 pos_ = 0; // implicit when ubrk_setText is done
209 prev_ = npos;
210 if (U_FAILURE(status)) {
211 NOTREACHED() << "ubrk_setText failed";
212 return false;
213 }
214 string_ = text;
215 return true;
216 }
217
IsWord() const218 bool BreakIterator::IsWord() const {
219 return GetWordBreakStatus() == IS_WORD_BREAK;
220 }
221
GetWordBreakStatus() const222 BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
223 int32_t status = ubrk_getRuleStatus(iter_.get());
224 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
225 return IS_LINE_OR_CHAR_BREAK;
226 // In ICU 60, trying to advance past the end of the text does not change
227 // |status| so that |pos_| has to be checked as well as |status|.
228 // See http://bugs.icu-project.org/trac/ticket/13447 .
229 return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
230 : IS_WORD_BREAK;
231 }
232
IsEndOfWord(size_t position) const233 bool BreakIterator::IsEndOfWord(size_t position) const {
234 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
235 return false;
236
237 UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
238 int32_t status = ubrk_getRuleStatus(iter_.get());
239 return (!!boundary && status != UBRK_WORD_NONE);
240 }
241
IsStartOfWord(size_t position) const242 bool BreakIterator::IsStartOfWord(size_t position) const {
243 if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
244 return false;
245
246 UBool boundary = ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
247 ubrk_next(iter_.get());
248 int32_t next_status = ubrk_getRuleStatus(iter_.get());
249 return (!!boundary && next_status != UBRK_WORD_NONE);
250 }
251
IsSentenceBoundary(size_t position) const252 bool BreakIterator::IsSentenceBoundary(size_t position) const {
253 if (break_type_ != BREAK_SENTENCE && break_type_ != RULE_BASED)
254 return false;
255
256 return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
257 }
258
IsGraphemeBoundary(size_t position) const259 bool BreakIterator::IsGraphemeBoundary(size_t position) const {
260 if (break_type_ != BREAK_CHARACTER)
261 return false;
262
263 return !!ubrk_isBoundary(iter_.get(), static_cast<int32_t>(position));
264 }
265
GetString() const266 std::u16string BreakIterator::GetString() const {
267 return std::u16string(GetStringPiece());
268 }
269
GetStringPiece() const270 std::u16string_view BreakIterator::GetStringPiece() const {
271 DCHECK(prev_ != npos && pos_ != npos);
272 return string_.substr(prev_, pos_ - prev_);
273 }
274
275 } // namespace i18n
276 } // namespace base
277