1 /*
2 * Copyright 2023 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7 #include "modules/skunicode/include/SkUnicode_icu4x.h"
8
9 #include "include/core/SkSpan.h"
10 #include "include/core/SkString.h"
11 #include "include/core/SkTypes.h"
12 #include "include/private/base/SkTArray.h"
13 #include "include/private/base/SkTo.h"
14 #include "modules/skunicode/include/SkUnicode.h"
15 #include "modules/skunicode/src/SkUnicode_hardcoded.h"
16 #include "src/base/SkBitmaskEnum.h"
17 #include "src/base/SkUTF.h"
18
19 #include <ICU4XBidi.hpp>
20 #include <ICU4XCaseMapper.hpp>
21 #include <ICU4XCodePointMapData8.hpp>
22 #include <ICU4XCodePointSetData.hpp>
23 #include <ICU4XDataProvider.hpp>
24 #include <ICU4XGraphemeClusterSegmenter.hpp>
25 #include <ICU4XLineSegmenter.hpp>
26 #include <ICU4XWordSegmenter.hpp>
27
28 #include <algorithm>
29 #include <cstdint>
30 #include <memory>
31 #include <string>
32 #include <utility>
33 #include <vector>
34
35 class SkUnicode_icu4x : public SkUnicode {
36 public:
SkUnicode_icu4x()37 SkUnicode_icu4x() {
38 fLocale = ICU4XLocale::create_from_string("tr").ok().value();
39 fDataProvider = ICU4XDataProvider::create_compiled();
40 fCaseMapper = ICU4XCaseMapper::create(fDataProvider).ok().value();
41 const auto general = ICU4XCodePointMapData8::load_general_category(fDataProvider).ok().value();
42 fControls = general.get_set_for_value(/*Control*/15);
43 fWhitespaces = general.get_set_for_value(/*SpaceSeparator*/12);
44 fSpaces = general.get_set_for_value(/*SpaceSeparator*/12);
45 // TODO: u_isSpace
46 fBlanks = ICU4XCodePointSetData::load_blank(fDataProvider).ok().value();
47 fEmoji = ICU4XCodePointSetData::load_emoji(fDataProvider).ok().value();
48 fEmojiComponent = ICU4XCodePointSetData::load_emoji_component(fDataProvider).ok().value();
49 fEmojiModifier = ICU4XCodePointSetData::load_emoji_modifier(fDataProvider).ok().value();
50 fEmojiModifierBase = ICU4XCodePointSetData::load_emoji_modifier_base(fDataProvider).ok().value();
51 fEmoji = ICU4XCodePointSetData::load_emoji(fDataProvider).ok().value();
52 fRegionalIndicator = ICU4XCodePointSetData::load_regional_indicator(fDataProvider).ok().value();
53 fIdeographic = ICU4XCodePointSetData::load_ideographic(fDataProvider).ok().value();
54 fLineBreaks = ICU4XCodePointMapData8::load_line_break(fDataProvider).ok().value();
55 }
56
57 ~SkUnicode_icu4x() override = default;
58
59 void reset();
60
61 // SkUnicode properties
isControl(SkUnichar utf8)62 bool isControl(SkUnichar utf8) override { return fControls.contains(utf8); }
isWhitespace(SkUnichar utf8)63 bool isWhitespace(SkUnichar utf8) override { return fWhitespaces.contains(utf8); }
isSpace(SkUnichar utf8)64 bool isSpace(SkUnichar utf8) override { return fBlanks.contains(utf8); }
isHardBreak(SkUnichar utf8)65 bool isHardBreak(SkUnichar utf8) override {
66 auto value = fLineBreaks.get(utf8);
67 return (value == /*MandatoryBreak*/6) ||
68 (value == /*CarriageReturn*/10) ||
69 (value == /*LineFeed*/17) ||
70 (value == /*NextLine*/29);
71 }
isEmoji(SkUnichar utf8)72 bool isEmoji(SkUnichar utf8) override { return fEmoji.contains(utf8); }
isEmojiComponent(SkUnichar utf8)73 bool isEmojiComponent(SkUnichar utf8) override { return fEmojiComponent.contains(utf8); }
isEmojiModifierBase(SkUnichar utf8)74 bool isEmojiModifierBase(SkUnichar utf8) override { return fEmojiModifierBase.contains(utf8); }
isEmojiModifier(SkUnichar utf8)75 bool isEmojiModifier(SkUnichar utf8) override { return fEmojiModifier.contains(utf8); }
isRegionalIndicator(SkUnichar utf8)76 bool isRegionalIndicator(SkUnichar utf8) override { return fRegionalIndicator.contains(utf8); }
isIdeographic(SkUnichar utf8)77 bool isIdeographic(SkUnichar utf8) override { return fIdeographic.contains(utf8); }
78
79 // TODO: is there a check for tabulation
isTabulation(SkUnichar utf8)80 bool isTabulation(SkUnichar utf8) override {
81 return utf8 == '\t';
82 }
83
84 // For SkShaper
85 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
86 SkBidiIterator::Direction dir) override;
87 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
88 int count,
89 SkBidiIterator::Direction dir) override;
90 std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
91 BreakType breakType) override;
92 std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override;
93 // For SkParagraph
getBidiRegions(const char utf8[],int utf8Units,TextDirection dir,std::vector<BidiRegion> * results)94 bool getBidiRegions(const char utf8[],
95 int utf8Units,
96 TextDirection dir,
97 std::vector<BidiRegion>* results) override {
98
99 const auto bidi = ICU4XBidi::create(fDataProvider).ok().value();
100 std::string_view string_view(utf8, utf8Units);
101 auto info = bidi.for_text(string_view, dir == TextDirection::kLTR ? 0 : 1);
102 auto currentLevel = info.level_at(0);
103 size_t start = 0;
104
105 for (size_t i = 1; i < info.size(); i++) {
106 const auto level = info.level_at(i);
107 if (level != currentLevel) {
108 (*results).emplace_back(start, i, currentLevel);
109 currentLevel = level;
110 start = i;
111 }
112 }
113 (*results).emplace_back(start, info.size(), currentLevel);
114 return true;
115 }
116
getBidiRegions(const uint16_t utf16[],int utf16Units,TextDirection dir,std::vector<BidiRegion> * results)117 bool getBidiRegions(const uint16_t utf16[],
118 int utf16Units,
119 TextDirection dir,
120 std::vector<BidiRegion>* results) {
121 auto utf8 = SkUnicode::convertUtf16ToUtf8((char16_t*)utf16, utf16Units);
122 return this->getBidiRegions(utf8.data(), utf8.size(), dir, results);
123 }
124
computeCodeUnitFlags(char utf8[],int utf8Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)125 bool computeCodeUnitFlags(char utf8[],
126 int utf8Units,
127 bool replaceTabs,
128 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
129 results->clear();
130 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
131 this->markLineBreaks(utf8, utf8Units, /*hardLineBreaks=*/false, results);
132 this->markHardLineBreaksHack(utf8, utf8Units, results);
133 this->markGraphemes(utf8, utf8Units, results);
134 this->markCharacters(utf8, utf8Units, replaceTabs, results);
135 return true;
136 }
137
computeCodeUnitFlags(char16_t utf16[],int utf16Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)138 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
139 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
140 SkASSERT(false);
141 return true;
142 }
143
getWords(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)144 bool getWords(const char utf8[],
145 int utf8Units,
146 const char* locale,
147 std::vector<Position>* results) override {
148 auto utf16 = SkUnicode::convertUtf8ToUtf16(utf8, utf8Units);
149 const diplomat::span<const uint16_t> span((uint16_t*)utf16.data(), utf16.size());
150 const auto segmenter = ICU4XWordSegmenter::create_dictionary(fDataProvider).ok().value();
151 auto iterator = segmenter.segment_utf16(span);
152 while (true) {
153 int32_t breakpoint = iterator.next();
154 if (breakpoint == -1) {
155 break;
156 }
157 results->emplace_back(breakpoint);
158 }
159 return true;
160 }
161
toUpper(const SkString & str)162 SkString toUpper(const SkString& str) override {
163 return toUpper(str, "und");
164 }
165
toUpper(const SkString & str,const char * localeStr)166 SkString toUpper(const SkString& str, const char* localeStr) override {
167 auto locale = ICU4XLocale::create_from_string(localeStr).ok().value();
168 std::string std_string(str.data(), str.size());
169 // TODO: upper case
170 auto result = fCaseMapper.uppercase(std_string, locale).ok().value();
171 return SkString(result.data(), result.size());
172 }
173
reorderVisual(const BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])174 void reorderVisual(const BidiLevel runLevels[],
175 int levelsCount,
176 int32_t logicalFromVisual[]) override {
177
178 const auto bidi = ICU4XBidi::create(fDataProvider).ok().value();
179 const diplomat::span<const uint8_t> levels(&runLevels[0], levelsCount);
180 auto map = bidi.reorder_visual(levels);
181 SkASSERT(levelsCount == map.len());
182 std::vector<int32_t> results;
183 for (size_t i = 0; i < map.len(); i++) {
184 auto level = map.get(i);
185 logicalFromVisual[i] = SkToS32(level);
186 }
187 }
188
189 private:
190 friend class SkBreakIterator_icu4x;
191 friend class SkBidiIterator_icu4x;
192
markHardLineBreaksHack(char utf8[],int utf8Units,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)193 bool markHardLineBreaksHack(char utf8[],
194 int utf8Units,
195 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
196 const char* end = utf8 + utf8Units;
197 const char* ch = utf8;
198 while (ch < end) {
199 auto unichar = SkUTF::NextUTF8(&ch, end);
200 if (this->isHardBreak(unichar)) {
201 (*results)[ch - utf8] |= CodeUnitFlags::kHardLineBreakBefore;
202 }
203 }
204 return true;
205 }
206
getChar32(const char * pointer,const char * end)207 SkUnichar getChar32(const char* pointer, const char* end) {
208 if (pointer < end) {
209 return SkUTF::NextUTF8(&pointer, end);
210 }
211 return -1;
212 }
213
markLineBreaks(char utf8[],int utf8Units,bool hardLineBreaks,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)214 bool markLineBreaks(char utf8[],
215 int utf8Units,
216 bool hardLineBreaks,
217 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
218 if (utf8Units == 0) {
219 return true;
220 }
221 // TODO: Remove hard line break hack and detect it here
222 SkASSERT(!hardLineBreaks);
223 const auto lineBreakingOptions = hardLineBreaks
224 ? ICU4XLineBreakOptionsV1{ICU4XLineBreakStrictness::Strict, ICU4XLineBreakWordOption::Normal}
225 : ICU4XLineBreakOptionsV1{ICU4XLineBreakStrictness::Loose, ICU4XLineBreakWordOption::Normal};
226 const auto segmenter = ICU4XLineSegmenter::create_auto_with_options_v1(fDataProvider, lineBreakingOptions).ok().value();
227 std::string_view string_view(utf8, utf8Units);
228 auto iterator = segmenter.segment_utf8(string_view);
229
230 while (true) {
231 int32_t lineBreak = iterator.next();
232 if (lineBreak == -1) {
233 break;
234 }
235 if (hardLineBreaks) {
236 (*results)[lineBreak] |= CodeUnitFlags::kHardLineBreakBefore;
237 } else {
238 (*results)[lineBreak] |= CodeUnitFlags::kSoftLineBreakBefore;
239 }
240 }
241 if (!hardLineBreaks) {
242 (*results)[0] |= CodeUnitFlags::kSoftLineBreakBefore;
243 (*results)[utf8Units] |= CodeUnitFlags::kSoftLineBreakBefore;
244 }
245 return true;
246 }
247
markGraphemes(const char utf8[],int utf8Units,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)248 bool markGraphemes(const char utf8[],
249 int utf8Units,
250 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
251 const auto segmenter = ICU4XGraphemeClusterSegmenter::create(fDataProvider).ok().value();
252 std::string_view string_view(utf8, utf8Units);
253 auto iterator = segmenter.segment_utf8(string_view);
254 while (true) {
255 int32_t graphemeStart = iterator.next();
256 if (graphemeStart == -1) {
257 break;
258 }
259 (*results)[graphemeStart] |= CodeUnitFlags::kGraphemeStart;
260 }
261 return true;
262 }
263
markCharacters(char utf8[],int utf8Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)264 bool markCharacters(char utf8[],
265 int utf8Units,
266 bool replaceTabs,
267 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
268 const char* current = utf8;
269 const char* end = utf8 + utf8Units;
270 while (current < end) {
271 auto before = current - utf8;
272 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
273 if (unichar < 0) unichar = 0xFFFD;
274 auto after = current - utf8;
275 if (replaceTabs && SkUnicode_icu4x::isTabulation(unichar)) {
276 results->at(before) |= SkUnicode::kTabulation;
277 if (replaceTabs) {
278 unichar = ' ';
279 utf8[before] = ' ';
280 }
281 }
282 for (auto i = before; i < after; ++i) {
283 bool isHardBreak = this->isHardBreak(unichar);
284 bool isSpace = this->isSpace(unichar) || isHardBreak;
285 bool isWhitespace = this->isWhitespace(unichar) || isHardBreak;
286 if (isSpace) {
287 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
288 }
289 if (isWhitespace) {
290 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
291 }
292 if (this->isControl(unichar)) {
293 results->at(i) |= SkUnicode::kControl;
294 }
295 }
296 }
297 return true;
298 }
299
getUtf8Words(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)300 bool getUtf8Words(const char utf8[],
301 int utf8Units,
302 const char* locale,
303 std::vector<Position>* results) override {
304 SkDEBUGF("Method 'getUtf8Words' is not implemented\n");
305 return false;
306 }
307
getSentences(const char utf8[],int utf8Units,const char * locale,std::vector<SkUnicode::Position> * results)308 bool getSentences(const char utf8[],
309 int utf8Units,
310 const char* locale,
311 std::vector<SkUnicode::Position>* results) override {
312 SkDEBUGF("Method 'getSentences' is not implemented\n");
313 return false;
314 }
315
316 std::shared_ptr<std::vector<SkUnicode::BidiRegion>> fRegions;
317 ICU4XLocale fLocale;
318 ICU4XDataProvider fDataProvider;
319 ICU4XCaseMapper fCaseMapper;
320 ICU4XCodePointSetData fWhitespaces;
321 ICU4XCodePointSetData fSpaces;
322 ICU4XCodePointSetData fBlanks;
323 ICU4XCodePointSetData fEmoji;
324 ICU4XCodePointSetData fEmojiComponent;
325 ICU4XCodePointSetData fEmojiModifier;
326 ICU4XCodePointSetData fEmojiModifierBase;
327 ICU4XCodePointSetData fRegionalIndicator;
328 ICU4XCodePointSetData fIdeographic;
329 ICU4XCodePointSetData fControls;
330 ICU4XCodePointMapData8 fLineBreaks;
331 };
332
333 class SkBreakIterator_icu4x: public SkBreakIterator {
334 Position fLastResult;
335 Position fStart;
336 Position fEnd;
337 public:
SkBreakIterator_icu4x()338 SkBreakIterator_icu4x() { }
first()339 Position first() override { SkASSERT(false); return -1; }
current()340 Position current() override { SkASSERT(false); return -1; }
next()341 Position next() override { SkASSERT(false); return -1; }
status()342 Status status() override { SkASSERT(false); return -1; }
isDone()343 bool isDone() override { SkASSERT(false); return false; }
setText(const char utftext8[],int utf8Units)344 bool setText(const char utftext8[], int utf8Units) override { SkASSERT(false); return false; }
setText(const char16_t utftext16[],int utf16Units)345 bool setText(const char16_t utftext16[], int utf16Units) override { SkASSERT(false); return false; }
346 };
347
348 class SkBidiIterator_icu4x : public SkBidiIterator {
349 std::shared_ptr<std::vector<SkUnicode::BidiRegion>> fRegions;
350 public:
SkBidiIterator_icu4x(std::shared_ptr<std::vector<SkUnicode::BidiRegion>> regions)351 explicit SkBidiIterator_icu4x(std::shared_ptr<std::vector<SkUnicode::BidiRegion>> regions)
352 : fRegions(regions) { }
getLength()353 Position getLength() override { return fRegions->size(); }
getLevelAt(Position pos)354 Level getLevelAt(Position pos) override {
355 auto found = std::lower_bound(
356 fRegions->begin(),
357 fRegions->end(),
358 SkUnicode::BidiRegion(pos, pos, 0),
359 [](const SkUnicode::BidiRegion& a, const SkUnicode::BidiRegion& b) {
360 return a.start <= b.start && a.end <= b.end;
361 });
362 return found->level;
363 }
364 };
365
makeBidiIterator(const uint16_t text[],int count,SkBidiIterator::Direction dir)366 std::unique_ptr<SkBidiIterator> SkUnicode_icu4x::makeBidiIterator(const uint16_t text[], int count,
367 SkBidiIterator::Direction dir) {
368 if (fRegions) {
369 fRegions->clear();
370 } else {
371 fRegions = std::make_shared<std::vector<SkUnicode::BidiRegion>>();
372 }
373
374 if (this->getBidiRegions(text, count, dir == SkBidiIterator::Direction::kLTR ? TextDirection::kLTR : TextDirection::kRTL, fRegions.get())) {
375 return std::make_unique<SkBidiIterator_icu4x>(fRegions);
376 } else {
377 return nullptr;
378 }
379 }
380
makeBidiIterator(const char text[],int count,SkBidiIterator::Direction dir)381 std::unique_ptr<SkBidiIterator> SkUnicode_icu4x::makeBidiIterator(const char text[],
382 int count,
383 SkBidiIterator::Direction dir) {
384 if (fRegions) {
385 fRegions->clear();
386 } else {
387 fRegions = std::make_shared<std::vector<SkUnicode::BidiRegion>>();
388 }
389 if (this->getBidiRegions(text, count, dir == SkBidiIterator::Direction::kLTR ? TextDirection::kLTR : TextDirection::kRTL, fRegions.get())) {
390 return std::make_unique<SkBidiIterator_icu4x>(fRegions);
391 } else {
392 return nullptr;
393 }
394 }
395
makeBreakIterator(const char locale[],BreakType breakType)396 std::unique_ptr<SkBreakIterator> SkUnicode_icu4x::makeBreakIterator(const char locale[],
397 BreakType breakType) {
398 SkASSERT(false); return nullptr;
399 }
400
makeBreakIterator(BreakType breakType)401 std::unique_ptr<SkBreakIterator> SkUnicode_icu4x::makeBreakIterator(BreakType breakType) {
402 SkASSERT(false); return nullptr;
403 }
404
405 namespace SkUnicodes::ICU4X {
Make()406 sk_sp<SkUnicode> Make() {
407 return sk_make_sp<SkUnicode_icu4x>();
408 }
409 }
410