xref: /aosp_15_r20/external/skia/modules/skunicode/src/SkUnicode_icu4x.cpp (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1 /*
2 * Copyright 2023 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7 #include "modules/skunicode/include/SkUnicode_icu4x.h"
8 
9 #include "include/core/SkSpan.h"
10 #include "include/core/SkString.h"
11 #include "include/core/SkTypes.h"
12 #include "include/private/base/SkTArray.h"
13 #include "include/private/base/SkTo.h"
14 #include "modules/skunicode/include/SkUnicode.h"
15 #include "modules/skunicode/src/SkUnicode_hardcoded.h"
16 #include "src/base/SkBitmaskEnum.h"
17 #include "src/base/SkUTF.h"
18 
19 #include <ICU4XBidi.hpp>
20 #include <ICU4XCaseMapper.hpp>
21 #include <ICU4XCodePointMapData8.hpp>
22 #include <ICU4XCodePointSetData.hpp>
23 #include <ICU4XDataProvider.hpp>
24 #include <ICU4XGraphemeClusterSegmenter.hpp>
25 #include <ICU4XLineSegmenter.hpp>
26 #include <ICU4XWordSegmenter.hpp>
27 
28 #include <algorithm>
29 #include <cstdint>
30 #include <memory>
31 #include <string>
32 #include <utility>
33 #include <vector>
34 
35 class SkUnicode_icu4x :  public SkUnicode {
36 public:
SkUnicode_icu4x()37     SkUnicode_icu4x() {
38         fLocale = ICU4XLocale::create_from_string("tr").ok().value();
39         fDataProvider = ICU4XDataProvider::create_compiled();
40         fCaseMapper = ICU4XCaseMapper::create(fDataProvider).ok().value();
41         const auto general = ICU4XCodePointMapData8::load_general_category(fDataProvider).ok().value();
42         fControls = general.get_set_for_value(/*Control*/15);
43         fWhitespaces = general.get_set_for_value(/*SpaceSeparator*/12);
44         fSpaces = general.get_set_for_value(/*SpaceSeparator*/12);
45         // TODO: u_isSpace
46         fBlanks = ICU4XCodePointSetData::load_blank(fDataProvider).ok().value();
47         fEmoji = ICU4XCodePointSetData::load_emoji(fDataProvider).ok().value();
48         fEmojiComponent = ICU4XCodePointSetData::load_emoji_component(fDataProvider).ok().value();
49         fEmojiModifier = ICU4XCodePointSetData::load_emoji_modifier(fDataProvider).ok().value();
50         fEmojiModifierBase = ICU4XCodePointSetData::load_emoji_modifier_base(fDataProvider).ok().value();
51         fEmoji = ICU4XCodePointSetData::load_emoji(fDataProvider).ok().value();
52         fRegionalIndicator = ICU4XCodePointSetData::load_regional_indicator(fDataProvider).ok().value();
53         fIdeographic = ICU4XCodePointSetData::load_ideographic(fDataProvider).ok().value();
54         fLineBreaks = ICU4XCodePointMapData8::load_line_break(fDataProvider).ok().value();
55     }
56 
57     ~SkUnicode_icu4x() override = default;
58 
59     void reset();
60 
61     // SkUnicode properties
isControl(SkUnichar utf8)62     bool isControl(SkUnichar utf8) override { return fControls.contains(utf8); }
isWhitespace(SkUnichar utf8)63     bool isWhitespace(SkUnichar utf8) override { return fWhitespaces.contains(utf8); }
isSpace(SkUnichar utf8)64     bool isSpace(SkUnichar utf8) override { return fBlanks.contains(utf8); }
isHardBreak(SkUnichar utf8)65     bool isHardBreak(SkUnichar utf8) override {
66         auto value = fLineBreaks.get(utf8);
67         return (value == /*MandatoryBreak*/6) ||
68                (value == /*CarriageReturn*/10) ||
69                (value == /*LineFeed*/17) ||
70                (value == /*NextLine*/29);
71     }
isEmoji(SkUnichar utf8)72     bool isEmoji(SkUnichar utf8) override { return fEmoji.contains(utf8); }
isEmojiComponent(SkUnichar utf8)73     bool isEmojiComponent(SkUnichar utf8) override { return fEmojiComponent.contains(utf8); }
isEmojiModifierBase(SkUnichar utf8)74     bool isEmojiModifierBase(SkUnichar utf8) override { return fEmojiModifierBase.contains(utf8); }
isEmojiModifier(SkUnichar utf8)75     bool isEmojiModifier(SkUnichar utf8) override { return fEmojiModifier.contains(utf8); }
isRegionalIndicator(SkUnichar utf8)76     bool isRegionalIndicator(SkUnichar utf8) override { return fRegionalIndicator.contains(utf8); }
isIdeographic(SkUnichar utf8)77     bool isIdeographic(SkUnichar utf8) override { return fIdeographic.contains(utf8); }
78 
79     // TODO: is there a check for tabulation
isTabulation(SkUnichar utf8)80     bool isTabulation(SkUnichar utf8) override {
81         return utf8 == '\t';
82     }
83 
84     // For SkShaper
85     std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
86                                                      SkBidiIterator::Direction dir) override;
87     std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
88                                                      int count,
89                                                      SkBidiIterator::Direction dir) override;
90     std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
91                                                        BreakType breakType) override;
92     std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override;
93     // For SkParagraph
getBidiRegions(const char utf8[],int utf8Units,TextDirection dir,std::vector<BidiRegion> * results)94     bool getBidiRegions(const char utf8[],
95                         int utf8Units,
96                         TextDirection dir,
97                         std::vector<BidiRegion>* results) override {
98 
99         const auto bidi = ICU4XBidi::create(fDataProvider).ok().value();
100         std::string_view string_view(utf8, utf8Units);
101         auto info = bidi.for_text(string_view, dir == TextDirection::kLTR ? 0 : 1);
102         auto currentLevel = info.level_at(0);
103         size_t start = 0;
104 
105         for (size_t i = 1; i < info.size(); i++) {
106             const auto level =  info.level_at(i);
107             if (level != currentLevel) {
108                 (*results).emplace_back(start, i, currentLevel);
109                 currentLevel = level;
110                 start = i;
111             }
112         }
113         (*results).emplace_back(start, info.size(), currentLevel);
114         return true;
115     }
116 
getBidiRegions(const uint16_t utf16[],int utf16Units,TextDirection dir,std::vector<BidiRegion> * results)117     bool getBidiRegions(const uint16_t utf16[],
118                         int utf16Units,
119                         TextDirection dir,
120                         std::vector<BidiRegion>* results) {
121         auto utf8 = SkUnicode::convertUtf16ToUtf8((char16_t*)utf16, utf16Units);
122         return this->getBidiRegions(utf8.data(), utf8.size(), dir, results);
123     }
124 
computeCodeUnitFlags(char utf8[],int utf8Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)125     bool computeCodeUnitFlags(char utf8[],
126                               int utf8Units,
127                               bool replaceTabs,
128                               skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
129         results->clear();
130         results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
131         this->markLineBreaks(utf8, utf8Units, /*hardLineBreaks=*/false, results);
132         this->markHardLineBreaksHack(utf8, utf8Units, results);
133         this->markGraphemes(utf8, utf8Units, results);
134         this->markCharacters(utf8, utf8Units, replaceTabs, results);
135         return true;
136     }
137 
computeCodeUnitFlags(char16_t utf16[],int utf16Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)138     bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
139                           skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) override {
140         SkASSERT(false);
141         return true;
142     }
143 
getWords(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)144     bool getWords(const char utf8[],
145                   int utf8Units,
146                   const char* locale,
147                   std::vector<Position>* results) override {
148         auto utf16 = SkUnicode::convertUtf8ToUtf16(utf8, utf8Units);
149         const diplomat::span<const uint16_t> span((uint16_t*)utf16.data(), utf16.size());
150         const auto segmenter = ICU4XWordSegmenter::create_dictionary(fDataProvider).ok().value();
151         auto iterator = segmenter.segment_utf16(span);
152         while (true) {
153             int32_t breakpoint = iterator.next();
154             if (breakpoint == -1) {
155                 break;
156             }
157             results->emplace_back(breakpoint);
158         }
159         return true;
160     }
161 
toUpper(const SkString & str)162     SkString toUpper(const SkString& str) override {
163         return toUpper(str, "und");
164     }
165 
toUpper(const SkString & str,const char * localeStr)166     SkString toUpper(const SkString& str, const char* localeStr) override {
167         auto locale = ICU4XLocale::create_from_string(localeStr).ok().value();
168         std::string std_string(str.data(), str.size());
169         // TODO: upper case
170         auto result = fCaseMapper.uppercase(std_string, locale).ok().value();
171         return SkString(result.data(), result.size());
172     }
173 
reorderVisual(const BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])174     void reorderVisual(const BidiLevel runLevels[],
175                        int levelsCount,
176                        int32_t logicalFromVisual[]) override {
177 
178         const auto bidi = ICU4XBidi::create(fDataProvider).ok().value();
179         const diplomat::span<const uint8_t> levels(&runLevels[0], levelsCount);
180         auto map = bidi.reorder_visual(levels);
181         SkASSERT(levelsCount == map.len());
182         std::vector<int32_t> results;
183         for (size_t i = 0; i < map.len(); i++) {
184             auto level = map.get(i);
185             logicalFromVisual[i] = SkToS32(level);
186         }
187     }
188 
189 private:
190     friend class SkBreakIterator_icu4x;
191     friend class SkBidiIterator_icu4x;
192 
markHardLineBreaksHack(char utf8[],int utf8Units,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)193     bool markHardLineBreaksHack(char utf8[],
194                                 int utf8Units,
195                                 skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
196         const char* end = utf8 + utf8Units;
197         const char* ch = utf8;
198         while (ch < end) {
199             auto unichar = SkUTF::NextUTF8(&ch, end);
200             if (this->isHardBreak(unichar)) {
201                 (*results)[ch - utf8] |= CodeUnitFlags::kHardLineBreakBefore;
202             }
203         }
204         return true;
205     }
206 
getChar32(const char * pointer,const char * end)207     SkUnichar getChar32(const char* pointer, const char* end) {
208         if (pointer < end) {
209             return SkUTF::NextUTF8(&pointer, end);
210         }
211         return -1;
212     }
213 
markLineBreaks(char utf8[],int utf8Units,bool hardLineBreaks,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)214     bool markLineBreaks(char utf8[],
215                         int utf8Units,
216                         bool hardLineBreaks,
217                         skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
218         if (utf8Units == 0) {
219             return true;
220         }
221         // TODO: Remove hard line break hack and detect it here
222         SkASSERT(!hardLineBreaks);
223         const auto lineBreakingOptions = hardLineBreaks
224                                               ? ICU4XLineBreakOptionsV1{ICU4XLineBreakStrictness::Strict, ICU4XLineBreakWordOption::Normal}
225                                               : ICU4XLineBreakOptionsV1{ICU4XLineBreakStrictness::Loose, ICU4XLineBreakWordOption::Normal};
226         const auto segmenter = ICU4XLineSegmenter::create_auto_with_options_v1(fDataProvider, lineBreakingOptions).ok().value();
227         std::string_view string_view(utf8, utf8Units);
228         auto iterator = segmenter.segment_utf8(string_view);
229 
230         while (true) {
231             int32_t lineBreak = iterator.next();
232             if (lineBreak == -1) {
233                 break;
234             }
235             if (hardLineBreaks) {
236                 (*results)[lineBreak] |= CodeUnitFlags::kHardLineBreakBefore;
237             } else {
238                 (*results)[lineBreak] |= CodeUnitFlags::kSoftLineBreakBefore;
239             }
240         }
241         if (!hardLineBreaks) {
242             (*results)[0] |= CodeUnitFlags::kSoftLineBreakBefore;
243             (*results)[utf8Units] |= CodeUnitFlags::kSoftLineBreakBefore;
244         }
245         return true;
246     }
247 
markGraphemes(const char utf8[],int utf8Units,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)248     bool markGraphemes(const char utf8[],
249                        int utf8Units,
250                        skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
251         const auto segmenter = ICU4XGraphemeClusterSegmenter::create(fDataProvider).ok().value();
252         std::string_view string_view(utf8, utf8Units);
253         auto iterator = segmenter.segment_utf8(string_view);
254         while (true) {
255             int32_t graphemeStart = iterator.next();
256             if (graphemeStart == -1) {
257                 break;
258             }
259             (*results)[graphemeStart] |= CodeUnitFlags::kGraphemeStart;
260         }
261         return true;
262     }
263 
markCharacters(char utf8[],int utf8Units,bool replaceTabs,skia_private::TArray<SkUnicode::CodeUnitFlags,true> * results)264     bool markCharacters(char utf8[],
265                         int utf8Units,
266                         bool replaceTabs,
267                         skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results) {
268         const char* current = utf8;
269         const char* end = utf8 + utf8Units;
270         while (current < end) {
271             auto before = current - utf8;
272             SkUnichar unichar = SkUTF::NextUTF8(&current, end);
273             if (unichar < 0) unichar = 0xFFFD;
274             auto after = current - utf8;
275             if (replaceTabs && SkUnicode_icu4x::isTabulation(unichar)) {
276                 results->at(before) |= SkUnicode::kTabulation;
277                 if (replaceTabs) {
278                     unichar = ' ';
279                     utf8[before] = ' ';
280                 }
281             }
282             for (auto i = before; i < after; ++i) {
283                 bool isHardBreak = this->isHardBreak(unichar);
284                 bool isSpace = this->isSpace(unichar) || isHardBreak;
285                 bool isWhitespace = this->isWhitespace(unichar) || isHardBreak;
286                 if (isSpace) {
287                     results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
288                 }
289                 if (isWhitespace) {
290                     results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
291                 }
292                 if (this->isControl(unichar)) {
293                     results->at(i) |= SkUnicode::kControl;
294                 }
295             }
296         }
297         return true;
298     }
299 
getUtf8Words(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)300     bool getUtf8Words(const char utf8[],
301                       int utf8Units,
302                       const char* locale,
303                       std::vector<Position>* results) override {
304         SkDEBUGF("Method 'getUtf8Words' is not implemented\n");
305         return false;
306     }
307 
getSentences(const char utf8[],int utf8Units,const char * locale,std::vector<SkUnicode::Position> * results)308     bool getSentences(const char utf8[],
309                       int utf8Units,
310                       const char* locale,
311                       std::vector<SkUnicode::Position>* results) override {
312         SkDEBUGF("Method 'getSentences' is not implemented\n");
313         return false;
314     }
315 
316     std::shared_ptr<std::vector<SkUnicode::BidiRegion>> fRegions;
317     ICU4XLocale fLocale;
318     ICU4XDataProvider fDataProvider;
319     ICU4XCaseMapper fCaseMapper;
320     ICU4XCodePointSetData fWhitespaces;
321     ICU4XCodePointSetData fSpaces;
322     ICU4XCodePointSetData fBlanks;
323     ICU4XCodePointSetData fEmoji;
324     ICU4XCodePointSetData fEmojiComponent;
325     ICU4XCodePointSetData fEmojiModifier;
326     ICU4XCodePointSetData fEmojiModifierBase;
327     ICU4XCodePointSetData fRegionalIndicator;
328     ICU4XCodePointSetData fIdeographic;
329     ICU4XCodePointSetData fControls;
330     ICU4XCodePointMapData8 fLineBreaks;
331 };
332 
333 class SkBreakIterator_icu4x: public SkBreakIterator {
334     Position fLastResult;
335     Position fStart;
336     Position fEnd;
337 public:
SkBreakIterator_icu4x()338     SkBreakIterator_icu4x() { }
first()339     Position first() override { SkASSERT(false); return -1; }
current()340     Position current() override { SkASSERT(false); return -1; }
next()341     Position next() override { SkASSERT(false); return -1; }
status()342     Status status() override { SkASSERT(false); return -1; }
isDone()343     bool isDone() override { SkASSERT(false); return false; }
setText(const char utftext8[],int utf8Units)344     bool setText(const char utftext8[], int utf8Units) override { SkASSERT(false); return false; }
setText(const char16_t utftext16[],int utf16Units)345     bool setText(const char16_t utftext16[], int utf16Units) override { SkASSERT(false); return false; }
346 };
347 
348 class SkBidiIterator_icu4x : public SkBidiIterator {
349     std::shared_ptr<std::vector<SkUnicode::BidiRegion>> fRegions;
350 public:
SkBidiIterator_icu4x(std::shared_ptr<std::vector<SkUnicode::BidiRegion>> regions)351     explicit SkBidiIterator_icu4x(std::shared_ptr<std::vector<SkUnicode::BidiRegion>> regions)
352             : fRegions(regions) { }
getLength()353     Position getLength() override { return fRegions->size(); }
getLevelAt(Position pos)354     Level getLevelAt(Position pos) override {
355         auto found = std::lower_bound(
356                 fRegions->begin(),
357                 fRegions->end(),
358                 SkUnicode::BidiRegion(pos, pos, 0),
359                 [](const SkUnicode::BidiRegion& a, const SkUnicode::BidiRegion& b) {
360                     return a.start <= b.start && a.end <= b.end;
361                 });
362         return found->level;
363     }
364 };
365 
makeBidiIterator(const uint16_t text[],int count,SkBidiIterator::Direction dir)366 std::unique_ptr<SkBidiIterator> SkUnicode_icu4x::makeBidiIterator(const uint16_t text[], int count,
367                                                  SkBidiIterator::Direction dir) {
368     if (fRegions) {
369         fRegions->clear();
370     } else {
371         fRegions = std::make_shared<std::vector<SkUnicode::BidiRegion>>();
372     }
373 
374     if (this->getBidiRegions(text, count, dir == SkBidiIterator::Direction::kLTR ? TextDirection::kLTR : TextDirection::kRTL, fRegions.get())) {
375         return std::make_unique<SkBidiIterator_icu4x>(fRegions);
376     } else {
377         return nullptr;
378     }
379 }
380 
makeBidiIterator(const char text[],int count,SkBidiIterator::Direction dir)381 std::unique_ptr<SkBidiIterator> SkUnicode_icu4x::makeBidiIterator(const char text[],
382                                                  int count,
383                                                  SkBidiIterator::Direction dir) {
384     if (fRegions) {
385         fRegions->clear();
386     } else {
387         fRegions = std::make_shared<std::vector<SkUnicode::BidiRegion>>();
388     }
389     if (this->getBidiRegions(text, count, dir == SkBidiIterator::Direction::kLTR ? TextDirection::kLTR : TextDirection::kRTL, fRegions.get())) {
390         return std::make_unique<SkBidiIterator_icu4x>(fRegions);
391     } else {
392         return nullptr;
393     }
394 }
395 
makeBreakIterator(const char locale[],BreakType breakType)396 std::unique_ptr<SkBreakIterator> SkUnicode_icu4x::makeBreakIterator(const char locale[],
397                                                    BreakType breakType) {
398     SkASSERT(false); return nullptr;
399 }
400 
makeBreakIterator(BreakType breakType)401 std::unique_ptr<SkBreakIterator> SkUnicode_icu4x::makeBreakIterator(BreakType breakType) {
402     SkASSERT(false); return nullptr;
403 }
404 
405 namespace SkUnicodes::ICU4X {
Make()406 sk_sp<SkUnicode> Make() {
407     return sk_make_sp<SkUnicode_icu4x>();
408 }
409 }
410