xref: /aosp_15_r20/frameworks/base/core/jni/android_text_Hyphenator.cpp (revision d57664e9bc4670b3ecf6748a746a57c557b6bc9e)
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <core_jni_helpers.h>
18 #include <cutils/trace.h>
19 #include <fcntl.h>
20 #include <minikin/Hyphenator.h>
21 #ifdef __ANDROID__
22 #include <sys/mman.h>
23 #else
24 #include <android-base/mapped_file.h>
25 #include <android-base/properties.h>
26 #endif
27 #include <sys/stat.h>
28 #include <sys/types.h>
29 #ifdef __ANDROID__
30 #include <tracing_perfetto.h>
31 #endif
32 #include <unicode/uloc.h>
33 #include <unistd.h>
34 
35 #include <algorithm>
36 
37 namespace android {
38 
buildFileName(const std::string & locale)39 static std::string buildFileName(const std::string& locale) {
40 #ifdef __ANDROID__
41     constexpr char SYSTEM_HYPHENATOR_PREFIX[] = "/system/usr/hyphen-data/hyph-";
42 #else
43     std::string hyphenPath = base::GetProperty("ro.hyphen.data.dir", "/system/usr/hyphen-data");
44     std::string SYSTEM_HYPHENATOR_PREFIX = hyphenPath + "/hyph-";
45 #endif
46     constexpr char SYSTEM_HYPHENATOR_SUFFIX[] = ".hyb";
47     std::string lowerLocale;
48     lowerLocale.reserve(locale.size());
49     std::transform(locale.begin(), locale.end(), std::back_inserter(lowerLocale), ::tolower);
50     return SYSTEM_HYPHENATOR_PREFIX + lowerLocale + SYSTEM_HYPHENATOR_SUFFIX;
51 }
52 
mmapPatternFile(const std::string & locale)53 static std::pair<const uint8_t*, size_t> mmapPatternFile(const std::string& locale) {
54     const std::string hyFilePath = buildFileName(locale);
55     const int fd = open(hyFilePath.c_str(), O_RDONLY | O_CLOEXEC);
56     if (fd == -1) {
57         return std::make_pair(nullptr, 0); // Open failed.
58     }
59 
60     struct stat st = {};
61     if (fstat(fd, &st) == -1) {  // Unlikely to happen.
62         close(fd);
63         return std::make_pair(nullptr, 0);
64     }
65 
66 #ifdef __ANDROID__
67     void* ptr = mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fd, 0 /* offset */);
68     close(fd);
69     if (ptr == MAP_FAILED) {
70         return std::make_pair(nullptr, 0);
71     }
72 #else
73     std::unique_ptr<base::MappedFile> patternFile =
74             base::MappedFile::FromFd(fd, 0, st.st_size, PROT_READ);
75     close(fd);
76     if (patternFile == nullptr) {
77         return std::make_pair(nullptr, 0);
78     }
79     auto* mappedPtr = new base::MappedFile(std::move(*patternFile));
80     char* ptr = mappedPtr->data();
81 #endif
82     return std::make_pair(reinterpret_cast<const uint8_t*>(ptr), st.st_size);
83 }
84 
addHyphenatorWithoutPatternFile(const std::string & locale,int minPrefix,int minSuffix)85 static void addHyphenatorWithoutPatternFile(const std::string& locale, int minPrefix,
86         int minSuffix) {
87     minikin::addHyphenator(locale,
88                            minikin::Hyphenator::loadBinary(nullptr, 0, minPrefix, minSuffix,
89                                                            locale));
90 }
91 
addHyphenator(const std::string & locale,int minPrefix,int minSuffix)92 static void addHyphenator(const std::string& locale, int minPrefix, int minSuffix) {
93     std::pair<const uint8_t*, size_t> r = mmapPatternFile(locale);
94     if (r.first == nullptr) {
95         ALOGE("Unable to find pattern file or unable to map it for %s", locale.c_str());
96         return;
97     }
98     minikin::addHyphenator(locale,
99                            minikin::Hyphenator::loadBinary(r.first, r.second, minPrefix, minSuffix,
100                                                            locale));
101 }
102 
addHyphenatorAlias(const std::string & from,const std::string & to)103 static void addHyphenatorAlias(const std::string& from, const std::string& to) {
104     minikin::addHyphenatorAlias(from, to);
105 }
106 
107 /*
108  * Cache the subtag key map by calling uloc_forLanguageTag with a subtag.
109  * minikin calls uloc_forLanguageTag with an Unicode extension specifying
110  * the line breaking strictness. Parsing the extension requires loading the key map
111  * from keyTypeData.res in the ICU.
112  * "lb" is the key commonly used by minikin. "ca" is a common legacy key mapping to
113  * the "calendar" key. It ensures that the key map is loaded and cached in icu4c.
114  * "en-Latn-US" is a common locale used in the Android system regardless what default locale
115  * is selected in the Settings app.
116  */
cacheUnicodeExtensionSubtagsKeyMap()117 inline static void cacheUnicodeExtensionSubtagsKeyMap() {
118     UErrorCode status = U_ZERO_ERROR;
119     char localeID[ULOC_FULLNAME_CAPACITY] = {};
120     uloc_forLanguageTag("en-Latn-US-u-lb-loose-ca-gregory", localeID, ULOC_FULLNAME_CAPACITY,
121                         nullptr, &status);
122 }
123 
init()124 static void init() {
125     // TODO: Confirm that these are the best values. Various sources suggest (1, 1), but that
126     // appears too small.
127     constexpr int INDIC_MIN_PREFIX = 2;
128     constexpr int INDIC_MIN_SUFFIX = 2;
129 
130     addHyphenator("af", 1, 1);  // Afrikaans
131     addHyphenator("am", 1, 1);  // Amharic
132     addHyphenator("as", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Assamese
133     addHyphenator("be", 2, 2);  // Belarusian
134     addHyphenator("bg", 2, 2);  // Bulgarian
135     addHyphenator("bn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Bengali
136     addHyphenator("cs", 2, 2);  // Czech
137     addHyphenator("cu", 1, 2);  // Church Slavonic
138     addHyphenator("cy", 2, 3);  // Welsh
139     addHyphenator("da", 2, 2);  // Danish
140     addHyphenator("de-1901", 2, 2);  // German 1901 orthography
141     addHyphenator("de-1996", 2, 2);  // German 1996 orthography
142     addHyphenator("de-CH-1901", 2, 2);  // Swiss High German 1901 orthography
143     addHyphenator("el", 1, 1);  // Greek
144     addHyphenator("en-GB", 2, 3);  // British English
145     addHyphenator("en-US", 2, 3);  // American English
146     addHyphenator("es", 2, 2);  // Spanish
147     addHyphenator("et", 2, 3);  // Estonian
148     addHyphenator("eu", 2, 2);  // Basque
149     addHyphenator("fr", 2, 3);  // French
150     addHyphenator("ga", 2, 3);  // Irish
151     addHyphenator("gl", 2, 2);  // Galician
152     addHyphenator("gu", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Gujarati
153     addHyphenator("hi", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Hindi
154     addHyphenator("hr", 2, 2);  // Croatian
155     addHyphenator("hu", 2, 2);  // Hungarian
156     // texhyphen sources say Armenian may be (1, 2); but that it needs confirmation.
157     // Going with a more conservative value of (2, 2) for now.
158     addHyphenator("hy", 2, 2);  // Armenian
159     addHyphenator("it", 2, 2);  // Italian
160     addHyphenator("ka", 1, 2);  // Georgian
161     addHyphenator("kn", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Kannada
162     addHyphenator("la", 2, 2);  // Latin
163     addHyphenator("lt", 2, 2);  // Lithuanian
164     addHyphenator("lv", 2, 2);  // Latvian
165     addHyphenator("ml", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Malayalam
166     addHyphenator("mn-Cyrl", 2, 2);  // Mongolian in Cyrillic script
167     addHyphenator("mr", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Marathi
168     addHyphenator("nb", 2, 2);  // Norwegian Bokmål
169     addHyphenator("nl", 2, 2);  // Dutch
170     addHyphenator("nn", 2, 2);  // Norwegian Nynorsk
171     addHyphenator("or", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Oriya
172     addHyphenator("pa", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Punjabi
173     addHyphenator("pl", 2, 2);  // Polish
174     addHyphenator("pt", 2, 3);  // Portuguese
175     addHyphenator("ru", 2, 2);  // Russian
176     addHyphenator("sk", 2, 2);  // Slovak
177     addHyphenator("sl", 2, 2);  // Slovenian
178     addHyphenator("sq", 2, 2);  // Albanian
179     addHyphenator("sv", 1, 2);  // Swedish
180     addHyphenator("ta", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Tamil
181     addHyphenator("te", INDIC_MIN_PREFIX, INDIC_MIN_SUFFIX);  // Telugu
182     addHyphenator("tk", 2, 2);  // Turkmen
183     addHyphenator("uk", 2, 2);  // Ukrainian
184     addHyphenator("und-Ethi", 1, 1);  // Any language in Ethiopic script
185 
186     // Following two hyphenators do not have pattern files but there is some special logic based on
187     // language.
188     addHyphenatorWithoutPatternFile("ca", 2, 2);  // Catalan
189 
190     // English locales that fall back to en-US. The data is from CLDR. It's all English locales,
191     // minus the locales whose parent is en-001 (from supplementalData.xml, under <parentLocales>).
192     // TODO: Figure out how to get this from ICU.
193     addHyphenatorAlias("en-AS", "en-US");  // English (American Samoa)
194     addHyphenatorAlias("en-GU", "en-US");  // English (Guam)
195     addHyphenatorAlias("en-MH", "en-US");  // English (Marshall Islands)
196     addHyphenatorAlias("en-MP", "en-US");  // English (Northern Mariana Islands)
197     addHyphenatorAlias("en-PR", "en-US");  // English (Puerto Rico)
198     addHyphenatorAlias("en-UM", "en-US");  // English (United States Minor Outlying Islands)
199     addHyphenatorAlias("en-VI", "en-US");  // English (Virgin Islands)
200 
201     // All English locales other than those falling back to en-US are mapped to en-GB.
202     addHyphenatorAlias("en", "en-GB");
203 
204     // For German, we're assuming the 1996 (and later) orthography by default.
205     addHyphenatorAlias("de", "de-1996");
206     // Liechtenstein uses the Swiss hyphenation rules for the 1901 orthography.
207     addHyphenatorAlias("de-LI-1901", "de-CH-1901");
208 
209     // Norwegian is very probably Norwegian Bokmål.
210     addHyphenatorAlias("no", "nb");
211 
212     // Use mn-Cyrl. According to CLDR's likelySubtags.xml, mn is most likely to be mn-Cyrl.
213     addHyphenatorAlias("mn", "mn-Cyrl");  // Mongolian
214 
215     // Fall back to Ethiopic script for languages likely to be written in Ethiopic.
216     // Data is from CLDR's likelySubtags.xml.
217     // TODO: Convert this to a mechanism using ICU4J's ULocale#addLikelySubtags().
218     addHyphenatorAlias("am", "und-Ethi");  // Amharic
219     addHyphenatorAlias("byn", "und-Ethi");  // Blin
220     addHyphenatorAlias("gez", "und-Ethi");  // Geʻez
221     addHyphenatorAlias("ti", "und-Ethi");  // Tigrinya
222     addHyphenatorAlias("wal", "und-Ethi");  // Wolaytta
223 
224     // Use Hindi as a fallback hyphenator for all languages written in Devanagari, etc. This makes
225     // sense because our Indic patterns are not really linguistic, but script-based.
226     addHyphenatorAlias("und-Beng", "bn");  // Bengali
227     addHyphenatorAlias("und-Deva", "hi");  // Devanagari -> Hindi
228     addHyphenatorAlias("und-Gujr", "gu");  // Gujarati
229     addHyphenatorAlias("und-Guru", "pa");  // Gurmukhi -> Punjabi
230     addHyphenatorAlias("und-Knda", "kn");  // Kannada
231     addHyphenatorAlias("und-Mlym", "ml");  // Malayalam
232     addHyphenatorAlias("und-Orya", "or");  // Oriya
233     addHyphenatorAlias("und-Taml", "ta");  // Tamil
234     addHyphenatorAlias("und-Telu", "te");  // Telugu
235 
236 #ifdef __ANDROID__
237     tracing_perfetto::traceBegin(ATRACE_TAG_VIEW, "CacheUnicodeExtensionSubtagsKeyMap");
238 #endif
239     cacheUnicodeExtensionSubtagsKeyMap();
240 #ifdef __ANDROID__
241     tracing_perfetto::traceEnd(ATRACE_TAG_VIEW); // CacheUnicodeExtensionSubtagsKeyMap
242 #endif
243 }
244 
245 static const JNINativeMethod gMethods[] = {
246     {"nInit", "()V", (void*) init},
247 };
248 
register_android_text_Hyphenator(JNIEnv * env)249 int register_android_text_Hyphenator(JNIEnv* env) {
250     return RegisterMethodsOrDie(env, "android/text/Hyphenator", gMethods, NELEM(gMethods));
251 }
252 
253 }  // namespace android
254