xref: /aosp_15_r20/external/noto-fonts/scripts/subset_noto_cjk.py (revision e5825d3be9fd13b272e7df556d285d1f07f3b027)
1#!/usr/bin/env python3
2# coding=UTF-8
3#
4# Copyright 2016 Google Inc. All rights reserved.
5#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10#     http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18"""Create a curated subset of Noto CJK for Android."""
19
20import argparse
21import logging
22import os
23from pathlib import Path
24
25from fontTools import ttLib
26from nototools import font_data
27from nototools import tool_utils
28from nototools import ttc_utils
29
30# Characters supported in Noto CJK fonts that UTR #51 recommends default to
31# emoji-style.
32EMOJI_IN_CJK = {
33    0x26BD, # ⚽ SOCCER BALL
34    0x26BE, # ⚾ BASEBALL
35    0x1F18E, # �� NEGATIVE SQUARED AB
36    0x1F191, # �� SQUARED CL
37    0x1F192, # �� SQUARED COOL
38    0x1F193, # �� SQUARED FREE
39    0x1F194, # �� SQUARED ID
40    0x1F195, # �� SQUARED NEW
41    0x1F196, # �� SQUARED NG
42    0x1F197, # �� SQUARED OK
43    0x1F198, # �� SQUARED SOS
44    0x1F199, # �� SQUARED UP WITH EXCLAMATION MARK
45    0x1F19A, # �� SQUARED VS
46    0x1F201, # �� SQUARED KATAKANA KOKO
47    0x1F21A, # �� SQUARED CJK UNIFIED IDEOGRAPH-7121
48    0x1F22F, # �� SQUARED CJK UNIFIED IDEOGRAPH-6307
49    0x1F232, # �� SQUARED CJK UNIFIED IDEOGRAPH-7981
50    0x1F233, # �� SQUARED CJK UNIFIED IDEOGRAPH-7A7A
51    0x1F234, # �� SQUARED CJK UNIFIED IDEOGRAPH-5408
52    0x1F235, # �� SQUARED CJK UNIFIED IDEOGRAPH-6E80
53    0x1F236, # �� SQUARED CJK UNIFIED IDEOGRAPH-6709
54    0x1F238, # �� SQUARED CJK UNIFIED IDEOGRAPH-7533
55    0x1F239, # �� SQUARED CJK UNIFIED IDEOGRAPH-5272
56    0x1F23A, # �� SQUARED CJK UNIFIED IDEOGRAPH-55B6
57    0x1F250, # �� CIRCLED IDEOGRAPH ADVANTAGE
58    0x1F251, # �� CIRCLED IDEOGRAPH ACCEPT
59}
60
61# Characters we have decided we are doing as emoji-style in Android,
62# despite UTR #51's recommendation
63ANDROID_EMOJI = {
64    0x2600, # ☀ BLACK SUN WITH RAYS
65    0x2601, # ☁ CLOUD
66    0X260E, # ☎ BLACK TELEPHONE
67    0x261D, # ☝ WHITE UP POINTING INDEX
68    0x263A, # ☺ WHITE SMILING FACE
69    0x2660, # ♠ BLACK SPADE SUIT
70    0x2663, # ♣ BLACK CLUB SUIT
71    0x2665, # ♥ BLACK HEART SUIT
72    0x2666, # ♦ BLACK DIAMOND SUIT
73    0x270C, # ✌ VICTORY HAND
74    0x2744, # ❄ SNOWFLAKE
75    0x2764, # ❤ HEAVY BLACK HEART
76}
77
78# We don't want support for ASCII control chars.
79CONTROL_CHARS = tool_utils.parse_int_ranges('0000-001F')
80
81EXCLUDED_CODEPOINTS = sorted(EMOJI_IN_CJK | ANDROID_EMOJI | CONTROL_CHARS)
82
83TTC_NAMES = ('NotoSansCJK-Regular.ttc', 'NotoSerifCJK-Regular.ttc')
84
85
86def remove_from_cmap(infile, outfile, exclude=frozenset()):
87    """Removes a set of characters from a font file's cmap table."""
88    font = ttLib.TTFont(infile)
89    font_data.delete_from_cmap(font, exclude)
90    font.save(outfile)
91
92
93def remove_codepoints_from_ttc_using_ttc_utils(ttc_name, out_dir):
94    otf_names = ttc_utils.ttcfile_extract(ttc_name, out_dir)
95
96    with tool_utils.temp_chdir(out_dir):
97        for index, otf_name in enumerate(otf_names):
98            logging.info('Subsetting %s...', otf_name)
99            remove_from_cmap(otf_name, otf_name, exclude=EXCLUDED_CODEPOINTS)
100        ttc_utils.ttcfile_build(ttc_name, otf_names)
101        for f in otf_names:
102            os.remove(f)
103
104
105def remove_codepoints_from_ttc(ttc_path, out_dir):
106    """Removes a set of characters from a TTC font file's cmap table."""
107    logging.info('Loading %s', ttc_path)
108    ttc = ttLib.ttCollection.TTCollection(ttc_path)
109
110    logging.info('Subsetting %d fonts in the collection', len(ttc))
111    for font in ttc:
112        font_data.delete_from_cmap(font, EXCLUDED_CODEPOINTS)
113
114    out_path = out_dir / ttc_path.name
115    logging.info('Saving to %s', out_path)
116    ttc.save(out_path)
117    logging.info('Size: %d --> %d, delta=%d',
118                 ttc_path.stat().st_size,
119                 out_path.stat().st_size,
120                 out_path.stat().st_size - ttc_path.stat().st_size)
121
122
123def main():
124    parser = argparse.ArgumentParser()
125    parser.add_argument('input', default='.', nargs='?')
126    parser.add_argument('-o', '--output', default='subsetted')
127    parser.add_argument('--use-ttc-utils', action='store_true')
128    parser.add_argument('-v', '--verbose', action='count')
129    args = parser.parse_args()
130    if args.verbose:
131        if args.verbose > 1:
132            logging.basicConfig(level=logging.DEBUG)
133        else:
134            logging.basicConfig(level=logging.INFO)
135    in_dir = Path(args.input)
136    out_dir = Path(args.output)
137    out_dir.mkdir(parents=True, exist_ok=True)
138    for ttc_name in TTC_NAMES:
139        if args.use_ttc_utils:
140            remove_codepoints_from_ttc_using_ttc_utils(ttc_name, out_dir)
141        else:
142            remove_codepoints_from_ttc(in_dir / ttc_name, out_dir)
143
144
145if __name__ == "__main__":
146    main()
147