xref: /aosp_15_r20/external/fonttools/MetaTools/buildUCD.py (revision e1fe3e4ad2793916b15cccdc4a7da52a7e1dd0e9)
1#!/usr/bin/env python3
2"""
3Tools to parse data files from the Unicode Character Database.
4"""
5
6
7try:
8    from urllib.request import urlopen
9except ImportError:
10    from urllib2 import urlopen
11from contextlib import closing, contextmanager
12import re
13from codecs import iterdecode
14import logging
15import os
16from io import open
17from os.path import abspath, dirname, join as pjoin, pardir, sep
18
19
20try:  # pragma: no cover
21    unicode
22except NameError:
23    unicode = str
24
25
26UNIDATA_URL = "https://unicode.org/Public/UNIDATA/"
27UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License"
28
29# by default save output files to ../Lib/fontTools/unicodedata/
30UNIDATA_PATH = (
31    pjoin(abspath(dirname(__file__)), pardir, "Lib", "fontTools", "unicodedata") + sep
32)
33
34SRC_ENCODING = "# -*- coding: utf-8 -*-\n"
35
36NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n"
37
38MAX_UNICODE = 0x10FFFF
39
40log = logging.getLogger()
41
42
43@contextmanager
44def open_unidata_file(filename):
45    """Open a text file from https://unicode.org/Public/UNIDATA/"""
46    url = UNIDATA_URL + filename
47    with closing(urlopen(url)) as response:
48        yield iterdecode(response, encoding="utf-8")
49
50
51def parse_unidata_header(infile):
52    """Read the top header of data files, until the first line
53    that does not start with '#'.
54    """
55    header = []
56    line = next(infile)
57    while line.startswith("#"):
58        header.append(line)
59        line = next(infile)
60    return "".join(header)
61
62
63def parse_range_properties(infile, default=None, is_set=False):
64    """Parse a Unicode data file containing a column with one character or
65    a range of characters, and another column containing a property value
66    separated by a semicolon. Comments after '#' are ignored.
67
68    If the ranges defined in the data file are not continuous, assign the
69    'default' property to the unassigned codepoints.
70
71    Return a list of (start, end, property_name) tuples.
72    """
73    ranges = []
74    line_regex = re.compile(
75        r"^"
76        r"([0-9A-F]{4,6})"  # first character code
77        r"(?:\.\.([0-9A-F]{4,6}))?"  # optional second character code
78        r"\s*;\s*"
79        r"([^#]+)"
80    )  # everything up to the potential comment
81    for line in infile:
82        match = line_regex.match(line)
83        if not match:
84            continue
85
86        first, last, data = match.groups()
87        if last is None:
88            last = first
89
90        first = int(first, 16)
91        last = int(last, 16)
92        data = str(data.rstrip())
93
94        ranges.append((first, last, data))
95
96    ranges.sort()
97
98    if isinstance(default, unicode):
99        default = str(default)
100
101    # fill the gaps between explicitly defined ranges
102    last_start, last_end = -1, -1
103    full_ranges = []
104    for start, end, value in ranges:
105        assert last_end < start
106        assert start <= end
107        if start - last_end > 1:
108            full_ranges.append((last_end + 1, start - 1, default))
109        if is_set:
110            value = set(value.split())
111        full_ranges.append((start, end, value))
112        last_start, last_end = start, end
113    if last_end != MAX_UNICODE:
114        full_ranges.append((last_end + 1, MAX_UNICODE, default))
115
116    # reduce total number of ranges by combining continuous ones
117    last_start, last_end, last_value = full_ranges.pop(0)
118    merged_ranges = []
119    for start, end, value in full_ranges:
120        if value == last_value:
121            continue
122        else:
123            merged_ranges.append((last_start, start - 1, last_value))
124            last_start, line_end, last_value = start, end, value
125    merged_ranges.append((last_start, MAX_UNICODE, last_value))
126
127    # make sure that the ranges cover the full unicode repertoire
128    assert merged_ranges[0][0] == 0
129    for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]):
130        assert ce + 1 == ns
131    assert merged_ranges[-1][1] == MAX_UNICODE
132
133    return merged_ranges
134
135
136def parse_semicolon_separated_data(infile):
137    """Parse a Unicode data file where each line contains a lists of values
138    separated by a semicolon (e.g. "PropertyValueAliases.txt").
139    The number of the values on different lines may be different.
140
141    Returns a list of lists each containing the values as strings.
142    """
143    data = []
144    for line in infile:
145        line = line.split("#", 1)[0].strip()  # remove the comment
146        if not line:
147            continue
148        fields = [str(field.strip()) for field in line.split(";")]
149        data.append(fields)
150    return data
151
152
153def _set_repr(value):
154    return (
155        "None"
156        if value is None
157        else "{{{}}}".format(", ".join(repr(v) for v in sorted(value)))
158    )
159
160
161def build_ranges(
162    filename, local_ucd=None, output_path=None, default=None, is_set=False, aliases=None
163):
164    """Fetch 'filename' UCD data file from Unicode official website, parse
165    the property ranges and values and write them as two Python lists
166    to 'fontTools.unicodedata.<filename>.py'.
167
168    'aliases' is an optional mapping of property codes (short names) to long
169    name aliases (list of strings, with the first item being the preferred
170    alias). When this is provided, the property values are written using the
171    short notation, and an additional 'NAMES' dict with the aliases is
172    written to the output module.
173
174    To load the data file from a local directory, you can use the
175    'local_ucd' argument.
176    """
177    modname = os.path.splitext(filename)[0] + ".py"
178    if not output_path:
179        output_path = UNIDATA_PATH + modname
180
181    if local_ucd:
182        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
183        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
184    else:
185        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
186        cm = open_unidata_file(filename)
187
188    with cm as f:
189        header = parse_unidata_header(f)
190        ranges = parse_range_properties(f, default=default, is_set=is_set)
191
192    if aliases:
193        reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()}
194        max_value_length = 6  # 4-letter tags plus two quotes for repr
195    else:
196        max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges))
197
198    with open(output_path, "w", encoding="utf-8") as f:
199        f.write(SRC_ENCODING)
200        f.write("#\n")
201        f.write(NOTICE)
202        f.write("# Source: {}{}\n".format(UNIDATA_URL, filename))
203        f.write("# License: {}\n".format(UNIDATA_LICENSE_URL))
204        f.write("#\n")
205        f.write(header + "\n\n")
206
207        f.write("RANGES = [\n")
208        for first, last, value in ranges:
209            f.write(
210                "    0x{:0>4X},  # .. 0x{:0>4X} ; {}\n".format(
211                    first, last, _set_repr(value) if is_set else value
212                )
213            )
214        f.write("]\n")
215
216        f.write("\n")
217        f.write("VALUES = [\n")
218        for first, last, value in ranges:
219            comment = "# {:0>4X}..{:0>4X}".format(first, last)
220            if is_set:
221                value_repr = "{},".format(_set_repr(value))
222            else:
223                if aliases:
224                    # append long name to comment and use the short code
225                    comment += " ; {}".format(value)
226                    value = reversed_aliases[normalize(value)]
227                value_repr = "{!r},".format(value)
228            f.write(
229                "    {}  {}\n".format(value_repr.ljust(max_value_length + 1), comment)
230            )
231        f.write("]\n")
232
233        if aliases:
234            f.write("\n")
235            f.write("NAMES = {\n")
236            for value, names in sorted(aliases.items()):
237                # we only write the first preferred alias
238                f.write("    {!r}: {!r},\n".format(value, names[0]))
239            f.write("}\n")
240
241    log.info("saved new file: '%s'", os.path.normpath(output_path))
242
243
244_normalize_re = re.compile(r"[-_ ]+")
245
246
247def normalize(string):
248    """Remove case, strip space, '-' and '_' for loose matching."""
249    return _normalize_re.sub("", string).lower()
250
251
252def parse_property_value_aliases(property_tag, local_ucd=None):
253    """Fetch the current 'PropertyValueAliases.txt' from the Unicode website,
254    parse the values for the specified 'property_tag' and return a dictionary
255    of name aliases (list of strings) keyed by short value codes (strings).
256
257    To load the data file from a local directory, you can use the
258    'local_ucd' argument.
259    """
260    filename = "PropertyValueAliases.txt"
261    if local_ucd:
262        log.info("loading '%s' from local directory '%s'", filename, local_ucd)
263        cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8")
264    else:
265        log.info("downloading '%s' from '%s'", filename, UNIDATA_URL)
266        cm = open_unidata_file(filename)
267
268    with cm as f:
269        header = parse_unidata_header(f)
270        data = parse_semicolon_separated_data(f)
271
272    aliases = {item[1]: item[2:] for item in data if item[0] == property_tag}
273
274    return aliases
275
276
277def main():
278    import argparse
279
280    parser = argparse.ArgumentParser(
281        description="Generate fontTools.unicodedata from UCD data files"
282    )
283    parser.add_argument(
284        "--ucd-path", help="Path to local folder containing UCD data files"
285    )
286    parser.add_argument("-q", "--quiet", action="store_true")
287    options = parser.parse_args()
288
289    level = "WARNING" if options.quiet else "INFO"
290    logging.basicConfig(level=level, format="%(message)s")
291
292    build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block")
293
294    script_aliases = parse_property_value_aliases("sc", options.ucd_path)
295    build_ranges(
296        "Scripts.txt",
297        local_ucd=options.ucd_path,
298        default="Unknown",
299        aliases=script_aliases,
300    )
301    build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path, is_set=True)
302
303
304if __name__ == "__main__":
305    import sys
306
307    sys.exit(main())
308