1#!/usr/bin/env python3
2#
3# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT
4# file at the top-level directory of this distribution and at
5# http://rust-lang.org/COPYRIGHT.
6#
7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
10# option. This file may not be copied, modified, or distributed
11# except according to those terms.
12
13# This script uses the following Unicode tables:
14# - EastAsianWidth.txt
15# - ReadMe.txt
16# - UnicodeData.txt
17#
18# Since this should not require frequent updates, we just store this
19# out-of-line and check the generated module into git.
20
21import enum
22import math
23import os
24import re
25import sys
26
27NUM_CODEPOINTS = 0x110000
28"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace."""
29
30MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1))
31"""The maximum number of bits required to represent a Unicode codepoint."""
32
33
34class OffsetType(enum.IntEnum):
35    """Represents the data type of a lookup table's offsets. Each variant's value represents the
36    number of bits required to represent that variant's type."""
37
38    U2 = 2
39    """Offsets are 2-bit unsigned integers, packed four-per-byte."""
40    U4 = 4
41    """Offsets are 4-bit unsigned integers, packed two-per-byte."""
42    U8 = 8
43    """Each offset is a single byte (u8)."""
44
45
46TABLE_CFGS = [
47    (13, MAX_CODEPOINT_BITS, OffsetType.U8),
48    (6, 13, OffsetType.U8),
49    (0, 6, OffsetType.U2),
50]
51"""Represents the format of each level of the multi-level lookup table.
52A level's entry is of the form `(low_bit, cap_bit, offset_type)`.
53This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the
54codepoint and those tables offsets are stored according to `offset_type`.
55
56If this is edited, you must ensure that `emit_module` reflects your changes."""
57
58MODULE_FILENAME = "tables.rs"
59"""The filename of the emitted Rust module (will be created in the working directory)"""
60
61Codepoint = int
62BitPos = int
63
64
65def fetch_open(filename: str):
66    """Opens `filename` and return its corresponding file object. If `filename` isn't on disk,
67    fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure."""
68    if not os.path.exists(os.path.basename(filename)):
69        os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}")
70    try:
71        return open(filename, encoding="utf-8")
72    except OSError:
73        sys.stderr.write(f"cannot load {filename}")
74        sys.exit(1)
75
76
77def load_unicode_version() -> "tuple[int, int, int]":
78    """Returns the current Unicode version by fetching and processing `ReadMe.txt`."""
79    with fetch_open("ReadMe.txt") as readme:
80        pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode"
81        return tuple(map(int, re.search(pattern, readme.read()).groups()))
82
83
84class EffectiveWidth(enum.IntEnum):
85    """Represents the width of a Unicode character. All East Asian Width classes resolve into
86    either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`."""
87
88    ZERO = 0
89    """ Zero columns wide. """
90    NARROW = 1
91    """ One column wide. """
92    WIDE = 2
93    """ Two columns wide. """
94    AMBIGUOUS = 3
95    """ Two columns wide in a CJK context. One column wide in all other contexts. """
96
97
98def load_east_asian_widths() -> "list[EffectiveWidth]":
99    """Return a list of effective widths, indexed by codepoint.
100    Widths are determined by fetching and parsing `EastAsianWidth.txt`.
101
102    `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`.
103
104    `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`.
105
106    `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`."""
107    with fetch_open("EastAsianWidth.txt") as eaw:
108        # matches a width assignment for a single codepoint, i.e. "1F336;N  # ..."
109        single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
110        # matches a width assignment for a range of codepoints, i.e. "3001..3003;W  # ..."
111        multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)")
112        # map between width category code and condensed width
113        width_codes = {
114            **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]},
115            **{c: EffectiveWidth.WIDE for c in ["W", "F"]},
116            "A": EffectiveWidth.AMBIGUOUS,
117        }
118
119        width_map = []
120        current = 0
121        for line in eaw.readlines():
122            raw_data = None  # (low, high, width)
123            if match := single.match(line):
124                raw_data = (match.group(1), match.group(1), match.group(2))
125            elif match := multiple.match(line):
126                raw_data = (match.group(1), match.group(2), match.group(3))
127            else:
128                continue
129            low = int(raw_data[0], 16)
130            high = int(raw_data[1], 16)
131            width = width_codes[raw_data[2]]
132
133            assert current <= high
134            while current <= high:
135                # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt.
136                # All such codepoints are implicitly given Neural width (resolves to narrow)
137                width_map.append(EffectiveWidth.NARROW if current < low else width)
138                current += 1
139
140        while len(width_map) < NUM_CODEPOINTS:
141            # Catch any leftover codepoints and assign them implicit Neutral/narrow width.
142            width_map.append(EffectiveWidth.NARROW)
143
144        return width_map
145
146
147def load_zero_widths() -> "list[bool]":
148    """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width
149    character. `c` is considered a zero-width character if `c` is in general categories
150    `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`)."""
151    with fetch_open("UnicodeData.txt") as categories:
152        zw_map = []
153        current = 0
154        for line in categories.readlines():
155            if len(raw_data := line.split(";")) != 15:
156                continue
157            [codepoint, name, cat_code] = [
158                int(raw_data[0], 16),
159                raw_data[1],
160                raw_data[2],
161            ]
162            zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"]
163
164            assert current <= codepoint
165            while current <= codepoint:
166                if name.endswith(", Last>") or current == codepoint:
167                    # if name ends with Last, we backfill the width value to all codepoints since
168                    # the previous codepoint (aka the start of the range)
169                    zw_map.append(zero_width)
170                else:
171                    # unassigned characters are implicitly given Neutral width, which is nonzero
172                    zw_map.append(False)
173                current += 1
174
175        while len(zw_map) < NUM_CODEPOINTS:
176            # Catch any leftover codepoints. They must be unassigned (so nonzero width)
177            zw_map.append(False)
178
179        return zw_map
180
181
182class Bucket:
183    """A bucket contains a group of codepoints and an ordered width list. If one bucket's width
184    list overlaps with another's width list, those buckets can be merged via `try_extend`."""
185
186    def __init__(self):
187        """Creates an empty bucket."""
188        self.entry_set = set()
189        self.widths = []
190
191    def append(self, codepoint: Codepoint, width: EffectiveWidth):
192        """Adds a codepoint/width pair to the bucket, and appends `width` to the width list."""
193        self.entry_set.add((codepoint, width))
194        self.widths.append(width)
195
196    def try_extend(self, attempt: "Bucket") -> bool:
197        """If either `self` or `attempt`'s width list starts with the other bucket's width list,
198        set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints
199        into `self`, and return `True`. Otherwise, return `False`."""
200        (less, more) = (self.widths, attempt.widths)
201        if len(self.widths) > len(attempt.widths):
202            (less, more) = (attempt.widths, self.widths)
203        if less != more[: len(less)]:
204            return False
205        self.entry_set |= attempt.entry_set
206        self.widths = more
207        return True
208
209    def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]":
210        """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint."""
211        result = list(self.entry_set)
212        result.sort()
213        return result
214
215    def width(self) -> "EffectiveWidth":
216        """If all codepoints in this bucket have the same width, return that width; otherwise,
217        return `None`."""
218        if len(self.widths) == 0:
219            return None
220        potential_width = self.widths[0]
221        for width in self.widths[1:]:
222            if potential_width != width:
223                return None
224        return potential_width
225
226
227def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]":
228    """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All
229    codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the
230    same bucket. Returns a list of the buckets in increasing order of those bits."""
231    num_bits = cap_bit - low_bit
232    assert num_bits > 0
233    buckets = [Bucket() for _ in range(0, 2 ** num_bits)]
234    mask = (1 << num_bits) - 1
235    for (codepoint, width) in entries:
236        buckets[(codepoint >> low_bit) & mask].append(codepoint, width)
237    return buckets
238
239
240class Table:
241    """Represents a lookup table. Each table contains a certain number of subtables; each
242    subtable is indexed by a contiguous bit range of the codepoint and contains a list
243    of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.)
244
245    Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should
246    be indexed by sub-table `i` in the next-level lookup table. The entries of this table are
247    indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The
248    key to compression is that two different buckets in two different sub-tables may have the
249    same width list, which means that they can be merged into the same bucket.
250
251    If no bucket contains two codepoints with different widths, calling `indices_to_widths` will
252    discard the buckets and convert the entries into `EffectiveWidth` values."""
253
254    def __init__(
255        self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType
256    ):
257        """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator
258        in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`,
259        and each table entry is represented in the format specified by  `offset_type`. Asserts
260        that this table is actually representable with `offset_type`."""
261        self.low_bit = low_bit
262        self.cap_bit = cap_bit
263        self.offset_type = offset_type
264        self.entries = []
265        self.indexed = []
266
267        buckets = []
268        for entries in entry_groups:
269            buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit))
270
271        for bucket in buckets:
272            for (i, existing) in enumerate(self.indexed):
273                if existing.try_extend(bucket):
274                    self.entries.append(i)
275                    break
276            else:
277                self.entries.append(len(self.indexed))
278                self.indexed.append(bucket)
279
280        # Validate offset type
281        for index in self.entries:
282            assert index < (1 << int(self.offset_type))
283
284    def indices_to_widths(self):
285        """Destructively converts the indices in this table to the `EffectiveWidth` values of
286        their buckets. Assumes that no bucket contains codepoints with different widths."""
287        self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries))
288        del self.indexed
289
290    def buckets(self):
291        """Returns an iterator over this table's buckets."""
292        return self.indexed
293
294    def to_bytes(self) -> "list[int]":
295        """Returns this table's entries as a list of bytes. The bytes are formatted according to
296        the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries
297        to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will
298        contain four packed 2-bit entries."""
299        entries_per_byte = 8 // int(self.offset_type)
300        byte_array = []
301        for i in range(0, len(self.entries), entries_per_byte):
302            byte = 0
303            for j in range(0, entries_per_byte):
304                byte |= self.entries[i + j] << (j * int(self.offset_type))
305            byte_array.append(byte)
306        return byte_array
307
308
309def make_tables(
310    table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries
311) -> "list[Table]":
312    """Creates a table for each configuration in `table_cfgs`, with the first config corresponding
313    to the top-level lookup table, the second config corresponding to the second-level lookup
314    table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs
315    to include in the top-level table."""
316    tables = []
317    entry_groups = [entries]
318    for (low_bit, cap_bit, offset_type) in table_cfgs:
319        table = Table(entry_groups, low_bit, cap_bit, offset_type)
320        entry_groups = map(lambda bucket: bucket.entries(), table.buckets())
321        tables.append(table)
322    return tables
323
324
325def emit_module(
326    out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]"
327):
328    """Outputs a Rust module to `out_name` using table data from `tables`.
329    If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`."""
330    if os.path.exists(out_name):
331        os.remove(out_name)
332    with open(out_name, "w", newline="\n", encoding="utf-8") as module:
333        module.write(
334            """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT
335// file at the top-level directory of this distribution and at
336// http://rust-lang.org/COPYRIGHT.
337//
338// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
339// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
340// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
341// option. This file may not be copied, modified, or distributed
342// except according to those terms.
343
344// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly
345"""
346        )
347        module.write(
348            f"""
349/// The version of [Unicode](http://www.unicode.org/)
350/// that this version of unicode-width is based on.
351pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version};
352"""
353        )
354
355        module.write(
356            """
357pub mod charwidth {
358    use core::option::Option::{self, None, Some};
359
360    /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by
361    /// consulting a multi-level lookup table.
362    /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
363    /// they're treated as single width.
364    ///
365    /// # Maintenance
366    /// The tables themselves are autogenerated but this function is hardcoded. You should have
367    /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.)
368    /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the
369    /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes.
370    #[inline]
371    fn lookup_width(c: char, is_cjk: bool) -> usize {
372        let cp = c as usize;
373
374        let t1_offset = TABLES_0[cp >> 13 & 0xFF];
375
376        // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte,
377        // so each sub-table is 128 bytes in size.
378        // (Sub-tables are selected using the computed offset from the previous table.)
379        let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)];
380
381        // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits.
382        // This is accomplished by packing four stored entries into one byte.
383        // So each sub-table is 2**(6-2) == 16 bytes in size.
384        // Since this is the last table, each entry represents an encoded width.
385        let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)];
386
387        // Extract the packed width
388        let width = packed_widths >> (2 * (cp & 0b11)) & 0b11;
389
390        // A width of 3 signifies that the codepoint is ambiguous width.
391        if width == 3 {
392            if is_cjk {
393                2
394            } else {
395                1
396            }
397        } else {
398            width.into()
399        }
400    }
401"""
402        )
403
404        module.write(
405            """
406    /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or
407    /// `None` if `c` is a control character other than `'\\x00'`.
408    /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise,
409    /// they're treated as single width.
410    #[inline]
411    pub fn width(c: char, is_cjk: bool) -> Option<usize> {
412        if c < '\\u{7F}' {
413            if c >= '\\u{20}' {
414                // U+0020 to U+007F (exclusive) are single-width ASCII codepoints
415                Some(1)
416            } else if c == '\\0' {
417                // U+0000 *is* a control code, but it's special-cased
418                Some(0)
419            } else {
420                // U+0001 to U+0020 (exclusive) are control codes
421                None
422            }
423        } else if c >= '\\u{A0}' {
424            // No characters >= U+00A0 are control codes, so we can consult the lookup tables
425            Some(lookup_width(c, is_cjk))
426        } else {
427            // U+007F to U+00A0 (exclusive) are control codes
428            None
429        }
430    }
431"""
432        )
433
434        subtable_count = 1
435        for (i, table) in enumerate(tables):
436            new_subtable_count = len(table.buckets())
437            if i == len(tables) - 1:
438                table.indices_to_widths()  # for the last table, indices == widths
439            byte_array = table.to_bytes()
440            module.write(
441                f"""
442    /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info.
443    static TABLES_{i}: [u8; {len(byte_array)}] = ["""
444            )
445            for (j, byte) in enumerate(byte_array):
446                # Add line breaks for every 15th entry (chosen to match what rustfmt does)
447                if j % 15 == 0:
448                    module.write("\n       ")
449                module.write(f" 0x{byte:02X},")
450            module.write("\n    ];\n")
451            subtable_count = new_subtable_count
452        module.write("}\n")
453
454
455def main(module_filename: str):
456    """Obtain character data from the latest version of Unicode, transform it into a multi-level
457    lookup table for character width, and write a Rust module utilizing that table to
458    `module_filename`.
459
460    We obey the following rules in decreasing order of importance:
461    - The soft hyphen (`U+00AD`) is single-width.
462    - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width.
463    - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width.
464    - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width.
465    - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width.
466    - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width
467    of `Neutral`, `Narrow`, or `Halfwidth`) are single-width.
468
469    These rules are based off of Markus Kuhn's free `wcwidth()` implementation:
470    http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c"""
471    version = load_unicode_version()
472    print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}")
473
474    eaw_map = load_east_asian_widths()
475    zw_map = load_zero_widths()
476
477    # Characters marked as zero-width in zw_map should be zero-width in the final map
478    width_map = list(
479        map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map))
480    )
481
482    # Override for soft hyphen
483    width_map[0x00AD] = EffectiveWidth.NARROW
484
485    # Override for Hangul Jamo medial vowels & final consonants
486    for i in range(0x1160, 0x11FF + 1):
487        width_map[i] = EffectiveWidth.ZERO
488
489    tables = make_tables(TABLE_CFGS, enumerate(width_map))
490
491    print("------------------------")
492    total_size = 0
493    for (i, table) in enumerate(tables):
494        size_bytes = len(table.to_bytes())
495        print(f"Table {i} Size: {size_bytes} bytes")
496        total_size += size_bytes
497    print("------------------------")
498    print(f"  Total Size: {total_size} bytes")
499
500    emit_module(module_filename, version, tables)
501    print(f'Wrote to "{module_filename}"')
502
503
504if __name__ == "__main__":
505    main(MODULE_FILENAME)
506