1#!/usr/bin/env python3 2# 3# Copyright 2011-2022 The Rust Project Developers. See the COPYRIGHT 4# file at the top-level directory of this distribution and at 5# http://rust-lang.org/COPYRIGHT. 6# 7# Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 8# http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 9# <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 10# option. This file may not be copied, modified, or distributed 11# except according to those terms. 12 13# This script uses the following Unicode tables: 14# - EastAsianWidth.txt 15# - ReadMe.txt 16# - UnicodeData.txt 17# 18# Since this should not require frequent updates, we just store this 19# out-of-line and check the generated module into git. 20 21import enum 22import math 23import os 24import re 25import sys 26 27NUM_CODEPOINTS = 0x110000 28"""An upper bound for which `range(0, NUM_CODEPOINTS)` contains Unicode's codespace.""" 29 30MAX_CODEPOINT_BITS = math.ceil(math.log2(NUM_CODEPOINTS - 1)) 31"""The maximum number of bits required to represent a Unicode codepoint.""" 32 33 34class OffsetType(enum.IntEnum): 35 """Represents the data type of a lookup table's offsets. Each variant's value represents the 36 number of bits required to represent that variant's type.""" 37 38 U2 = 2 39 """Offsets are 2-bit unsigned integers, packed four-per-byte.""" 40 U4 = 4 41 """Offsets are 4-bit unsigned integers, packed two-per-byte.""" 42 U8 = 8 43 """Each offset is a single byte (u8).""" 44 45 46TABLE_CFGS = [ 47 (13, MAX_CODEPOINT_BITS, OffsetType.U8), 48 (6, 13, OffsetType.U8), 49 (0, 6, OffsetType.U2), 50] 51"""Represents the format of each level of the multi-level lookup table. 52A level's entry is of the form `(low_bit, cap_bit, offset_type)`. 53This means that every sub-table in that level is indexed by bits `low_bit..cap_bit` of the 54codepoint and those tables offsets are stored according to `offset_type`. 55 56If this is edited, you must ensure that `emit_module` reflects your changes.""" 57 58MODULE_FILENAME = "tables.rs" 59"""The filename of the emitted Rust module (will be created in the working directory)""" 60 61Codepoint = int 62BitPos = int 63 64 65def fetch_open(filename: str): 66 """Opens `filename` and return its corresponding file object. If `filename` isn't on disk, 67 fetches it from `http://www.unicode.org/Public/UNIDATA/`. Exits with code 1 on failure.""" 68 if not os.path.exists(os.path.basename(filename)): 69 os.system(f"curl -O http://www.unicode.org/Public/UNIDATA/{filename}") 70 try: 71 return open(filename, encoding="utf-8") 72 except OSError: 73 sys.stderr.write(f"cannot load {filename}") 74 sys.exit(1) 75 76 77def load_unicode_version() -> "tuple[int, int, int]": 78 """Returns the current Unicode version by fetching and processing `ReadMe.txt`.""" 79 with fetch_open("ReadMe.txt") as readme: 80 pattern = r"for Version (\d+)\.(\d+)\.(\d+) of the Unicode" 81 return tuple(map(int, re.search(pattern, readme.read()).groups())) 82 83 84class EffectiveWidth(enum.IntEnum): 85 """Represents the width of a Unicode character. All East Asian Width classes resolve into 86 either `EffectiveWidth.NARROW`, `EffectiveWidth.WIDE`, or `EffectiveWidth.AMBIGUOUS`.""" 87 88 ZERO = 0 89 """ Zero columns wide. """ 90 NARROW = 1 91 """ One column wide. """ 92 WIDE = 2 93 """ Two columns wide. """ 94 AMBIGUOUS = 3 95 """ Two columns wide in a CJK context. One column wide in all other contexts. """ 96 97 98def load_east_asian_widths() -> "list[EffectiveWidth]": 99 """Return a list of effective widths, indexed by codepoint. 100 Widths are determined by fetching and parsing `EastAsianWidth.txt`. 101 102 `Neutral`, `Narrow`, and `Halfwidth` characters are assigned `EffectiveWidth.NARROW`. 103 104 `Wide` and `Fullwidth` characters are assigned `EffectiveWidth.WIDE`. 105 106 `Ambiguous` chracters are assigned `EffectiveWidth.AMBIGUOUS`.""" 107 with fetch_open("EastAsianWidth.txt") as eaw: 108 # matches a width assignment for a single codepoint, i.e. "1F336;N # ..." 109 single = re.compile(r"^([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") 110 # matches a width assignment for a range of codepoints, i.e. "3001..3003;W # ..." 111 multiple = re.compile(r"^([0-9A-F]+)\.\.([0-9A-F]+)\s+;\s+(\w+) +# (\w+)") 112 # map between width category code and condensed width 113 width_codes = { 114 **{c: EffectiveWidth.NARROW for c in ["N", "Na", "H"]}, 115 **{c: EffectiveWidth.WIDE for c in ["W", "F"]}, 116 "A": EffectiveWidth.AMBIGUOUS, 117 } 118 119 width_map = [] 120 current = 0 121 for line in eaw.readlines(): 122 raw_data = None # (low, high, width) 123 if match := single.match(line): 124 raw_data = (match.group(1), match.group(1), match.group(2)) 125 elif match := multiple.match(line): 126 raw_data = (match.group(1), match.group(2), match.group(3)) 127 else: 128 continue 129 low = int(raw_data[0], 16) 130 high = int(raw_data[1], 16) 131 width = width_codes[raw_data[2]] 132 133 assert current <= high 134 while current <= high: 135 # Some codepoints don't fall into any of the ranges in EastAsianWidth.txt. 136 # All such codepoints are implicitly given Neural width (resolves to narrow) 137 width_map.append(EffectiveWidth.NARROW if current < low else width) 138 current += 1 139 140 while len(width_map) < NUM_CODEPOINTS: 141 # Catch any leftover codepoints and assign them implicit Neutral/narrow width. 142 width_map.append(EffectiveWidth.NARROW) 143 144 return width_map 145 146 147def load_zero_widths() -> "list[bool]": 148 """Returns a list `l` where `l[c]` is true if codepoint `c` is considered a zero-width 149 character. `c` is considered a zero-width character if `c` is in general categories 150 `Cc`, `Cf`, `Mn`, or `Me` (determined by fetching and processing `UnicodeData.txt`).""" 151 with fetch_open("UnicodeData.txt") as categories: 152 zw_map = [] 153 current = 0 154 for line in categories.readlines(): 155 if len(raw_data := line.split(";")) != 15: 156 continue 157 [codepoint, name, cat_code] = [ 158 int(raw_data[0], 16), 159 raw_data[1], 160 raw_data[2], 161 ] 162 zero_width = cat_code in ["Cc", "Cf", "Mn", "Me"] 163 164 assert current <= codepoint 165 while current <= codepoint: 166 if name.endswith(", Last>") or current == codepoint: 167 # if name ends with Last, we backfill the width value to all codepoints since 168 # the previous codepoint (aka the start of the range) 169 zw_map.append(zero_width) 170 else: 171 # unassigned characters are implicitly given Neutral width, which is nonzero 172 zw_map.append(False) 173 current += 1 174 175 while len(zw_map) < NUM_CODEPOINTS: 176 # Catch any leftover codepoints. They must be unassigned (so nonzero width) 177 zw_map.append(False) 178 179 return zw_map 180 181 182class Bucket: 183 """A bucket contains a group of codepoints and an ordered width list. If one bucket's width 184 list overlaps with another's width list, those buckets can be merged via `try_extend`.""" 185 186 def __init__(self): 187 """Creates an empty bucket.""" 188 self.entry_set = set() 189 self.widths = [] 190 191 def append(self, codepoint: Codepoint, width: EffectiveWidth): 192 """Adds a codepoint/width pair to the bucket, and appends `width` to the width list.""" 193 self.entry_set.add((codepoint, width)) 194 self.widths.append(width) 195 196 def try_extend(self, attempt: "Bucket") -> bool: 197 """If either `self` or `attempt`'s width list starts with the other bucket's width list, 198 set `self`'s width list to the longer of the two, add all of `attempt`'s codepoints 199 into `self`, and return `True`. Otherwise, return `False`.""" 200 (less, more) = (self.widths, attempt.widths) 201 if len(self.widths) > len(attempt.widths): 202 (less, more) = (attempt.widths, self.widths) 203 if less != more[: len(less)]: 204 return False 205 self.entry_set |= attempt.entry_set 206 self.widths = more 207 return True 208 209 def entries(self) -> "list[tuple[Codepoint, EffectiveWidth]]": 210 """Return a list of the codepoint/width pairs in this bucket, sorted by codepoint.""" 211 result = list(self.entry_set) 212 result.sort() 213 return result 214 215 def width(self) -> "EffectiveWidth": 216 """If all codepoints in this bucket have the same width, return that width; otherwise, 217 return `None`.""" 218 if len(self.widths) == 0: 219 return None 220 potential_width = self.widths[0] 221 for width in self.widths[1:]: 222 if potential_width != width: 223 return None 224 return potential_width 225 226 227def make_buckets(entries, low_bit: BitPos, cap_bit: BitPos) -> "list[Bucket]": 228 """Partitions the `(Codepoint, EffectiveWidth)` tuples in `entries` into `Bucket`s. All 229 codepoints with identical bits from `low_bit` to `cap_bit` (exclusive) are placed in the 230 same bucket. Returns a list of the buckets in increasing order of those bits.""" 231 num_bits = cap_bit - low_bit 232 assert num_bits > 0 233 buckets = [Bucket() for _ in range(0, 2 ** num_bits)] 234 mask = (1 << num_bits) - 1 235 for (codepoint, width) in entries: 236 buckets[(codepoint >> low_bit) & mask].append(codepoint, width) 237 return buckets 238 239 240class Table: 241 """Represents a lookup table. Each table contains a certain number of subtables; each 242 subtable is indexed by a contiguous bit range of the codepoint and contains a list 243 of `2**(number of bits in bit range)` entries. (The bit range is the same for all subtables.) 244 245 Typically, tables contain a list of buckets of codepoints. Bucket `i`'s codepoints should 246 be indexed by sub-table `i` in the next-level lookup table. The entries of this table are 247 indexes into the bucket list (~= indexes into the sub-tables of the next-level table.) The 248 key to compression is that two different buckets in two different sub-tables may have the 249 same width list, which means that they can be merged into the same bucket. 250 251 If no bucket contains two codepoints with different widths, calling `indices_to_widths` will 252 discard the buckets and convert the entries into `EffectiveWidth` values.""" 253 254 def __init__( 255 self, entry_groups, low_bit: BitPos, cap_bit: BitPos, offset_type: OffsetType 256 ): 257 """Create a lookup table with a sub-table for each `(Codepoint, EffectiveWidth)` iterator 258 in `entry_groups`. Each sub-table is indexed by codepoint bits in `low_bit..cap_bit`, 259 and each table entry is represented in the format specified by `offset_type`. Asserts 260 that this table is actually representable with `offset_type`.""" 261 self.low_bit = low_bit 262 self.cap_bit = cap_bit 263 self.offset_type = offset_type 264 self.entries = [] 265 self.indexed = [] 266 267 buckets = [] 268 for entries in entry_groups: 269 buckets.extend(make_buckets(entries, self.low_bit, self.cap_bit)) 270 271 for bucket in buckets: 272 for (i, existing) in enumerate(self.indexed): 273 if existing.try_extend(bucket): 274 self.entries.append(i) 275 break 276 else: 277 self.entries.append(len(self.indexed)) 278 self.indexed.append(bucket) 279 280 # Validate offset type 281 for index in self.entries: 282 assert index < (1 << int(self.offset_type)) 283 284 def indices_to_widths(self): 285 """Destructively converts the indices in this table to the `EffectiveWidth` values of 286 their buckets. Assumes that no bucket contains codepoints with different widths.""" 287 self.entries = list(map(lambda i: int(self.indexed[i].width()), self.entries)) 288 del self.indexed 289 290 def buckets(self): 291 """Returns an iterator over this table's buckets.""" 292 return self.indexed 293 294 def to_bytes(self) -> "list[int]": 295 """Returns this table's entries as a list of bytes. The bytes are formatted according to 296 the `OffsetType` which the table was created with, converting any `EffectiveWidth` entries 297 to their enum variant's integer value. For example, with `OffsetType.U2`, each byte will 298 contain four packed 2-bit entries.""" 299 entries_per_byte = 8 // int(self.offset_type) 300 byte_array = [] 301 for i in range(0, len(self.entries), entries_per_byte): 302 byte = 0 303 for j in range(0, entries_per_byte): 304 byte |= self.entries[i + j] << (j * int(self.offset_type)) 305 byte_array.append(byte) 306 return byte_array 307 308 309def make_tables( 310 table_cfgs: "list[tuple[BitPos, BitPos, OffsetType]]", entries 311) -> "list[Table]": 312 """Creates a table for each configuration in `table_cfgs`, with the first config corresponding 313 to the top-level lookup table, the second config corresponding to the second-level lookup 314 table, and so forth. `entries` is an iterator over the `(Codepoint, EffectiveWidth)` pairs 315 to include in the top-level table.""" 316 tables = [] 317 entry_groups = [entries] 318 for (low_bit, cap_bit, offset_type) in table_cfgs: 319 table = Table(entry_groups, low_bit, cap_bit, offset_type) 320 entry_groups = map(lambda bucket: bucket.entries(), table.buckets()) 321 tables.append(table) 322 return tables 323 324 325def emit_module( 326 out_name: str, unicode_version: "tuple[int, int, int]", tables: "list[Table]" 327): 328 """Outputs a Rust module to `out_name` using table data from `tables`. 329 If `TABLE_CFGS` is edited, you may need to edit the included code for `lookup_width`.""" 330 if os.path.exists(out_name): 331 os.remove(out_name) 332 with open(out_name, "w", newline="\n", encoding="utf-8") as module: 333 module.write( 334 """// Copyright 2012-2022 The Rust Project Developers. See the COPYRIGHT 335// file at the top-level directory of this distribution and at 336// http://rust-lang.org/COPYRIGHT. 337// 338// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 339// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 340// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 341// option. This file may not be copied, modified, or distributed 342// except according to those terms. 343 344// NOTE: The following code was generated by "scripts/unicode.py", do not edit directly 345""" 346 ) 347 module.write( 348 f""" 349/// The version of [Unicode](http://www.unicode.org/) 350/// that this version of unicode-width is based on. 351pub const UNICODE_VERSION: (u8, u8, u8) = {unicode_version}; 352""" 353 ) 354 355 module.write( 356 """ 357pub mod charwidth { 358 use core::option::Option::{self, None, Some}; 359 360 /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c` by 361 /// consulting a multi-level lookup table. 362 /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, 363 /// they're treated as single width. 364 /// 365 /// # Maintenance 366 /// The tables themselves are autogenerated but this function is hardcoded. You should have 367 /// nothing to worry about if you re-run `unicode.py` (for example, when updating Unicode.) 368 /// However, if you change the *actual structure* of the lookup tables (perhaps by editing the 369 /// `TABLE_CFGS` global in `unicode.py`) you must ensure that this code reflects those changes. 370 #[inline] 371 fn lookup_width(c: char, is_cjk: bool) -> usize { 372 let cp = c as usize; 373 374 let t1_offset = TABLES_0[cp >> 13 & 0xFF]; 375 376 // Each sub-table in TABLES_1 is 7 bits, and each stored entry is a byte, 377 // so each sub-table is 128 bytes in size. 378 // (Sub-tables are selected using the computed offset from the previous table.) 379 let t2_offset = TABLES_1[128 * usize::from(t1_offset) + (cp >> 6 & 0x7F)]; 380 381 // Each sub-table in TABLES_2 is 6 bits, but each stored entry is 2 bits. 382 // This is accomplished by packing four stored entries into one byte. 383 // So each sub-table is 2**(6-2) == 16 bytes in size. 384 // Since this is the last table, each entry represents an encoded width. 385 let packed_widths = TABLES_2[16 * usize::from(t2_offset) + (cp >> 2 & 0xF)]; 386 387 // Extract the packed width 388 let width = packed_widths >> (2 * (cp & 0b11)) & 0b11; 389 390 // A width of 3 signifies that the codepoint is ambiguous width. 391 if width == 3 { 392 if is_cjk { 393 2 394 } else { 395 1 396 } 397 } else { 398 width.into() 399 } 400 } 401""" 402 ) 403 404 module.write( 405 """ 406 /// Returns the [UAX #11](https://www.unicode.org/reports/tr11/) based width of `c`, or 407 /// `None` if `c` is a control character other than `'\\x00'`. 408 /// If `is_cjk == true`, ambiguous width characters are treated as double width; otherwise, 409 /// they're treated as single width. 410 #[inline] 411 pub fn width(c: char, is_cjk: bool) -> Option<usize> { 412 if c < '\\u{7F}' { 413 if c >= '\\u{20}' { 414 // U+0020 to U+007F (exclusive) are single-width ASCII codepoints 415 Some(1) 416 } else if c == '\\0' { 417 // U+0000 *is* a control code, but it's special-cased 418 Some(0) 419 } else { 420 // U+0001 to U+0020 (exclusive) are control codes 421 None 422 } 423 } else if c >= '\\u{A0}' { 424 // No characters >= U+00A0 are control codes, so we can consult the lookup tables 425 Some(lookup_width(c, is_cjk)) 426 } else { 427 // U+007F to U+00A0 (exclusive) are control codes 428 None 429 } 430 } 431""" 432 ) 433 434 subtable_count = 1 435 for (i, table) in enumerate(tables): 436 new_subtable_count = len(table.buckets()) 437 if i == len(tables) - 1: 438 table.indices_to_widths() # for the last table, indices == widths 439 byte_array = table.to_bytes() 440 module.write( 441 f""" 442 /// Autogenerated. {subtable_count} sub-table(s). Consult [`lookup_width`] for layout info. 443 static TABLES_{i}: [u8; {len(byte_array)}] = [""" 444 ) 445 for (j, byte) in enumerate(byte_array): 446 # Add line breaks for every 15th entry (chosen to match what rustfmt does) 447 if j % 15 == 0: 448 module.write("\n ") 449 module.write(f" 0x{byte:02X},") 450 module.write("\n ];\n") 451 subtable_count = new_subtable_count 452 module.write("}\n") 453 454 455def main(module_filename: str): 456 """Obtain character data from the latest version of Unicode, transform it into a multi-level 457 lookup table for character width, and write a Rust module utilizing that table to 458 `module_filename`. 459 460 We obey the following rules in decreasing order of importance: 461 - The soft hyphen (`U+00AD`) is single-width. 462 - Hangul Jamo medial vowels & final consonants (`U+1160..=U+11FF`) are zero-width. 463 - All codepoints in general categories `Cc`, `Cf`, `Mn`, and `Me` are zero-width. 464 - All codepoints with an East Asian Width of `Ambigous` are ambiguous-width. 465 - All codepoints with an East Asian Width of `Wide` or `Fullwidth` are double-width. 466 - All other codepoints (including unassigned codepoints and codepoints with an East Asian Width 467 of `Neutral`, `Narrow`, or `Halfwidth`) are single-width. 468 469 These rules are based off of Markus Kuhn's free `wcwidth()` implementation: 470 http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c""" 471 version = load_unicode_version() 472 print(f"Generating module for Unicode {version[0]}.{version[1]}.{version[2]}") 473 474 eaw_map = load_east_asian_widths() 475 zw_map = load_zero_widths() 476 477 # Characters marked as zero-width in zw_map should be zero-width in the final map 478 width_map = list( 479 map(lambda x: EffectiveWidth.ZERO if x[1] else x[0], zip(eaw_map, zw_map)) 480 ) 481 482 # Override for soft hyphen 483 width_map[0x00AD] = EffectiveWidth.NARROW 484 485 # Override for Hangul Jamo medial vowels & final consonants 486 for i in range(0x1160, 0x11FF + 1): 487 width_map[i] = EffectiveWidth.ZERO 488 489 tables = make_tables(TABLE_CFGS, enumerate(width_map)) 490 491 print("------------------------") 492 total_size = 0 493 for (i, table) in enumerate(tables): 494 size_bytes = len(table.to_bytes()) 495 print(f"Table {i} Size: {size_bytes} bytes") 496 total_size += size_bytes 497 print("------------------------") 498 print(f" Total Size: {total_size} bytes") 499 500 emit_module(module_filename, version, tables) 501 print(f'Wrote to "{module_filename}"') 502 503 504if __name__ == "__main__": 505 main(MODULE_FILENAME) 506