1#!/usr/bin/env python3 2""" 3Tools to parse data files from the Unicode Character Database. 4""" 5 6 7try: 8 from urllib.request import urlopen 9except ImportError: 10 from urllib2 import urlopen 11from contextlib import closing, contextmanager 12import re 13from codecs import iterdecode 14import logging 15import os 16from io import open 17from os.path import abspath, dirname, join as pjoin, pardir, sep 18 19 20try: # pragma: no cover 21 unicode 22except NameError: 23 unicode = str 24 25 26UNIDATA_URL = "https://unicode.org/Public/UNIDATA/" 27UNIDATA_LICENSE_URL = "http://unicode.org/copyright.html#License" 28 29# by default save output files to ../Lib/fontTools/unicodedata/ 30UNIDATA_PATH = ( 31 pjoin(abspath(dirname(__file__)), pardir, "Lib", "fontTools", "unicodedata") + sep 32) 33 34SRC_ENCODING = "# -*- coding: utf-8 -*-\n" 35 36NOTICE = "# NOTE: This file was auto-generated with MetaTools/buildUCD.py.\n" 37 38MAX_UNICODE = 0x10FFFF 39 40log = logging.getLogger() 41 42 43@contextmanager 44def open_unidata_file(filename): 45 """Open a text file from https://unicode.org/Public/UNIDATA/""" 46 url = UNIDATA_URL + filename 47 with closing(urlopen(url)) as response: 48 yield iterdecode(response, encoding="utf-8") 49 50 51def parse_unidata_header(infile): 52 """Read the top header of data files, until the first line 53 that does not start with '#'. 54 """ 55 header = [] 56 line = next(infile) 57 while line.startswith("#"): 58 header.append(line) 59 line = next(infile) 60 return "".join(header) 61 62 63def parse_range_properties(infile, default=None, is_set=False): 64 """Parse a Unicode data file containing a column with one character or 65 a range of characters, and another column containing a property value 66 separated by a semicolon. Comments after '#' are ignored. 67 68 If the ranges defined in the data file are not continuous, assign the 69 'default' property to the unassigned codepoints. 70 71 Return a list of (start, end, property_name) tuples. 72 """ 73 ranges = [] 74 line_regex = re.compile( 75 r"^" 76 r"([0-9A-F]{4,6})" # first character code 77 r"(?:\.\.([0-9A-F]{4,6}))?" # optional second character code 78 r"\s*;\s*" 79 r"([^#]+)" 80 ) # everything up to the potential comment 81 for line in infile: 82 match = line_regex.match(line) 83 if not match: 84 continue 85 86 first, last, data = match.groups() 87 if last is None: 88 last = first 89 90 first = int(first, 16) 91 last = int(last, 16) 92 data = str(data.rstrip()) 93 94 ranges.append((first, last, data)) 95 96 ranges.sort() 97 98 if isinstance(default, unicode): 99 default = str(default) 100 101 # fill the gaps between explicitly defined ranges 102 last_start, last_end = -1, -1 103 full_ranges = [] 104 for start, end, value in ranges: 105 assert last_end < start 106 assert start <= end 107 if start - last_end > 1: 108 full_ranges.append((last_end + 1, start - 1, default)) 109 if is_set: 110 value = set(value.split()) 111 full_ranges.append((start, end, value)) 112 last_start, last_end = start, end 113 if last_end != MAX_UNICODE: 114 full_ranges.append((last_end + 1, MAX_UNICODE, default)) 115 116 # reduce total number of ranges by combining continuous ones 117 last_start, last_end, last_value = full_ranges.pop(0) 118 merged_ranges = [] 119 for start, end, value in full_ranges: 120 if value == last_value: 121 continue 122 else: 123 merged_ranges.append((last_start, start - 1, last_value)) 124 last_start, line_end, last_value = start, end, value 125 merged_ranges.append((last_start, MAX_UNICODE, last_value)) 126 127 # make sure that the ranges cover the full unicode repertoire 128 assert merged_ranges[0][0] == 0 129 for (cs, ce, cv), (ns, ne, nv) in zip(merged_ranges, merged_ranges[1:]): 130 assert ce + 1 == ns 131 assert merged_ranges[-1][1] == MAX_UNICODE 132 133 return merged_ranges 134 135 136def parse_semicolon_separated_data(infile): 137 """Parse a Unicode data file where each line contains a lists of values 138 separated by a semicolon (e.g. "PropertyValueAliases.txt"). 139 The number of the values on different lines may be different. 140 141 Returns a list of lists each containing the values as strings. 142 """ 143 data = [] 144 for line in infile: 145 line = line.split("#", 1)[0].strip() # remove the comment 146 if not line: 147 continue 148 fields = [str(field.strip()) for field in line.split(";")] 149 data.append(fields) 150 return data 151 152 153def _set_repr(value): 154 return ( 155 "None" 156 if value is None 157 else "{{{}}}".format(", ".join(repr(v) for v in sorted(value))) 158 ) 159 160 161def build_ranges( 162 filename, local_ucd=None, output_path=None, default=None, is_set=False, aliases=None 163): 164 """Fetch 'filename' UCD data file from Unicode official website, parse 165 the property ranges and values and write them as two Python lists 166 to 'fontTools.unicodedata.<filename>.py'. 167 168 'aliases' is an optional mapping of property codes (short names) to long 169 name aliases (list of strings, with the first item being the preferred 170 alias). When this is provided, the property values are written using the 171 short notation, and an additional 'NAMES' dict with the aliases is 172 written to the output module. 173 174 To load the data file from a local directory, you can use the 175 'local_ucd' argument. 176 """ 177 modname = os.path.splitext(filename)[0] + ".py" 178 if not output_path: 179 output_path = UNIDATA_PATH + modname 180 181 if local_ucd: 182 log.info("loading '%s' from local directory '%s'", filename, local_ucd) 183 cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 184 else: 185 log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 186 cm = open_unidata_file(filename) 187 188 with cm as f: 189 header = parse_unidata_header(f) 190 ranges = parse_range_properties(f, default=default, is_set=is_set) 191 192 if aliases: 193 reversed_aliases = {normalize(v[0]): k for k, v in aliases.items()} 194 max_value_length = 6 # 4-letter tags plus two quotes for repr 195 else: 196 max_value_length = min(56, max(len(repr(v)) for _, _, v in ranges)) 197 198 with open(output_path, "w", encoding="utf-8") as f: 199 f.write(SRC_ENCODING) 200 f.write("#\n") 201 f.write(NOTICE) 202 f.write("# Source: {}{}\n".format(UNIDATA_URL, filename)) 203 f.write("# License: {}\n".format(UNIDATA_LICENSE_URL)) 204 f.write("#\n") 205 f.write(header + "\n\n") 206 207 f.write("RANGES = [\n") 208 for first, last, value in ranges: 209 f.write( 210 " 0x{:0>4X}, # .. 0x{:0>4X} ; {}\n".format( 211 first, last, _set_repr(value) if is_set else value 212 ) 213 ) 214 f.write("]\n") 215 216 f.write("\n") 217 f.write("VALUES = [\n") 218 for first, last, value in ranges: 219 comment = "# {:0>4X}..{:0>4X}".format(first, last) 220 if is_set: 221 value_repr = "{},".format(_set_repr(value)) 222 else: 223 if aliases: 224 # append long name to comment and use the short code 225 comment += " ; {}".format(value) 226 value = reversed_aliases[normalize(value)] 227 value_repr = "{!r},".format(value) 228 f.write( 229 " {} {}\n".format(value_repr.ljust(max_value_length + 1), comment) 230 ) 231 f.write("]\n") 232 233 if aliases: 234 f.write("\n") 235 f.write("NAMES = {\n") 236 for value, names in sorted(aliases.items()): 237 # we only write the first preferred alias 238 f.write(" {!r}: {!r},\n".format(value, names[0])) 239 f.write("}\n") 240 241 log.info("saved new file: '%s'", os.path.normpath(output_path)) 242 243 244_normalize_re = re.compile(r"[-_ ]+") 245 246 247def normalize(string): 248 """Remove case, strip space, '-' and '_' for loose matching.""" 249 return _normalize_re.sub("", string).lower() 250 251 252def parse_property_value_aliases(property_tag, local_ucd=None): 253 """Fetch the current 'PropertyValueAliases.txt' from the Unicode website, 254 parse the values for the specified 'property_tag' and return a dictionary 255 of name aliases (list of strings) keyed by short value codes (strings). 256 257 To load the data file from a local directory, you can use the 258 'local_ucd' argument. 259 """ 260 filename = "PropertyValueAliases.txt" 261 if local_ucd: 262 log.info("loading '%s' from local directory '%s'", filename, local_ucd) 263 cm = open(pjoin(local_ucd, filename), "r", encoding="utf-8") 264 else: 265 log.info("downloading '%s' from '%s'", filename, UNIDATA_URL) 266 cm = open_unidata_file(filename) 267 268 with cm as f: 269 header = parse_unidata_header(f) 270 data = parse_semicolon_separated_data(f) 271 272 aliases = {item[1]: item[2:] for item in data if item[0] == property_tag} 273 274 return aliases 275 276 277def main(): 278 import argparse 279 280 parser = argparse.ArgumentParser( 281 description="Generate fontTools.unicodedata from UCD data files" 282 ) 283 parser.add_argument( 284 "--ucd-path", help="Path to local folder containing UCD data files" 285 ) 286 parser.add_argument("-q", "--quiet", action="store_true") 287 options = parser.parse_args() 288 289 level = "WARNING" if options.quiet else "INFO" 290 logging.basicConfig(level=level, format="%(message)s") 291 292 build_ranges("Blocks.txt", local_ucd=options.ucd_path, default="No_Block") 293 294 script_aliases = parse_property_value_aliases("sc", options.ucd_path) 295 build_ranges( 296 "Scripts.txt", 297 local_ucd=options.ucd_path, 298 default="Unknown", 299 aliases=script_aliases, 300 ) 301 build_ranges("ScriptExtensions.txt", local_ucd=options.ucd_path, is_set=True) 302 303 304if __name__ == "__main__": 305 import sys 306 307 sys.exit(main()) 308