1*e1fe3e4aSElliott Hughes"""Extend the Python codecs module with a few encodings that are used in OpenType (name table) 2*e1fe3e4aSElliott Hughesbut missing from Python. See https://github.com/fonttools/fonttools/issues/236 for details.""" 3*e1fe3e4aSElliott Hughes 4*e1fe3e4aSElliott Hughesimport codecs 5*e1fe3e4aSElliott Hughesimport encodings 6*e1fe3e4aSElliott Hughes 7*e1fe3e4aSElliott Hughes 8*e1fe3e4aSElliott Hughesclass ExtendCodec(codecs.Codec): 9*e1fe3e4aSElliott Hughes def __init__(self, name, base_encoding, mapping): 10*e1fe3e4aSElliott Hughes self.name = name 11*e1fe3e4aSElliott Hughes self.base_encoding = base_encoding 12*e1fe3e4aSElliott Hughes self.mapping = mapping 13*e1fe3e4aSElliott Hughes self.reverse = {v: k for k, v in mapping.items()} 14*e1fe3e4aSElliott Hughes self.max_len = max(len(v) for v in mapping.values()) 15*e1fe3e4aSElliott Hughes self.info = codecs.CodecInfo( 16*e1fe3e4aSElliott Hughes name=self.name, encode=self.encode, decode=self.decode 17*e1fe3e4aSElliott Hughes ) 18*e1fe3e4aSElliott Hughes codecs.register_error(name, self.error) 19*e1fe3e4aSElliott Hughes 20*e1fe3e4aSElliott Hughes def _map(self, mapper, output_type, exc_type, input, errors): 21*e1fe3e4aSElliott Hughes base_error_handler = codecs.lookup_error(errors) 22*e1fe3e4aSElliott Hughes length = len(input) 23*e1fe3e4aSElliott Hughes out = output_type() 24*e1fe3e4aSElliott Hughes while input: 25*e1fe3e4aSElliott Hughes # first try to use self.error as the error handler 26*e1fe3e4aSElliott Hughes try: 27*e1fe3e4aSElliott Hughes part = mapper(input, self.base_encoding, errors=self.name) 28*e1fe3e4aSElliott Hughes out += part 29*e1fe3e4aSElliott Hughes break # All converted 30*e1fe3e4aSElliott Hughes except exc_type as e: 31*e1fe3e4aSElliott Hughes # else convert the correct part, handle error as requested and continue 32*e1fe3e4aSElliott Hughes out += mapper(input[: e.start], self.base_encoding, self.name) 33*e1fe3e4aSElliott Hughes replacement, pos = base_error_handler(e) 34*e1fe3e4aSElliott Hughes out += replacement 35*e1fe3e4aSElliott Hughes input = input[pos:] 36*e1fe3e4aSElliott Hughes return out, length 37*e1fe3e4aSElliott Hughes 38*e1fe3e4aSElliott Hughes def encode(self, input, errors="strict"): 39*e1fe3e4aSElliott Hughes return self._map(codecs.encode, bytes, UnicodeEncodeError, input, errors) 40*e1fe3e4aSElliott Hughes 41*e1fe3e4aSElliott Hughes def decode(self, input, errors="strict"): 42*e1fe3e4aSElliott Hughes return self._map(codecs.decode, str, UnicodeDecodeError, input, errors) 43*e1fe3e4aSElliott Hughes 44*e1fe3e4aSElliott Hughes def error(self, e): 45*e1fe3e4aSElliott Hughes if isinstance(e, UnicodeDecodeError): 46*e1fe3e4aSElliott Hughes for end in range(e.start + 1, e.end + 1): 47*e1fe3e4aSElliott Hughes s = e.object[e.start : end] 48*e1fe3e4aSElliott Hughes if s in self.mapping: 49*e1fe3e4aSElliott Hughes return self.mapping[s], end 50*e1fe3e4aSElliott Hughes elif isinstance(e, UnicodeEncodeError): 51*e1fe3e4aSElliott Hughes for end in range(e.start + 1, e.start + self.max_len + 1): 52*e1fe3e4aSElliott Hughes s = e.object[e.start : end] 53*e1fe3e4aSElliott Hughes if s in self.reverse: 54*e1fe3e4aSElliott Hughes return self.reverse[s], end 55*e1fe3e4aSElliott Hughes e.encoding = self.name 56*e1fe3e4aSElliott Hughes raise e 57*e1fe3e4aSElliott Hughes 58*e1fe3e4aSElliott Hughes 59*e1fe3e4aSElliott Hughes_extended_encodings = { 60*e1fe3e4aSElliott Hughes "x_mac_japanese_ttx": ( 61*e1fe3e4aSElliott Hughes "shift_jis", 62*e1fe3e4aSElliott Hughes { 63*e1fe3e4aSElliott Hughes b"\xFC": chr(0x007C), 64*e1fe3e4aSElliott Hughes b"\x7E": chr(0x007E), 65*e1fe3e4aSElliott Hughes b"\x80": chr(0x005C), 66*e1fe3e4aSElliott Hughes b"\xA0": chr(0x00A0), 67*e1fe3e4aSElliott Hughes b"\xFD": chr(0x00A9), 68*e1fe3e4aSElliott Hughes b"\xFE": chr(0x2122), 69*e1fe3e4aSElliott Hughes b"\xFF": chr(0x2026), 70*e1fe3e4aSElliott Hughes }, 71*e1fe3e4aSElliott Hughes ), 72*e1fe3e4aSElliott Hughes "x_mac_trad_chinese_ttx": ( 73*e1fe3e4aSElliott Hughes "big5", 74*e1fe3e4aSElliott Hughes { 75*e1fe3e4aSElliott Hughes b"\x80": chr(0x005C), 76*e1fe3e4aSElliott Hughes b"\xA0": chr(0x00A0), 77*e1fe3e4aSElliott Hughes b"\xFD": chr(0x00A9), 78*e1fe3e4aSElliott Hughes b"\xFE": chr(0x2122), 79*e1fe3e4aSElliott Hughes b"\xFF": chr(0x2026), 80*e1fe3e4aSElliott Hughes }, 81*e1fe3e4aSElliott Hughes ), 82*e1fe3e4aSElliott Hughes "x_mac_korean_ttx": ( 83*e1fe3e4aSElliott Hughes "euc_kr", 84*e1fe3e4aSElliott Hughes { 85*e1fe3e4aSElliott Hughes b"\x80": chr(0x00A0), 86*e1fe3e4aSElliott Hughes b"\x81": chr(0x20A9), 87*e1fe3e4aSElliott Hughes b"\x82": chr(0x2014), 88*e1fe3e4aSElliott Hughes b"\x83": chr(0x00A9), 89*e1fe3e4aSElliott Hughes b"\xFE": chr(0x2122), 90*e1fe3e4aSElliott Hughes b"\xFF": chr(0x2026), 91*e1fe3e4aSElliott Hughes }, 92*e1fe3e4aSElliott Hughes ), 93*e1fe3e4aSElliott Hughes "x_mac_simp_chinese_ttx": ( 94*e1fe3e4aSElliott Hughes "gb2312", 95*e1fe3e4aSElliott Hughes { 96*e1fe3e4aSElliott Hughes b"\x80": chr(0x00FC), 97*e1fe3e4aSElliott Hughes b"\xA0": chr(0x00A0), 98*e1fe3e4aSElliott Hughes b"\xFD": chr(0x00A9), 99*e1fe3e4aSElliott Hughes b"\xFE": chr(0x2122), 100*e1fe3e4aSElliott Hughes b"\xFF": chr(0x2026), 101*e1fe3e4aSElliott Hughes }, 102*e1fe3e4aSElliott Hughes ), 103*e1fe3e4aSElliott Hughes} 104*e1fe3e4aSElliott Hughes 105*e1fe3e4aSElliott Hughes_cache = {} 106*e1fe3e4aSElliott Hughes 107*e1fe3e4aSElliott Hughes 108*e1fe3e4aSElliott Hughesdef search_function(name): 109*e1fe3e4aSElliott Hughes name = encodings.normalize_encoding(name) # Rather undocumented... 110*e1fe3e4aSElliott Hughes if name in _extended_encodings: 111*e1fe3e4aSElliott Hughes if name not in _cache: 112*e1fe3e4aSElliott Hughes base_encoding, mapping = _extended_encodings[name] 113*e1fe3e4aSElliott Hughes assert name[-4:] == "_ttx" 114*e1fe3e4aSElliott Hughes # Python 2 didn't have any of the encodings that we are implementing 115*e1fe3e4aSElliott Hughes # in this file. Python 3 added aliases for the East Asian ones, mapping 116*e1fe3e4aSElliott Hughes # them "temporarily" to the same base encoding as us, with a comment 117*e1fe3e4aSElliott Hughes # suggesting that full implementation will appear some time later. 118*e1fe3e4aSElliott Hughes # As such, try the Python version of the x_mac_... first, if that is found, 119*e1fe3e4aSElliott Hughes # use *that* as our base encoding. This would make our encoding upgrade 120*e1fe3e4aSElliott Hughes # to the full encoding when and if Python finally implements that. 121*e1fe3e4aSElliott Hughes # http://bugs.python.org/issue24041 122*e1fe3e4aSElliott Hughes base_encodings = [name[:-4], base_encoding] 123*e1fe3e4aSElliott Hughes for base_encoding in base_encodings: 124*e1fe3e4aSElliott Hughes try: 125*e1fe3e4aSElliott Hughes codecs.lookup(base_encoding) 126*e1fe3e4aSElliott Hughes except LookupError: 127*e1fe3e4aSElliott Hughes continue 128*e1fe3e4aSElliott Hughes _cache[name] = ExtendCodec(name, base_encoding, mapping) 129*e1fe3e4aSElliott Hughes break 130*e1fe3e4aSElliott Hughes return _cache[name].info 131*e1fe3e4aSElliott Hughes 132*e1fe3e4aSElliott Hughes return None 133*e1fe3e4aSElliott Hughes 134*e1fe3e4aSElliott Hughes 135*e1fe3e4aSElliott Hughescodecs.register(search_function) 136