1*cda5da8dSAndroid Build Coastguard Worker"""Tokenization help for Python programs. 2*cda5da8dSAndroid Build Coastguard Worker 3*cda5da8dSAndroid Build Coastguard Workertokenize(readline) is a generator that breaks a stream of bytes into 4*cda5da8dSAndroid Build Coastguard WorkerPython tokens. It decodes the bytes according to PEP-0263 for 5*cda5da8dSAndroid Build Coastguard Workerdetermining source file encoding. 6*cda5da8dSAndroid Build Coastguard Worker 7*cda5da8dSAndroid Build Coastguard WorkerIt accepts a readline-like method which is called repeatedly to get the 8*cda5da8dSAndroid Build Coastguard Workernext line of input (or b"" for EOF). It generates 5-tuples with these 9*cda5da8dSAndroid Build Coastguard Workermembers: 10*cda5da8dSAndroid Build Coastguard Worker 11*cda5da8dSAndroid Build Coastguard Worker the token type (see token.py) 12*cda5da8dSAndroid Build Coastguard Worker the token (a string) 13*cda5da8dSAndroid Build Coastguard Worker the starting (row, column) indices of the token (a 2-tuple of ints) 14*cda5da8dSAndroid Build Coastguard Worker the ending (row, column) indices of the token (a 2-tuple of ints) 15*cda5da8dSAndroid Build Coastguard Worker the original line (string) 16*cda5da8dSAndroid Build Coastguard Worker 17*cda5da8dSAndroid Build Coastguard WorkerIt is designed to match the working of the Python tokenizer exactly, except 18*cda5da8dSAndroid Build Coastguard Workerthat it produces COMMENT tokens for comments and gives type OP for all 19*cda5da8dSAndroid Build Coastguard Workeroperators. Additionally, all token lists start with an ENCODING token 20*cda5da8dSAndroid Build Coastguard Workerwhich tells you which encoding was used to decode the bytes stream. 21*cda5da8dSAndroid Build Coastguard Worker""" 22*cda5da8dSAndroid Build Coastguard Worker 23*cda5da8dSAndroid Build Coastguard Worker__author__ = 'Ka-Ping Yee <[email protected]>' 24*cda5da8dSAndroid Build Coastguard Worker__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25*cda5da8dSAndroid Build Coastguard Worker 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26*cda5da8dSAndroid Build Coastguard Worker 'Michael Foord') 27*cda5da8dSAndroid Build Coastguard Workerfrom builtins import open as _builtin_open 28*cda5da8dSAndroid Build Coastguard Workerfrom codecs import lookup, BOM_UTF8 29*cda5da8dSAndroid Build Coastguard Workerimport collections 30*cda5da8dSAndroid Build Coastguard Workerimport functools 31*cda5da8dSAndroid Build Coastguard Workerfrom io import TextIOWrapper 32*cda5da8dSAndroid Build Coastguard Workerimport itertools as _itertools 33*cda5da8dSAndroid Build Coastguard Workerimport re 34*cda5da8dSAndroid Build Coastguard Workerimport sys 35*cda5da8dSAndroid Build Coastguard Workerfrom token import * 36*cda5da8dSAndroid Build Coastguard Workerfrom token import EXACT_TOKEN_TYPES 37*cda5da8dSAndroid Build Coastguard Worker 38*cda5da8dSAndroid Build Coastguard Workercookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 39*cda5da8dSAndroid Build Coastguard Workerblank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 40*cda5da8dSAndroid Build Coastguard Worker 41*cda5da8dSAndroid Build Coastguard Workerimport token 42*cda5da8dSAndroid Build Coastguard Worker__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", 43*cda5da8dSAndroid Build Coastguard Worker "untokenize", "TokenInfo"] 44*cda5da8dSAndroid Build Coastguard Workerdel token 45*cda5da8dSAndroid Build Coastguard Worker 46*cda5da8dSAndroid Build Coastguard Workerclass TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 47*cda5da8dSAndroid Build Coastguard Worker def __repr__(self): 48*cda5da8dSAndroid Build Coastguard Worker annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 49*cda5da8dSAndroid Build Coastguard Worker return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 50*cda5da8dSAndroid Build Coastguard Worker self._replace(type=annotated_type)) 51*cda5da8dSAndroid Build Coastguard Worker 52*cda5da8dSAndroid Build Coastguard Worker @property 53*cda5da8dSAndroid Build Coastguard Worker def exact_type(self): 54*cda5da8dSAndroid Build Coastguard Worker if self.type == OP and self.string in EXACT_TOKEN_TYPES: 55*cda5da8dSAndroid Build Coastguard Worker return EXACT_TOKEN_TYPES[self.string] 56*cda5da8dSAndroid Build Coastguard Worker else: 57*cda5da8dSAndroid Build Coastguard Worker return self.type 58*cda5da8dSAndroid Build Coastguard Worker 59*cda5da8dSAndroid Build Coastguard Workerdef group(*choices): return '(' + '|'.join(choices) + ')' 60*cda5da8dSAndroid Build Coastguard Workerdef any(*choices): return group(*choices) + '*' 61*cda5da8dSAndroid Build Coastguard Workerdef maybe(*choices): return group(*choices) + '?' 62*cda5da8dSAndroid Build Coastguard Worker 63*cda5da8dSAndroid Build Coastguard Worker# Note: we use unicode matching for names ("\w") but ascii matching for 64*cda5da8dSAndroid Build Coastguard Worker# number literals. 65*cda5da8dSAndroid Build Coastguard WorkerWhitespace = r'[ \f\t]*' 66*cda5da8dSAndroid Build Coastguard WorkerComment = r'#[^\r\n]*' 67*cda5da8dSAndroid Build Coastguard WorkerIgnore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 68*cda5da8dSAndroid Build Coastguard WorkerName = r'\w+' 69*cda5da8dSAndroid Build Coastguard Worker 70*cda5da8dSAndroid Build Coastguard WorkerHexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 71*cda5da8dSAndroid Build Coastguard WorkerBinnumber = r'0[bB](?:_?[01])+' 72*cda5da8dSAndroid Build Coastguard WorkerOctnumber = r'0[oO](?:_?[0-7])+' 73*cda5da8dSAndroid Build Coastguard WorkerDecnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 74*cda5da8dSAndroid Build Coastguard WorkerIntnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 75*cda5da8dSAndroid Build Coastguard WorkerExponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 76*cda5da8dSAndroid Build Coastguard WorkerPointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 77*cda5da8dSAndroid Build Coastguard Worker r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 78*cda5da8dSAndroid Build Coastguard WorkerExpfloat = r'[0-9](?:_?[0-9])*' + Exponent 79*cda5da8dSAndroid Build Coastguard WorkerFloatnumber = group(Pointfloat, Expfloat) 80*cda5da8dSAndroid Build Coastguard WorkerImagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 81*cda5da8dSAndroid Build Coastguard WorkerNumber = group(Imagnumber, Floatnumber, Intnumber) 82*cda5da8dSAndroid Build Coastguard Worker 83*cda5da8dSAndroid Build Coastguard Worker# Return the empty string, plus all of the valid string prefixes. 84*cda5da8dSAndroid Build Coastguard Workerdef _all_string_prefixes(): 85*cda5da8dSAndroid Build Coastguard Worker # The valid string prefixes. Only contain the lower case versions, 86*cda5da8dSAndroid Build Coastguard Worker # and don't contain any permutations (include 'fr', but not 87*cda5da8dSAndroid Build Coastguard Worker # 'rf'). The various permutations will be generated. 88*cda5da8dSAndroid Build Coastguard Worker _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 89*cda5da8dSAndroid Build Coastguard Worker # if we add binary f-strings, add: ['fb', 'fbr'] 90*cda5da8dSAndroid Build Coastguard Worker result = {''} 91*cda5da8dSAndroid Build Coastguard Worker for prefix in _valid_string_prefixes: 92*cda5da8dSAndroid Build Coastguard Worker for t in _itertools.permutations(prefix): 93*cda5da8dSAndroid Build Coastguard Worker # create a list with upper and lower versions of each 94*cda5da8dSAndroid Build Coastguard Worker # character 95*cda5da8dSAndroid Build Coastguard Worker for u in _itertools.product(*[(c, c.upper()) for c in t]): 96*cda5da8dSAndroid Build Coastguard Worker result.add(''.join(u)) 97*cda5da8dSAndroid Build Coastguard Worker return result 98*cda5da8dSAndroid Build Coastguard Worker 99*cda5da8dSAndroid Build Coastguard Worker@functools.lru_cache 100*cda5da8dSAndroid Build Coastguard Workerdef _compile(expr): 101*cda5da8dSAndroid Build Coastguard Worker return re.compile(expr, re.UNICODE) 102*cda5da8dSAndroid Build Coastguard Worker 103*cda5da8dSAndroid Build Coastguard Worker# Note that since _all_string_prefixes includes the empty string, 104*cda5da8dSAndroid Build Coastguard Worker# StringPrefix can be the empty string (making it optional). 105*cda5da8dSAndroid Build Coastguard WorkerStringPrefix = group(*_all_string_prefixes()) 106*cda5da8dSAndroid Build Coastguard Worker 107*cda5da8dSAndroid Build Coastguard Worker# Tail end of ' string. 108*cda5da8dSAndroid Build Coastguard WorkerSingle = r"[^'\\]*(?:\\.[^'\\]*)*'" 109*cda5da8dSAndroid Build Coastguard Worker# Tail end of " string. 110*cda5da8dSAndroid Build Coastguard WorkerDouble = r'[^"\\]*(?:\\.[^"\\]*)*"' 111*cda5da8dSAndroid Build Coastguard Worker# Tail end of ''' string. 112*cda5da8dSAndroid Build Coastguard WorkerSingle3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 113*cda5da8dSAndroid Build Coastguard Worker# Tail end of """ string. 114*cda5da8dSAndroid Build Coastguard WorkerDouble3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 115*cda5da8dSAndroid Build Coastguard WorkerTriple = group(StringPrefix + "'''", StringPrefix + '"""') 116*cda5da8dSAndroid Build Coastguard Worker# Single-line ' or " string. 117*cda5da8dSAndroid Build Coastguard WorkerString = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 118*cda5da8dSAndroid Build Coastguard Worker StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 119*cda5da8dSAndroid Build Coastguard Worker 120*cda5da8dSAndroid Build Coastguard Worker# Sorting in reverse order puts the long operators before their prefixes. 121*cda5da8dSAndroid Build Coastguard Worker# Otherwise if = came before ==, == would get recognized as two instances 122*cda5da8dSAndroid Build Coastguard Worker# of =. 123*cda5da8dSAndroid Build Coastguard WorkerSpecial = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) 124*cda5da8dSAndroid Build Coastguard WorkerFunny = group(r'\r?\n', Special) 125*cda5da8dSAndroid Build Coastguard Worker 126*cda5da8dSAndroid Build Coastguard WorkerPlainToken = group(Number, Funny, String, Name) 127*cda5da8dSAndroid Build Coastguard WorkerToken = Ignore + PlainToken 128*cda5da8dSAndroid Build Coastguard Worker 129*cda5da8dSAndroid Build Coastguard Worker# First (or only) line of ' or " string. 130*cda5da8dSAndroid Build Coastguard WorkerContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 131*cda5da8dSAndroid Build Coastguard Worker group("'", r'\\\r?\n'), 132*cda5da8dSAndroid Build Coastguard Worker StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 133*cda5da8dSAndroid Build Coastguard Worker group('"', r'\\\r?\n')) 134*cda5da8dSAndroid Build Coastguard WorkerPseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 135*cda5da8dSAndroid Build Coastguard WorkerPseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 136*cda5da8dSAndroid Build Coastguard Worker 137*cda5da8dSAndroid Build Coastguard Worker# For a given string prefix plus quotes, endpats maps it to a regex 138*cda5da8dSAndroid Build Coastguard Worker# to match the remainder of that string. _prefix can be empty, for 139*cda5da8dSAndroid Build Coastguard Worker# a normal single or triple quoted string (with no prefix). 140*cda5da8dSAndroid Build Coastguard Workerendpats = {} 141*cda5da8dSAndroid Build Coastguard Workerfor _prefix in _all_string_prefixes(): 142*cda5da8dSAndroid Build Coastguard Worker endpats[_prefix + "'"] = Single 143*cda5da8dSAndroid Build Coastguard Worker endpats[_prefix + '"'] = Double 144*cda5da8dSAndroid Build Coastguard Worker endpats[_prefix + "'''"] = Single3 145*cda5da8dSAndroid Build Coastguard Worker endpats[_prefix + '"""'] = Double3 146*cda5da8dSAndroid Build Coastguard Workerdel _prefix 147*cda5da8dSAndroid Build Coastguard Worker 148*cda5da8dSAndroid Build Coastguard Worker# A set of all of the single and triple quoted string prefixes, 149*cda5da8dSAndroid Build Coastguard Worker# including the opening quotes. 150*cda5da8dSAndroid Build Coastguard Workersingle_quoted = set() 151*cda5da8dSAndroid Build Coastguard Workertriple_quoted = set() 152*cda5da8dSAndroid Build Coastguard Workerfor t in _all_string_prefixes(): 153*cda5da8dSAndroid Build Coastguard Worker for u in (t + '"', t + "'"): 154*cda5da8dSAndroid Build Coastguard Worker single_quoted.add(u) 155*cda5da8dSAndroid Build Coastguard Worker for u in (t + '"""', t + "'''"): 156*cda5da8dSAndroid Build Coastguard Worker triple_quoted.add(u) 157*cda5da8dSAndroid Build Coastguard Workerdel t, u 158*cda5da8dSAndroid Build Coastguard Worker 159*cda5da8dSAndroid Build Coastguard Workertabsize = 8 160*cda5da8dSAndroid Build Coastguard Worker 161*cda5da8dSAndroid Build Coastguard Workerclass TokenError(Exception): pass 162*cda5da8dSAndroid Build Coastguard Worker 163*cda5da8dSAndroid Build Coastguard Workerclass StopTokenizing(Exception): pass 164*cda5da8dSAndroid Build Coastguard Worker 165*cda5da8dSAndroid Build Coastguard Worker 166*cda5da8dSAndroid Build Coastguard Workerclass Untokenizer: 167*cda5da8dSAndroid Build Coastguard Worker 168*cda5da8dSAndroid Build Coastguard Worker def __init__(self): 169*cda5da8dSAndroid Build Coastguard Worker self.tokens = [] 170*cda5da8dSAndroid Build Coastguard Worker self.prev_row = 1 171*cda5da8dSAndroid Build Coastguard Worker self.prev_col = 0 172*cda5da8dSAndroid Build Coastguard Worker self.encoding = None 173*cda5da8dSAndroid Build Coastguard Worker 174*cda5da8dSAndroid Build Coastguard Worker def add_whitespace(self, start): 175*cda5da8dSAndroid Build Coastguard Worker row, col = start 176*cda5da8dSAndroid Build Coastguard Worker if row < self.prev_row or row == self.prev_row and col < self.prev_col: 177*cda5da8dSAndroid Build Coastguard Worker raise ValueError("start ({},{}) precedes previous end ({},{})" 178*cda5da8dSAndroid Build Coastguard Worker .format(row, col, self.prev_row, self.prev_col)) 179*cda5da8dSAndroid Build Coastguard Worker row_offset = row - self.prev_row 180*cda5da8dSAndroid Build Coastguard Worker if row_offset: 181*cda5da8dSAndroid Build Coastguard Worker self.tokens.append("\\\n" * row_offset) 182*cda5da8dSAndroid Build Coastguard Worker self.prev_col = 0 183*cda5da8dSAndroid Build Coastguard Worker col_offset = col - self.prev_col 184*cda5da8dSAndroid Build Coastguard Worker if col_offset: 185*cda5da8dSAndroid Build Coastguard Worker self.tokens.append(" " * col_offset) 186*cda5da8dSAndroid Build Coastguard Worker 187*cda5da8dSAndroid Build Coastguard Worker def untokenize(self, iterable): 188*cda5da8dSAndroid Build Coastguard Worker it = iter(iterable) 189*cda5da8dSAndroid Build Coastguard Worker indents = [] 190*cda5da8dSAndroid Build Coastguard Worker startline = False 191*cda5da8dSAndroid Build Coastguard Worker for t in it: 192*cda5da8dSAndroid Build Coastguard Worker if len(t) == 2: 193*cda5da8dSAndroid Build Coastguard Worker self.compat(t, it) 194*cda5da8dSAndroid Build Coastguard Worker break 195*cda5da8dSAndroid Build Coastguard Worker tok_type, token, start, end, line = t 196*cda5da8dSAndroid Build Coastguard Worker if tok_type == ENCODING: 197*cda5da8dSAndroid Build Coastguard Worker self.encoding = token 198*cda5da8dSAndroid Build Coastguard Worker continue 199*cda5da8dSAndroid Build Coastguard Worker if tok_type == ENDMARKER: 200*cda5da8dSAndroid Build Coastguard Worker break 201*cda5da8dSAndroid Build Coastguard Worker if tok_type == INDENT: 202*cda5da8dSAndroid Build Coastguard Worker indents.append(token) 203*cda5da8dSAndroid Build Coastguard Worker continue 204*cda5da8dSAndroid Build Coastguard Worker elif tok_type == DEDENT: 205*cda5da8dSAndroid Build Coastguard Worker indents.pop() 206*cda5da8dSAndroid Build Coastguard Worker self.prev_row, self.prev_col = end 207*cda5da8dSAndroid Build Coastguard Worker continue 208*cda5da8dSAndroid Build Coastguard Worker elif tok_type in (NEWLINE, NL): 209*cda5da8dSAndroid Build Coastguard Worker startline = True 210*cda5da8dSAndroid Build Coastguard Worker elif startline and indents: 211*cda5da8dSAndroid Build Coastguard Worker indent = indents[-1] 212*cda5da8dSAndroid Build Coastguard Worker if start[1] >= len(indent): 213*cda5da8dSAndroid Build Coastguard Worker self.tokens.append(indent) 214*cda5da8dSAndroid Build Coastguard Worker self.prev_col = len(indent) 215*cda5da8dSAndroid Build Coastguard Worker startline = False 216*cda5da8dSAndroid Build Coastguard Worker self.add_whitespace(start) 217*cda5da8dSAndroid Build Coastguard Worker self.tokens.append(token) 218*cda5da8dSAndroid Build Coastguard Worker self.prev_row, self.prev_col = end 219*cda5da8dSAndroid Build Coastguard Worker if tok_type in (NEWLINE, NL): 220*cda5da8dSAndroid Build Coastguard Worker self.prev_row += 1 221*cda5da8dSAndroid Build Coastguard Worker self.prev_col = 0 222*cda5da8dSAndroid Build Coastguard Worker return "".join(self.tokens) 223*cda5da8dSAndroid Build Coastguard Worker 224*cda5da8dSAndroid Build Coastguard Worker def compat(self, token, iterable): 225*cda5da8dSAndroid Build Coastguard Worker indents = [] 226*cda5da8dSAndroid Build Coastguard Worker toks_append = self.tokens.append 227*cda5da8dSAndroid Build Coastguard Worker startline = token[0] in (NEWLINE, NL) 228*cda5da8dSAndroid Build Coastguard Worker prevstring = False 229*cda5da8dSAndroid Build Coastguard Worker 230*cda5da8dSAndroid Build Coastguard Worker for tok in _itertools.chain([token], iterable): 231*cda5da8dSAndroid Build Coastguard Worker toknum, tokval = tok[:2] 232*cda5da8dSAndroid Build Coastguard Worker if toknum == ENCODING: 233*cda5da8dSAndroid Build Coastguard Worker self.encoding = tokval 234*cda5da8dSAndroid Build Coastguard Worker continue 235*cda5da8dSAndroid Build Coastguard Worker 236*cda5da8dSAndroid Build Coastguard Worker if toknum in (NAME, NUMBER): 237*cda5da8dSAndroid Build Coastguard Worker tokval += ' ' 238*cda5da8dSAndroid Build Coastguard Worker 239*cda5da8dSAndroid Build Coastguard Worker # Insert a space between two consecutive strings 240*cda5da8dSAndroid Build Coastguard Worker if toknum == STRING: 241*cda5da8dSAndroid Build Coastguard Worker if prevstring: 242*cda5da8dSAndroid Build Coastguard Worker tokval = ' ' + tokval 243*cda5da8dSAndroid Build Coastguard Worker prevstring = True 244*cda5da8dSAndroid Build Coastguard Worker else: 245*cda5da8dSAndroid Build Coastguard Worker prevstring = False 246*cda5da8dSAndroid Build Coastguard Worker 247*cda5da8dSAndroid Build Coastguard Worker if toknum == INDENT: 248*cda5da8dSAndroid Build Coastguard Worker indents.append(tokval) 249*cda5da8dSAndroid Build Coastguard Worker continue 250*cda5da8dSAndroid Build Coastguard Worker elif toknum == DEDENT: 251*cda5da8dSAndroid Build Coastguard Worker indents.pop() 252*cda5da8dSAndroid Build Coastguard Worker continue 253*cda5da8dSAndroid Build Coastguard Worker elif toknum in (NEWLINE, NL): 254*cda5da8dSAndroid Build Coastguard Worker startline = True 255*cda5da8dSAndroid Build Coastguard Worker elif startline and indents: 256*cda5da8dSAndroid Build Coastguard Worker toks_append(indents[-1]) 257*cda5da8dSAndroid Build Coastguard Worker startline = False 258*cda5da8dSAndroid Build Coastguard Worker toks_append(tokval) 259*cda5da8dSAndroid Build Coastguard Worker 260*cda5da8dSAndroid Build Coastguard Worker 261*cda5da8dSAndroid Build Coastguard Workerdef untokenize(iterable): 262*cda5da8dSAndroid Build Coastguard Worker """Transform tokens back into Python source code. 263*cda5da8dSAndroid Build Coastguard Worker It returns a bytes object, encoded using the ENCODING 264*cda5da8dSAndroid Build Coastguard Worker token, which is the first token sequence output by tokenize. 265*cda5da8dSAndroid Build Coastguard Worker 266*cda5da8dSAndroid Build Coastguard Worker Each element returned by the iterable must be a token sequence 267*cda5da8dSAndroid Build Coastguard Worker with at least two elements, a token number and token value. If 268*cda5da8dSAndroid Build Coastguard Worker only two tokens are passed, the resulting output is poor. 269*cda5da8dSAndroid Build Coastguard Worker 270*cda5da8dSAndroid Build Coastguard Worker Round-trip invariant for full input: 271*cda5da8dSAndroid Build Coastguard Worker Untokenized source will match input source exactly 272*cda5da8dSAndroid Build Coastguard Worker 273*cda5da8dSAndroid Build Coastguard Worker Round-trip invariant for limited input: 274*cda5da8dSAndroid Build Coastguard Worker # Output bytes will tokenize back to the input 275*cda5da8dSAndroid Build Coastguard Worker t1 = [tok[:2] for tok in tokenize(f.readline)] 276*cda5da8dSAndroid Build Coastguard Worker newcode = untokenize(t1) 277*cda5da8dSAndroid Build Coastguard Worker readline = BytesIO(newcode).readline 278*cda5da8dSAndroid Build Coastguard Worker t2 = [tok[:2] for tok in tokenize(readline)] 279*cda5da8dSAndroid Build Coastguard Worker assert t1 == t2 280*cda5da8dSAndroid Build Coastguard Worker """ 281*cda5da8dSAndroid Build Coastguard Worker ut = Untokenizer() 282*cda5da8dSAndroid Build Coastguard Worker out = ut.untokenize(iterable) 283*cda5da8dSAndroid Build Coastguard Worker if ut.encoding is not None: 284*cda5da8dSAndroid Build Coastguard Worker out = out.encode(ut.encoding) 285*cda5da8dSAndroid Build Coastguard Worker return out 286*cda5da8dSAndroid Build Coastguard Worker 287*cda5da8dSAndroid Build Coastguard Worker 288*cda5da8dSAndroid Build Coastguard Workerdef _get_normal_name(orig_enc): 289*cda5da8dSAndroid Build Coastguard Worker """Imitates get_normal_name in tokenizer.c.""" 290*cda5da8dSAndroid Build Coastguard Worker # Only care about the first 12 characters. 291*cda5da8dSAndroid Build Coastguard Worker enc = orig_enc[:12].lower().replace("_", "-") 292*cda5da8dSAndroid Build Coastguard Worker if enc == "utf-8" or enc.startswith("utf-8-"): 293*cda5da8dSAndroid Build Coastguard Worker return "utf-8" 294*cda5da8dSAndroid Build Coastguard Worker if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 295*cda5da8dSAndroid Build Coastguard Worker enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 296*cda5da8dSAndroid Build Coastguard Worker return "iso-8859-1" 297*cda5da8dSAndroid Build Coastguard Worker return orig_enc 298*cda5da8dSAndroid Build Coastguard Worker 299*cda5da8dSAndroid Build Coastguard Workerdef detect_encoding(readline): 300*cda5da8dSAndroid Build Coastguard Worker """ 301*cda5da8dSAndroid Build Coastguard Worker The detect_encoding() function is used to detect the encoding that should 302*cda5da8dSAndroid Build Coastguard Worker be used to decode a Python source file. It requires one argument, readline, 303*cda5da8dSAndroid Build Coastguard Worker in the same way as the tokenize() generator. 304*cda5da8dSAndroid Build Coastguard Worker 305*cda5da8dSAndroid Build Coastguard Worker It will call readline a maximum of twice, and return the encoding used 306*cda5da8dSAndroid Build Coastguard Worker (as a string) and a list of any lines (left as bytes) it has read in. 307*cda5da8dSAndroid Build Coastguard Worker 308*cda5da8dSAndroid Build Coastguard Worker It detects the encoding from the presence of a utf-8 bom or an encoding 309*cda5da8dSAndroid Build Coastguard Worker cookie as specified in pep-0263. If both a bom and a cookie are present, 310*cda5da8dSAndroid Build Coastguard Worker but disagree, a SyntaxError will be raised. If the encoding cookie is an 311*cda5da8dSAndroid Build Coastguard Worker invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 312*cda5da8dSAndroid Build Coastguard Worker 'utf-8-sig' is returned. 313*cda5da8dSAndroid Build Coastguard Worker 314*cda5da8dSAndroid Build Coastguard Worker If no encoding is specified, then the default of 'utf-8' will be returned. 315*cda5da8dSAndroid Build Coastguard Worker """ 316*cda5da8dSAndroid Build Coastguard Worker try: 317*cda5da8dSAndroid Build Coastguard Worker filename = readline.__self__.name 318*cda5da8dSAndroid Build Coastguard Worker except AttributeError: 319*cda5da8dSAndroid Build Coastguard Worker filename = None 320*cda5da8dSAndroid Build Coastguard Worker bom_found = False 321*cda5da8dSAndroid Build Coastguard Worker encoding = None 322*cda5da8dSAndroid Build Coastguard Worker default = 'utf-8' 323*cda5da8dSAndroid Build Coastguard Worker def read_or_stop(): 324*cda5da8dSAndroid Build Coastguard Worker try: 325*cda5da8dSAndroid Build Coastguard Worker return readline() 326*cda5da8dSAndroid Build Coastguard Worker except StopIteration: 327*cda5da8dSAndroid Build Coastguard Worker return b'' 328*cda5da8dSAndroid Build Coastguard Worker 329*cda5da8dSAndroid Build Coastguard Worker def find_cookie(line): 330*cda5da8dSAndroid Build Coastguard Worker try: 331*cda5da8dSAndroid Build Coastguard Worker # Decode as UTF-8. Either the line is an encoding declaration, 332*cda5da8dSAndroid Build Coastguard Worker # in which case it should be pure ASCII, or it must be UTF-8 333*cda5da8dSAndroid Build Coastguard Worker # per default encoding. 334*cda5da8dSAndroid Build Coastguard Worker line_string = line.decode('utf-8') 335*cda5da8dSAndroid Build Coastguard Worker except UnicodeDecodeError: 336*cda5da8dSAndroid Build Coastguard Worker msg = "invalid or missing encoding declaration" 337*cda5da8dSAndroid Build Coastguard Worker if filename is not None: 338*cda5da8dSAndroid Build Coastguard Worker msg = '{} for {!r}'.format(msg, filename) 339*cda5da8dSAndroid Build Coastguard Worker raise SyntaxError(msg) 340*cda5da8dSAndroid Build Coastguard Worker 341*cda5da8dSAndroid Build Coastguard Worker match = cookie_re.match(line_string) 342*cda5da8dSAndroid Build Coastguard Worker if not match: 343*cda5da8dSAndroid Build Coastguard Worker return None 344*cda5da8dSAndroid Build Coastguard Worker encoding = _get_normal_name(match.group(1)) 345*cda5da8dSAndroid Build Coastguard Worker try: 346*cda5da8dSAndroid Build Coastguard Worker codec = lookup(encoding) 347*cda5da8dSAndroid Build Coastguard Worker except LookupError: 348*cda5da8dSAndroid Build Coastguard Worker # This behaviour mimics the Python interpreter 349*cda5da8dSAndroid Build Coastguard Worker if filename is None: 350*cda5da8dSAndroid Build Coastguard Worker msg = "unknown encoding: " + encoding 351*cda5da8dSAndroid Build Coastguard Worker else: 352*cda5da8dSAndroid Build Coastguard Worker msg = "unknown encoding for {!r}: {}".format(filename, 353*cda5da8dSAndroid Build Coastguard Worker encoding) 354*cda5da8dSAndroid Build Coastguard Worker raise SyntaxError(msg) 355*cda5da8dSAndroid Build Coastguard Worker 356*cda5da8dSAndroid Build Coastguard Worker if bom_found: 357*cda5da8dSAndroid Build Coastguard Worker if encoding != 'utf-8': 358*cda5da8dSAndroid Build Coastguard Worker # This behaviour mimics the Python interpreter 359*cda5da8dSAndroid Build Coastguard Worker if filename is None: 360*cda5da8dSAndroid Build Coastguard Worker msg = 'encoding problem: utf-8' 361*cda5da8dSAndroid Build Coastguard Worker else: 362*cda5da8dSAndroid Build Coastguard Worker msg = 'encoding problem for {!r}: utf-8'.format(filename) 363*cda5da8dSAndroid Build Coastguard Worker raise SyntaxError(msg) 364*cda5da8dSAndroid Build Coastguard Worker encoding += '-sig' 365*cda5da8dSAndroid Build Coastguard Worker return encoding 366*cda5da8dSAndroid Build Coastguard Worker 367*cda5da8dSAndroid Build Coastguard Worker first = read_or_stop() 368*cda5da8dSAndroid Build Coastguard Worker if first.startswith(BOM_UTF8): 369*cda5da8dSAndroid Build Coastguard Worker bom_found = True 370*cda5da8dSAndroid Build Coastguard Worker first = first[3:] 371*cda5da8dSAndroid Build Coastguard Worker default = 'utf-8-sig' 372*cda5da8dSAndroid Build Coastguard Worker if not first: 373*cda5da8dSAndroid Build Coastguard Worker return default, [] 374*cda5da8dSAndroid Build Coastguard Worker 375*cda5da8dSAndroid Build Coastguard Worker encoding = find_cookie(first) 376*cda5da8dSAndroid Build Coastguard Worker if encoding: 377*cda5da8dSAndroid Build Coastguard Worker return encoding, [first] 378*cda5da8dSAndroid Build Coastguard Worker if not blank_re.match(first): 379*cda5da8dSAndroid Build Coastguard Worker return default, [first] 380*cda5da8dSAndroid Build Coastguard Worker 381*cda5da8dSAndroid Build Coastguard Worker second = read_or_stop() 382*cda5da8dSAndroid Build Coastguard Worker if not second: 383*cda5da8dSAndroid Build Coastguard Worker return default, [first] 384*cda5da8dSAndroid Build Coastguard Worker 385*cda5da8dSAndroid Build Coastguard Worker encoding = find_cookie(second) 386*cda5da8dSAndroid Build Coastguard Worker if encoding: 387*cda5da8dSAndroid Build Coastguard Worker return encoding, [first, second] 388*cda5da8dSAndroid Build Coastguard Worker 389*cda5da8dSAndroid Build Coastguard Worker return default, [first, second] 390*cda5da8dSAndroid Build Coastguard Worker 391*cda5da8dSAndroid Build Coastguard Worker 392*cda5da8dSAndroid Build Coastguard Workerdef open(filename): 393*cda5da8dSAndroid Build Coastguard Worker """Open a file in read only mode using the encoding detected by 394*cda5da8dSAndroid Build Coastguard Worker detect_encoding(). 395*cda5da8dSAndroid Build Coastguard Worker """ 396*cda5da8dSAndroid Build Coastguard Worker buffer = _builtin_open(filename, 'rb') 397*cda5da8dSAndroid Build Coastguard Worker try: 398*cda5da8dSAndroid Build Coastguard Worker encoding, lines = detect_encoding(buffer.readline) 399*cda5da8dSAndroid Build Coastguard Worker buffer.seek(0) 400*cda5da8dSAndroid Build Coastguard Worker text = TextIOWrapper(buffer, encoding, line_buffering=True) 401*cda5da8dSAndroid Build Coastguard Worker text.mode = 'r' 402*cda5da8dSAndroid Build Coastguard Worker return text 403*cda5da8dSAndroid Build Coastguard Worker except: 404*cda5da8dSAndroid Build Coastguard Worker buffer.close() 405*cda5da8dSAndroid Build Coastguard Worker raise 406*cda5da8dSAndroid Build Coastguard Worker 407*cda5da8dSAndroid Build Coastguard Worker 408*cda5da8dSAndroid Build Coastguard Workerdef tokenize(readline): 409*cda5da8dSAndroid Build Coastguard Worker """ 410*cda5da8dSAndroid Build Coastguard Worker The tokenize() generator requires one argument, readline, which 411*cda5da8dSAndroid Build Coastguard Worker must be a callable object which provides the same interface as the 412*cda5da8dSAndroid Build Coastguard Worker readline() method of built-in file objects. Each call to the function 413*cda5da8dSAndroid Build Coastguard Worker should return one line of input as bytes. Alternatively, readline 414*cda5da8dSAndroid Build Coastguard Worker can be a callable function terminating with StopIteration: 415*cda5da8dSAndroid Build Coastguard Worker readline = open(myfile, 'rb').__next__ # Example of alternate readline 416*cda5da8dSAndroid Build Coastguard Worker 417*cda5da8dSAndroid Build Coastguard Worker The generator produces 5-tuples with these members: the token type; the 418*cda5da8dSAndroid Build Coastguard Worker token string; a 2-tuple (srow, scol) of ints specifying the row and 419*cda5da8dSAndroid Build Coastguard Worker column where the token begins in the source; a 2-tuple (erow, ecol) of 420*cda5da8dSAndroid Build Coastguard Worker ints specifying the row and column where the token ends in the source; 421*cda5da8dSAndroid Build Coastguard Worker and the line on which the token was found. The line passed is the 422*cda5da8dSAndroid Build Coastguard Worker physical line. 423*cda5da8dSAndroid Build Coastguard Worker 424*cda5da8dSAndroid Build Coastguard Worker The first token sequence will always be an ENCODING token 425*cda5da8dSAndroid Build Coastguard Worker which tells you which encoding was used to decode the bytes stream. 426*cda5da8dSAndroid Build Coastguard Worker """ 427*cda5da8dSAndroid Build Coastguard Worker encoding, consumed = detect_encoding(readline) 428*cda5da8dSAndroid Build Coastguard Worker empty = _itertools.repeat(b"") 429*cda5da8dSAndroid Build Coastguard Worker rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) 430*cda5da8dSAndroid Build Coastguard Worker return _tokenize(rl_gen.__next__, encoding) 431*cda5da8dSAndroid Build Coastguard Worker 432*cda5da8dSAndroid Build Coastguard Worker 433*cda5da8dSAndroid Build Coastguard Workerdef _tokenize(readline, encoding): 434*cda5da8dSAndroid Build Coastguard Worker lnum = parenlev = continued = 0 435*cda5da8dSAndroid Build Coastguard Worker numchars = '0123456789' 436*cda5da8dSAndroid Build Coastguard Worker contstr, needcont = '', 0 437*cda5da8dSAndroid Build Coastguard Worker contline = None 438*cda5da8dSAndroid Build Coastguard Worker indents = [0] 439*cda5da8dSAndroid Build Coastguard Worker 440*cda5da8dSAndroid Build Coastguard Worker if encoding is not None: 441*cda5da8dSAndroid Build Coastguard Worker if encoding == "utf-8-sig": 442*cda5da8dSAndroid Build Coastguard Worker # BOM will already have been stripped. 443*cda5da8dSAndroid Build Coastguard Worker encoding = "utf-8" 444*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 445*cda5da8dSAndroid Build Coastguard Worker last_line = b'' 446*cda5da8dSAndroid Build Coastguard Worker line = b'' 447*cda5da8dSAndroid Build Coastguard Worker while True: # loop over lines in stream 448*cda5da8dSAndroid Build Coastguard Worker try: 449*cda5da8dSAndroid Build Coastguard Worker # We capture the value of the line variable here because 450*cda5da8dSAndroid Build Coastguard Worker # readline uses the empty string '' to signal end of input, 451*cda5da8dSAndroid Build Coastguard Worker # hence `line` itself will always be overwritten at the end 452*cda5da8dSAndroid Build Coastguard Worker # of this loop. 453*cda5da8dSAndroid Build Coastguard Worker last_line = line 454*cda5da8dSAndroid Build Coastguard Worker line = readline() 455*cda5da8dSAndroid Build Coastguard Worker except StopIteration: 456*cda5da8dSAndroid Build Coastguard Worker line = b'' 457*cda5da8dSAndroid Build Coastguard Worker 458*cda5da8dSAndroid Build Coastguard Worker if encoding is not None: 459*cda5da8dSAndroid Build Coastguard Worker line = line.decode(encoding) 460*cda5da8dSAndroid Build Coastguard Worker lnum += 1 461*cda5da8dSAndroid Build Coastguard Worker pos, max = 0, len(line) 462*cda5da8dSAndroid Build Coastguard Worker 463*cda5da8dSAndroid Build Coastguard Worker if contstr: # continued string 464*cda5da8dSAndroid Build Coastguard Worker if not line: 465*cda5da8dSAndroid Build Coastguard Worker raise TokenError("EOF in multi-line string", strstart) 466*cda5da8dSAndroid Build Coastguard Worker endmatch = endprog.match(line) 467*cda5da8dSAndroid Build Coastguard Worker if endmatch: 468*cda5da8dSAndroid Build Coastguard Worker pos = end = endmatch.end(0) 469*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(STRING, contstr + line[:end], 470*cda5da8dSAndroid Build Coastguard Worker strstart, (lnum, end), contline + line) 471*cda5da8dSAndroid Build Coastguard Worker contstr, needcont = '', 0 472*cda5da8dSAndroid Build Coastguard Worker contline = None 473*cda5da8dSAndroid Build Coastguard Worker elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 474*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(ERRORTOKEN, contstr + line, 475*cda5da8dSAndroid Build Coastguard Worker strstart, (lnum, len(line)), contline) 476*cda5da8dSAndroid Build Coastguard Worker contstr = '' 477*cda5da8dSAndroid Build Coastguard Worker contline = None 478*cda5da8dSAndroid Build Coastguard Worker continue 479*cda5da8dSAndroid Build Coastguard Worker else: 480*cda5da8dSAndroid Build Coastguard Worker contstr = contstr + line 481*cda5da8dSAndroid Build Coastguard Worker contline = contline + line 482*cda5da8dSAndroid Build Coastguard Worker continue 483*cda5da8dSAndroid Build Coastguard Worker 484*cda5da8dSAndroid Build Coastguard Worker elif parenlev == 0 and not continued: # new statement 485*cda5da8dSAndroid Build Coastguard Worker if not line: break 486*cda5da8dSAndroid Build Coastguard Worker column = 0 487*cda5da8dSAndroid Build Coastguard Worker while pos < max: # measure leading whitespace 488*cda5da8dSAndroid Build Coastguard Worker if line[pos] == ' ': 489*cda5da8dSAndroid Build Coastguard Worker column += 1 490*cda5da8dSAndroid Build Coastguard Worker elif line[pos] == '\t': 491*cda5da8dSAndroid Build Coastguard Worker column = (column//tabsize + 1)*tabsize 492*cda5da8dSAndroid Build Coastguard Worker elif line[pos] == '\f': 493*cda5da8dSAndroid Build Coastguard Worker column = 0 494*cda5da8dSAndroid Build Coastguard Worker else: 495*cda5da8dSAndroid Build Coastguard Worker break 496*cda5da8dSAndroid Build Coastguard Worker pos += 1 497*cda5da8dSAndroid Build Coastguard Worker if pos == max: 498*cda5da8dSAndroid Build Coastguard Worker break 499*cda5da8dSAndroid Build Coastguard Worker 500*cda5da8dSAndroid Build Coastguard Worker if line[pos] in '#\r\n': # skip comments or blank lines 501*cda5da8dSAndroid Build Coastguard Worker if line[pos] == '#': 502*cda5da8dSAndroid Build Coastguard Worker comment_token = line[pos:].rstrip('\r\n') 503*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(COMMENT, comment_token, 504*cda5da8dSAndroid Build Coastguard Worker (lnum, pos), (lnum, pos + len(comment_token)), line) 505*cda5da8dSAndroid Build Coastguard Worker pos += len(comment_token) 506*cda5da8dSAndroid Build Coastguard Worker 507*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(NL, line[pos:], 508*cda5da8dSAndroid Build Coastguard Worker (lnum, pos), (lnum, len(line)), line) 509*cda5da8dSAndroid Build Coastguard Worker continue 510*cda5da8dSAndroid Build Coastguard Worker 511*cda5da8dSAndroid Build Coastguard Worker if column > indents[-1]: # count indents or dedents 512*cda5da8dSAndroid Build Coastguard Worker indents.append(column) 513*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 514*cda5da8dSAndroid Build Coastguard Worker while column < indents[-1]: 515*cda5da8dSAndroid Build Coastguard Worker if column not in indents: 516*cda5da8dSAndroid Build Coastguard Worker raise IndentationError( 517*cda5da8dSAndroid Build Coastguard Worker "unindent does not match any outer indentation level", 518*cda5da8dSAndroid Build Coastguard Worker ("<tokenize>", lnum, pos, line)) 519*cda5da8dSAndroid Build Coastguard Worker indents = indents[:-1] 520*cda5da8dSAndroid Build Coastguard Worker 521*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 522*cda5da8dSAndroid Build Coastguard Worker 523*cda5da8dSAndroid Build Coastguard Worker else: # continued statement 524*cda5da8dSAndroid Build Coastguard Worker if not line: 525*cda5da8dSAndroid Build Coastguard Worker raise TokenError("EOF in multi-line statement", (lnum, 0)) 526*cda5da8dSAndroid Build Coastguard Worker continued = 0 527*cda5da8dSAndroid Build Coastguard Worker 528*cda5da8dSAndroid Build Coastguard Worker while pos < max: 529*cda5da8dSAndroid Build Coastguard Worker pseudomatch = _compile(PseudoToken).match(line, pos) 530*cda5da8dSAndroid Build Coastguard Worker if pseudomatch: # scan for tokens 531*cda5da8dSAndroid Build Coastguard Worker start, end = pseudomatch.span(1) 532*cda5da8dSAndroid Build Coastguard Worker spos, epos, pos = (lnum, start), (lnum, end), end 533*cda5da8dSAndroid Build Coastguard Worker if start == end: 534*cda5da8dSAndroid Build Coastguard Worker continue 535*cda5da8dSAndroid Build Coastguard Worker token, initial = line[start:end], line[start] 536*cda5da8dSAndroid Build Coastguard Worker 537*cda5da8dSAndroid Build Coastguard Worker if (initial in numchars or # ordinary number 538*cda5da8dSAndroid Build Coastguard Worker (initial == '.' and token != '.' and token != '...')): 539*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(NUMBER, token, spos, epos, line) 540*cda5da8dSAndroid Build Coastguard Worker elif initial in '\r\n': 541*cda5da8dSAndroid Build Coastguard Worker if parenlev > 0: 542*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(NL, token, spos, epos, line) 543*cda5da8dSAndroid Build Coastguard Worker else: 544*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(NEWLINE, token, spos, epos, line) 545*cda5da8dSAndroid Build Coastguard Worker 546*cda5da8dSAndroid Build Coastguard Worker elif initial == '#': 547*cda5da8dSAndroid Build Coastguard Worker assert not token.endswith("\n") 548*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(COMMENT, token, spos, epos, line) 549*cda5da8dSAndroid Build Coastguard Worker 550*cda5da8dSAndroid Build Coastguard Worker elif token in triple_quoted: 551*cda5da8dSAndroid Build Coastguard Worker endprog = _compile(endpats[token]) 552*cda5da8dSAndroid Build Coastguard Worker endmatch = endprog.match(line, pos) 553*cda5da8dSAndroid Build Coastguard Worker if endmatch: # all on one line 554*cda5da8dSAndroid Build Coastguard Worker pos = endmatch.end(0) 555*cda5da8dSAndroid Build Coastguard Worker token = line[start:pos] 556*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(STRING, token, spos, (lnum, pos), line) 557*cda5da8dSAndroid Build Coastguard Worker else: 558*cda5da8dSAndroid Build Coastguard Worker strstart = (lnum, start) # multiple lines 559*cda5da8dSAndroid Build Coastguard Worker contstr = line[start:] 560*cda5da8dSAndroid Build Coastguard Worker contline = line 561*cda5da8dSAndroid Build Coastguard Worker break 562*cda5da8dSAndroid Build Coastguard Worker 563*cda5da8dSAndroid Build Coastguard Worker # Check up to the first 3 chars of the token to see if 564*cda5da8dSAndroid Build Coastguard Worker # they're in the single_quoted set. If so, they start 565*cda5da8dSAndroid Build Coastguard Worker # a string. 566*cda5da8dSAndroid Build Coastguard Worker # We're using the first 3, because we're looking for 567*cda5da8dSAndroid Build Coastguard Worker # "rb'" (for example) at the start of the token. If 568*cda5da8dSAndroid Build Coastguard Worker # we switch to longer prefixes, this needs to be 569*cda5da8dSAndroid Build Coastguard Worker # adjusted. 570*cda5da8dSAndroid Build Coastguard Worker # Note that initial == token[:1]. 571*cda5da8dSAndroid Build Coastguard Worker # Also note that single quote checking must come after 572*cda5da8dSAndroid Build Coastguard Worker # triple quote checking (above). 573*cda5da8dSAndroid Build Coastguard Worker elif (initial in single_quoted or 574*cda5da8dSAndroid Build Coastguard Worker token[:2] in single_quoted or 575*cda5da8dSAndroid Build Coastguard Worker token[:3] in single_quoted): 576*cda5da8dSAndroid Build Coastguard Worker if token[-1] == '\n': # continued string 577*cda5da8dSAndroid Build Coastguard Worker strstart = (lnum, start) 578*cda5da8dSAndroid Build Coastguard Worker # Again, using the first 3 chars of the 579*cda5da8dSAndroid Build Coastguard Worker # token. This is looking for the matching end 580*cda5da8dSAndroid Build Coastguard Worker # regex for the correct type of quote 581*cda5da8dSAndroid Build Coastguard Worker # character. So it's really looking for 582*cda5da8dSAndroid Build Coastguard Worker # endpats["'"] or endpats['"'], by trying to 583*cda5da8dSAndroid Build Coastguard Worker # skip string prefix characters, if any. 584*cda5da8dSAndroid Build Coastguard Worker endprog = _compile(endpats.get(initial) or 585*cda5da8dSAndroid Build Coastguard Worker endpats.get(token[1]) or 586*cda5da8dSAndroid Build Coastguard Worker endpats.get(token[2])) 587*cda5da8dSAndroid Build Coastguard Worker contstr, needcont = line[start:], 1 588*cda5da8dSAndroid Build Coastguard Worker contline = line 589*cda5da8dSAndroid Build Coastguard Worker break 590*cda5da8dSAndroid Build Coastguard Worker else: # ordinary string 591*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(STRING, token, spos, epos, line) 592*cda5da8dSAndroid Build Coastguard Worker 593*cda5da8dSAndroid Build Coastguard Worker elif initial.isidentifier(): # ordinary name 594*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(NAME, token, spos, epos, line) 595*cda5da8dSAndroid Build Coastguard Worker elif initial == '\\': # continued stmt 596*cda5da8dSAndroid Build Coastguard Worker continued = 1 597*cda5da8dSAndroid Build Coastguard Worker else: 598*cda5da8dSAndroid Build Coastguard Worker if initial in '([{': 599*cda5da8dSAndroid Build Coastguard Worker parenlev += 1 600*cda5da8dSAndroid Build Coastguard Worker elif initial in ')]}': 601*cda5da8dSAndroid Build Coastguard Worker parenlev -= 1 602*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(OP, token, spos, epos, line) 603*cda5da8dSAndroid Build Coastguard Worker else: 604*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(ERRORTOKEN, line[pos], 605*cda5da8dSAndroid Build Coastguard Worker (lnum, pos), (lnum, pos+1), line) 606*cda5da8dSAndroid Build Coastguard Worker pos += 1 607*cda5da8dSAndroid Build Coastguard Worker 608*cda5da8dSAndroid Build Coastguard Worker # Add an implicit NEWLINE if the input doesn't end in one 609*cda5da8dSAndroid Build Coastguard Worker if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): 610*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') 611*cda5da8dSAndroid Build Coastguard Worker for indent in indents[1:]: # pop remaining indent levels 612*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 613*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 614*cda5da8dSAndroid Build Coastguard Worker 615*cda5da8dSAndroid Build Coastguard Worker 616*cda5da8dSAndroid Build Coastguard Workerdef generate_tokens(readline): 617*cda5da8dSAndroid Build Coastguard Worker """Tokenize a source reading Python code as unicode strings. 618*cda5da8dSAndroid Build Coastguard Worker 619*cda5da8dSAndroid Build Coastguard Worker This has the same API as tokenize(), except that it expects the *readline* 620*cda5da8dSAndroid Build Coastguard Worker callable to return str objects instead of bytes. 621*cda5da8dSAndroid Build Coastguard Worker """ 622*cda5da8dSAndroid Build Coastguard Worker return _tokenize(readline, None) 623*cda5da8dSAndroid Build Coastguard Worker 624*cda5da8dSAndroid Build Coastguard Workerdef main(): 625*cda5da8dSAndroid Build Coastguard Worker import argparse 626*cda5da8dSAndroid Build Coastguard Worker 627*cda5da8dSAndroid Build Coastguard Worker # Helper error handling routines 628*cda5da8dSAndroid Build Coastguard Worker def perror(message): 629*cda5da8dSAndroid Build Coastguard Worker sys.stderr.write(message) 630*cda5da8dSAndroid Build Coastguard Worker sys.stderr.write('\n') 631*cda5da8dSAndroid Build Coastguard Worker 632*cda5da8dSAndroid Build Coastguard Worker def error(message, filename=None, location=None): 633*cda5da8dSAndroid Build Coastguard Worker if location: 634*cda5da8dSAndroid Build Coastguard Worker args = (filename,) + location + (message,) 635*cda5da8dSAndroid Build Coastguard Worker perror("%s:%d:%d: error: %s" % args) 636*cda5da8dSAndroid Build Coastguard Worker elif filename: 637*cda5da8dSAndroid Build Coastguard Worker perror("%s: error: %s" % (filename, message)) 638*cda5da8dSAndroid Build Coastguard Worker else: 639*cda5da8dSAndroid Build Coastguard Worker perror("error: %s" % message) 640*cda5da8dSAndroid Build Coastguard Worker sys.exit(1) 641*cda5da8dSAndroid Build Coastguard Worker 642*cda5da8dSAndroid Build Coastguard Worker # Parse the arguments and options 643*cda5da8dSAndroid Build Coastguard Worker parser = argparse.ArgumentParser(prog='python -m tokenize') 644*cda5da8dSAndroid Build Coastguard Worker parser.add_argument(dest='filename', nargs='?', 645*cda5da8dSAndroid Build Coastguard Worker metavar='filename.py', 646*cda5da8dSAndroid Build Coastguard Worker help='the file to tokenize; defaults to stdin') 647*cda5da8dSAndroid Build Coastguard Worker parser.add_argument('-e', '--exact', dest='exact', action='store_true', 648*cda5da8dSAndroid Build Coastguard Worker help='display token names using the exact type') 649*cda5da8dSAndroid Build Coastguard Worker args = parser.parse_args() 650*cda5da8dSAndroid Build Coastguard Worker 651*cda5da8dSAndroid Build Coastguard Worker try: 652*cda5da8dSAndroid Build Coastguard Worker # Tokenize the input 653*cda5da8dSAndroid Build Coastguard Worker if args.filename: 654*cda5da8dSAndroid Build Coastguard Worker filename = args.filename 655*cda5da8dSAndroid Build Coastguard Worker with _builtin_open(filename, 'rb') as f: 656*cda5da8dSAndroid Build Coastguard Worker tokens = list(tokenize(f.readline)) 657*cda5da8dSAndroid Build Coastguard Worker else: 658*cda5da8dSAndroid Build Coastguard Worker filename = "<stdin>" 659*cda5da8dSAndroid Build Coastguard Worker tokens = _tokenize(sys.stdin.readline, None) 660*cda5da8dSAndroid Build Coastguard Worker 661*cda5da8dSAndroid Build Coastguard Worker # Output the tokenization 662*cda5da8dSAndroid Build Coastguard Worker for token in tokens: 663*cda5da8dSAndroid Build Coastguard Worker token_type = token.type 664*cda5da8dSAndroid Build Coastguard Worker if args.exact: 665*cda5da8dSAndroid Build Coastguard Worker token_type = token.exact_type 666*cda5da8dSAndroid Build Coastguard Worker token_range = "%d,%d-%d,%d:" % (token.start + token.end) 667*cda5da8dSAndroid Build Coastguard Worker print("%-20s%-15s%-15r" % 668*cda5da8dSAndroid Build Coastguard Worker (token_range, tok_name[token_type], token.string)) 669*cda5da8dSAndroid Build Coastguard Worker except IndentationError as err: 670*cda5da8dSAndroid Build Coastguard Worker line, column = err.args[1][1:3] 671*cda5da8dSAndroid Build Coastguard Worker error(err.args[0], filename, (line, column)) 672*cda5da8dSAndroid Build Coastguard Worker except TokenError as err: 673*cda5da8dSAndroid Build Coastguard Worker line, column = err.args[1] 674*cda5da8dSAndroid Build Coastguard Worker error(err.args[0], filename, (line, column)) 675*cda5da8dSAndroid Build Coastguard Worker except SyntaxError as err: 676*cda5da8dSAndroid Build Coastguard Worker error(err, filename) 677*cda5da8dSAndroid Build Coastguard Worker except OSError as err: 678*cda5da8dSAndroid Build Coastguard Worker error(err) 679*cda5da8dSAndroid Build Coastguard Worker except KeyboardInterrupt: 680*cda5da8dSAndroid Build Coastguard Worker print("interrupted\n") 681*cda5da8dSAndroid Build Coastguard Worker except Exception as err: 682*cda5da8dSAndroid Build Coastguard Worker perror("unexpected error: %s" % err) 683*cda5da8dSAndroid Build Coastguard Worker raise 684*cda5da8dSAndroid Build Coastguard Worker 685*cda5da8dSAndroid Build Coastguard Workerdef _generate_tokens_from_c_tokenizer(source): 686*cda5da8dSAndroid Build Coastguard Worker """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" 687*cda5da8dSAndroid Build Coastguard Worker import _tokenize as c_tokenizer 688*cda5da8dSAndroid Build Coastguard Worker for info in c_tokenizer.TokenizerIter(source): 689*cda5da8dSAndroid Build Coastguard Worker tok, type, lineno, end_lineno, col_off, end_col_off, line = info 690*cda5da8dSAndroid Build Coastguard Worker yield TokenInfo(type, tok, (lineno, col_off), (end_lineno, end_col_off), line) 691*cda5da8dSAndroid Build Coastguard Worker 692*cda5da8dSAndroid Build Coastguard Worker 693*cda5da8dSAndroid Build Coastguard Workerif __name__ == "__main__": 694*cda5da8dSAndroid Build Coastguard Worker main() 695