1import argparse 2import sys 3import time 4import token 5import tokenize 6import traceback 7from abc import abstractmethod 8from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast 9 10from pegen.tokenizer import Mark, Tokenizer, exact_token_types 11 12T = TypeVar("T") 13P = TypeVar("P", bound="Parser") 14F = TypeVar("F", bound=Callable[..., Any]) 15 16 17def logger(method: F) -> F: 18 """For non-memoized functions that we want to be logged. 19 20 (In practice this is only non-leader left-recursive functions.) 21 """ 22 method_name = method.__name__ 23 24 def logger_wrapper(self: P, *args: object) -> T: 25 if not self._verbose: 26 return method(self, *args) 27 argsr = ",".join(repr(arg) for arg in args) 28 fill = " " * self._level 29 print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})") 30 self._level += 1 31 tree = method(self, *args) 32 self._level -= 1 33 print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}") 34 return tree 35 36 logger_wrapper.__wrapped__ = method # type: ignore 37 return cast(F, logger_wrapper) 38 39 40def memoize(method: F) -> F: 41 """Memoize a symbol method.""" 42 method_name = method.__name__ 43 44 def memoize_wrapper(self: P, *args: object) -> T: 45 mark = self._mark() 46 key = mark, method_name, args 47 # Fast path: cache hit, and not verbose. 48 if key in self._cache and not self._verbose: 49 tree, endmark = self._cache[key] 50 self._reset(endmark) 51 return tree 52 # Slow path: no cache hit, or verbose. 53 verbose = self._verbose 54 argsr = ",".join(repr(arg) for arg in args) 55 fill = " " * self._level 56 if key not in self._cache: 57 if verbose: 58 print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})") 59 self._level += 1 60 tree = method(self, *args) 61 self._level -= 1 62 if verbose: 63 print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}") 64 endmark = self._mark() 65 self._cache[key] = tree, endmark 66 else: 67 tree, endmark = self._cache[key] 68 if verbose: 69 print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}") 70 self._reset(endmark) 71 return tree 72 73 memoize_wrapper.__wrapped__ = method # type: ignore 74 return cast(F, memoize_wrapper) 75 76 77def memoize_left_rec(method: Callable[[P], Optional[T]]) -> Callable[[P], Optional[T]]: 78 """Memoize a left-recursive symbol method.""" 79 method_name = method.__name__ 80 81 def memoize_left_rec_wrapper(self: P) -> Optional[T]: 82 mark = self._mark() 83 key = mark, method_name, () 84 # Fast path: cache hit, and not verbose. 85 if key in self._cache and not self._verbose: 86 tree, endmark = self._cache[key] 87 self._reset(endmark) 88 return tree 89 # Slow path: no cache hit, or verbose. 90 verbose = self._verbose 91 fill = " " * self._level 92 if key not in self._cache: 93 if verbose: 94 print(f"{fill}{method_name} ... (looking at {self.showpeek()})") 95 self._level += 1 96 97 # For left-recursive rules we manipulate the cache and 98 # loop until the rule shows no progress, then pick the 99 # previous result. For an explanation why this works, see 100 # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion 101 # (But we use the memoization cache instead of a static 102 # variable; perhaps this is similar to a paper by Warth et al. 103 # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08). 104 105 # Prime the cache with a failure. 106 self._cache[key] = None, mark 107 lastresult, lastmark = None, mark 108 depth = 0 109 if verbose: 110 print(f"{fill}Recursive {method_name} at {mark} depth {depth}") 111 112 while True: 113 self._reset(mark) 114 self.in_recursive_rule += 1 115 try: 116 result = method(self) 117 finally: 118 self.in_recursive_rule -= 1 119 endmark = self._mark() 120 depth += 1 121 if verbose: 122 print( 123 f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}" 124 ) 125 if not result: 126 if verbose: 127 print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}") 128 break 129 if endmark <= lastmark: 130 if verbose: 131 print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}") 132 break 133 self._cache[key] = lastresult, lastmark = result, endmark 134 135 self._reset(lastmark) 136 tree = lastresult 137 138 self._level -= 1 139 if verbose: 140 print(f"{fill}{method_name}() -> {tree!s:.200} [cached]") 141 if tree: 142 endmark = self._mark() 143 else: 144 endmark = mark 145 self._reset(endmark) 146 self._cache[key] = tree, endmark 147 else: 148 tree, endmark = self._cache[key] 149 if verbose: 150 print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]") 151 if tree: 152 self._reset(endmark) 153 return tree 154 155 memoize_left_rec_wrapper.__wrapped__ = method # type: ignore 156 return memoize_left_rec_wrapper 157 158 159class Parser: 160 """Parsing base class.""" 161 162 KEYWORDS: ClassVar[Tuple[str, ...]] 163 164 SOFT_KEYWORDS: ClassVar[Tuple[str, ...]] 165 166 def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False): 167 self._tokenizer = tokenizer 168 self._verbose = verbose 169 self._level = 0 170 self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {} 171 # Integer tracking whether we are in a left recursive rule or not. Can be useful 172 # for error reporting. 173 self.in_recursive_rule = 0 174 # Pass through common tokenizer methods. 175 self._mark = self._tokenizer.mark 176 self._reset = self._tokenizer.reset 177 178 @abstractmethod 179 def start(self) -> Any: 180 pass 181 182 def showpeek(self) -> str: 183 tok = self._tokenizer.peek() 184 return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}" 185 186 @memoize 187 def name(self) -> Optional[tokenize.TokenInfo]: 188 tok = self._tokenizer.peek() 189 if tok.type == token.NAME and tok.string not in self.KEYWORDS: 190 return self._tokenizer.getnext() 191 return None 192 193 @memoize 194 def number(self) -> Optional[tokenize.TokenInfo]: 195 tok = self._tokenizer.peek() 196 if tok.type == token.NUMBER: 197 return self._tokenizer.getnext() 198 return None 199 200 @memoize 201 def string(self) -> Optional[tokenize.TokenInfo]: 202 tok = self._tokenizer.peek() 203 if tok.type == token.STRING: 204 return self._tokenizer.getnext() 205 return None 206 207 @memoize 208 def op(self) -> Optional[tokenize.TokenInfo]: 209 tok = self._tokenizer.peek() 210 if tok.type == token.OP: 211 return self._tokenizer.getnext() 212 return None 213 214 @memoize 215 def type_comment(self) -> Optional[tokenize.TokenInfo]: 216 tok = self._tokenizer.peek() 217 if tok.type == token.TYPE_COMMENT: 218 return self._tokenizer.getnext() 219 return None 220 221 @memoize 222 def soft_keyword(self) -> Optional[tokenize.TokenInfo]: 223 tok = self._tokenizer.peek() 224 if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS: 225 return self._tokenizer.getnext() 226 return None 227 228 @memoize 229 def expect(self, type: str) -> Optional[tokenize.TokenInfo]: 230 tok = self._tokenizer.peek() 231 if tok.string == type: 232 return self._tokenizer.getnext() 233 if type in exact_token_types: 234 if tok.type == exact_token_types[type]: 235 return self._tokenizer.getnext() 236 if type in token.__dict__: 237 if tok.type == token.__dict__[type]: 238 return self._tokenizer.getnext() 239 if tok.type == token.OP and tok.string == type: 240 return self._tokenizer.getnext() 241 return None 242 243 def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]: 244 if res is None: 245 raise self.make_syntax_error(f"expected {expectation}") 246 return res 247 248 def positive_lookahead(self, func: Callable[..., T], *args: object) -> T: 249 mark = self._mark() 250 ok = func(*args) 251 self._reset(mark) 252 return ok 253 254 def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool: 255 mark = self._mark() 256 ok = func(*args) 257 self._reset(mark) 258 return not ok 259 260 def make_syntax_error(self, message: str, filename: str = "<unknown>") -> SyntaxError: 261 tok = self._tokenizer.diagnose() 262 return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line)) 263 264 265def simple_parser_main(parser_class: Type[Parser]) -> None: 266 argparser = argparse.ArgumentParser() 267 argparser.add_argument( 268 "-v", 269 "--verbose", 270 action="count", 271 default=0, 272 help="Print timing stats; repeat for more debug output", 273 ) 274 argparser.add_argument( 275 "-q", "--quiet", action="store_true", help="Don't print the parsed program" 276 ) 277 argparser.add_argument("filename", help="Input file ('-' to use stdin)") 278 279 args = argparser.parse_args() 280 verbose = args.verbose 281 verbose_tokenizer = verbose >= 3 282 verbose_parser = verbose == 2 or verbose >= 4 283 284 t0 = time.time() 285 286 filename = args.filename 287 if filename == "" or filename == "-": 288 filename = "<stdin>" 289 file = sys.stdin 290 else: 291 file = open(args.filename) 292 try: 293 tokengen = tokenize.generate_tokens(file.readline) 294 tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer) 295 parser = parser_class(tokenizer, verbose=verbose_parser) 296 tree = parser.start() 297 try: 298 if file.isatty(): 299 endpos = 0 300 else: 301 endpos = file.tell() 302 except IOError: 303 endpos = 0 304 finally: 305 if file is not sys.stdin: 306 file.close() 307 308 t1 = time.time() 309 310 if not tree: 311 err = parser.make_syntax_error(filename) 312 traceback.print_exception(err.__class__, err, None) 313 sys.exit(1) 314 315 if not args.quiet: 316 print(tree) 317 318 if verbose: 319 dt = t1 - t0 320 diag = tokenizer.diagnose() 321 nlines = diag.end[0] 322 if diag.type == token.ENDMARKER: 323 nlines -= 1 324 print(f"Total time: {dt:.3f} sec; {nlines} lines", end="") 325 if endpos: 326 print(f" ({endpos} bytes)", end="") 327 if dt: 328 print(f"; {nlines / dt:.0f} lines/sec") 329 else: 330 print() 331 print("Caches sizes:") 332 print(f" token array : {len(tokenizer._tokens):10}") 333 print(f" cache : {len(parser._cache):10}") 334 ## print_memstats() 335