1import argparse
2import sys
3import time
4import token
5import tokenize
6import traceback
7from abc import abstractmethod
8from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast
9
10from pegen.tokenizer import Mark, Tokenizer, exact_token_types
11
12T = TypeVar("T")
13P = TypeVar("P", bound="Parser")
14F = TypeVar("F", bound=Callable[..., Any])
15
16
17def logger(method: F) -> F:
18    """For non-memoized functions that we want to be logged.
19
20    (In practice this is only non-leader left-recursive functions.)
21    """
22    method_name = method.__name__
23
24    def logger_wrapper(self: P, *args: object) -> T:
25        if not self._verbose:
26            return method(self, *args)
27        argsr = ",".join(repr(arg) for arg in args)
28        fill = "  " * self._level
29        print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})")
30        self._level += 1
31        tree = method(self, *args)
32        self._level -= 1
33        print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}")
34        return tree
35
36    logger_wrapper.__wrapped__ = method  # type: ignore
37    return cast(F, logger_wrapper)
38
39
40def memoize(method: F) -> F:
41    """Memoize a symbol method."""
42    method_name = method.__name__
43
44    def memoize_wrapper(self: P, *args: object) -> T:
45        mark = self._mark()
46        key = mark, method_name, args
47        # Fast path: cache hit, and not verbose.
48        if key in self._cache and not self._verbose:
49            tree, endmark = self._cache[key]
50            self._reset(endmark)
51            return tree
52        # Slow path: no cache hit, or verbose.
53        verbose = self._verbose
54        argsr = ",".join(repr(arg) for arg in args)
55        fill = "  " * self._level
56        if key not in self._cache:
57            if verbose:
58                print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})")
59            self._level += 1
60            tree = method(self, *args)
61            self._level -= 1
62            if verbose:
63                print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}")
64            endmark = self._mark()
65            self._cache[key] = tree, endmark
66        else:
67            tree, endmark = self._cache[key]
68            if verbose:
69                print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}")
70            self._reset(endmark)
71        return tree
72
73    memoize_wrapper.__wrapped__ = method  # type: ignore
74    return cast(F, memoize_wrapper)
75
76
77def memoize_left_rec(method: Callable[[P], Optional[T]]) -> Callable[[P], Optional[T]]:
78    """Memoize a left-recursive symbol method."""
79    method_name = method.__name__
80
81    def memoize_left_rec_wrapper(self: P) -> Optional[T]:
82        mark = self._mark()
83        key = mark, method_name, ()
84        # Fast path: cache hit, and not verbose.
85        if key in self._cache and not self._verbose:
86            tree, endmark = self._cache[key]
87            self._reset(endmark)
88            return tree
89        # Slow path: no cache hit, or verbose.
90        verbose = self._verbose
91        fill = "  " * self._level
92        if key not in self._cache:
93            if verbose:
94                print(f"{fill}{method_name} ... (looking at {self.showpeek()})")
95            self._level += 1
96
97            # For left-recursive rules we manipulate the cache and
98            # loop until the rule shows no progress, then pick the
99            # previous result.  For an explanation why this works, see
100            # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion
101            # (But we use the memoization cache instead of a static
102            # variable; perhaps this is similar to a paper by Warth et al.
103            # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08).
104
105            # Prime the cache with a failure.
106            self._cache[key] = None, mark
107            lastresult, lastmark = None, mark
108            depth = 0
109            if verbose:
110                print(f"{fill}Recursive {method_name} at {mark} depth {depth}")
111
112            while True:
113                self._reset(mark)
114                self.in_recursive_rule += 1
115                try:
116                    result = method(self)
117                finally:
118                    self.in_recursive_rule -= 1
119                endmark = self._mark()
120                depth += 1
121                if verbose:
122                    print(
123                        f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}"
124                    )
125                if not result:
126                    if verbose:
127                        print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}")
128                    break
129                if endmark <= lastmark:
130                    if verbose:
131                        print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}")
132                    break
133                self._cache[key] = lastresult, lastmark = result, endmark
134
135            self._reset(lastmark)
136            tree = lastresult
137
138            self._level -= 1
139            if verbose:
140                print(f"{fill}{method_name}() -> {tree!s:.200} [cached]")
141            if tree:
142                endmark = self._mark()
143            else:
144                endmark = mark
145                self._reset(endmark)
146            self._cache[key] = tree, endmark
147        else:
148            tree, endmark = self._cache[key]
149            if verbose:
150                print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]")
151            if tree:
152                self._reset(endmark)
153        return tree
154
155    memoize_left_rec_wrapper.__wrapped__ = method  # type: ignore
156    return memoize_left_rec_wrapper
157
158
159class Parser:
160    """Parsing base class."""
161
162    KEYWORDS: ClassVar[Tuple[str, ...]]
163
164    SOFT_KEYWORDS: ClassVar[Tuple[str, ...]]
165
166    def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False):
167        self._tokenizer = tokenizer
168        self._verbose = verbose
169        self._level = 0
170        self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {}
171        # Integer tracking whether we are in a left recursive rule or not. Can be useful
172        # for error reporting.
173        self.in_recursive_rule = 0
174        # Pass through common tokenizer methods.
175        self._mark = self._tokenizer.mark
176        self._reset = self._tokenizer.reset
177
178    @abstractmethod
179    def start(self) -> Any:
180        pass
181
182    def showpeek(self) -> str:
183        tok = self._tokenizer.peek()
184        return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}"
185
186    @memoize
187    def name(self) -> Optional[tokenize.TokenInfo]:
188        tok = self._tokenizer.peek()
189        if tok.type == token.NAME and tok.string not in self.KEYWORDS:
190            return self._tokenizer.getnext()
191        return None
192
193    @memoize
194    def number(self) -> Optional[tokenize.TokenInfo]:
195        tok = self._tokenizer.peek()
196        if tok.type == token.NUMBER:
197            return self._tokenizer.getnext()
198        return None
199
200    @memoize
201    def string(self) -> Optional[tokenize.TokenInfo]:
202        tok = self._tokenizer.peek()
203        if tok.type == token.STRING:
204            return self._tokenizer.getnext()
205        return None
206
207    @memoize
208    def op(self) -> Optional[tokenize.TokenInfo]:
209        tok = self._tokenizer.peek()
210        if tok.type == token.OP:
211            return self._tokenizer.getnext()
212        return None
213
214    @memoize
215    def type_comment(self) -> Optional[tokenize.TokenInfo]:
216        tok = self._tokenizer.peek()
217        if tok.type == token.TYPE_COMMENT:
218            return self._tokenizer.getnext()
219        return None
220
221    @memoize
222    def soft_keyword(self) -> Optional[tokenize.TokenInfo]:
223        tok = self._tokenizer.peek()
224        if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS:
225            return self._tokenizer.getnext()
226        return None
227
228    @memoize
229    def expect(self, type: str) -> Optional[tokenize.TokenInfo]:
230        tok = self._tokenizer.peek()
231        if tok.string == type:
232            return self._tokenizer.getnext()
233        if type in exact_token_types:
234            if tok.type == exact_token_types[type]:
235                return self._tokenizer.getnext()
236        if type in token.__dict__:
237            if tok.type == token.__dict__[type]:
238                return self._tokenizer.getnext()
239        if tok.type == token.OP and tok.string == type:
240            return self._tokenizer.getnext()
241        return None
242
243    def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]:
244        if res is None:
245            raise self.make_syntax_error(f"expected {expectation}")
246        return res
247
248    def positive_lookahead(self, func: Callable[..., T], *args: object) -> T:
249        mark = self._mark()
250        ok = func(*args)
251        self._reset(mark)
252        return ok
253
254    def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool:
255        mark = self._mark()
256        ok = func(*args)
257        self._reset(mark)
258        return not ok
259
260    def make_syntax_error(self, message: str, filename: str = "<unknown>") -> SyntaxError:
261        tok = self._tokenizer.diagnose()
262        return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line))
263
264
265def simple_parser_main(parser_class: Type[Parser]) -> None:
266    argparser = argparse.ArgumentParser()
267    argparser.add_argument(
268        "-v",
269        "--verbose",
270        action="count",
271        default=0,
272        help="Print timing stats; repeat for more debug output",
273    )
274    argparser.add_argument(
275        "-q", "--quiet", action="store_true", help="Don't print the parsed program"
276    )
277    argparser.add_argument("filename", help="Input file ('-' to use stdin)")
278
279    args = argparser.parse_args()
280    verbose = args.verbose
281    verbose_tokenizer = verbose >= 3
282    verbose_parser = verbose == 2 or verbose >= 4
283
284    t0 = time.time()
285
286    filename = args.filename
287    if filename == "" or filename == "-":
288        filename = "<stdin>"
289        file = sys.stdin
290    else:
291        file = open(args.filename)
292    try:
293        tokengen = tokenize.generate_tokens(file.readline)
294        tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer)
295        parser = parser_class(tokenizer, verbose=verbose_parser)
296        tree = parser.start()
297        try:
298            if file.isatty():
299                endpos = 0
300            else:
301                endpos = file.tell()
302        except IOError:
303            endpos = 0
304    finally:
305        if file is not sys.stdin:
306            file.close()
307
308    t1 = time.time()
309
310    if not tree:
311        err = parser.make_syntax_error(filename)
312        traceback.print_exception(err.__class__, err, None)
313        sys.exit(1)
314
315    if not args.quiet:
316        print(tree)
317
318    if verbose:
319        dt = t1 - t0
320        diag = tokenizer.diagnose()
321        nlines = diag.end[0]
322        if diag.type == token.ENDMARKER:
323            nlines -= 1
324        print(f"Total time: {dt:.3f} sec; {nlines} lines", end="")
325        if endpos:
326            print(f" ({endpos} bytes)", end="")
327        if dt:
328            print(f"; {nlines / dt:.0f} lines/sec")
329        else:
330            print()
331        print("Caches sizes:")
332        print(f"  token array : {len(tokenizer._tokens):10}")
333        print(f"        cache : {len(parser._cache):10}")
334        ## print_memstats()
335