1# mako/lexer.py
2# Copyright 2006-2023 the Mako authors and contributors <see AUTHORS file>
3#
4# This module is part of Mako and is released under
5# the MIT License: http://www.opensource.org/licenses/mit-license.php
6
7"""provides the Lexer class for parsing template strings into parse trees."""
8
9import codecs
10import re
11
12from mako import exceptions
13from mako import parsetree
14from mako.pygen import adjust_whitespace
15
16_regexp_cache = {}
17
18
19class Lexer:
20    def __init__(
21        self, text, filename=None, input_encoding=None, preprocessor=None
22    ):
23        self.text = text
24        self.filename = filename
25        self.template = parsetree.TemplateNode(self.filename)
26        self.matched_lineno = 1
27        self.matched_charpos = 0
28        self.lineno = 1
29        self.match_position = 0
30        self.tag = []
31        self.control_line = []
32        self.ternary_stack = []
33        self.encoding = input_encoding
34
35        if preprocessor is None:
36            self.preprocessor = []
37        elif not hasattr(preprocessor, "__iter__"):
38            self.preprocessor = [preprocessor]
39        else:
40            self.preprocessor = preprocessor
41
42    @property
43    def exception_kwargs(self):
44        return {
45            "source": self.text,
46            "lineno": self.matched_lineno,
47            "pos": self.matched_charpos,
48            "filename": self.filename,
49        }
50
51    def match(self, regexp, flags=None):
52        """compile the given regexp, cache the reg, and call match_reg()."""
53
54        try:
55            reg = _regexp_cache[(regexp, flags)]
56        except KeyError:
57            reg = re.compile(regexp, flags) if flags else re.compile(regexp)
58            _regexp_cache[(regexp, flags)] = reg
59
60        return self.match_reg(reg)
61
62    def match_reg(self, reg):
63        """match the given regular expression object to the current text
64        position.
65
66        if a match occurs, update the current text and line position.
67
68        """
69
70        mp = self.match_position
71
72        match = reg.match(self.text, self.match_position)
73        if match:
74            (start, end) = match.span()
75            self.match_position = end + 1 if end == start else end
76            self.matched_lineno = self.lineno
77            cp = mp - 1
78            if cp >= 0 and cp < self.textlength:
79                cp = self.text[: cp + 1].rfind("\n")
80            self.matched_charpos = mp - cp
81            self.lineno += self.text[mp : self.match_position].count("\n")
82        return match
83
84    def parse_until_text(self, watch_nesting, *text):
85        startpos = self.match_position
86        text_re = r"|".join(text)
87        brace_level = 0
88        paren_level = 0
89        bracket_level = 0
90        while True:
91            match = self.match(r"#.*\n")
92            if match:
93                continue
94            match = self.match(
95                r"(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1", re.S
96            )
97            if match:
98                continue
99            match = self.match(r"(%s)" % text_re)
100            if match and not (
101                watch_nesting
102                and (brace_level > 0 or paren_level > 0 or bracket_level > 0)
103            ):
104                return (
105                    self.text[
106                        startpos : self.match_position - len(match.group(1))
107                    ],
108                    match.group(1),
109                )
110            elif not match:
111                match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
112            if match:
113                brace_level += match.group(1).count("{")
114                brace_level -= match.group(1).count("}")
115                paren_level += match.group(1).count("(")
116                paren_level -= match.group(1).count(")")
117                bracket_level += match.group(1).count("[")
118                bracket_level -= match.group(1).count("]")
119                continue
120            raise exceptions.SyntaxException(
121                "Expected: %s" % ",".join(text), **self.exception_kwargs
122            )
123
124    def append_node(self, nodecls, *args, **kwargs):
125        kwargs.setdefault("source", self.text)
126        kwargs.setdefault("lineno", self.matched_lineno)
127        kwargs.setdefault("pos", self.matched_charpos)
128        kwargs["filename"] = self.filename
129        node = nodecls(*args, **kwargs)
130        if len(self.tag):
131            self.tag[-1].nodes.append(node)
132        else:
133            self.template.nodes.append(node)
134        # build a set of child nodes for the control line
135        # (used for loop variable detection)
136        # also build a set of child nodes on ternary control lines
137        # (used for determining if a pass needs to be auto-inserted
138        if self.control_line:
139            control_frame = self.control_line[-1]
140            control_frame.nodes.append(node)
141            if (
142                not (
143                    isinstance(node, parsetree.ControlLine)
144                    and control_frame.is_ternary(node.keyword)
145                )
146                and self.ternary_stack
147                and self.ternary_stack[-1]
148            ):
149                self.ternary_stack[-1][-1].nodes.append(node)
150        if isinstance(node, parsetree.Tag):
151            if len(self.tag):
152                node.parent = self.tag[-1]
153            self.tag.append(node)
154        elif isinstance(node, parsetree.ControlLine):
155            if node.isend:
156                self.control_line.pop()
157                self.ternary_stack.pop()
158            elif node.is_primary:
159                self.control_line.append(node)
160                self.ternary_stack.append([])
161            elif self.control_line and self.control_line[-1].is_ternary(
162                node.keyword
163            ):
164                self.ternary_stack[-1].append(node)
165            elif self.control_line and not self.control_line[-1].is_ternary(
166                node.keyword
167            ):
168                raise exceptions.SyntaxException(
169                    "Keyword '%s' not a legal ternary for keyword '%s'"
170                    % (node.keyword, self.control_line[-1].keyword),
171                    **self.exception_kwargs,
172                )
173
174    _coding_re = re.compile(r"#.*coding[:=]\s*([-\w.]+).*\r?\n")
175
176    def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
177        """given string/unicode or bytes/string, determine encoding
178        from magic encoding comment, return body as unicode
179        or raw if decode_raw=False
180
181        """
182        if isinstance(text, str):
183            m = self._coding_re.match(text)
184            encoding = m and m.group(1) or known_encoding or "utf-8"
185            return encoding, text
186
187        if text.startswith(codecs.BOM_UTF8):
188            text = text[len(codecs.BOM_UTF8) :]
189            parsed_encoding = "utf-8"
190            m = self._coding_re.match(text.decode("utf-8", "ignore"))
191            if m is not None and m.group(1) != "utf-8":
192                raise exceptions.CompileException(
193                    "Found utf-8 BOM in file, with conflicting "
194                    "magic encoding comment of '%s'" % m.group(1),
195                    text.decode("utf-8", "ignore"),
196                    0,
197                    0,
198                    filename,
199                )
200        else:
201            m = self._coding_re.match(text.decode("utf-8", "ignore"))
202            parsed_encoding = m.group(1) if m else known_encoding or "utf-8"
203        if decode_raw:
204            try:
205                text = text.decode(parsed_encoding)
206            except UnicodeDecodeError:
207                raise exceptions.CompileException(
208                    "Unicode decode operation of encoding '%s' failed"
209                    % parsed_encoding,
210                    text.decode("utf-8", "ignore"),
211                    0,
212                    0,
213                    filename,
214                )
215
216        return parsed_encoding, text
217
218    def parse(self):
219        self.encoding, self.text = self.decode_raw_stream(
220            self.text, True, self.encoding, self.filename
221        )
222
223        for preproc in self.preprocessor:
224            self.text = preproc(self.text)
225
226        # push the match marker past the
227        # encoding comment.
228        self.match_reg(self._coding_re)
229
230        self.textlength = len(self.text)
231
232        while True:
233            if self.match_position > self.textlength:
234                break
235
236            if self.match_end():
237                break
238            if self.match_expression():
239                continue
240            if self.match_control_line():
241                continue
242            if self.match_comment():
243                continue
244            if self.match_tag_start():
245                continue
246            if self.match_tag_end():
247                continue
248            if self.match_python_block():
249                continue
250            if self.match_text():
251                continue
252
253            if self.match_position > self.textlength:
254                break
255            # TODO: no coverage here
256            raise exceptions.MakoException("assertion failed")
257
258        if len(self.tag):
259            raise exceptions.SyntaxException(
260                "Unclosed tag: <%%%s>" % self.tag[-1].keyword,
261                **self.exception_kwargs,
262            )
263        if len(self.control_line):
264            raise exceptions.SyntaxException(
265                "Unterminated control keyword: '%s'"
266                % self.control_line[-1].keyword,
267                self.text,
268                self.control_line[-1].lineno,
269                self.control_line[-1].pos,
270                self.filename,
271            )
272        return self.template
273
274    def match_tag_start(self):
275        reg = r"""
276            \<%     # opening tag
277
278            ([\w\.\:]+)   # keyword
279
280            ((?:\s+\w+|\s*=\s*|"[^"]*?"|'[^']*?'|\s*,\s*)*)  # attrname, = \
281                                               #        sign, string expression
282                                               # comma is for backwards compat
283                                               # identified in #366
284
285            \s*     # more whitespace
286
287            (/)?>   # closing
288
289        """
290
291        match = self.match(
292            reg,
293            re.I | re.S | re.X,
294        )
295
296        if not match:
297            return False
298
299        keyword, attr, isend = match.groups()
300        self.keyword = keyword
301        attributes = {}
302        if attr:
303            for att in re.findall(
304                r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr
305            ):
306                key, val1, val2 = att
307                text = val1 or val2
308                text = text.replace("\r\n", "\n")
309                attributes[key] = text
310        self.append_node(parsetree.Tag, keyword, attributes)
311        if isend:
312            self.tag.pop()
313        elif keyword == "text":
314            match = self.match(r"(.*?)(?=\</%text>)", re.S)
315            if not match:
316                raise exceptions.SyntaxException(
317                    "Unclosed tag: <%%%s>" % self.tag[-1].keyword,
318                    **self.exception_kwargs,
319                )
320            self.append_node(parsetree.Text, match.group(1))
321            return self.match_tag_end()
322        return True
323
324    def match_tag_end(self):
325        match = self.match(r"\</%[\t ]*([^\t ]+?)[\t ]*>")
326        if match:
327            if not len(self.tag):
328                raise exceptions.SyntaxException(
329                    "Closing tag without opening tag: </%%%s>"
330                    % match.group(1),
331                    **self.exception_kwargs,
332                )
333            elif self.tag[-1].keyword != match.group(1):
334                raise exceptions.SyntaxException(
335                    "Closing tag </%%%s> does not match tag: <%%%s>"
336                    % (match.group(1), self.tag[-1].keyword),
337                    **self.exception_kwargs,
338                )
339            self.tag.pop()
340            return True
341        else:
342            return False
343
344    def match_end(self):
345        match = self.match(r"\Z", re.S)
346        if not match:
347            return False
348
349        string = match.group()
350        if string:
351            return string
352        else:
353            return True
354
355    def match_text(self):
356        match = self.match(
357            r"""
358                (.*?)         # anything, followed by:
359                (
360                 (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
361                                             # comment preceded by a
362                                             # consumed newline and whitespace
363                 |
364                 (?=\${)      # an expression
365                 |
366                 (?=</?[%&])  # a substitution or block or call start or end
367                              # - don't consume
368                 |
369                 (\\\r?\n)    # an escaped newline  - throw away
370                 |
371                 \Z           # end of string
372                )""",
373            re.X | re.S,
374        )
375
376        if match:
377            text = match.group(1)
378            if text:
379                self.append_node(parsetree.Text, text)
380            return True
381        else:
382            return False
383
384    def match_python_block(self):
385        match = self.match(r"<%(!)?")
386        if match:
387            line, pos = self.matched_lineno, self.matched_charpos
388            text, end = self.parse_until_text(False, r"%>")
389            # the trailing newline helps
390            # compiler.parse() not complain about indentation
391            text = adjust_whitespace(text) + "\n"
392            self.append_node(
393                parsetree.Code,
394                text,
395                match.group(1) == "!",
396                lineno=line,
397                pos=pos,
398            )
399            return True
400        else:
401            return False
402
403    def match_expression(self):
404        match = self.match(r"\${")
405        if not match:
406            return False
407
408        line, pos = self.matched_lineno, self.matched_charpos
409        text, end = self.parse_until_text(True, r"\|", r"}")
410        if end == "|":
411            escapes, end = self.parse_until_text(True, r"}")
412        else:
413            escapes = ""
414        text = text.replace("\r\n", "\n")
415        self.append_node(
416            parsetree.Expression,
417            text,
418            escapes.strip(),
419            lineno=line,
420            pos=pos,
421        )
422        return True
423
424    def match_control_line(self):
425        match = self.match(
426            r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\\r?\n)|[^\r\n])*)"
427            r"(?:\r?\n|\Z)",
428            re.M,
429        )
430        if not match:
431            return False
432
433        operator = match.group(1)
434        text = match.group(2)
435        if operator == "%":
436            m2 = re.match(r"(end)?(\w+)\s*(.*)", text)
437            if not m2:
438                raise exceptions.SyntaxException(
439                    "Invalid control line: '%s'" % text,
440                    **self.exception_kwargs,
441                )
442            isend, keyword = m2.group(1, 2)
443            isend = isend is not None
444
445            if isend:
446                if not len(self.control_line):
447                    raise exceptions.SyntaxException(
448                        "No starting keyword '%s' for '%s'" % (keyword, text),
449                        **self.exception_kwargs,
450                    )
451                elif self.control_line[-1].keyword != keyword:
452                    raise exceptions.SyntaxException(
453                        "Keyword '%s' doesn't match keyword '%s'"
454                        % (text, self.control_line[-1].keyword),
455                        **self.exception_kwargs,
456                    )
457            self.append_node(parsetree.ControlLine, keyword, isend, text)
458        else:
459            self.append_node(parsetree.Comment, text)
460        return True
461
462    def match_comment(self):
463        """matches the multiline version of a comment"""
464        match = self.match(r"<%doc>(.*?)</%doc>", re.S)
465        if match:
466            self.append_node(parsetree.Comment, match.group(1))
467            return True
468        else:
469            return False
470