1# mako/lexer.py 2# Copyright 2006-2023 the Mako authors and contributors <see AUTHORS file> 3# 4# This module is part of Mako and is released under 5# the MIT License: http://www.opensource.org/licenses/mit-license.php 6 7"""provides the Lexer class for parsing template strings into parse trees.""" 8 9import codecs 10import re 11 12from mako import exceptions 13from mako import parsetree 14from mako.pygen import adjust_whitespace 15 16_regexp_cache = {} 17 18 19class Lexer: 20 def __init__( 21 self, text, filename=None, input_encoding=None, preprocessor=None 22 ): 23 self.text = text 24 self.filename = filename 25 self.template = parsetree.TemplateNode(self.filename) 26 self.matched_lineno = 1 27 self.matched_charpos = 0 28 self.lineno = 1 29 self.match_position = 0 30 self.tag = [] 31 self.control_line = [] 32 self.ternary_stack = [] 33 self.encoding = input_encoding 34 35 if preprocessor is None: 36 self.preprocessor = [] 37 elif not hasattr(preprocessor, "__iter__"): 38 self.preprocessor = [preprocessor] 39 else: 40 self.preprocessor = preprocessor 41 42 @property 43 def exception_kwargs(self): 44 return { 45 "source": self.text, 46 "lineno": self.matched_lineno, 47 "pos": self.matched_charpos, 48 "filename": self.filename, 49 } 50 51 def match(self, regexp, flags=None): 52 """compile the given regexp, cache the reg, and call match_reg().""" 53 54 try: 55 reg = _regexp_cache[(regexp, flags)] 56 except KeyError: 57 reg = re.compile(regexp, flags) if flags else re.compile(regexp) 58 _regexp_cache[(regexp, flags)] = reg 59 60 return self.match_reg(reg) 61 62 def match_reg(self, reg): 63 """match the given regular expression object to the current text 64 position. 65 66 if a match occurs, update the current text and line position. 67 68 """ 69 70 mp = self.match_position 71 72 match = reg.match(self.text, self.match_position) 73 if match: 74 (start, end) = match.span() 75 self.match_position = end + 1 if end == start else end 76 self.matched_lineno = self.lineno 77 cp = mp - 1 78 if cp >= 0 and cp < self.textlength: 79 cp = self.text[: cp + 1].rfind("\n") 80 self.matched_charpos = mp - cp 81 self.lineno += self.text[mp : self.match_position].count("\n") 82 return match 83 84 def parse_until_text(self, watch_nesting, *text): 85 startpos = self.match_position 86 text_re = r"|".join(text) 87 brace_level = 0 88 paren_level = 0 89 bracket_level = 0 90 while True: 91 match = self.match(r"#.*\n") 92 if match: 93 continue 94 match = self.match( 95 r"(\"\"\"|\'\'\'|\"|\')[^\\]*?(\\.[^\\]*?)*\1", re.S 96 ) 97 if match: 98 continue 99 match = self.match(r"(%s)" % text_re) 100 if match and not ( 101 watch_nesting 102 and (brace_level > 0 or paren_level > 0 or bracket_level > 0) 103 ): 104 return ( 105 self.text[ 106 startpos : self.match_position - len(match.group(1)) 107 ], 108 match.group(1), 109 ) 110 elif not match: 111 match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S) 112 if match: 113 brace_level += match.group(1).count("{") 114 brace_level -= match.group(1).count("}") 115 paren_level += match.group(1).count("(") 116 paren_level -= match.group(1).count(")") 117 bracket_level += match.group(1).count("[") 118 bracket_level -= match.group(1).count("]") 119 continue 120 raise exceptions.SyntaxException( 121 "Expected: %s" % ",".join(text), **self.exception_kwargs 122 ) 123 124 def append_node(self, nodecls, *args, **kwargs): 125 kwargs.setdefault("source", self.text) 126 kwargs.setdefault("lineno", self.matched_lineno) 127 kwargs.setdefault("pos", self.matched_charpos) 128 kwargs["filename"] = self.filename 129 node = nodecls(*args, **kwargs) 130 if len(self.tag): 131 self.tag[-1].nodes.append(node) 132 else: 133 self.template.nodes.append(node) 134 # build a set of child nodes for the control line 135 # (used for loop variable detection) 136 # also build a set of child nodes on ternary control lines 137 # (used for determining if a pass needs to be auto-inserted 138 if self.control_line: 139 control_frame = self.control_line[-1] 140 control_frame.nodes.append(node) 141 if ( 142 not ( 143 isinstance(node, parsetree.ControlLine) 144 and control_frame.is_ternary(node.keyword) 145 ) 146 and self.ternary_stack 147 and self.ternary_stack[-1] 148 ): 149 self.ternary_stack[-1][-1].nodes.append(node) 150 if isinstance(node, parsetree.Tag): 151 if len(self.tag): 152 node.parent = self.tag[-1] 153 self.tag.append(node) 154 elif isinstance(node, parsetree.ControlLine): 155 if node.isend: 156 self.control_line.pop() 157 self.ternary_stack.pop() 158 elif node.is_primary: 159 self.control_line.append(node) 160 self.ternary_stack.append([]) 161 elif self.control_line and self.control_line[-1].is_ternary( 162 node.keyword 163 ): 164 self.ternary_stack[-1].append(node) 165 elif self.control_line and not self.control_line[-1].is_ternary( 166 node.keyword 167 ): 168 raise exceptions.SyntaxException( 169 "Keyword '%s' not a legal ternary for keyword '%s'" 170 % (node.keyword, self.control_line[-1].keyword), 171 **self.exception_kwargs, 172 ) 173 174 _coding_re = re.compile(r"#.*coding[:=]\s*([-\w.]+).*\r?\n") 175 176 def decode_raw_stream(self, text, decode_raw, known_encoding, filename): 177 """given string/unicode or bytes/string, determine encoding 178 from magic encoding comment, return body as unicode 179 or raw if decode_raw=False 180 181 """ 182 if isinstance(text, str): 183 m = self._coding_re.match(text) 184 encoding = m and m.group(1) or known_encoding or "utf-8" 185 return encoding, text 186 187 if text.startswith(codecs.BOM_UTF8): 188 text = text[len(codecs.BOM_UTF8) :] 189 parsed_encoding = "utf-8" 190 m = self._coding_re.match(text.decode("utf-8", "ignore")) 191 if m is not None and m.group(1) != "utf-8": 192 raise exceptions.CompileException( 193 "Found utf-8 BOM in file, with conflicting " 194 "magic encoding comment of '%s'" % m.group(1), 195 text.decode("utf-8", "ignore"), 196 0, 197 0, 198 filename, 199 ) 200 else: 201 m = self._coding_re.match(text.decode("utf-8", "ignore")) 202 parsed_encoding = m.group(1) if m else known_encoding or "utf-8" 203 if decode_raw: 204 try: 205 text = text.decode(parsed_encoding) 206 except UnicodeDecodeError: 207 raise exceptions.CompileException( 208 "Unicode decode operation of encoding '%s' failed" 209 % parsed_encoding, 210 text.decode("utf-8", "ignore"), 211 0, 212 0, 213 filename, 214 ) 215 216 return parsed_encoding, text 217 218 def parse(self): 219 self.encoding, self.text = self.decode_raw_stream( 220 self.text, True, self.encoding, self.filename 221 ) 222 223 for preproc in self.preprocessor: 224 self.text = preproc(self.text) 225 226 # push the match marker past the 227 # encoding comment. 228 self.match_reg(self._coding_re) 229 230 self.textlength = len(self.text) 231 232 while True: 233 if self.match_position > self.textlength: 234 break 235 236 if self.match_end(): 237 break 238 if self.match_expression(): 239 continue 240 if self.match_control_line(): 241 continue 242 if self.match_comment(): 243 continue 244 if self.match_tag_start(): 245 continue 246 if self.match_tag_end(): 247 continue 248 if self.match_python_block(): 249 continue 250 if self.match_text(): 251 continue 252 253 if self.match_position > self.textlength: 254 break 255 # TODO: no coverage here 256 raise exceptions.MakoException("assertion failed") 257 258 if len(self.tag): 259 raise exceptions.SyntaxException( 260 "Unclosed tag: <%%%s>" % self.tag[-1].keyword, 261 **self.exception_kwargs, 262 ) 263 if len(self.control_line): 264 raise exceptions.SyntaxException( 265 "Unterminated control keyword: '%s'" 266 % self.control_line[-1].keyword, 267 self.text, 268 self.control_line[-1].lineno, 269 self.control_line[-1].pos, 270 self.filename, 271 ) 272 return self.template 273 274 def match_tag_start(self): 275 reg = r""" 276 \<% # opening tag 277 278 ([\w\.\:]+) # keyword 279 280 ((?:\s+\w+|\s*=\s*|"[^"]*?"|'[^']*?'|\s*,\s*)*) # attrname, = \ 281 # sign, string expression 282 # comma is for backwards compat 283 # identified in #366 284 285 \s* # more whitespace 286 287 (/)?> # closing 288 289 """ 290 291 match = self.match( 292 reg, 293 re.I | re.S | re.X, 294 ) 295 296 if not match: 297 return False 298 299 keyword, attr, isend = match.groups() 300 self.keyword = keyword 301 attributes = {} 302 if attr: 303 for att in re.findall( 304 r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr 305 ): 306 key, val1, val2 = att 307 text = val1 or val2 308 text = text.replace("\r\n", "\n") 309 attributes[key] = text 310 self.append_node(parsetree.Tag, keyword, attributes) 311 if isend: 312 self.tag.pop() 313 elif keyword == "text": 314 match = self.match(r"(.*?)(?=\</%text>)", re.S) 315 if not match: 316 raise exceptions.SyntaxException( 317 "Unclosed tag: <%%%s>" % self.tag[-1].keyword, 318 **self.exception_kwargs, 319 ) 320 self.append_node(parsetree.Text, match.group(1)) 321 return self.match_tag_end() 322 return True 323 324 def match_tag_end(self): 325 match = self.match(r"\</%[\t ]*([^\t ]+?)[\t ]*>") 326 if match: 327 if not len(self.tag): 328 raise exceptions.SyntaxException( 329 "Closing tag without opening tag: </%%%s>" 330 % match.group(1), 331 **self.exception_kwargs, 332 ) 333 elif self.tag[-1].keyword != match.group(1): 334 raise exceptions.SyntaxException( 335 "Closing tag </%%%s> does not match tag: <%%%s>" 336 % (match.group(1), self.tag[-1].keyword), 337 **self.exception_kwargs, 338 ) 339 self.tag.pop() 340 return True 341 else: 342 return False 343 344 def match_end(self): 345 match = self.match(r"\Z", re.S) 346 if not match: 347 return False 348 349 string = match.group() 350 if string: 351 return string 352 else: 353 return True 354 355 def match_text(self): 356 match = self.match( 357 r""" 358 (.*?) # anything, followed by: 359 ( 360 (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based 361 # comment preceded by a 362 # consumed newline and whitespace 363 | 364 (?=\${) # an expression 365 | 366 (?=</?[%&]) # a substitution or block or call start or end 367 # - don't consume 368 | 369 (\\\r?\n) # an escaped newline - throw away 370 | 371 \Z # end of string 372 )""", 373 re.X | re.S, 374 ) 375 376 if match: 377 text = match.group(1) 378 if text: 379 self.append_node(parsetree.Text, text) 380 return True 381 else: 382 return False 383 384 def match_python_block(self): 385 match = self.match(r"<%(!)?") 386 if match: 387 line, pos = self.matched_lineno, self.matched_charpos 388 text, end = self.parse_until_text(False, r"%>") 389 # the trailing newline helps 390 # compiler.parse() not complain about indentation 391 text = adjust_whitespace(text) + "\n" 392 self.append_node( 393 parsetree.Code, 394 text, 395 match.group(1) == "!", 396 lineno=line, 397 pos=pos, 398 ) 399 return True 400 else: 401 return False 402 403 def match_expression(self): 404 match = self.match(r"\${") 405 if not match: 406 return False 407 408 line, pos = self.matched_lineno, self.matched_charpos 409 text, end = self.parse_until_text(True, r"\|", r"}") 410 if end == "|": 411 escapes, end = self.parse_until_text(True, r"}") 412 else: 413 escapes = "" 414 text = text.replace("\r\n", "\n") 415 self.append_node( 416 parsetree.Expression, 417 text, 418 escapes.strip(), 419 lineno=line, 420 pos=pos, 421 ) 422 return True 423 424 def match_control_line(self): 425 match = self.match( 426 r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\\r?\n)|[^\r\n])*)" 427 r"(?:\r?\n|\Z)", 428 re.M, 429 ) 430 if not match: 431 return False 432 433 operator = match.group(1) 434 text = match.group(2) 435 if operator == "%": 436 m2 = re.match(r"(end)?(\w+)\s*(.*)", text) 437 if not m2: 438 raise exceptions.SyntaxException( 439 "Invalid control line: '%s'" % text, 440 **self.exception_kwargs, 441 ) 442 isend, keyword = m2.group(1, 2) 443 isend = isend is not None 444 445 if isend: 446 if not len(self.control_line): 447 raise exceptions.SyntaxException( 448 "No starting keyword '%s' for '%s'" % (keyword, text), 449 **self.exception_kwargs, 450 ) 451 elif self.control_line[-1].keyword != keyword: 452 raise exceptions.SyntaxException( 453 "Keyword '%s' doesn't match keyword '%s'" 454 % (text, self.control_line[-1].keyword), 455 **self.exception_kwargs, 456 ) 457 self.append_node(parsetree.ControlLine, keyword, isend, text) 458 else: 459 self.append_node(parsetree.Comment, text) 460 return True 461 462 def match_comment(self): 463 """matches the multiline version of a comment""" 464 match = self.match(r"<%doc>(.*?)</%doc>", re.S) 465 if match: 466 self.append_node(parsetree.Comment, match.group(1)) 467 return True 468 else: 469 return False 470