1# Copyright 2019 Google LLC 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15"""Tokenization for the Emboss definition language. 16 17This module exports the tokenize function and various errors. 18 19In addition, a couple of lists are exported for the use of 20generate_grammar_md.py: 21 22LITERAL_TOKEN_PATTERNS: A list of literal strings which are matched against 23 input. 24REGEX_TOKEN_PATTERNS: A list of regexes used for tokenization. 25 REGEX_TOKEN_PATTERNS[n].regex is an re.RegexObject 26 (REGEX_TOKEN_PATTERNS[n].regex.pattern contains the text of the pattern), and 27 REGEX_TOKEN_PATTERNS[n].symbol is the name of the symbol assigned to tokens 28 which match the pattern. 29""" 30 31import collections 32import re 33 34from compiler.util import error 35from compiler.util import parser_types 36 37 38def tokenize(text, file_name): 39 # TODO(bolms): suppress end-of-line, indent, and dedent tokens between matched 40 # delimiters ([], (), and {}). 41 """Tokenizes its argument. 42 43 Arguments: 44 text: The raw text of a .emb file. 45 file_name: The name of the file to use in errors. 46 47 Returns: 48 A tuple of: 49 a list of parser_types.Tokens or None 50 a possibly-empty list of errors. 51 """ 52 tokens = [] 53 indent_stack = [""] 54 line_number = 0 55 for line in text.splitlines(): 56 line_number += 1 57 58 # _tokenize_line splits the actual text into tokens. 59 line_tokens, errors = _tokenize_line(line, line_number, file_name) 60 if errors: 61 return None, errors 62 63 # Lines with only whitespace and comments are not used for Indent/Dedent 64 # calculation, and do not produce end-of-line tokens. 65 for token in line_tokens: 66 if token.symbol != "Comment": 67 break 68 else: 69 tokens.extend(line_tokens) 70 tokens.append(parser_types.Token( 71 '"\\n"', "\n", parser_types.make_location( 72 (line_number, len(line) + 1), (line_number, len(line) + 1)))) 73 continue 74 75 # Leading whitespace is whatever .lstrip() removes. 76 leading_whitespace = line[0:len(line) - len(line.lstrip())] 77 if leading_whitespace == indent_stack[-1]: 78 # If the current leading whitespace is equal to the last leading 79 # whitespace, do not emit an Indent or Dedent token. 80 pass 81 elif leading_whitespace.startswith(indent_stack[-1]): 82 # If the current leading whitespace is longer than the last leading 83 # whitespace, emit an Indent token. For the token text, take the new 84 # part of the whitespace. 85 tokens.append( 86 parser_types.Token( 87 "Indent", leading_whitespace[len(indent_stack[-1]):], 88 parser_types.make_location( 89 (line_number, len(indent_stack[-1]) + 1), 90 (line_number, len(leading_whitespace) + 1)))) 91 indent_stack.append(leading_whitespace) 92 else: 93 # Otherwise, search for the unclosed indentation level that matches 94 # the current indentation level. Emit a Dedent token for each 95 # newly-closed indentation level. 96 for i in range(len(indent_stack) - 1, -1, -1): 97 if leading_whitespace == indent_stack[i]: 98 break 99 tokens.append( 100 parser_types.Token("Dedent", "", parser_types.make_location( 101 (line_number, len(leading_whitespace) + 1), 102 (line_number, len(leading_whitespace) + 1)))) 103 del indent_stack[i] 104 else: 105 return None, [[error.error( 106 file_name, parser_types.make_location( 107 (line_number, 1), (line_number, len(leading_whitespace) + 1)), 108 "Bad indentation")]] 109 110 tokens.extend(line_tokens) 111 112 # Append an end-of-line token (for non-whitespace lines). 113 tokens.append(parser_types.Token( 114 '"\\n"', "\n", parser_types.make_location( 115 (line_number, len(line) + 1), (line_number, len(line) + 1)))) 116 for i in range(len(indent_stack) - 1): 117 tokens.append(parser_types.Token("Dedent", "", parser_types.make_location( 118 (line_number + 1, 1), (line_number + 1, 1)))) 119 return tokens, [] 120 121# Token patterns used by _tokenize_line. 122LITERAL_TOKEN_PATTERNS = ( 123 "[ ] ( ) : = + - * . ? == != && || < > <= >= , " 124 "$static_size_in_bits $is_statically_sized " 125 "$max $present $upper_bound $lower_bound $next " 126 "$size_in_bits $size_in_bytes " 127 "$max_size_in_bits $max_size_in_bytes $min_size_in_bits $min_size_in_bytes " 128 "$default struct bits enum external import as if let").split() 129_T = collections.namedtuple("T", ["regex", "symbol"]) 130REGEX_TOKEN_PATTERNS = [ 131 # Words starting with variations of "emboss reserved" are reserved for 132 # internal use by the Emboss compiler. 133 _T(re.compile(r"EmbossReserved[A-Za-z0-9]*"), "BadWord"), 134 _T(re.compile(r"emboss_reserved[_a-z0-9]*"), "BadWord"), 135 _T(re.compile(r"EMBOSS_RESERVED[_A-Z0-9]*"), "BadWord"), 136 _T(re.compile(r'"(?:[^"\n\\]|\\[n\\"])*"'), "String"), 137 _T(re.compile("[0-9]+"), "Number"), 138 _T(re.compile("[0-9]{1,3}(?:_[0-9]{3})*"), "Number"), 139 _T(re.compile("0x[0-9a-fA-F]+"), "Number"), 140 _T(re.compile("0x_?[0-9a-fA-F]{1,4}(?:_[0-9a-fA-F]{4})*"), "Number"), 141 _T(re.compile("0x_?[0-9a-fA-F]{1,8}(?:_[0-9a-fA-F]{8})*"), "Number"), 142 _T(re.compile("0b[01]+"), "Number"), 143 _T(re.compile("0b_?[01]{1,4}(?:_[01]{4})*"), "Number"), 144 _T(re.compile("0b_?[01]{1,8}(?:_[01]{8})*"), "Number"), 145 _T(re.compile("true|false"), "BooleanConstant"), 146 _T(re.compile("[a-z][a-z_0-9]*"), "SnakeWord"), 147 # Single-letter ShoutyWords (like "A") and single-letter-followed-by-number 148 # ShoutyWords ("A100") are disallowed due to ambiguity with CamelWords. A 149 # ShoutyWord must start with an upper case letter and contain at least one 150 # more upper case letter or '_'. 151 _T(re.compile("[A-Z][A-Z_0-9]*[A-Z_][A-Z_0-9]*"), "ShoutyWord"), 152 # A CamelWord starts with A-Z and contains at least one a-z, and no _. 153 _T(re.compile("[A-Z][a-zA-Z0-9]*[a-z][a-zA-Z0-9]*"), "CamelWord"), 154 _T(re.compile("-- .*"), "Documentation"), 155 _T(re.compile("--$"), "Documentation"), 156 _T(re.compile("--.*"), "BadDocumentation"), 157 _T(re.compile(r"\s+"), None), 158 _T(re.compile("#.*"), "Comment"), 159 # BadWord and BadNumber are a catch-alls for words and numbers so that 160 # something like "abcDef" doesn't tokenize to [SnakeWord, CamelWord]. 161 # 162 # This is preferable to returning an error because the BadWord and BadNumber 163 # token types can be used in example-based errors. 164 _T(re.compile("[0-9][bxBX]?[0-9a-fA-F_]*"), "BadNumber"), 165 _T(re.compile("[a-zA-Z_$0-9]+"), "BadWord"), 166] 167del _T 168 169 170def _tokenize_line(line, line_number, file_name): 171 """Tokenizes a single line of input. 172 173 Arguments: 174 line: The line of text to tokenize. 175 line_number: The line number (used when constructing token objects). 176 file_name: The name of a file to use in errors. 177 178 Returns: 179 A tuple of: 180 A list of token objects or None. 181 A possibly-empty list of errors. 182 """ 183 tokens = [] 184 offset = 0 185 while offset < len(line): 186 best_candidate = "" 187 best_candidate_symbol = None 188 # Find the longest match. Ties go to the first match. This way, keywords 189 # ("struct") are matched as themselves, but words that only happen to start 190 # with keywords ("structure") are matched as words. 191 # 192 # There is never a reason to try to match a literal after a regex that 193 # could also match that literal, so check literals first. 194 for literal in LITERAL_TOKEN_PATTERNS: 195 if line[offset:].startswith(literal) and len(literal) > len( 196 best_candidate): 197 best_candidate = literal 198 # For Emboss, the name of a literal token is just the literal in quotes, 199 # so that the grammar can read a little more naturally, e.g.: 200 # 201 # expression -> expression "+" expression 202 # 203 # instead of 204 # 205 # expression -> expression Plus expression 206 best_candidate_symbol = '"' + literal + '"' 207 for pattern in REGEX_TOKEN_PATTERNS: 208 match_result = pattern.regex.match(line[offset:]) 209 if match_result and len(match_result.group(0)) > len(best_candidate): 210 best_candidate = match_result.group(0) 211 best_candidate_symbol = pattern.symbol 212 if not best_candidate: 213 return None, [[error.error( 214 file_name, parser_types.make_location( 215 (line_number, offset + 1), (line_number, offset + 2)), 216 "Unrecognized token")]] 217 if best_candidate_symbol: 218 tokens.append(parser_types.Token( 219 best_candidate_symbol, best_candidate, parser_types.make_location( 220 (line_number, offset + 1), 221 (line_number, offset + len(best_candidate) + 1)))) 222 offset += len(best_candidate) 223 return tokens, None 224