1# Copyright 2019 Google LLC 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# https://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14 15"""Tests for tokenizer.""" 16 17import unittest 18from compiler.front_end import tokenizer 19from compiler.util import error 20from compiler.util import parser_types 21 22 23def _token_symbols(token_list): 24 """Given a list of tokens, returns a list of their symbol names.""" 25 return [token.symbol for token in token_list] 26 27 28class TokenizerTest(unittest.TestCase): 29 """Tests for the tokenizer.tokenize function.""" 30 31 def test_bad_indent_tab_versus_space(self): 32 # A bad indent is one that doesn't match a previous unmatched indent. 33 tokens, errors = tokenizer.tokenize(" a\n\tb", "file") 34 self.assertFalse(tokens) 35 self.assertEqual([[error.error("file", parser_types.make_location( 36 (2, 1), (2, 2)), "Bad indentation")]], errors) 37 38 def test_bad_indent_tab_versus_eight_spaces(self): 39 tokens, errors = tokenizer.tokenize(" a\n\tb", "file") 40 self.assertFalse(tokens) 41 self.assertEqual([[error.error("file", parser_types.make_location( 42 (2, 1), (2, 2)), "Bad indentation")]], errors) 43 44 def test_bad_indent_tab_versus_four_spaces(self): 45 tokens, errors = tokenizer.tokenize(" a\n\tb", "file") 46 self.assertFalse(tokens) 47 self.assertEqual([[error.error("file", parser_types.make_location( 48 (2, 1), (2, 2)), "Bad indentation")]], errors) 49 50 def test_bad_indent_two_spaces_versus_one_space(self): 51 tokens, errors = tokenizer.tokenize(" a\n b", "file") 52 self.assertFalse(tokens) 53 self.assertEqual([[error.error("file", parser_types.make_location( 54 (2, 1), (2, 2)), "Bad indentation")]], errors) 55 56 def test_bad_indent_matches_closed_indent(self): 57 tokens, errors = tokenizer.tokenize(" a\nb\n c\n d", "file") 58 self.assertFalse(tokens) 59 self.assertEqual([[error.error("file", parser_types.make_location( 60 (4, 1), (4, 2)), "Bad indentation")]], errors) 61 62 def test_bad_string_after_string_with_escaped_backslash_at_end(self): 63 tokens, errors = tokenizer.tokenize(r'"\\""', "name") 64 self.assertFalse(tokens) 65 self.assertEqual([[error.error("name", parser_types.make_location( 66 (1, 5), (1, 6)), "Unrecognized token")]], errors) 67 68 69def _make_short_token_match_tests(): 70 """Makes tests for short, simple tokenization cases.""" 71 eol = '"\\n"' 72 cases = { 73 "Cam": ["CamelWord", eol], 74 "Ca9": ["CamelWord", eol], 75 "CanB": ["CamelWord", eol], 76 "CanBee": ["CamelWord", eol], 77 "CBa": ["CamelWord", eol], 78 "cam": ["SnakeWord", eol], 79 "ca9": ["SnakeWord", eol], 80 "can_b": ["SnakeWord", eol], 81 "can_bee": ["SnakeWord", eol], 82 "c_ba": ["SnakeWord", eol], 83 "cba_": ["SnakeWord", eol], 84 "c_b_a_": ["SnakeWord", eol], 85 "CAM": ["ShoutyWord", eol], 86 "CA9": ["ShoutyWord", eol], 87 "CAN_B": ["ShoutyWord", eol], 88 "CAN_BEE": ["ShoutyWord", eol], 89 "C_BA": ["ShoutyWord", eol], 90 "C": ["BadWord", eol], 91 "C1": ["BadWord", eol], 92 "c": ["SnakeWord", eol], 93 "$": ["BadWord", eol], 94 "_": ["BadWord", eol], 95 "_a": ["BadWord", eol], 96 "_A": ["BadWord", eol], 97 "Cb_A": ["BadWord", eol], 98 "aCb": ["BadWord", eol], 99 "a b": ["SnakeWord", "SnakeWord", eol], 100 "a\tb": ["SnakeWord", "SnakeWord", eol], 101 "a \t b ": ["SnakeWord", "SnakeWord", eol], 102 " \t ": [eol], 103 "a #b": ["SnakeWord", "Comment", eol], 104 "a#": ["SnakeWord", "Comment", eol], 105 "# b": ["Comment", eol], 106 " # b": ["Comment", eol], 107 " #": ["Comment", eol], 108 "": [], 109 "\n": [eol], 110 "\na": [eol, "SnakeWord", eol], 111 "a--example": ["SnakeWord", "BadDocumentation", eol], 112 "a ---- example": ["SnakeWord", "BadDocumentation", eol], 113 "a --- example": ["SnakeWord", "BadDocumentation", eol], 114 "a-- example": ["SnakeWord", "Documentation", eol], 115 "a -- -- example": ["SnakeWord", "Documentation", eol], 116 "a -- - example": ["SnakeWord", "Documentation", eol], 117 "--": ["Documentation", eol], 118 "-- ": ["Documentation", eol], 119 "-- ": ["Documentation", eol], 120 "$default": ['"$default"', eol], 121 "$defaultx": ["BadWord", eol], 122 "$def": ["BadWord", eol], 123 "x$default": ["BadWord", eol], 124 "9$default": ["BadWord", eol], 125 "struct": ['"struct"', eol], 126 "external": ['"external"', eol], 127 "bits": ['"bits"', eol], 128 "enum": ['"enum"', eol], 129 "as": ['"as"', eol], 130 "import": ['"import"', eol], 131 "true": ["BooleanConstant", eol], 132 "false": ["BooleanConstant", eol], 133 "truex": ["SnakeWord", eol], 134 "falsex": ["SnakeWord", eol], 135 "structx": ["SnakeWord", eol], 136 "bitsx": ["SnakeWord", eol], 137 "enumx": ["SnakeWord", eol], 138 "0b": ["BadNumber", eol], 139 "0x": ["BadNumber", eol], 140 "0b011101": ["Number", eol], 141 "0b0": ["Number", eol], 142 "0b0111_1111_0000": ["Number", eol], 143 "0b00_000_00": ["BadNumber", eol], 144 "0b0_0_0": ["BadNumber", eol], 145 "0b0111012": ["BadNumber", eol], 146 "0b011101x": ["BadWord", eol], 147 "0b011101b": ["BadNumber", eol], 148 "0B0": ["BadNumber", eol], 149 "0X0": ["BadNumber", eol], 150 "0b_": ["BadNumber", eol], 151 "0x_": ["BadNumber", eol], 152 "0b__": ["BadNumber", eol], 153 "0x__": ["BadNumber", eol], 154 "0b_0000": ["Number", eol], 155 "0b0000_": ["BadNumber", eol], 156 "0b00_____00": ["BadNumber", eol], 157 "0x00_000_00": ["BadNumber", eol], 158 "0x0_0_0": ["BadNumber", eol], 159 "0b____0____": ["BadNumber", eol], 160 "0b00000000000000000000": ["Number", eol], 161 "0b_00000000": ["Number", eol], 162 "0b0000_0000_0000": ["Number", eol], 163 "0b000_0000_0000": ["Number", eol], 164 "0b00_0000_0000": ["Number", eol], 165 "0b0_0000_0000": ["Number", eol], 166 "0b_0000_0000_0000": ["Number", eol], 167 "0b_000_0000_0000": ["Number", eol], 168 "0b_00_0000_0000": ["Number", eol], 169 "0b_0_0000_0000": ["Number", eol], 170 "0b00000000_00000000_00000000": ["Number", eol], 171 "0b0000000_00000000_00000000": ["Number", eol], 172 "0b000000_00000000_00000000": ["Number", eol], 173 "0b00000_00000000_00000000": ["Number", eol], 174 "0b0000_00000000_00000000": ["Number", eol], 175 "0b000_00000000_00000000": ["Number", eol], 176 "0b00_00000000_00000000": ["Number", eol], 177 "0b0_00000000_00000000": ["Number", eol], 178 "0b_00000000_00000000_00000000": ["Number", eol], 179 "0b_0000000_00000000_00000000": ["Number", eol], 180 "0b_000000_00000000_00000000": ["Number", eol], 181 "0b_00000_00000000_00000000": ["Number", eol], 182 "0b_0000_00000000_00000000": ["Number", eol], 183 "0b_000_00000000_00000000": ["Number", eol], 184 "0b_00_00000000_00000000": ["Number", eol], 185 "0b_0_00000000_00000000": ["Number", eol], 186 "0x0": ["Number", eol], 187 "0x00000000000000000000": ["Number", eol], 188 "0x_0000": ["Number", eol], 189 "0x_00000000": ["Number", eol], 190 "0x0000_0000_0000": ["Number", eol], 191 "0x000_0000_0000": ["Number", eol], 192 "0x00_0000_0000": ["Number", eol], 193 "0x0_0000_0000": ["Number", eol], 194 "0x_0000_0000_0000": ["Number", eol], 195 "0x_000_0000_0000": ["Number", eol], 196 "0x_00_0000_0000": ["Number", eol], 197 "0x_0_0000_0000": ["Number", eol], 198 "0x00000000_00000000_00000000": ["Number", eol], 199 "0x0000000_00000000_00000000": ["Number", eol], 200 "0x000000_00000000_00000000": ["Number", eol], 201 "0x00000_00000000_00000000": ["Number", eol], 202 "0x0000_00000000_00000000": ["Number", eol], 203 "0x000_00000000_00000000": ["Number", eol], 204 "0x00_00000000_00000000": ["Number", eol], 205 "0x0_00000000_00000000": ["Number", eol], 206 "0x_00000000_00000000_00000000": ["Number", eol], 207 "0x_0000000_00000000_00000000": ["Number", eol], 208 "0x_000000_00000000_00000000": ["Number", eol], 209 "0x_00000_00000000_00000000": ["Number", eol], 210 "0x_0000_00000000_00000000": ["Number", eol], 211 "0x_000_00000000_00000000": ["Number", eol], 212 "0x_00_00000000_00000000": ["Number", eol], 213 "0x_0_00000000_00000000": ["Number", eol], 214 "0x__00000000_00000000": ["BadNumber", eol], 215 "0x00000000_00000000_0000": ["BadNumber", eol], 216 "0x00000000_0000_0000": ["BadNumber", eol], 217 "0x_00000000000000000000": ["BadNumber", eol], 218 "0b_00000000000000000000": ["BadNumber", eol], 219 "0b00000000_00000000_0000": ["BadNumber", eol], 220 "0b00000000_0000_0000": ["BadNumber", eol], 221 "0x0000_": ["BadNumber", eol], 222 "0x00_____00": ["BadNumber", eol], 223 "0x____0____": ["BadNumber", eol], 224 "EmbossReserved": ["BadWord", eol], 225 "EmbossReservedA": ["BadWord", eol], 226 "EmbossReserved_": ["BadWord", eol], 227 "EMBOSS_RESERVED": ["BadWord", eol], 228 "EMBOSS_RESERVED_": ["BadWord", eol], 229 "EMBOSS_RESERVEDA": ["BadWord", eol], 230 "emboss_reserved": ["BadWord", eol], 231 "emboss_reserved_": ["BadWord", eol], 232 "emboss_reserveda": ["BadWord", eol], 233 "0x0123456789abcdefABCDEF": ["Number", eol], 234 "0": ["Number", eol], 235 "1": ["Number", eol], 236 "1a": ["BadNumber", eol], 237 "1g": ["BadWord", eol], 238 "1234567890": ["Number", eol], 239 "1_234_567_890": ["Number", eol], 240 "234_567_890": ["Number", eol], 241 "34_567_890": ["Number", eol], 242 "4_567_890": ["Number", eol], 243 "1_2_3_4_5_6_7_8_9_0": ["BadNumber", eol], 244 "1234567890_": ["BadNumber", eol], 245 "1__234567890": ["BadNumber", eol], 246 "_1234567890": ["BadWord", eol], 247 "[]": ['"["', '"]"', eol], 248 "()": ['"("', '")"', eol], 249 "..": ['"."', '"."', eol], 250 "...": ['"."', '"."', '"."', eol], 251 "....": ['"."', '"."', '"."', '"."', eol], 252 '"abc"': ["String", eol], 253 '""': ["String", eol], 254 r'"\\"': ["String", eol], 255 r'"\""': ["String", eol], 256 r'"\n"': ["String", eol], 257 r'"\\n"': ["String", eol], 258 r'"\\xyz"': ["String", eol], 259 r'"\\\\"': ["String", eol], 260 } 261 for c in ("[ ] ( ) ? : = + - * . == != < <= > >= && || , $max $present " 262 "$upper_bound $lower_bound $size_in_bits $size_in_bytes " 263 "$max_size_in_bits $max_size_in_bytes $min_size_in_bits " 264 "$min_size_in_bytes " 265 "$default struct bits enum external import as if let").split(): 266 cases[c] = ['"' + c + '"', eol] 267 268 def make_test_case(case): 269 270 def test_case(self): 271 tokens, errors = tokenizer.tokenize(case, "name") 272 symbols = _token_symbols(tokens) 273 self.assertFalse(errors) 274 self.assertEqual(symbols, cases[case]) 275 276 return test_case 277 278 for c in cases: 279 setattr(TokenizerTest, "testShortTokenMatch{!r}".format(c), 280 make_test_case(c)) 281 282 283def _make_bad_char_tests(): 284 """Makes tests that an error is returned for bad characters.""" 285 286 def make_test_case(case): 287 288 def test_case(self): 289 tokens, errors = tokenizer.tokenize(case, "name") 290 self.assertFalse(tokens) 291 self.assertEqual([[error.error("name", parser_types.make_location( 292 (1, 1), (1, 2)), "Unrecognized token")]], errors) 293 294 return test_case 295 296 for c in "~`!@%^&\\|;'\"/{}": 297 setattr(TokenizerTest, "testBadChar{!r}".format(c), make_test_case(c)) 298 299 300def _make_bad_string_tests(): 301 """Makes tests that an error is returned for bad strings.""" 302 bad_strings = (r'"\"', '"\\\n"', r'"\\\"', r'"', r'"\q"', r'"\\\q"') 303 304 def make_test_case(string): 305 306 def test_case(self): 307 tokens, errors = tokenizer.tokenize(string, "name") 308 self.assertFalse(tokens) 309 self.assertEqual([[error.error("name", parser_types.make_location( 310 (1, 1), (1, 2)), "Unrecognized token")]], errors) 311 312 return test_case 313 314 for s in bad_strings: 315 setattr(TokenizerTest, "testBadString{!r}".format(s), make_test_case(s)) 316 317 318def _make_multiline_tests(): 319 """Makes tests for indent/dedent insertion and eol insertion.""" 320 321 c = "Comment" 322 eol = '"\\n"' 323 sw = "SnakeWord" 324 ind = "Indent" 325 ded = "Dedent" 326 cases = { 327 "a\nb\n": [sw, eol, sw, eol], 328 "a\n\nb\n": [sw, eol, eol, sw, eol], 329 "a\n#foo\nb\n": [sw, eol, c, eol, sw, eol], 330 "a\n #foo\nb\n": [sw, eol, c, eol, sw, eol], 331 "a\n b\n": [sw, eol, ind, sw, eol, ded], 332 "a\n b\n\n": [sw, eol, ind, sw, eol, eol, ded], 333 "a\n b\n c\n": [sw, eol, ind, sw, eol, ind, sw, eol, ded, ded], 334 "a\n b\n c\n": [sw, eol, ind, sw, eol, sw, eol, ded], 335 "a\n b\n\n c\n": [sw, eol, ind, sw, eol, eol, sw, eol, ded], 336 "a\n b\n #\n c\n": [sw, eol, ind, sw, eol, c, eol, sw, eol, ded], 337 "a\n\tb\n #\n\tc\n": [sw, eol, ind, sw, eol, c, eol, sw, eol, ded], 338 " a\n b\n c\n d\n": [ind, sw, eol, ind, sw, eol, ind, sw, eol, ded, 339 ded, sw, eol, ded], 340 } 341 342 def make_test_case(case): 343 344 def test_case(self): 345 tokens, errors = tokenizer.tokenize(case, "file") 346 self.assertFalse(errors) 347 self.assertEqual(_token_symbols(tokens), cases[case]) 348 349 return test_case 350 351 for c in cases: 352 setattr(TokenizerTest, "testMultiline{!r}".format(c), make_test_case(c)) 353 354 355def _make_offset_tests(): 356 """Makes tests that the tokenizer fills in correct source locations.""" 357 cases = { 358 "a+": ["1:1-1:2", "1:2-1:3", "1:3-1:3"], 359 "a + ": ["1:1-1:2", "1:5-1:6", "1:9-1:9"], 360 "a\n\nb": ["1:1-1:2", "1:2-1:2", "2:1-2:1", "3:1-3:2", "3:2-3:2"], 361 "a\n b": ["1:1-1:2", "1:2-1:2", "2:1-2:3", "2:3-2:4", "2:4-2:4", 362 "3:1-3:1"], 363 "a\n b\nc": ["1:1-1:2", "1:2-1:2", "2:1-2:3", "2:3-2:4", "2:4-2:4", 364 "3:1-3:1", "3:1-3:2", "3:2-3:2"], 365 "a\n b\n c": ["1:1-1:2", "1:2-1:2", "2:1-2:2", "2:2-2:3", "2:3-2:3", 366 "3:2-3:3", "3:3-3:4", "3:4-3:4", "4:1-4:1", "4:1-4:1"], 367 } 368 369 def make_test_case(case): 370 371 def test_case(self): 372 self.assertEqual([parser_types.format_location(l.source_location) 373 for l in tokenizer.tokenize(case, "file")[0]], 374 cases[case]) 375 376 return test_case 377 378 for c in cases: 379 setattr(TokenizerTest, "testOffset{!r}".format(c), make_test_case(c)) 380 381_make_short_token_match_tests() 382_make_bad_char_tests() 383_make_bad_string_tests() 384_make_multiline_tests() 385_make_offset_tests() 386 387if __name__ == "__main__": 388 unittest.main() 389