xref: /aosp_15_r20/external/emboss/compiler/front_end/tokenizer_test.py (revision 99e0aae7469b87d12f0ad23e61142c2d74c1ef70)
1# Copyright 2019 Google LLC
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     https://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14
15"""Tests for tokenizer."""
16
17import unittest
18from compiler.front_end import tokenizer
19from compiler.util import error
20from compiler.util import parser_types
21
22
23def _token_symbols(token_list):
24  """Given a list of tokens, returns a list of their symbol names."""
25  return [token.symbol for token in token_list]
26
27
28class TokenizerTest(unittest.TestCase):
29  """Tests for the tokenizer.tokenize function."""
30
31  def test_bad_indent_tab_versus_space(self):
32    # A bad indent is one that doesn't match a previous unmatched indent.
33    tokens, errors = tokenizer.tokenize(" a\n\tb", "file")
34    self.assertFalse(tokens)
35    self.assertEqual([[error.error("file", parser_types.make_location(
36        (2, 1), (2, 2)), "Bad indentation")]], errors)
37
38  def test_bad_indent_tab_versus_eight_spaces(self):
39    tokens, errors = tokenizer.tokenize("        a\n\tb", "file")
40    self.assertFalse(tokens)
41    self.assertEqual([[error.error("file", parser_types.make_location(
42        (2, 1), (2, 2)), "Bad indentation")]], errors)
43
44  def test_bad_indent_tab_versus_four_spaces(self):
45    tokens, errors = tokenizer.tokenize("    a\n\tb", "file")
46    self.assertFalse(tokens)
47    self.assertEqual([[error.error("file", parser_types.make_location(
48        (2, 1), (2, 2)), "Bad indentation")]], errors)
49
50  def test_bad_indent_two_spaces_versus_one_space(self):
51    tokens, errors = tokenizer.tokenize("  a\n b", "file")
52    self.assertFalse(tokens)
53    self.assertEqual([[error.error("file", parser_types.make_location(
54        (2, 1), (2, 2)), "Bad indentation")]], errors)
55
56  def test_bad_indent_matches_closed_indent(self):
57    tokens, errors = tokenizer.tokenize(" a\nb\n  c\n d", "file")
58    self.assertFalse(tokens)
59    self.assertEqual([[error.error("file", parser_types.make_location(
60        (4, 1), (4, 2)), "Bad indentation")]], errors)
61
62  def test_bad_string_after_string_with_escaped_backslash_at_end(self):
63    tokens, errors = tokenizer.tokenize(r'"\\""', "name")
64    self.assertFalse(tokens)
65    self.assertEqual([[error.error("name", parser_types.make_location(
66        (1, 5), (1, 6)), "Unrecognized token")]], errors)
67
68
69def _make_short_token_match_tests():
70  """Makes tests for short, simple tokenization cases."""
71  eol = '"\\n"'
72  cases = {
73      "Cam": ["CamelWord", eol],
74      "Ca9": ["CamelWord", eol],
75      "CanB": ["CamelWord", eol],
76      "CanBee": ["CamelWord", eol],
77      "CBa": ["CamelWord", eol],
78      "cam": ["SnakeWord", eol],
79      "ca9": ["SnakeWord", eol],
80      "can_b": ["SnakeWord", eol],
81      "can_bee": ["SnakeWord", eol],
82      "c_ba": ["SnakeWord", eol],
83      "cba_": ["SnakeWord", eol],
84      "c_b_a_": ["SnakeWord", eol],
85      "CAM": ["ShoutyWord", eol],
86      "CA9": ["ShoutyWord", eol],
87      "CAN_B": ["ShoutyWord", eol],
88      "CAN_BEE": ["ShoutyWord", eol],
89      "C_BA": ["ShoutyWord", eol],
90      "C": ["BadWord", eol],
91      "C1": ["BadWord", eol],
92      "c": ["SnakeWord", eol],
93      "$": ["BadWord", eol],
94      "_": ["BadWord", eol],
95      "_a": ["BadWord", eol],
96      "_A": ["BadWord", eol],
97      "Cb_A": ["BadWord", eol],
98      "aCb": ["BadWord", eol],
99      "a  b": ["SnakeWord", "SnakeWord", eol],
100      "a\tb": ["SnakeWord", "SnakeWord", eol],
101      "a \t b ": ["SnakeWord", "SnakeWord", eol],
102      " \t ": [eol],
103      "a #b": ["SnakeWord", "Comment", eol],
104      "a#": ["SnakeWord", "Comment", eol],
105      "# b": ["Comment", eol],
106      "    # b": ["Comment", eol],
107      "    #": ["Comment", eol],
108      "": [],
109      "\n": [eol],
110      "\na": [eol, "SnakeWord", eol],
111      "a--example": ["SnakeWord", "BadDocumentation", eol],
112      "a ---- example": ["SnakeWord", "BadDocumentation", eol],
113      "a --- example": ["SnakeWord", "BadDocumentation", eol],
114      "a-- example": ["SnakeWord", "Documentation", eol],
115      "a --    -- example": ["SnakeWord", "Documentation", eol],
116      "a -- - example": ["SnakeWord", "Documentation", eol],
117      "--": ["Documentation", eol],
118      "-- ": ["Documentation", eol],
119      "--  ": ["Documentation", eol],
120      "$default": ['"$default"', eol],
121      "$defaultx": ["BadWord", eol],
122      "$def": ["BadWord", eol],
123      "x$default": ["BadWord", eol],
124      "9$default": ["BadWord", eol],
125      "struct": ['"struct"', eol],
126      "external": ['"external"', eol],
127      "bits": ['"bits"', eol],
128      "enum": ['"enum"', eol],
129      "as": ['"as"', eol],
130      "import": ['"import"', eol],
131      "true": ["BooleanConstant", eol],
132      "false": ["BooleanConstant", eol],
133      "truex": ["SnakeWord", eol],
134      "falsex": ["SnakeWord", eol],
135      "structx": ["SnakeWord", eol],
136      "bitsx": ["SnakeWord", eol],
137      "enumx": ["SnakeWord", eol],
138      "0b": ["BadNumber", eol],
139      "0x": ["BadNumber", eol],
140      "0b011101": ["Number", eol],
141      "0b0": ["Number", eol],
142      "0b0111_1111_0000": ["Number", eol],
143      "0b00_000_00": ["BadNumber", eol],
144      "0b0_0_0": ["BadNumber", eol],
145      "0b0111012": ["BadNumber", eol],
146      "0b011101x": ["BadWord", eol],
147      "0b011101b": ["BadNumber", eol],
148      "0B0": ["BadNumber", eol],
149      "0X0": ["BadNumber", eol],
150      "0b_": ["BadNumber", eol],
151      "0x_": ["BadNumber", eol],
152      "0b__": ["BadNumber", eol],
153      "0x__": ["BadNumber", eol],
154      "0b_0000": ["Number", eol],
155      "0b0000_": ["BadNumber", eol],
156      "0b00_____00": ["BadNumber", eol],
157      "0x00_000_00": ["BadNumber", eol],
158      "0x0_0_0": ["BadNumber", eol],
159      "0b____0____": ["BadNumber", eol],
160      "0b00000000000000000000": ["Number", eol],
161      "0b_00000000": ["Number", eol],
162      "0b0000_0000_0000": ["Number", eol],
163      "0b000_0000_0000": ["Number", eol],
164      "0b00_0000_0000": ["Number", eol],
165      "0b0_0000_0000": ["Number", eol],
166      "0b_0000_0000_0000": ["Number", eol],
167      "0b_000_0000_0000": ["Number", eol],
168      "0b_00_0000_0000": ["Number", eol],
169      "0b_0_0000_0000": ["Number", eol],
170      "0b00000000_00000000_00000000": ["Number", eol],
171      "0b0000000_00000000_00000000": ["Number", eol],
172      "0b000000_00000000_00000000": ["Number", eol],
173      "0b00000_00000000_00000000": ["Number", eol],
174      "0b0000_00000000_00000000": ["Number", eol],
175      "0b000_00000000_00000000": ["Number", eol],
176      "0b00_00000000_00000000": ["Number", eol],
177      "0b0_00000000_00000000": ["Number", eol],
178      "0b_00000000_00000000_00000000": ["Number", eol],
179      "0b_0000000_00000000_00000000": ["Number", eol],
180      "0b_000000_00000000_00000000": ["Number", eol],
181      "0b_00000_00000000_00000000": ["Number", eol],
182      "0b_0000_00000000_00000000": ["Number", eol],
183      "0b_000_00000000_00000000": ["Number", eol],
184      "0b_00_00000000_00000000": ["Number", eol],
185      "0b_0_00000000_00000000": ["Number", eol],
186      "0x0": ["Number", eol],
187      "0x00000000000000000000": ["Number", eol],
188      "0x_0000": ["Number", eol],
189      "0x_00000000": ["Number", eol],
190      "0x0000_0000_0000": ["Number", eol],
191      "0x000_0000_0000": ["Number", eol],
192      "0x00_0000_0000": ["Number", eol],
193      "0x0_0000_0000": ["Number", eol],
194      "0x_0000_0000_0000": ["Number", eol],
195      "0x_000_0000_0000": ["Number", eol],
196      "0x_00_0000_0000": ["Number", eol],
197      "0x_0_0000_0000": ["Number", eol],
198      "0x00000000_00000000_00000000": ["Number", eol],
199      "0x0000000_00000000_00000000": ["Number", eol],
200      "0x000000_00000000_00000000": ["Number", eol],
201      "0x00000_00000000_00000000": ["Number", eol],
202      "0x0000_00000000_00000000": ["Number", eol],
203      "0x000_00000000_00000000": ["Number", eol],
204      "0x00_00000000_00000000": ["Number", eol],
205      "0x0_00000000_00000000": ["Number", eol],
206      "0x_00000000_00000000_00000000": ["Number", eol],
207      "0x_0000000_00000000_00000000": ["Number", eol],
208      "0x_000000_00000000_00000000": ["Number", eol],
209      "0x_00000_00000000_00000000": ["Number", eol],
210      "0x_0000_00000000_00000000": ["Number", eol],
211      "0x_000_00000000_00000000": ["Number", eol],
212      "0x_00_00000000_00000000": ["Number", eol],
213      "0x_0_00000000_00000000": ["Number", eol],
214      "0x__00000000_00000000": ["BadNumber", eol],
215      "0x00000000_00000000_0000": ["BadNumber", eol],
216      "0x00000000_0000_0000": ["BadNumber", eol],
217      "0x_00000000000000000000": ["BadNumber", eol],
218      "0b_00000000000000000000": ["BadNumber", eol],
219      "0b00000000_00000000_0000": ["BadNumber", eol],
220      "0b00000000_0000_0000": ["BadNumber", eol],
221      "0x0000_": ["BadNumber", eol],
222      "0x00_____00": ["BadNumber", eol],
223      "0x____0____": ["BadNumber", eol],
224      "EmbossReserved": ["BadWord", eol],
225      "EmbossReservedA": ["BadWord", eol],
226      "EmbossReserved_": ["BadWord", eol],
227      "EMBOSS_RESERVED": ["BadWord", eol],
228      "EMBOSS_RESERVED_": ["BadWord", eol],
229      "EMBOSS_RESERVEDA": ["BadWord", eol],
230      "emboss_reserved": ["BadWord", eol],
231      "emboss_reserved_": ["BadWord", eol],
232      "emboss_reserveda": ["BadWord", eol],
233      "0x0123456789abcdefABCDEF": ["Number", eol],
234      "0": ["Number", eol],
235      "1": ["Number", eol],
236      "1a": ["BadNumber", eol],
237      "1g": ["BadWord", eol],
238      "1234567890": ["Number", eol],
239      "1_234_567_890": ["Number", eol],
240      "234_567_890": ["Number", eol],
241      "34_567_890": ["Number", eol],
242      "4_567_890": ["Number", eol],
243      "1_2_3_4_5_6_7_8_9_0": ["BadNumber", eol],
244      "1234567890_": ["BadNumber", eol],
245      "1__234567890": ["BadNumber", eol],
246      "_1234567890": ["BadWord", eol],
247      "[]": ['"["', '"]"', eol],
248      "()": ['"("', '")"', eol],
249      "..": ['"."', '"."', eol],
250      "...": ['"."', '"."', '"."', eol],
251      "....": ['"."', '"."', '"."', '"."', eol],
252      '"abc"': ["String", eol],
253      '""': ["String", eol],
254      r'"\\"': ["String", eol],
255      r'"\""': ["String", eol],
256      r'"\n"': ["String", eol],
257      r'"\\n"': ["String", eol],
258      r'"\\xyz"': ["String", eol],
259      r'"\\\\"': ["String", eol],
260  }
261  for c in ("[ ] ( ) ? : = + - * . == != < <= > >= && || , $max $present "
262            "$upper_bound $lower_bound $size_in_bits $size_in_bytes "
263            "$max_size_in_bits $max_size_in_bytes $min_size_in_bits "
264            "$min_size_in_bytes "
265            "$default struct bits enum external import as if let").split():
266    cases[c] = ['"' + c + '"', eol]
267
268  def make_test_case(case):
269
270    def test_case(self):
271      tokens, errors = tokenizer.tokenize(case, "name")
272      symbols = _token_symbols(tokens)
273      self.assertFalse(errors)
274      self.assertEqual(symbols, cases[case])
275
276    return test_case
277
278  for c in cases:
279    setattr(TokenizerTest, "testShortTokenMatch{!r}".format(c),
280            make_test_case(c))
281
282
283def _make_bad_char_tests():
284  """Makes tests that an error is returned for bad characters."""
285
286  def make_test_case(case):
287
288    def test_case(self):
289      tokens, errors = tokenizer.tokenize(case, "name")
290      self.assertFalse(tokens)
291      self.assertEqual([[error.error("name", parser_types.make_location(
292          (1, 1), (1, 2)), "Unrecognized token")]], errors)
293
294    return test_case
295
296  for c in "~`!@%^&\\|;'\"/{}":
297    setattr(TokenizerTest, "testBadChar{!r}".format(c), make_test_case(c))
298
299
300def _make_bad_string_tests():
301  """Makes tests that an error is returned for bad strings."""
302  bad_strings = (r'"\"', '"\\\n"', r'"\\\"', r'"', r'"\q"', r'"\\\q"')
303
304  def make_test_case(string):
305
306    def test_case(self):
307      tokens, errors = tokenizer.tokenize(string, "name")
308      self.assertFalse(tokens)
309      self.assertEqual([[error.error("name", parser_types.make_location(
310          (1, 1), (1, 2)), "Unrecognized token")]], errors)
311
312    return test_case
313
314  for s in bad_strings:
315    setattr(TokenizerTest, "testBadString{!r}".format(s), make_test_case(s))
316
317
318def _make_multiline_tests():
319  """Makes tests for indent/dedent insertion and eol insertion."""
320
321  c = "Comment"
322  eol = '"\\n"'
323  sw = "SnakeWord"
324  ind = "Indent"
325  ded = "Dedent"
326  cases = {
327      "a\nb\n": [sw, eol, sw, eol],
328      "a\n\nb\n": [sw, eol, eol, sw, eol],
329      "a\n#foo\nb\n": [sw, eol, c, eol, sw, eol],
330      "a\n   #foo\nb\n": [sw, eol, c, eol, sw, eol],
331      "a\n b\n": [sw, eol, ind, sw, eol, ded],
332      "a\n b\n\n": [sw, eol, ind, sw, eol, eol, ded],
333      "a\n b\n  c\n": [sw, eol, ind, sw, eol, ind, sw, eol, ded, ded],
334      "a\n b\n c\n": [sw, eol, ind, sw, eol, sw, eol, ded],
335      "a\n b\n\n c\n": [sw, eol, ind, sw, eol, eol, sw, eol, ded],
336      "a\n b\n    #\n c\n": [sw, eol, ind, sw, eol, c, eol, sw, eol, ded],
337      "a\n\tb\n    #\n\tc\n": [sw, eol, ind, sw, eol, c, eol, sw, eol, ded],
338      " a\n  b\n   c\n d\n": [ind, sw, eol, ind, sw, eol, ind, sw, eol, ded,
339                              ded, sw, eol, ded],
340  }
341
342  def make_test_case(case):
343
344    def test_case(self):
345      tokens, errors = tokenizer.tokenize(case, "file")
346      self.assertFalse(errors)
347      self.assertEqual(_token_symbols(tokens), cases[case])
348
349    return test_case
350
351  for c in cases:
352    setattr(TokenizerTest, "testMultiline{!r}".format(c), make_test_case(c))
353
354
355def _make_offset_tests():
356  """Makes tests that the tokenizer fills in correct source locations."""
357  cases = {
358      "a+": ["1:1-1:2", "1:2-1:3", "1:3-1:3"],
359      "a   +   ": ["1:1-1:2", "1:5-1:6", "1:9-1:9"],
360      "a\n\nb": ["1:1-1:2", "1:2-1:2", "2:1-2:1", "3:1-3:2", "3:2-3:2"],
361      "a\n  b": ["1:1-1:2", "1:2-1:2", "2:1-2:3", "2:3-2:4", "2:4-2:4",
362                 "3:1-3:1"],
363      "a\n  b\nc": ["1:1-1:2", "1:2-1:2", "2:1-2:3", "2:3-2:4", "2:4-2:4",
364                    "3:1-3:1", "3:1-3:2", "3:2-3:2"],
365      "a\n b\n  c": ["1:1-1:2", "1:2-1:2", "2:1-2:2", "2:2-2:3", "2:3-2:3",
366                     "3:2-3:3", "3:3-3:4", "3:4-3:4", "4:1-4:1", "4:1-4:1"],
367  }
368
369  def make_test_case(case):
370
371    def test_case(self):
372      self.assertEqual([parser_types.format_location(l.source_location)
373                        for l in tokenizer.tokenize(case, "file")[0]],
374                       cases[case])
375
376    return test_case
377
378  for c in cases:
379    setattr(TokenizerTest, "testOffset{!r}".format(c), make_test_case(c))
380
381_make_short_token_match_tests()
382_make_bad_char_tests()
383_make_bad_string_tests()
384_make_multiline_tests()
385_make_offset_tests()
386
387if __name__ == "__main__":
388  unittest.main()
389