xref: /aosp_15_r20/external/antlr/runtime/JavaScript/tests/functional/rhino-python.extensions (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot/*
2*16467b97STreehugger Robot [The "BSD licence"]
3*16467b97STreehugger Robot Copyright (c) 2004 Terence Parr and Loring Craymer
4*16467b97STreehugger Robot All rights reserved.
5*16467b97STreehugger Robot
6*16467b97STreehugger Robot Redistribution and use in source and binary forms, with or without
7*16467b97STreehugger Robot modification, are permitted provided that the following conditions
8*16467b97STreehugger Robot are met:
9*16467b97STreehugger Robot 1. Redistributions of source code must retain the above copyright
10*16467b97STreehugger Robot    notice, this list of conditions and the following disclaimer.
11*16467b97STreehugger Robot 2. Redistributions in binary form must reproduce the above copyright
12*16467b97STreehugger Robot    notice, this list of conditions and the following disclaimer in the
13*16467b97STreehugger Robot    documentation and/or other materials provided with the distribution.
14*16467b97STreehugger Robot 3. The name of the author may not be used to endorse or promote products
15*16467b97STreehugger Robot    derived from this software without specific prior written permission.
16*16467b97STreehugger Robot
17*16467b97STreehugger Robot THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18*16467b97STreehugger Robot IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19*16467b97STreehugger Robot OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20*16467b97STreehugger Robot IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21*16467b97STreehugger Robot INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22*16467b97STreehugger Robot NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23*16467b97STreehugger Robot DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24*16467b97STreehugger Robot THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25*16467b97STreehugger Robot (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26*16467b97STreehugger Robot THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*16467b97STreehugger Robot*/
28*16467b97STreehugger Robot
29*16467b97STreehugger Robot
30*16467b97STreehugger Robot/** Python does not explicitly provide begin and end nesting signals.
31*16467b97STreehugger Robot Rather, the indentation level indicates when you begin and end.
32*16467b97STreehugger Robot This is an interesting lexical problem because multiple DEDENT
33*16467b97STreehugger Robot tokens should be sent to the parser sometimes without a corresponding
34*16467b97STreehugger Robot input symbol!  Consider the following example:
35*16467b97STreehugger Robot
36*16467b97STreehugger Robot a=1
37*16467b97STreehugger Robot if a>1:
38*16467b97STreehugger Robot     print a
39*16467b97STreehugger Robot b=3
40*16467b97STreehugger Robot
41*16467b97STreehugger Robot Here the "b" token on the left edge signals that a DEDENT is needed
42*16467b97STreehugger Robot after the "print a \n" and before the "b".  The sequence should be
43*16467b97STreehugger Robot
44*16467b97STreehugger Robot ... 1 COLON NEWLINE INDENT PRINT a NEWLINE DEDENT b ASSIGN 3 ...
45*16467b97STreehugger Robot
46*16467b97STreehugger Robot For more examples, see the big comment at the bottom of this file.
47*16467b97STreehugger Robot
48*16467b97STreehugger Robot This TokenStream normally just passes tokens through to the parser.
49*16467b97STreehugger Robot Upon NEWLINE token from the lexer, however, an INDENT or DEDENT token
50*16467b97STreehugger Robot may need to be sent to the parser.  The NEWLINE is the trigger for
51*16467b97STreehugger Robot this class to do it's job.  NEWLINE is saved and then the first token
52*16467b97STreehugger Robot of the next line is examined.  If non-leading-whitespace token,
53*16467b97STreehugger Robot then check against stack for indent vs dedent.  If LEADING_WS, then
54*16467b97STreehugger Robot the column of the next non-whitespace token will dictate indent vs
55*16467b97STreehugger Robot dedent.  The column of the next real token is number of spaces
56*16467b97STreehugger Robot in the LEADING_WS token + 1 (to move past the whitespace).  The
57*16467b97STreehugger Robot lexer grammar must set the text of the LEADING_WS token to be
58*16467b97STreehugger Robot the proper number of spaces (and do tab conversion etc...).
59*16467b97STreehugger Robot
60*16467b97STreehugger Robot A stack of column numbers is tracked and used to detect changes
61*16467b97STreehugger Robot in indent level from one token to the next.
62*16467b97STreehugger Robot
63*16467b97STreehugger Robot A queue of tokens is built up to hold multiple DEDENT tokens that
64*16467b97STreehugger Robot are generated.  Before asking the lexer for another token via
65*16467b97STreehugger Robot nextToken(), the queue is flushed first one token at a time.
66*16467b97STreehugger Robot
67*16467b97STreehugger Robot Terence Parr and Loring Craymer
68*16467b97STreehugger Robot February 2004
69*16467b97STreehugger Robot */
70*16467b97STreehugger RobotPythonTokenSource = function(stream) {
71*16467b97STreehugger Robot    this.stream = stream;
72*16467b97STreehugger Robot	/** The stack of indent levels (column numbers) */
73*16467b97STreehugger Robot	this.indentStack = new Array(PythonTokenSource.MAX_INDENTS);
74*16467b97STreehugger Robot	/** stack pointer */
75*16467b97STreehugger Robot	this.sp=-1; // grow upwards
76*16467b97STreehugger Robot
77*16467b97STreehugger Robot	/** The queue of tokens */
78*16467b97STreehugger Robot	this.tokens = [];
79*16467b97STreehugger Robot	this.lastTokenAddedIndex = -1;
80*16467b97STreehugger Robot	this.push(PythonTokenSource.FIRST_CHAR_POSITION);
81*16467b97STreehugger Robot};
82*16467b97STreehugger Robot
83*16467b97STreehugger RobotANTLR.lang.augmentObject(PythonTokenSource, {
84*16467b97STreehugger Robot	MAX_INDENTS: 100,
85*16467b97STreehugger Robot	FIRST_CHAR_POSITION: 0,
86*16467b97STreehugger Robot});
87*16467b97STreehugger Robot
88*16467b97STreehugger RobotPythonTokenSource.prototype = {
89*16467b97STreehugger Robot	getSourceName: function() {
90*16467b97STreehugger Robot		return this.stream.getSourceName();
91*16467b97STreehugger Robot	},
92*16467b97STreehugger Robot
93*16467b97STreehugger Robot	/** From http://www.python.org/doc/2.2.3/ref/indentation.html
94*16467b97STreehugger Robot
95*16467b97STreehugger Robot	 "Before the first line of the file is read, a single zero is
96*16467b97STreehugger Robot	 pushed on the stack; this will never be popped off again. The
97*16467b97STreehugger Robot	 numbers pushed on the stack will always be strictly increasing
98*16467b97STreehugger Robot	 from bottom to top. At the beginning of each logical line, the
99*16467b97STreehugger Robot	 line's indentation level is compared to the top of the
100*16467b97STreehugger Robot	 stack. If it is equal, nothing happens. If it is larger, it is
101*16467b97STreehugger Robot	 pushed on the stack, and one INDENT token is generated. If it
102*16467b97STreehugger Robot	 is smaller, it must be one of the numbers occurring on the
103*16467b97STreehugger Robot	 stack; all numbers on the stack that are larger are popped
104*16467b97STreehugger Robot	 off, and for each number popped off a DEDENT token is
105*16467b97STreehugger Robot	 generated. At the end of the file, a DEDENT token is generated
106*16467b97STreehugger Robot	 for each number remaining on the stack that is larger than
107*16467b97STreehugger Robot	 zero."
108*16467b97STreehugger Robot
109*16467b97STreehugger Robot	 I use char position in line 0..n-1 instead.
110*16467b97STreehugger Robot
111*16467b97STreehugger Robot	 The DEDENTS possibly needed at EOF are gracefully handled by forcing
112*16467b97STreehugger Robot	 EOF to have char pos 0 even though with UNIX it's hard to get EOF
113*16467b97STreehugger Robot	 at a non left edge.
114*16467b97STreehugger Robot	 */
115*16467b97STreehugger Robot	nextToken: function() {
116*16467b97STreehugger Robot		// if something in queue, just remove and return it
117*16467b97STreehugger Robot		if (this.tokens.length>0 ) {
118*16467b97STreehugger Robot			var t = this.tokens[0];
119*16467b97STreehugger Robot			this.tokens.splice(0,1);
120*16467b97STreehugger Robot			return t;
121*16467b97STreehugger Robot		}
122*16467b97STreehugger Robot
123*16467b97STreehugger Robot		this.insertImaginaryIndentDedentTokens();
124*16467b97STreehugger Robot
125*16467b97STreehugger Robot		return this.nextToken();
126*16467b97STreehugger Robot	},
127*16467b97STreehugger Robot
128*16467b97STreehugger Robot	insertImaginaryIndentDedentTokens: function()
129*16467b97STreehugger Robot	{
130*16467b97STreehugger Robot		var t = this.stream.LT(1);
131*16467b97STreehugger Robot		this.stream.consume();
132*16467b97STreehugger Robot
133*16467b97STreehugger Robot		// if not a NEWLINE, doesn't signal indent/dedent work; just enqueue
134*16467b97STreehugger Robot		if ( t.getType()!=PythonLexer.NEWLINE ) {
135*16467b97STreehugger Robot			var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
136*16467b97STreehugger Robot			if ( hiddenTokens!=null ) {
137*16467b97STreehugger Robot				this.tokens = this.tokens.concat(hiddenTokens);
138*16467b97STreehugger Robot			}
139*16467b97STreehugger Robot			this.lastTokenAddedIndex = t.getTokenIndex();
140*16467b97STreehugger Robot			this.tokens.push(t);
141*16467b97STreehugger Robot			return;
142*16467b97STreehugger Robot		}
143*16467b97STreehugger Robot
144*16467b97STreehugger Robot		// save NEWLINE in the queue
145*16467b97STreehugger Robot		var hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
146*16467b97STreehugger Robot		if ( hiddenTokens!=null ) {
147*16467b97STreehugger Robot			this.tokens = this.tokens.concat(hiddenTokens);
148*16467b97STreehugger Robot		}
149*16467b97STreehugger Robot		this.lastTokenAddedIndex = t.getTokenIndex();
150*16467b97STreehugger Robot		this.tokens.push(t);
151*16467b97STreehugger Robot
152*16467b97STreehugger Robot		// grab first token of next line
153*16467b97STreehugger Robot		t = this.stream.LT(1);
154*16467b97STreehugger Robot		this.stream.consume();
155*16467b97STreehugger Robot
156*16467b97STreehugger Robot		hiddenTokens = this.stream.getTokens(this.lastTokenAddedIndex+1,t.getTokenIndex()-1);
157*16467b97STreehugger Robot		if ( hiddenTokens!=null ) {
158*16467b97STreehugger Robot			this.tokens = this.tokens.concat(hiddenTokens);
159*16467b97STreehugger Robot		}
160*16467b97STreehugger Robot		this.lastTokenAddedIndex = t.getTokenIndex();
161*16467b97STreehugger Robot
162*16467b97STreehugger Robot		// compute cpos as the char pos of next non-WS token in line
163*16467b97STreehugger Robot		var cpos = t.getCharPositionInLine(); // column dictates indent/dedent
164*16467b97STreehugger Robot		if ( t.getType()==ANTLR.runtime.Token.EOF ) {
165*16467b97STreehugger Robot			cpos = -1; // pretend EOF always happens at left edge
166*16467b97STreehugger Robot		}
167*16467b97STreehugger Robot		else if ( t.getType()==PythonLexer.LEADING_WS ) {
168*16467b97STreehugger Robot			cpos = t.getText().length;
169*16467b97STreehugger Robot		}
170*16467b97STreehugger Robot
171*16467b97STreehugger Robot		// compare to last indent level
172*16467b97STreehugger Robot		var lastIndent = this.peek();
173*16467b97STreehugger Robot		if ( cpos > lastIndent ) { // they indented; track and gen INDENT
174*16467b97STreehugger Robot			this.push(cpos);
175*16467b97STreehugger Robot			var indent = new ANTLR.runtime.CommonToken(PythonParser.INDENT, "");
176*16467b97STreehugger Robot			indent.setCharPositionInLine(t.getCharPositionInLine());
177*16467b97STreehugger Robot			indent.setLine(t.getLine());
178*16467b97STreehugger Robot			this.tokens.push(indent);
179*16467b97STreehugger Robot		}
180*16467b97STreehugger Robot		else if ( cpos < lastIndent ) { // they dedented
181*16467b97STreehugger Robot			// how far back did we dedent?
182*16467b97STreehugger Robot			var prevIndex = this.findPreviousIndent(cpos);
183*16467b97STreehugger Robot			// generate DEDENTs for each indent level we backed up over
184*16467b97STreehugger Robot			for (var d=this.sp-1; d>=prevIndex; d--) {
185*16467b97STreehugger Robot				var dedent = new ANTLR.runtime.CommonToken(PythonParser.DEDENT, "");
186*16467b97STreehugger Robot				dedent.setCharPositionInLine(t.getCharPositionInLine());
187*16467b97STreehugger Robot				dedent.setLine(t.getLine());
188*16467b97STreehugger Robot				this.tokens.push(dedent);
189*16467b97STreehugger Robot			}
190*16467b97STreehugger Robot			this.sp = prevIndex; // pop those off indent level
191*16467b97STreehugger Robot		}
192*16467b97STreehugger Robot		if ( t.getType()!=PythonLexer.LEADING_WS ) { // discard WS
193*16467b97STreehugger Robot			this.tokens.push(t);
194*16467b97STreehugger Robot		}
195*16467b97STreehugger Robot	},
196*16467b97STreehugger Robot
197*16467b97STreehugger Robot	//  T O K E N  S T A C K  M E T H O D S
198*16467b97STreehugger Robot
199*16467b97STreehugger Robot	push: function(i) {
200*16467b97STreehugger Robot		if (this.sp>=PythonTokenSource.MAX_INDENTS) {
201*16467b97STreehugger Robot			throw new Error("stack overflow");
202*16467b97STreehugger Robot		}
203*16467b97STreehugger Robot		this.sp++;
204*16467b97STreehugger Robot		this.indentStack[this.sp] = i;
205*16467b97STreehugger Robot	},
206*16467b97STreehugger Robot
207*16467b97STreehugger Robot	pop: function() {
208*16467b97STreehugger Robot		if (this.sp<0) {
209*16467b97STreehugger Robot			throw new Error("stack underflow");
210*16467b97STreehugger Robot		}
211*16467b97STreehugger Robot		var top = this.indentStack[this.sp];
212*16467b97STreehugger Robot		this.sp--;
213*16467b97STreehugger Robot		return top;
214*16467b97STreehugger Robot	},
215*16467b97STreehugger Robot
216*16467b97STreehugger Robot	peek: function() {
217*16467b97STreehugger Robot		return this.indentStack[this.sp];
218*16467b97STreehugger Robot	},
219*16467b97STreehugger Robot
220*16467b97STreehugger Robot	/** Return the index on stack of previous indent level == i else -1 */
221*16467b97STreehugger Robot	findPreviousIndent: function(i) {
222*16467b97STreehugger Robot		for (var j=this.sp-1; j>=0; j--) {
223*16467b97STreehugger Robot			if (this.indentStack[j]==i ) {
224*16467b97STreehugger Robot				return j;
225*16467b97STreehugger Robot			}
226*16467b97STreehugger Robot		}
227*16467b97STreehugger Robot		return PythonTokenSource.FIRST_CHAR_POSITION;
228*16467b97STreehugger Robot	},
229*16467b97STreehugger Robot
230*16467b97STreehugger Robot	stackString: function() {
231*16467b97STreehugger Robot		var buf = [];
232*16467b97STreehugger Robot		for (var j=this.sp; j>=0; j--) {
233*16467b97STreehugger Robot			buf.push(this.indentStack[j]);
234*16467b97STreehugger Robot		}
235*16467b97STreehugger Robot		return buf.join(" ");
236*16467b97STreehugger Robot	}
237*16467b97STreehugger Robot
238*16467b97STreehugger Robot}
239*16467b97STreehugger Robot
240*16467b97STreehugger Robot/* More example input / output pairs with code simplified to single chars
241*16467b97STreehugger Robot------- t1 -------
242*16467b97STreehugger Robota a
243*16467b97STreehugger Robot        b b
244*16467b97STreehugger Robot        c
245*16467b97STreehugger Robotd
246*16467b97STreehugger Robota a \n INDENT b b \n c \n DEDENT d \n EOF
247*16467b97STreehugger Robot------- t2 -------
248*16467b97STreehugger Robota  c
249*16467b97STreehugger Robot b
250*16467b97STreehugger Robotc
251*16467b97STreehugger Robota c \n INDENT b \n DEDENT c \n EOF
252*16467b97STreehugger Robot------- t3 -------
253*16467b97STreehugger Robota
254*16467b97STreehugger Robot        b
255*16467b97STreehugger Robot                c
256*16467b97STreehugger Robotd
257*16467b97STreehugger Robota \n INDENT b \n INDENT c \n DEDENT DEDENT d \n EOF
258*16467b97STreehugger Robot------- t4 -------
259*16467b97STreehugger Robota
260*16467b97STreehugger Robot    c
261*16467b97STreehugger Robot                  d
262*16467b97STreehugger Robot    e
263*16467b97STreehugger Robot    f
264*16467b97STreehugger Robot             g
265*16467b97STreehugger Robot             h
266*16467b97STreehugger Robot             i
267*16467b97STreehugger Robot              j
268*16467b97STreehugger Robot    k
269*16467b97STreehugger Robota \n INDENT c \n INDENT d \n DEDENT e \n f \n INDENT g \n h \n i \n INDENT j \n DEDENT DEDENT k \n DEDENT EOF
270*16467b97STreehugger Robot------- t5 -------
271*16467b97STreehugger Robota
272*16467b97STreehugger Robot        b
273*16467b97STreehugger Robot        c
274*16467b97STreehugger Robot                d
275*16467b97STreehugger Robot                e
276*16467b97STreehugger Robota \n INDENT b \n c \n INDENT d \n e \n DEDENT DEDENT EOF
277*16467b97STreehugger Robot*/
278