xref: /aosp_15_r20/external/antlr/runtime/Cpp/include/antlr3baserecognizer.hpp (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /** \file
2*16467b97STreehugger Robot  * Defines the basic structure to support recognizing by either a lexer,
3*16467b97STreehugger Robot  * parser, or tree parser.
4*16467b97STreehugger Robot  * \addtogroup BaseRecognizer
5*16467b97STreehugger Robot  * @{
6*16467b97STreehugger Robot  */
7*16467b97STreehugger Robot #ifndef	_ANTLR3_BASERECOGNIZER_HPP
8*16467b97STreehugger Robot #define	_ANTLR3_BASERECOGNIZER_HPP
9*16467b97STreehugger Robot 
10*16467b97STreehugger Robot // [The "BSD licence"]
11*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
12*16467b97STreehugger Robot 
13*16467b97STreehugger Robot //
14*16467b97STreehugger Robot // All rights reserved.
15*16467b97STreehugger Robot //
16*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without
17*16467b97STreehugger Robot // modification, are permitted provided that the following conditions
18*16467b97STreehugger Robot // are met:
19*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright
20*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer.
21*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright
22*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer in the
23*16467b97STreehugger Robot //    documentation and/or other materials provided with the distribution.
24*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products
25*16467b97STreehugger Robot //    derived from this software without specific prior written permission.
26*16467b97STreehugger Robot //
27*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
28*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
29*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
30*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
31*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
32*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
33*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
34*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
35*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
36*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37*16467b97STreehugger Robot 
38*16467b97STreehugger Robot #include    "antlr3defs.hpp"
39*16467b97STreehugger Robot #include    "antlr3collections.hpp"
40*16467b97STreehugger Robot 
41*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE()
42*16467b97STreehugger Robot 
43*16467b97STreehugger Robot /** \brief Base tracking context structure for all types of
44*16467b97STreehugger Robot  * recognizers.
45*16467b97STreehugger Robot  */
46*16467b97STreehugger Robot template< class ImplTraits, class StreamType >
47*16467b97STreehugger Robot class BaseRecognizer : public ImplTraits::AllocPolicyType
48*16467b97STreehugger Robot {
49*16467b97STreehugger Robot public:
50*16467b97STreehugger Robot 	typedef typename ImplTraits::AllocPolicyType	AllocPolicyType;
51*16467b97STreehugger Robot 	typedef typename StreamType::IntStreamType	IntStreamType;
52*16467b97STreehugger Robot 	typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType  SuperType;
53*16467b97STreehugger Robot 	typedef typename StreamType::UnitType		UnitType;
54*16467b97STreehugger Robot 	typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType;
55*16467b97STreehugger Robot 	typedef typename ImplTraits::BitsetType BitsetType;
56*16467b97STreehugger Robot 	typedef typename ImplTraits::BitsetListType		BitsetListType;
57*16467b97STreehugger Robot 	typedef typename ImplTraits::StringType	StringType;
58*16467b97STreehugger Robot 	typedef typename ImplTraits::template RecognizerSharedStateType<StreamType>  RecognizerSharedStateType;
59*16467b97STreehugger Robot 	typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
60*16467b97STreehugger Robot 	typedef typename ImplTraits::LexerType LexerType;
61*16467b97STreehugger Robot 	typedef typename ImplTraits::ParserType ParserType;
62*16467b97STreehugger Robot 	typedef typename ImplTraits::TreeParserType TreeParserType;
63*16467b97STreehugger Robot 
64*16467b97STreehugger Robot 	typedef typename AllocPolicyType::template StackType<StringType>  StringStackType;
65*16467b97STreehugger Robot 	typedef typename AllocPolicyType::template ListType<StringType>  StringListType;
66*16467b97STreehugger Robot 
67*16467b97STreehugger Robot private:
68*16467b97STreehugger Robot 	/// A pointer to the shared recognizer state, such that multiple
69*16467b97STreehugger Robot 	/// recognizers can use the same inputs streams and so on (in
70*16467b97STreehugger Robot 	/// the case of grammar inheritance for instance.
71*16467b97STreehugger Robot 	///
72*16467b97STreehugger Robot 	RecognizerSharedStateType*		m_state;
73*16467b97STreehugger Robot 
74*16467b97STreehugger Robot 	/// If set to something other than NULL, then this structure is
75*16467b97STreehugger Robot 	/// points to an instance of the debugger interface. In general, the
76*16467b97STreehugger Robot 	/// debugger is only referenced internally in recovery/error operations
77*16467b97STreehugger Robot 	/// so that it does not cause overhead by having to check this pointer
78*16467b97STreehugger Robot 	/// in every function/method
79*16467b97STreehugger Robot 	///
80*16467b97STreehugger Robot 	DebugEventListenerType*		m_debugger;
81*16467b97STreehugger Robot 
82*16467b97STreehugger Robot 
83*16467b97STreehugger Robot public:
84*16467b97STreehugger Robot 	BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
85*16467b97STreehugger Robot 
86*16467b97STreehugger Robot 	SuperType* get_super();
87*16467b97STreehugger Robot 	RecognizerSharedStateType* get_state() const;
88*16467b97STreehugger Robot 	DebugEventListenerType* get_debugger() const;
89*16467b97STreehugger Robot 	void  set_state( RecognizerSharedStateType* state );
90*16467b97STreehugger Robot 	void  set_debugger( DebugEventListenerType* debugger );
91*16467b97STreehugger Robot 
92*16467b97STreehugger Robot     /// Match current input symbol against ttype.  Upon error, do one token
93*16467b97STreehugger Robot 	/// insertion or deletion if possible.
94*16467b97STreehugger Robot 	/// To turn off single token insertion or deletion error
95*16467b97STreehugger Robot 	/// recovery, override mismatchRecover() and have it call
96*16467b97STreehugger Robot 	/// plain mismatch(), which does not recover.  Then any error
97*16467b97STreehugger Robot 	/// in a rule will cause an exception and immediate exit from
98*16467b97STreehugger Robot 	/// rule.  Rule would recover by resynchronizing to the set of
99*16467b97STreehugger Robot 	/// symbols that can follow rule ref.
100*16467b97STreehugger Robot 	///
101*16467b97STreehugger Robot     const UnitType*	match(ANTLR_UINT32 ttype, BitsetListType* follow);
102*16467b97STreehugger Robot 
103*16467b97STreehugger Robot 	/// Consumes the next token, whatever it is, and resets the recognizer state
104*16467b97STreehugger Robot 	/// so that it is not in error.
105*16467b97STreehugger Robot 	///
106*16467b97STreehugger Robot 	/// \param recognizer
107*16467b97STreehugger Robot 	/// Recognizer context pointer
108*16467b97STreehugger Robot 	///
109*16467b97STreehugger Robot     void	matchAny();
110*16467b97STreehugger Robot 
111*16467b97STreehugger Robot 	/// function that decides if the token ahead of the current one is the
112*16467b97STreehugger Robot 	/// one we were loking for, in which case the curernt one is very likely extraneous
113*16467b97STreehugger Robot 	/// and can be reported that way.
114*16467b97STreehugger Robot 	///
115*16467b97STreehugger Robot 	bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype);
116*16467b97STreehugger Robot 
117*16467b97STreehugger Robot 	/// function that decides if the current token is one that can logically
118*16467b97STreehugger Robot 	/// follow the one we were looking for, in which case the one we were looking for is
119*16467b97STreehugger Robot 	/// probably missing from the input.
120*16467b97STreehugger Robot 	///
121*16467b97STreehugger Robot 	bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow);
122*16467b97STreehugger Robot 
123*16467b97STreehugger Robot     /// Factor out what to do upon token mismatch so tree parsers can behave
124*16467b97STreehugger Robot 	/// differently.  Override and call mismatchRecover(input, ttype, follow)
125*16467b97STreehugger Robot 	/// to get single token insertion and deletion.  Use this to turn off
126*16467b97STreehugger Robot 	/// single token insertion and deletion. Override mismatchRecover
127*16467b97STreehugger Robot 	/// to call this instead.
128*16467b97STreehugger Robot 	///
129*16467b97STreehugger Robot 	/// \remark mismatch only works for parsers and must be overridden for anything else.
130*16467b97STreehugger Robot 	///
131*16467b97STreehugger Robot     void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow);
132*16467b97STreehugger Robot 
133*16467b97STreehugger Robot     /// Report a recognition problem.
134*16467b97STreehugger Robot 	///
135*16467b97STreehugger Robot 	/// This method sets errorRecovery to indicate the parser is recovering
136*16467b97STreehugger Robot 	/// not parsing.  Once in recovery mode, no errors are generated.
137*16467b97STreehugger Robot 	/// To get out of recovery mode, the parser must successfully match
138*16467b97STreehugger Robot 	/// a token (after a resync).  So it will go:
139*16467b97STreehugger Robot 	///
140*16467b97STreehugger Robot 	///		1. error occurs
141*16467b97STreehugger Robot 	///		2. enter recovery mode, report error
142*16467b97STreehugger Robot 	///		3. consume until token found in resynch set
143*16467b97STreehugger Robot 	///		4. try to resume parsing
144*16467b97STreehugger Robot 	///		5. next match() will reset errorRecovery mode
145*16467b97STreehugger Robot 	///
146*16467b97STreehugger Robot 	/// If you override, make sure to update errorCount if you care about that.
147*16467b97STreehugger Robot 	///
148*16467b97STreehugger Robot     void	reportError();
149*16467b97STreehugger Robot 	void	reportError( ClassForwarder<LexerType> );
150*16467b97STreehugger Robot 	template<typename CompType>
151*16467b97STreehugger Robot 	void	reportError( ClassForwarder<CompType> );
152*16467b97STreehugger Robot 
153*16467b97STreehugger Robot     /** Function that is called to display a recognition error message. You may
154*16467b97STreehugger Robot      *  override this function independently of (*reportError)() above as that function calls
155*16467b97STreehugger Robot      *  this one to do the actual exception printing.
156*16467b97STreehugger Robot      */
157*16467b97STreehugger Robot     void	displayRecognitionError(ANTLR_UINT8** tokenNames);
158*16467b97STreehugger Robot 
159*16467b97STreehugger Robot 	/// Get number of recognition errors (lexer, parser, tree parser).  Each
160*16467b97STreehugger Robot 	/// recognizer tracks its own number.  So parser and lexer each have
161*16467b97STreehugger Robot 	/// separate count.  Does not count the spurious errors found between
162*16467b97STreehugger Robot 	/// an error and next valid token match
163*16467b97STreehugger Robot 	///
164*16467b97STreehugger Robot 	/// \see reportError()
165*16467b97STreehugger Robot 	///
166*16467b97STreehugger Robot 	ANTLR_UINT32 getNumberOfSyntaxErrors();
167*16467b97STreehugger Robot 
168*16467b97STreehugger Robot     /** Function that recovers from an error found in the input stream.
169*16467b97STreehugger Robot      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
170*16467b97STreehugger Robot      *  be from a mismatched token that the (*match)() could not recover from.
171*16467b97STreehugger Robot      */
172*16467b97STreehugger Robot     void	recover();
173*16467b97STreehugger Robot 
174*16467b97STreehugger Robot     /** function that is a hook to listen to token consumption during error recovery.
175*16467b97STreehugger Robot      *  This is mainly used by the debug parser to send events to the listener.
176*16467b97STreehugger Robot      */
177*16467b97STreehugger Robot     void	beginResync();
178*16467b97STreehugger Robot 
179*16467b97STreehugger Robot     /** function that is a hook to listen to token consumption during error recovery.
180*16467b97STreehugger Robot      *  This is mainly used by the debug parser to send events to the listener.
181*16467b97STreehugger Robot      */
182*16467b97STreehugger Robot     void	endResync();
183*16467b97STreehugger Robot 
184*16467b97STreehugger Robot 	/** function that is a hook to listen to token consumption during error recovery.
185*16467b97STreehugger Robot      *  This is mainly used by the debug parser to send events to the listener.
186*16467b97STreehugger Robot      */
187*16467b97STreehugger Robot     void	beginBacktrack(ANTLR_UINT32 level);
188*16467b97STreehugger Robot 
189*16467b97STreehugger Robot     /** function that is a hook to listen to token consumption during error recovery.
190*16467b97STreehugger Robot      *  This is mainly used by the debug parser to send events to the listener.
191*16467b97STreehugger Robot      */
192*16467b97STreehugger Robot     void	endBacktrack(ANTLR_UINT32 level, bool successful);
193*16467b97STreehugger Robot 
194*16467b97STreehugger Robot     /// Compute the error recovery set for the current rule.
195*16467b97STreehugger Robot 	/// Documentation below is from the Java implementation.
196*16467b97STreehugger Robot 	///
197*16467b97STreehugger Robot 	/// During rule invocation, the parser pushes the set of tokens that can
198*16467b97STreehugger Robot 	/// follow that rule reference on the stack; this amounts to
199*16467b97STreehugger Robot 	/// computing FIRST of what follows the rule reference in the
200*16467b97STreehugger Robot 	/// enclosing rule. This local follow set only includes tokens
201*16467b97STreehugger Robot 	/// from within the rule; i.e., the FIRST computation done by
202*16467b97STreehugger Robot 	/// ANTLR stops at the end of a rule.
203*16467b97STreehugger Robot 	//
204*16467b97STreehugger Robot 	/// EXAMPLE
205*16467b97STreehugger Robot 	//
206*16467b97STreehugger Robot 	/// When you find a "no viable alt exception", the input is not
207*16467b97STreehugger Robot 	/// consistent with any of the alternatives for rule r.  The best
208*16467b97STreehugger Robot 	/// thing to do is to consume tokens until you see something that
209*16467b97STreehugger Robot 	/// can legally follow a call to r *or* any rule that called r.
210*16467b97STreehugger Robot 	/// You don't want the exact set of viable next tokens because the
211*16467b97STreehugger Robot 	/// input might just be missing a token--you might consume the
212*16467b97STreehugger Robot 	/// rest of the input looking for one of the missing tokens.
213*16467b97STreehugger Robot 	///
214*16467b97STreehugger Robot 	/// Consider grammar:
215*16467b97STreehugger Robot 	///
216*16467b97STreehugger Robot 	/// a : '[' b ']'
217*16467b97STreehugger Robot 	///   | '(' b ')'
218*16467b97STreehugger Robot 	///   ;
219*16467b97STreehugger Robot 	/// b : c '^' INT ;
220*16467b97STreehugger Robot 	/// c : ID
221*16467b97STreehugger Robot 	///   | INT
222*16467b97STreehugger Robot 	///   ;
223*16467b97STreehugger Robot 	///
224*16467b97STreehugger Robot 	/// At each rule invocation, the set of tokens that could follow
225*16467b97STreehugger Robot 	/// that rule is pushed on a stack.  Here are the various "local"
226*16467b97STreehugger Robot 	/// follow sets:
227*16467b97STreehugger Robot 	///
228*16467b97STreehugger Robot 	/// FOLLOW(b1_in_a) = FIRST(']') = ']'
229*16467b97STreehugger Robot 	/// FOLLOW(b2_in_a) = FIRST(')') = ')'
230*16467b97STreehugger Robot 	/// FOLLOW(c_in_b) = FIRST('^') = '^'
231*16467b97STreehugger Robot 	///
232*16467b97STreehugger Robot 	/// Upon erroneous input "[]", the call chain is
233*16467b97STreehugger Robot 	///
234*16467b97STreehugger Robot 	/// a -> b -> c
235*16467b97STreehugger Robot 	///
236*16467b97STreehugger Robot 	/// and, hence, the follow context stack is:
237*16467b97STreehugger Robot 	///
238*16467b97STreehugger Robot 	/// depth  local follow set     after call to rule
239*16467b97STreehugger Robot 	///   0         <EOF>                    a (from main())
240*16467b97STreehugger Robot 	///   1          ']'                     b
241*16467b97STreehugger Robot 	///   3          '^'                     c
242*16467b97STreehugger Robot 	///
243*16467b97STreehugger Robot 	/// Notice that ')' is not included, because b would have to have
244*16467b97STreehugger Robot 	/// been called from a different context in rule a for ')' to be
245*16467b97STreehugger Robot 	/// included.
246*16467b97STreehugger Robot 	///
247*16467b97STreehugger Robot 	/// For error recovery, we cannot consider FOLLOW(c)
248*16467b97STreehugger Robot 	/// (context-sensitive or otherwise).  We need the combined set of
249*16467b97STreehugger Robot 	/// all context-sensitive FOLLOW sets--the set of all tokens that
250*16467b97STreehugger Robot 	/// could follow any reference in the call chain.  We need to
251*16467b97STreehugger Robot 	/// resync to one of those tokens.  Note that FOLLOW(c)='^' and if
252*16467b97STreehugger Robot 	/// we resync'd to that token, we'd consume until EOF.  We need to
253*16467b97STreehugger Robot 	/// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
254*16467b97STreehugger Robot 	/// In this case, for input "[]", LA(1) is in this set so we would
255*16467b97STreehugger Robot 	/// not consume anything and after printing an error rule c would
256*16467b97STreehugger Robot 	/// return normally.  It would not find the required '^' though.
257*16467b97STreehugger Robot 	/// At this point, it gets a mismatched token error and throws an
258*16467b97STreehugger Robot 	/// exception (since LA(1) is not in the viable following token
259*16467b97STreehugger Robot 	/// set).  The rule exception handler tries to recover, but finds
260*16467b97STreehugger Robot 	/// the same recovery set and doesn't consume anything.  Rule b
261*16467b97STreehugger Robot 	/// exits normally returning to rule a.  Now it finds the ']' (and
262*16467b97STreehugger Robot 	/// with the successful match exits errorRecovery mode).
263*16467b97STreehugger Robot 	///
264*16467b97STreehugger Robot 	/// So, you can see that the parser walks up call chain looking
265*16467b97STreehugger Robot 	/// for the token that was a member of the recovery set.
266*16467b97STreehugger Robot 	///
267*16467b97STreehugger Robot 	/// Errors are not generated in errorRecovery mode.
268*16467b97STreehugger Robot 	///
269*16467b97STreehugger Robot 	/// ANTLR's error recovery mechanism is based upon original ideas:
270*16467b97STreehugger Robot 	///
271*16467b97STreehugger Robot 	/// "Algorithms + Data Structures = Programs" by Niklaus Wirth
272*16467b97STreehugger Robot 	///
273*16467b97STreehugger Robot 	/// and
274*16467b97STreehugger Robot 	///
275*16467b97STreehugger Robot 	/// "A note on error recovery in recursive descent parsers":
276*16467b97STreehugger Robot 	/// http://portal.acm.org/citation.cfm?id=947902.947905
277*16467b97STreehugger Robot 	///
278*16467b97STreehugger Robot 	/// Later, Josef Grosch had some good ideas:
279*16467b97STreehugger Robot 	///
280*16467b97STreehugger Robot 	/// "Efficient and Comfortable Error Recovery in Recursive Descent
281*16467b97STreehugger Robot 	/// Parsers":
282*16467b97STreehugger Robot 	/// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
283*16467b97STreehugger Robot 	///
284*16467b97STreehugger Robot 	/// Like Grosch I implemented local FOLLOW sets that are combined
285*16467b97STreehugger Robot 	/// at run-time upon error to avoid overhead during parsing.
286*16467b97STreehugger Robot 	///
287*16467b97STreehugger Robot     BitsetType*	computeErrorRecoverySet();
288*16467b97STreehugger Robot 
289*16467b97STreehugger Robot     /// Compute the context-sensitive FOLLOW set for current rule.
290*16467b97STreehugger Robot 	/// Documentation below is from the Java runtime.
291*16467b97STreehugger Robot 	///
292*16467b97STreehugger Robot 	/// This is the set of token types that can follow a specific rule
293*16467b97STreehugger Robot 	/// reference given a specific call chain.  You get the set of
294*16467b97STreehugger Robot 	/// viable tokens that can possibly come next (look ahead depth 1)
295*16467b97STreehugger Robot 	/// given the current call chain.  Contrast this with the
296*16467b97STreehugger Robot 	/// definition of plain FOLLOW for rule r:
297*16467b97STreehugger Robot 	///
298*16467b97STreehugger Robot 	///  FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
299*16467b97STreehugger Robot 	///
300*16467b97STreehugger Robot 	/// where x in T* and alpha, beta in V*; T is set of terminals and
301*16467b97STreehugger Robot 	/// V is the set of terminals and non terminals.  In other words,
302*16467b97STreehugger Robot 	/// FOLLOW(r) is the set of all tokens that can possibly follow
303*16467b97STreehugger Robot 	/// references to r in///any* sentential form (context).  At
304*16467b97STreehugger Robot 	/// runtime, however, we know precisely which context applies as
305*16467b97STreehugger Robot 	/// we have the call chain.  We may compute the exact (rather
306*16467b97STreehugger Robot 	/// than covering superset) set of following tokens.
307*16467b97STreehugger Robot 	///
308*16467b97STreehugger Robot 	/// For example, consider grammar:
309*16467b97STreehugger Robot 	///
310*16467b97STreehugger Robot 	/// stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
311*16467b97STreehugger Robot 	///      | "return" expr '.'
312*16467b97STreehugger Robot 	///      ;
313*16467b97STreehugger Robot 	/// expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
314*16467b97STreehugger Robot 	/// atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
315*16467b97STreehugger Robot 	///      | '(' expr ')'
316*16467b97STreehugger Robot 	///      ;
317*16467b97STreehugger Robot 	///
318*16467b97STreehugger Robot 	/// The FOLLOW sets are all inclusive whereas context-sensitive
319*16467b97STreehugger Robot 	/// FOLLOW sets are precisely what could follow a rule reference.
320*16467b97STreehugger Robot 	/// For input input "i=(3);", here is the derivation:
321*16467b97STreehugger Robot 	///
322*16467b97STreehugger Robot 	/// stat => ID '=' expr ';'
323*16467b97STreehugger Robot 	///      => ID '=' atom ('+' atom)* ';'
324*16467b97STreehugger Robot 	///      => ID '=' '(' expr ')' ('+' atom)* ';'
325*16467b97STreehugger Robot 	///      => ID '=' '(' atom ')' ('+' atom)* ';'
326*16467b97STreehugger Robot 	///      => ID '=' '(' INT ')' ('+' atom)* ';'
327*16467b97STreehugger Robot 	///      => ID '=' '(' INT ')' ';'
328*16467b97STreehugger Robot 	///
329*16467b97STreehugger Robot 	/// At the "3" token, you'd have a call chain of
330*16467b97STreehugger Robot 	///
331*16467b97STreehugger Robot 	///   stat -> expr -> atom -> expr -> atom
332*16467b97STreehugger Robot 	///
333*16467b97STreehugger Robot 	/// What can follow that specific nested ref to atom?  Exactly ')'
334*16467b97STreehugger Robot 	/// as you can see by looking at the derivation of this specific
335*16467b97STreehugger Robot 	/// input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
336*16467b97STreehugger Robot 	///
337*16467b97STreehugger Robot 	/// You want the exact viable token set when recovering from a
338*16467b97STreehugger Robot 	/// token mismatch.  Upon token mismatch, if LA(1) is member of
339*16467b97STreehugger Robot 	/// the viable next token set, then you know there is most likely
340*16467b97STreehugger Robot 	/// a missing token in the input stream.  "Insert" one by just not
341*16467b97STreehugger Robot 	/// throwing an exception.
342*16467b97STreehugger Robot 	///
343*16467b97STreehugger Robot     BitsetType*	computeCSRuleFollow();
344*16467b97STreehugger Robot 
345*16467b97STreehugger Robot     /// Compute the current followset for the input stream.
346*16467b97STreehugger Robot 	///
347*16467b97STreehugger Robot     BitsetType*	combineFollows(bool exact);
348*16467b97STreehugger Robot 
349*16467b97STreehugger Robot     /// Attempt to recover from a single missing or extra token.
350*16467b97STreehugger Robot 	///
351*16467b97STreehugger Robot 	/// EXTRA TOKEN
352*16467b97STreehugger Robot 	///
353*16467b97STreehugger Robot 	/// LA(1) is not what we are looking for.  If LA(2) has the right token,
354*16467b97STreehugger Robot 	/// however, then assume LA(1) is some extra spurious token.  Delete it
355*16467b97STreehugger Robot 	/// and LA(2) as if we were doing a normal match(), which advances the
356*16467b97STreehugger Robot 	/// input.
357*16467b97STreehugger Robot 	///
358*16467b97STreehugger Robot 	/// MISSING TOKEN
359*16467b97STreehugger Robot 	///
360*16467b97STreehugger Robot 	/// If current token is consistent with what could come after
361*16467b97STreehugger Robot 	/// ttype then it is ok to "insert" the missing token, else throw
362*16467b97STreehugger Robot 	/// exception For example, Input "i=(3;" is clearly missing the
363*16467b97STreehugger Robot 	/// ')'.  When the parser returns from the nested call to expr, it
364*16467b97STreehugger Robot 	/// will have call chain:
365*16467b97STreehugger Robot 	///
366*16467b97STreehugger Robot 	///    stat -> expr -> atom
367*16467b97STreehugger Robot 	///
368*16467b97STreehugger Robot 	/// and it will be trying to match the ')' at this point in the
369*16467b97STreehugger Robot 	/// derivation:
370*16467b97STreehugger Robot 	///
371*16467b97STreehugger Robot 	///       => ID '=' '(' INT ')' ('+' atom)* ';'
372*16467b97STreehugger Robot 	///                          ^
373*16467b97STreehugger Robot 	/// match() will see that ';' doesn't match ')' and report a
374*16467b97STreehugger Robot 	/// mismatched token error.  To recover, it sees that LA(1)==';'
375*16467b97STreehugger Robot 	/// is in the set of tokens that can follow the ')' token
376*16467b97STreehugger Robot 	/// reference in rule atom.  It can assume that you forgot the ')'.
377*16467b97STreehugger Robot 	///
378*16467b97STreehugger Robot 	/// The exception that was passed in, in the java implementation is
379*16467b97STreehugger Robot 	/// sorted in the recognizer exception stack in the C version. To 'throw' it we set the
380*16467b97STreehugger Robot 	/// error flag and rules cascade back when this is set.
381*16467b97STreehugger Robot 	///
382*16467b97STreehugger Robot     const UnitType* recoverFromMismatchedToken( ANTLR_UINT32	ttype, BitsetListType*	follow);
383*16467b97STreehugger Robot 
384*16467b97STreehugger Robot     /** Function that recovers from a mismatched set in the token stream, in a similar manner
385*16467b97STreehugger Robot      *  to (*recoverFromMismatchedToken)
386*16467b97STreehugger Robot      */
387*16467b97STreehugger Robot     const UnitType* recoverFromMismatchedSet(BitsetListType*	follow);
388*16467b97STreehugger Robot 
389*16467b97STreehugger Robot     /** common routine to handle single token insertion for recovery functions.
390*16467b97STreehugger Robot      */
391*16467b97STreehugger Robot 	/// This code is factored out from mismatched token and mismatched set
392*16467b97STreehugger Robot 	///  recovery.  It handles "single token insertion" error recovery for
393*16467b97STreehugger Robot 	/// both.  No tokens are consumed to recover from insertions.  Return
394*16467b97STreehugger Robot 	/// true if recovery was possible else return false.
395*16467b97STreehugger Robot 	///
396*16467b97STreehugger Robot     bool	recoverFromMismatchedElement(BitsetListType*	follow);
397*16467b97STreehugger Robot 
398*16467b97STreehugger Robot     /** function that consumes input until the next token matches
399*16467b97STreehugger Robot      *  the given token.
400*16467b97STreehugger Robot      */
401*16467b97STreehugger Robot     void	consumeUntil(ANTLR_UINT32   tokenType);
402*16467b97STreehugger Robot 
403*16467b97STreehugger Robot     /** function that consumes input until the next token matches
404*16467b97STreehugger Robot      *  one in the given set.
405*16467b97STreehugger Robot      */
406*16467b97STreehugger Robot     void	consumeUntilSet(BitsetType*	set);
407*16467b97STreehugger Robot 
408*16467b97STreehugger Robot     /** function that returns an ANTLR3_LIST of the strings that identify
409*16467b97STreehugger Robot      *  the rules in the parser that got you to this point. Can be overridden by installing your
410*16467b97STreehugger Robot      *	own function set.
411*16467b97STreehugger Robot      *
412*16467b97STreehugger Robot      * \todo Document how to override invocation stack functions.
413*16467b97STreehugger Robot      */
414*16467b97STreehugger Robot 	StringStackType	getRuleInvocationStack();
415*16467b97STreehugger Robot 	StringStackType	getRuleInvocationStackNamed(ANTLR_UINT8*    name);
416*16467b97STreehugger Robot 
417*16467b97STreehugger Robot     /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of
418*16467b97STreehugger Robot      *  string token names. As this is mostly used in string template processing it may not be useful
419*16467b97STreehugger Robot      *  in the C runtime.
420*16467b97STreehugger Robot      */
421*16467b97STreehugger Robot     StringListType	toStrings( const StringListType& );
422*16467b97STreehugger Robot 
423*16467b97STreehugger Robot     /** function to return whether the rule has parsed input starting at the supplied
424*16467b97STreehugger Robot      *  start index before. If the rule has not parsed input starting from the supplied start index,
425*16467b97STreehugger Robot      *  then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point
426*16467b97STreehugger Robot      *  then it will return the point where it last stopped parsing after that start point.
427*16467b97STreehugger Robot      */
428*16467b97STreehugger Robot     ANTLR_MARKER	getRuleMemoization( ANTLR_INTKEY	ruleIndex,
429*16467b97STreehugger Robot 												ANTLR_MARKER	ruleParseStart);
430*16467b97STreehugger Robot 
431*16467b97STreehugger Robot     /** function that determines whether the rule has parsed input at the current index
432*16467b97STreehugger Robot      *  in the input stream
433*16467b97STreehugger Robot      */
434*16467b97STreehugger Robot     bool	alreadyParsedRule(ANTLR_MARKER	ruleIndex);
435*16467b97STreehugger Robot 
436*16467b97STreehugger Robot     /** Function that records whether the rule has parsed the input at a
437*16467b97STreehugger Robot      *  current position successfully or not.
438*16467b97STreehugger Robot      */
439*16467b97STreehugger Robot     void	memoize(ANTLR_MARKER	ruleIndex,
440*16467b97STreehugger Robot 								ANTLR_MARKER	ruleParseStart);
441*16467b97STreehugger Robot 
442*16467b97STreehugger Robot 	/// Function that returns the current input symbol.
443*16467b97STreehugger Robot     /// The is placed into any label for the associated token ref; e.g., x=ID.  Token
444*16467b97STreehugger Robot 	/// and tree parsers need to return different objects. Rather than test
445*16467b97STreehugger Robot 	/// for input stream type or change the IntStream interface, I use
446*16467b97STreehugger Robot 	/// a simple method to ask the recognizer to tell me what the current
447*16467b97STreehugger Robot 	/// input symbol is.
448*16467b97STreehugger Robot 	///
449*16467b97STreehugger Robot 	/// This is ignored for lexers and the lexer implementation of this
450*16467b97STreehugger Robot 	/// function should return NULL.
451*16467b97STreehugger Robot 	///
452*16467b97STreehugger Robot 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream);
453*16467b97STreehugger Robot 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>);
454*16467b97STreehugger Robot 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>);
455*16467b97STreehugger Robot 	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>);
456*16467b97STreehugger Robot 
457*16467b97STreehugger Robot 	/// Conjure up a missing token during error recovery.
458*16467b97STreehugger Robot 	///
459*16467b97STreehugger Robot 	/// The recognizer attempts to recover from single missing
460*16467b97STreehugger Robot 	/// symbols. But, actions might refer to that missing symbol.
461*16467b97STreehugger Robot 	/// For example, x=ID {f($x);}. The action clearly assumes
462*16467b97STreehugger Robot 	/// that there has been an identifier matched previously and that
463*16467b97STreehugger Robot 	/// $x points at that token. If that token is missing, but
464*16467b97STreehugger Robot 	/// the next token in the stream is what we want we assume that
465*16467b97STreehugger Robot 	/// this token is missing and we keep going. Because we
466*16467b97STreehugger Robot 	/// have to return some token to replace the missing token,
467*16467b97STreehugger Robot 	/// we have to conjure one up. This method gives the user control
468*16467b97STreehugger Robot 	/// over the tokens returned for missing tokens. Mostly,
469*16467b97STreehugger Robot 	/// you will want to create something special for identifier
470*16467b97STreehugger Robot 	/// tokens. For literals such as '{' and ',', the default
471*16467b97STreehugger Robot 	/// action in the parser or tree parser works. It simply creates
472*16467b97STreehugger Robot 	/// a CommonToken of the appropriate type. The text will be the token.
473*16467b97STreehugger Robot 	/// If you change what tokens must be created by the lexer,
474*16467b97STreehugger Robot 	/// override this method to create the appropriate tokens.
475*16467b97STreehugger Robot 	///
476*16467b97STreehugger Robot 	UnitType*	getMissingSymbol( IntStreamType*		istream, ExceptionBaseType*		e,
477*16467b97STreehugger Robot 												ANTLR_UINT32			expectedTokenType,
478*16467b97STreehugger Robot 												BitsetListType*		follow);
479*16467b97STreehugger Robot 
480*16467b97STreehugger Robot     /** Function that returns whether the supplied grammar function
481*16467b97STreehugger Robot      *  will parse the current input stream or not. This is the way that syntactic
482*16467b97STreehugger Robot      *  predicates are evaluated. Unlike java, C is perfectly happy to invoke code
483*16467b97STreehugger Robot      *  via a pointer to a function (hence that's what all the ANTLR3 C interfaces
484*16467b97STreehugger Robot      *  do.
485*16467b97STreehugger Robot      */
486*16467b97STreehugger Robot 	template<typename Predicate>
487*16467b97STreehugger Robot     bool  synpred( ClassForwarder<Predicate> );
488*16467b97STreehugger Robot 
489*16467b97STreehugger Robot 	//In place of exConstruct, just directly instantiate the Exception Object
490*16467b97STreehugger Robot 
491*16467b97STreehugger Robot     /** Reset the recognizer
492*16467b97STreehugger Robot      */
493*16467b97STreehugger Robot     void  reset();
494*16467b97STreehugger Robot 	void  reset( ClassForwarder<LexerType> );
495*16467b97STreehugger Robot 	template<typename CompType>
496*16467b97STreehugger Robot 	void  reset( ClassForwarder<CompType> );
497*16467b97STreehugger Robot 
498*16467b97STreehugger Robot 	void exConstruct();
499*16467b97STreehugger Robot 
500*16467b97STreehugger Robot     ~BaseRecognizer();
501*16467b97STreehugger Robot 
502*16467b97STreehugger Robot };
503*16467b97STreehugger Robot 
504*16467b97STreehugger Robot ANTLR_END_NAMESPACE()
505*16467b97STreehugger Robot 
506*16467b97STreehugger Robot #include "antlr3baserecognizer.inl"
507*16467b97STreehugger Robot 
508*16467b97STreehugger Robot /// @}
509*16467b97STreehugger Robot ///
510*16467b97STreehugger Robot 
511*16467b97STreehugger Robot #endif	    /* _ANTLR3_BASERECOGNIZER_H	*/
512*16467b97STreehugger Robot 
513