Cpp/include/antlr3baserecognizer.hpp

*16467b97STreehugger Robot/** \file
*16467b97STreehugger Robot * Defines the basic structure to support recognizing by either a lexer,
*16467b97STreehugger Robot * parser, or tree parser.
*16467b97STreehugger Robot * \addtogroup BaseRecognizer
*16467b97STreehugger Robot * @{
*16467b97STreehugger Robot */
*16467b97STreehugger Robot#ifndef	_ANTLR3_BASERECOGNIZER_HPP
*16467b97STreehugger Robot#define	_ANTLR3_BASERECOGNIZER_HPP
*16467b97STreehugger Robot
*16467b97STreehugger Robot// [The "BSD licence"]
*16467b97STreehugger Robot// Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
*16467b97STreehugger Robot
*16467b97STreehugger Robot//
*16467b97STreehugger Robot// All rights reserved.
*16467b97STreehugger Robot//
*16467b97STreehugger Robot// Redistribution and use in source and binary forms, with or without
*16467b97STreehugger Robot// modification, are permitted provided that the following conditions
*16467b97STreehugger Robot// are met:
*16467b97STreehugger Robot// 1. Redistributions of source code must retain the above copyright
*16467b97STreehugger Robot//    notice, this list of conditions and the following disclaimer.
*16467b97STreehugger Robot// 2. Redistributions in binary form must reproduce the above copyright
*16467b97STreehugger Robot//    notice, this list of conditions and the following disclaimer in the
*16467b97STreehugger Robot//    documentation and/or other materials provided with the distribution.
*16467b97STreehugger Robot// 3. The name of the author may not be used to endorse or promote products
*16467b97STreehugger Robot//    derived from this software without specific prior written permission.
*16467b97STreehugger Robot//
*16467b97STreehugger Robot// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
*16467b97STreehugger Robot// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
*16467b97STreehugger Robot// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
*16467b97STreehugger Robot// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
*16467b97STreehugger Robot// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
*16467b97STreehugger Robot// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
*16467b97STreehugger Robot// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
*16467b97STreehugger Robot// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
*16467b97STreehugger Robot// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
*16467b97STreehugger Robot// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*16467b97STreehugger Robot
*16467b97STreehugger Robot#include    "antlr3defs.hpp"
*16467b97STreehugger Robot#include    "antlr3collections.hpp"
*16467b97STreehugger Robot
*16467b97STreehugger RobotANTLR_BEGIN_NAMESPACE()
*16467b97STreehugger Robot
*16467b97STreehugger Robot/** \brief Base tracking context structure for all types of
*16467b97STreehugger Robot * recognizers.
*16467b97STreehugger Robot */
*16467b97STreehugger Robottemplate< class ImplTraits, class StreamType >
*16467b97STreehugger Robotclass BaseRecognizer : public ImplTraits::AllocPolicyType
*16467b97STreehugger Robot{
*16467b97STreehugger Robotpublic:
*16467b97STreehugger Robot	typedef typename ImplTraits::AllocPolicyType	AllocPolicyType;
*16467b97STreehugger Robot	typedef typename StreamType::IntStreamType	IntStreamType;
*16467b97STreehugger Robot	typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType  SuperType;
*16467b97STreehugger Robot	typedef typename StreamType::UnitType		UnitType;
*16467b97STreehugger Robot	typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType;
*16467b97STreehugger Robot	typedef typename ImplTraits::BitsetType BitsetType;
*16467b97STreehugger Robot	typedef typename ImplTraits::BitsetListType		BitsetListType;
*16467b97STreehugger Robot	typedef typename ImplTraits::StringType	StringType;
*16467b97STreehugger Robot	typedef typename ImplTraits::template RecognizerSharedStateType<StreamType>  RecognizerSharedStateType;
*16467b97STreehugger Robot	typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType;
*16467b97STreehugger Robot	typedef typename ImplTraits::LexerType LexerType;
*16467b97STreehugger Robot	typedef typename ImplTraits::ParserType ParserType;
*16467b97STreehugger Robot	typedef typename ImplTraits::TreeParserType TreeParserType;
*16467b97STreehugger Robot
*16467b97STreehugger Robot	typedef typename AllocPolicyType::template StackType<StringType>  StringStackType;
*16467b97STreehugger Robot	typedef typename AllocPolicyType::template ListType<StringType>  StringListType;
*16467b97STreehugger Robot
*16467b97STreehugger Robotprivate:
*16467b97STreehugger Robot	/// A pointer to the shared recognizer state, such that multiple
*16467b97STreehugger Robot	/// recognizers can use the same inputs streams and so on (in
*16467b97STreehugger Robot	/// the case of grammar inheritance for instance.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	RecognizerSharedStateType*		m_state;
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// If set to something other than NULL, then this structure is
*16467b97STreehugger Robot	/// points to an instance of the debugger interface. In general, the
*16467b97STreehugger Robot	/// debugger is only referenced internally in recovery/error operations
*16467b97STreehugger Robot	/// so that it does not cause overhead by having to check this pointer
*16467b97STreehugger Robot	/// in every function/method
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	DebugEventListenerType*		m_debugger;
*16467b97STreehugger Robot
*16467b97STreehugger Robot
*16467b97STreehugger Robotpublic:
*16467b97STreehugger Robot	BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
*16467b97STreehugger Robot
*16467b97STreehugger Robot	SuperType* get_super();
*16467b97STreehugger Robot	RecognizerSharedStateType* get_state() const;
*16467b97STreehugger Robot	DebugEventListenerType* get_debugger() const;
*16467b97STreehugger Robot	void  set_state( RecognizerSharedStateType* state );
*16467b97STreehugger Robot	void  set_debugger( DebugEventListenerType* debugger );
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Match current input symbol against ttype.  Upon error, do one token
*16467b97STreehugger Robot	/// insertion or deletion if possible.
*16467b97STreehugger Robot	/// To turn off single token insertion or deletion error
*16467b97STreehugger Robot	/// recovery, override mismatchRecover() and have it call
*16467b97STreehugger Robot	/// plain mismatch(), which does not recover.  Then any error
*16467b97STreehugger Robot	/// in a rule will cause an exception and immediate exit from
*16467b97STreehugger Robot	/// rule.  Rule would recover by resynchronizing to the set of
*16467b97STreehugger Robot	/// symbols that can follow rule ref.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    const UnitType*	match(ANTLR_UINT32 ttype, BitsetListType* follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// Consumes the next token, whatever it is, and resets the recognizer state
*16467b97STreehugger Robot	/// so that it is not in error.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// \param recognizer
*16467b97STreehugger Robot	/// Recognizer context pointer
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    void	matchAny();
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// function that decides if the token ahead of the current one is the
*16467b97STreehugger Robot	/// one we were loking for, in which case the curernt one is very likely extraneous
*16467b97STreehugger Robot	/// and can be reported that way.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype);
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// function that decides if the current token is one that can logically
*16467b97STreehugger Robot	/// follow the one we were looking for, in which case the one we were looking for is
*16467b97STreehugger Robot	/// probably missing from the input.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Factor out what to do upon token mismatch so tree parsers can behave
*16467b97STreehugger Robot	/// differently.  Override and call mismatchRecover(input, ttype, follow)
*16467b97STreehugger Robot	/// to get single token insertion and deletion.  Use this to turn off
*16467b97STreehugger Robot	/// single token insertion and deletion. Override mismatchRecover
*16467b97STreehugger Robot	/// to call this instead.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// \remark mismatch only works for parsers and must be overridden for anything else.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Report a recognition problem.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// This method sets errorRecovery to indicate the parser is recovering
*16467b97STreehugger Robot	/// not parsing.  Once in recovery mode, no errors are generated.
*16467b97STreehugger Robot	/// To get out of recovery mode, the parser must successfully match
*16467b97STreehugger Robot	/// a token (after a resync).  So it will go:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	///		1. error occurs
*16467b97STreehugger Robot	///		2. enter recovery mode, report error
*16467b97STreehugger Robot	///		3. consume until token found in resynch set
*16467b97STreehugger Robot	///		4. try to resume parsing
*16467b97STreehugger Robot	///		5. next match() will reset errorRecovery mode
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// If you override, make sure to update errorCount if you care about that.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    void	reportError();
*16467b97STreehugger Robot	void	reportError( ClassForwarder<LexerType> );
*16467b97STreehugger Robot	template<typename CompType>
*16467b97STreehugger Robot	void	reportError( ClassForwarder<CompType> );
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** Function that is called to display a recognition error message. You may
*16467b97STreehugger Robot     *  override this function independently of (*reportError)() above as that function calls
*16467b97STreehugger Robot     *  this one to do the actual exception printing.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	displayRecognitionError(ANTLR_UINT8** tokenNames);
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// Get number of recognition errors (lexer, parser, tree parser).  Each
*16467b97STreehugger Robot	/// recognizer tracks its own number.  So parser and lexer each have
*16467b97STreehugger Robot	/// separate count.  Does not count the spurious errors found between
*16467b97STreehugger Robot	/// an error and next valid token match
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// \see reportError()
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	ANTLR_UINT32 getNumberOfSyntaxErrors();
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** Function that recovers from an error found in the input stream.
*16467b97STreehugger Robot     *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
*16467b97STreehugger Robot     *  be from a mismatched token that the (*match)() could not recover from.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	recover();
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that is a hook to listen to token consumption during error recovery.
*16467b97STreehugger Robot     *  This is mainly used by the debug parser to send events to the listener.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	beginResync();
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that is a hook to listen to token consumption during error recovery.
*16467b97STreehugger Robot     *  This is mainly used by the debug parser to send events to the listener.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	endResync();
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/** function that is a hook to listen to token consumption during error recovery.
*16467b97STreehugger Robot     *  This is mainly used by the debug parser to send events to the listener.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	beginBacktrack(ANTLR_UINT32 level);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that is a hook to listen to token consumption during error recovery.
*16467b97STreehugger Robot     *  This is mainly used by the debug parser to send events to the listener.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	endBacktrack(ANTLR_UINT32 level, bool successful);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Compute the error recovery set for the current rule.
*16467b97STreehugger Robot	/// Documentation below is from the Java implementation.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// During rule invocation, the parser pushes the set of tokens that can
*16467b97STreehugger Robot	/// follow that rule reference on the stack; this amounts to
*16467b97STreehugger Robot	/// computing FIRST of what follows the rule reference in the
*16467b97STreehugger Robot	/// enclosing rule. This local follow set only includes tokens
*16467b97STreehugger Robot	/// from within the rule; i.e., the FIRST computation done by
*16467b97STreehugger Robot	/// ANTLR stops at the end of a rule.
*16467b97STreehugger Robot	//
*16467b97STreehugger Robot	/// EXAMPLE
*16467b97STreehugger Robot	//
*16467b97STreehugger Robot	/// When you find a "no viable alt exception", the input is not
*16467b97STreehugger Robot	/// consistent with any of the alternatives for rule r.  The best
*16467b97STreehugger Robot	/// thing to do is to consume tokens until you see something that
*16467b97STreehugger Robot	/// can legally follow a call to r *or* any rule that called r.
*16467b97STreehugger Robot	/// You don't want the exact set of viable next tokens because the
*16467b97STreehugger Robot	/// input might just be missing a token--you might consume the
*16467b97STreehugger Robot	/// rest of the input looking for one of the missing tokens.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// Consider grammar:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// a : '[' b ']'
*16467b97STreehugger Robot	///   | '(' b ')'
*16467b97STreehugger Robot	///   ;
*16467b97STreehugger Robot	/// b : c '^' INT ;
*16467b97STreehugger Robot	/// c : ID
*16467b97STreehugger Robot	///   | INT
*16467b97STreehugger Robot	///   ;
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// At each rule invocation, the set of tokens that could follow
*16467b97STreehugger Robot	/// that rule is pushed on a stack.  Here are the various "local"
*16467b97STreehugger Robot	/// follow sets:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// FOLLOW(b1_in_a) = FIRST(']') = ']'
*16467b97STreehugger Robot	/// FOLLOW(b2_in_a) = FIRST(')') = ')'
*16467b97STreehugger Robot	/// FOLLOW(c_in_b) = FIRST('^') = '^'
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// Upon erroneous input "[]", the call chain is
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// a -> b -> c
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// and, hence, the follow context stack is:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// depth  local follow set     after call to rule
*16467b97STreehugger Robot	///   0         <EOF>                    a (from main())
*16467b97STreehugger Robot	///   1          ']'                     b
*16467b97STreehugger Robot	///   3          '^'                     c
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// Notice that ')' is not included, because b would have to have
*16467b97STreehugger Robot	/// been called from a different context in rule a for ')' to be
*16467b97STreehugger Robot	/// included.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// For error recovery, we cannot consider FOLLOW(c)
*16467b97STreehugger Robot	/// (context-sensitive or otherwise).  We need the combined set of
*16467b97STreehugger Robot	/// all context-sensitive FOLLOW sets--the set of all tokens that
*16467b97STreehugger Robot	/// could follow any reference in the call chain.  We need to
*16467b97STreehugger Robot	/// resync to one of those tokens.  Note that FOLLOW(c)='^' and if
*16467b97STreehugger Robot	/// we resync'd to that token, we'd consume until EOF.  We need to
*16467b97STreehugger Robot	/// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
*16467b97STreehugger Robot	/// In this case, for input "[]", LA(1) is in this set so we would
*16467b97STreehugger Robot	/// not consume anything and after printing an error rule c would
*16467b97STreehugger Robot	/// return normally.  It would not find the required '^' though.
*16467b97STreehugger Robot	/// At this point, it gets a mismatched token error and throws an
*16467b97STreehugger Robot	/// exception (since LA(1) is not in the viable following token
*16467b97STreehugger Robot	/// set).  The rule exception handler tries to recover, but finds
*16467b97STreehugger Robot	/// the same recovery set and doesn't consume anything.  Rule b
*16467b97STreehugger Robot	/// exits normally returning to rule a.  Now it finds the ']' (and
*16467b97STreehugger Robot	/// with the successful match exits errorRecovery mode).
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// So, you can see that the parser walks up call chain looking
*16467b97STreehugger Robot	/// for the token that was a member of the recovery set.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// Errors are not generated in errorRecovery mode.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// ANTLR's error recovery mechanism is based upon original ideas:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// "Algorithms + Data Structures = Programs" by Niklaus Wirth
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// and
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// "A note on error recovery in recursive descent parsers":
*16467b97STreehugger Robot	/// http://portal.acm.org/citation.cfm?id=947902.947905
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// Later, Josef Grosch had some good ideas:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// "Efficient and Comfortable Error Recovery in Recursive Descent
*16467b97STreehugger Robot	/// Parsers":
*16467b97STreehugger Robot	/// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// Like Grosch I implemented local FOLLOW sets that are combined
*16467b97STreehugger Robot	/// at run-time upon error to avoid overhead during parsing.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    BitsetType*	computeErrorRecoverySet();
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Compute the context-sensitive FOLLOW set for current rule.
*16467b97STreehugger Robot	/// Documentation below is from the Java runtime.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// This is the set of token types that can follow a specific rule
*16467b97STreehugger Robot	/// reference given a specific call chain.  You get the set of
*16467b97STreehugger Robot	/// viable tokens that can possibly come next (look ahead depth 1)
*16467b97STreehugger Robot	/// given the current call chain.  Contrast this with the
*16467b97STreehugger Robot	/// definition of plain FOLLOW for rule r:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	///  FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// where x in T* and alpha, beta in V*; T is set of terminals and
*16467b97STreehugger Robot	/// V is the set of terminals and non terminals.  In other words,
*16467b97STreehugger Robot	/// FOLLOW(r) is the set of all tokens that can possibly follow
*16467b97STreehugger Robot	/// references to r in///any* sentential form (context).  At
*16467b97STreehugger Robot	/// runtime, however, we know precisely which context applies as
*16467b97STreehugger Robot	/// we have the call chain.  We may compute the exact (rather
*16467b97STreehugger Robot	/// than covering superset) set of following tokens.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// For example, consider grammar:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// stat : ID '=' expr ';'      // FOLLOW(stat)=={EOF}
*16467b97STreehugger Robot	///      | "return" expr '.'
*16467b97STreehugger Robot	///      ;
*16467b97STreehugger Robot	/// expr : atom ('+' atom)* ;   // FOLLOW(expr)=={';','.',')'}
*16467b97STreehugger Robot	/// atom : INT                  // FOLLOW(atom)=={'+',')',';','.'}
*16467b97STreehugger Robot	///      | '(' expr ')'
*16467b97STreehugger Robot	///      ;
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// The FOLLOW sets are all inclusive whereas context-sensitive
*16467b97STreehugger Robot	/// FOLLOW sets are precisely what could follow a rule reference.
*16467b97STreehugger Robot	/// For input input "i=(3);", here is the derivation:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// stat => ID '=' expr ';'
*16467b97STreehugger Robot	///      => ID '=' atom ('+' atom)* ';'
*16467b97STreehugger Robot	///      => ID '=' '(' expr ')' ('+' atom)* ';'
*16467b97STreehugger Robot	///      => ID '=' '(' atom ')' ('+' atom)* ';'
*16467b97STreehugger Robot	///      => ID '=' '(' INT ')' ('+' atom)* ';'
*16467b97STreehugger Robot	///      => ID '=' '(' INT ')' ';'
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// At the "3" token, you'd have a call chain of
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	///   stat -> expr -> atom -> expr -> atom
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// What can follow that specific nested ref to atom?  Exactly ')'
*16467b97STreehugger Robot	/// as you can see by looking at the derivation of this specific
*16467b97STreehugger Robot	/// input.  Contrast this with the FOLLOW(atom)={'+',')',';','.'}.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// You want the exact viable token set when recovering from a
*16467b97STreehugger Robot	/// token mismatch.  Upon token mismatch, if LA(1) is member of
*16467b97STreehugger Robot	/// the viable next token set, then you know there is most likely
*16467b97STreehugger Robot	/// a missing token in the input stream.  "Insert" one by just not
*16467b97STreehugger Robot	/// throwing an exception.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    BitsetType*	computeCSRuleFollow();
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Compute the current followset for the input stream.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    BitsetType*	combineFollows(bool exact);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /// Attempt to recover from a single missing or extra token.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// EXTRA TOKEN
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// LA(1) is not what we are looking for.  If LA(2) has the right token,
*16467b97STreehugger Robot	/// however, then assume LA(1) is some extra spurious token.  Delete it
*16467b97STreehugger Robot	/// and LA(2) as if we were doing a normal match(), which advances the
*16467b97STreehugger Robot	/// input.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// MISSING TOKEN
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// If current token is consistent with what could come after
*16467b97STreehugger Robot	/// ttype then it is ok to "insert" the missing token, else throw
*16467b97STreehugger Robot	/// exception For example, Input "i=(3;" is clearly missing the
*16467b97STreehugger Robot	/// ')'.  When the parser returns from the nested call to expr, it
*16467b97STreehugger Robot	/// will have call chain:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	///    stat -> expr -> atom
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// and it will be trying to match the ')' at this point in the
*16467b97STreehugger Robot	/// derivation:
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	///       => ID '=' '(' INT ')' ('+' atom)* ';'
*16467b97STreehugger Robot	///                          ^
*16467b97STreehugger Robot	/// match() will see that ';' doesn't match ')' and report a
*16467b97STreehugger Robot	/// mismatched token error.  To recover, it sees that LA(1)==';'
*16467b97STreehugger Robot	/// is in the set of tokens that can follow the ')' token
*16467b97STreehugger Robot	/// reference in rule atom.  It can assume that you forgot the ')'.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// The exception that was passed in, in the java implementation is
*16467b97STreehugger Robot	/// sorted in the recognizer exception stack in the C version. To 'throw' it we set the
*16467b97STreehugger Robot	/// error flag and rules cascade back when this is set.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    const UnitType* recoverFromMismatchedToken( ANTLR_UINT32	ttype, BitsetListType*	follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** Function that recovers from a mismatched set in the token stream, in a similar manner
*16467b97STreehugger Robot     *  to (*recoverFromMismatchedToken)
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    const UnitType* recoverFromMismatchedSet(BitsetListType*	follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** common routine to handle single token insertion for recovery functions.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot	/// This code is factored out from mismatched token and mismatched set
*16467b97STreehugger Robot	///  recovery.  It handles "single token insertion" error recovery for
*16467b97STreehugger Robot	/// both.  No tokens are consumed to recover from insertions.  Return
*16467b97STreehugger Robot	/// true if recovery was possible else return false.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot    bool	recoverFromMismatchedElement(BitsetListType*	follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that consumes input until the next token matches
*16467b97STreehugger Robot     *  the given token.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	consumeUntil(ANTLR_UINT32   tokenType);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that consumes input until the next token matches
*16467b97STreehugger Robot     *  one in the given set.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	consumeUntilSet(BitsetType*	set);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that returns an ANTLR3_LIST of the strings that identify
*16467b97STreehugger Robot     *  the rules in the parser that got you to this point. Can be overridden by installing your
*16467b97STreehugger Robot     *	own function set.
*16467b97STreehugger Robot     *
*16467b97STreehugger Robot     * \todo Document how to override invocation stack functions.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot	StringStackType	getRuleInvocationStack();
*16467b97STreehugger Robot	StringStackType	getRuleInvocationStackNamed(ANTLR_UINT8*    name);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of
*16467b97STreehugger Robot     *  string token names. As this is mostly used in string template processing it may not be useful
*16467b97STreehugger Robot     *  in the C runtime.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    StringListType	toStrings( const StringListType& );
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function to return whether the rule has parsed input starting at the supplied
*16467b97STreehugger Robot     *  start index before. If the rule has not parsed input starting from the supplied start index,
*16467b97STreehugger Robot     *  then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point
*16467b97STreehugger Robot     *  then it will return the point where it last stopped parsing after that start point.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    ANTLR_MARKER	getRuleMemoization( ANTLR_INTKEY	ruleIndex,
*16467b97STreehugger Robot												ANTLR_MARKER	ruleParseStart);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** function that determines whether the rule has parsed input at the current index
*16467b97STreehugger Robot     *  in the input stream
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    bool	alreadyParsedRule(ANTLR_MARKER	ruleIndex);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** Function that records whether the rule has parsed the input at a
*16467b97STreehugger Robot     *  current position successfully or not.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void	memoize(ANTLR_MARKER	ruleIndex,
*16467b97STreehugger Robot								ANTLR_MARKER	ruleParseStart);
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// Function that returns the current input symbol.
*16467b97STreehugger Robot    /// The is placed into any label for the associated token ref; e.g., x=ID.  Token
*16467b97STreehugger Robot	/// and tree parsers need to return different objects. Rather than test
*16467b97STreehugger Robot	/// for input stream type or change the IntStream interface, I use
*16467b97STreehugger Robot	/// a simple method to ask the recognizer to tell me what the current
*16467b97STreehugger Robot	/// input symbol is.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// This is ignored for lexers and the lexer implementation of this
*16467b97STreehugger Robot	/// function should return NULL.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	const UnitType*	getCurrentInputSymbol(IntStreamType* istream);
*16467b97STreehugger Robot	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>);
*16467b97STreehugger Robot	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>);
*16467b97STreehugger Robot	const UnitType*	getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>);
*16467b97STreehugger Robot
*16467b97STreehugger Robot	/// Conjure up a missing token during error recovery.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	/// The recognizer attempts to recover from single missing
*16467b97STreehugger Robot	/// symbols. But, actions might refer to that missing symbol.
*16467b97STreehugger Robot	/// For example, x=ID {f($x);}. The action clearly assumes
*16467b97STreehugger Robot	/// that there has been an identifier matched previously and that
*16467b97STreehugger Robot	/// $x points at that token. If that token is missing, but
*16467b97STreehugger Robot	/// the next token in the stream is what we want we assume that
*16467b97STreehugger Robot	/// this token is missing and we keep going. Because we
*16467b97STreehugger Robot	/// have to return some token to replace the missing token,
*16467b97STreehugger Robot	/// we have to conjure one up. This method gives the user control
*16467b97STreehugger Robot	/// over the tokens returned for missing tokens. Mostly,
*16467b97STreehugger Robot	/// you will want to create something special for identifier
*16467b97STreehugger Robot	/// tokens. For literals such as '{' and ',', the default
*16467b97STreehugger Robot	/// action in the parser or tree parser works. It simply creates
*16467b97STreehugger Robot	/// a CommonToken of the appropriate type. The text will be the token.
*16467b97STreehugger Robot	/// If you change what tokens must be created by the lexer,
*16467b97STreehugger Robot	/// override this method to create the appropriate tokens.
*16467b97STreehugger Robot	///
*16467b97STreehugger Robot	UnitType*	getMissingSymbol( IntStreamType*		istream, ExceptionBaseType*		e,
*16467b97STreehugger Robot												ANTLR_UINT32			expectedTokenType,
*16467b97STreehugger Robot												BitsetListType*		follow);
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** Function that returns whether the supplied grammar function
*16467b97STreehugger Robot     *  will parse the current input stream or not. This is the way that syntactic
*16467b97STreehugger Robot     *  predicates are evaluated. Unlike java, C is perfectly happy to invoke code
*16467b97STreehugger Robot     *  via a pointer to a function (hence that's what all the ANTLR3 C interfaces
*16467b97STreehugger Robot     *  do.
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot	template<typename Predicate>
*16467b97STreehugger Robot    bool  synpred( ClassForwarder<Predicate> );
*16467b97STreehugger Robot
*16467b97STreehugger Robot	//In place of exConstruct, just directly instantiate the Exception Object
*16467b97STreehugger Robot
*16467b97STreehugger Robot    /** Reset the recognizer
*16467b97STreehugger Robot     */
*16467b97STreehugger Robot    void  reset();
*16467b97STreehugger Robot	void  reset( ClassForwarder<LexerType> );
*16467b97STreehugger Robot	template<typename CompType>
*16467b97STreehugger Robot	void  reset( ClassForwarder<CompType> );
*16467b97STreehugger Robot
*16467b97STreehugger Robot	void exConstruct();
*16467b97STreehugger Robot
*16467b97STreehugger Robot    ~BaseRecognizer();
*16467b97STreehugger Robot
*16467b97STreehugger Robot};
*16467b97STreehugger Robot
*16467b97STreehugger RobotANTLR_END_NAMESPACE()
*16467b97STreehugger Robot
*16467b97STreehugger Robot#include "antlr3baserecognizer.inl"
*16467b97STreehugger Robot
*16467b97STreehugger Robot/// @}
*16467b97STreehugger Robot///
*16467b97STreehugger Robot
*16467b97STreehugger Robot#endif	    /* _ANTLR3_BASERECOGNIZER_H	*/
*16467b97STreehugger Robot