1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * Defines the basic structure to support recognizing by either a lexer, 3*16467b97STreehugger Robot * parser, or tree parser. 4*16467b97STreehugger Robot * \addtogroup BaseRecognizer 5*16467b97STreehugger Robot * @{ 6*16467b97STreehugger Robot */ 7*16467b97STreehugger Robot #ifndef _ANTLR3_BASERECOGNIZER_HPP 8*16467b97STreehugger Robot #define _ANTLR3_BASERECOGNIZER_HPP 9*16467b97STreehugger Robot 10*16467b97STreehugger Robot // [The "BSD licence"] 11*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 12*16467b97STreehugger Robot 13*16467b97STreehugger Robot // 14*16467b97STreehugger Robot // All rights reserved. 15*16467b97STreehugger Robot // 16*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 17*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 18*16467b97STreehugger Robot // are met: 19*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 20*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 21*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 22*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 23*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 24*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 25*16467b97STreehugger Robot // derived from this software without specific prior written permission. 26*16467b97STreehugger Robot // 27*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 28*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 29*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 30*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 31*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 32*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 33*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 34*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 35*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 36*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 37*16467b97STreehugger Robot 38*16467b97STreehugger Robot #include "antlr3defs.hpp" 39*16467b97STreehugger Robot #include "antlr3collections.hpp" 40*16467b97STreehugger Robot 41*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE() 42*16467b97STreehugger Robot 43*16467b97STreehugger Robot /** \brief Base tracking context structure for all types of 44*16467b97STreehugger Robot * recognizers. 45*16467b97STreehugger Robot */ 46*16467b97STreehugger Robot template< class ImplTraits, class StreamType > 47*16467b97STreehugger Robot class BaseRecognizer : public ImplTraits::AllocPolicyType 48*16467b97STreehugger Robot { 49*16467b97STreehugger Robot public: 50*16467b97STreehugger Robot typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 51*16467b97STreehugger Robot typedef typename StreamType::IntStreamType IntStreamType; 52*16467b97STreehugger Robot typedef typename ComponentTypeFinder<ImplTraits, StreamType>::ComponentType SuperType; 53*16467b97STreehugger Robot typedef typename StreamType::UnitType UnitType; 54*16467b97STreehugger Robot typedef typename ImplTraits::template ExceptionBaseType<StreamType> ExceptionBaseType; 55*16467b97STreehugger Robot typedef typename ImplTraits::BitsetType BitsetType; 56*16467b97STreehugger Robot typedef typename ImplTraits::BitsetListType BitsetListType; 57*16467b97STreehugger Robot typedef typename ImplTraits::StringType StringType; 58*16467b97STreehugger Robot typedef typename ImplTraits::template RecognizerSharedStateType<StreamType> RecognizerSharedStateType; 59*16467b97STreehugger Robot typedef typename ImplTraits::DebugEventListenerType DebugEventListenerType; 60*16467b97STreehugger Robot typedef typename ImplTraits::LexerType LexerType; 61*16467b97STreehugger Robot typedef typename ImplTraits::ParserType ParserType; 62*16467b97STreehugger Robot typedef typename ImplTraits::TreeParserType TreeParserType; 63*16467b97STreehugger Robot 64*16467b97STreehugger Robot typedef typename AllocPolicyType::template StackType<StringType> StringStackType; 65*16467b97STreehugger Robot typedef typename AllocPolicyType::template ListType<StringType> StringListType; 66*16467b97STreehugger Robot 67*16467b97STreehugger Robot private: 68*16467b97STreehugger Robot /// A pointer to the shared recognizer state, such that multiple 69*16467b97STreehugger Robot /// recognizers can use the same inputs streams and so on (in 70*16467b97STreehugger Robot /// the case of grammar inheritance for instance. 71*16467b97STreehugger Robot /// 72*16467b97STreehugger Robot RecognizerSharedStateType* m_state; 73*16467b97STreehugger Robot 74*16467b97STreehugger Robot /// If set to something other than NULL, then this structure is 75*16467b97STreehugger Robot /// points to an instance of the debugger interface. In general, the 76*16467b97STreehugger Robot /// debugger is only referenced internally in recovery/error operations 77*16467b97STreehugger Robot /// so that it does not cause overhead by having to check this pointer 78*16467b97STreehugger Robot /// in every function/method 79*16467b97STreehugger Robot /// 80*16467b97STreehugger Robot DebugEventListenerType* m_debugger; 81*16467b97STreehugger Robot 82*16467b97STreehugger Robot 83*16467b97STreehugger Robot public: 84*16467b97STreehugger Robot BaseRecognizer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); 85*16467b97STreehugger Robot 86*16467b97STreehugger Robot SuperType* get_super(); 87*16467b97STreehugger Robot RecognizerSharedStateType* get_state() const; 88*16467b97STreehugger Robot DebugEventListenerType* get_debugger() const; 89*16467b97STreehugger Robot void set_state( RecognizerSharedStateType* state ); 90*16467b97STreehugger Robot void set_debugger( DebugEventListenerType* debugger ); 91*16467b97STreehugger Robot 92*16467b97STreehugger Robot /// Match current input symbol against ttype. Upon error, do one token 93*16467b97STreehugger Robot /// insertion or deletion if possible. 94*16467b97STreehugger Robot /// To turn off single token insertion or deletion error 95*16467b97STreehugger Robot /// recovery, override mismatchRecover() and have it call 96*16467b97STreehugger Robot /// plain mismatch(), which does not recover. Then any error 97*16467b97STreehugger Robot /// in a rule will cause an exception and immediate exit from 98*16467b97STreehugger Robot /// rule. Rule would recover by resynchronizing to the set of 99*16467b97STreehugger Robot /// symbols that can follow rule ref. 100*16467b97STreehugger Robot /// 101*16467b97STreehugger Robot const UnitType* match(ANTLR_UINT32 ttype, BitsetListType* follow); 102*16467b97STreehugger Robot 103*16467b97STreehugger Robot /// Consumes the next token, whatever it is, and resets the recognizer state 104*16467b97STreehugger Robot /// so that it is not in error. 105*16467b97STreehugger Robot /// 106*16467b97STreehugger Robot /// \param recognizer 107*16467b97STreehugger Robot /// Recognizer context pointer 108*16467b97STreehugger Robot /// 109*16467b97STreehugger Robot void matchAny(); 110*16467b97STreehugger Robot 111*16467b97STreehugger Robot /// function that decides if the token ahead of the current one is the 112*16467b97STreehugger Robot /// one we were loking for, in which case the curernt one is very likely extraneous 113*16467b97STreehugger Robot /// and can be reported that way. 114*16467b97STreehugger Robot /// 115*16467b97STreehugger Robot bool mismatchIsUnwantedToken(IntStreamType* input, ANTLR_UINT32 ttype); 116*16467b97STreehugger Robot 117*16467b97STreehugger Robot /// function that decides if the current token is one that can logically 118*16467b97STreehugger Robot /// follow the one we were looking for, in which case the one we were looking for is 119*16467b97STreehugger Robot /// probably missing from the input. 120*16467b97STreehugger Robot /// 121*16467b97STreehugger Robot bool mismatchIsMissingToken(IntStreamType* input, BitsetListType* follow); 122*16467b97STreehugger Robot 123*16467b97STreehugger Robot /// Factor out what to do upon token mismatch so tree parsers can behave 124*16467b97STreehugger Robot /// differently. Override and call mismatchRecover(input, ttype, follow) 125*16467b97STreehugger Robot /// to get single token insertion and deletion. Use this to turn off 126*16467b97STreehugger Robot /// single token insertion and deletion. Override mismatchRecover 127*16467b97STreehugger Robot /// to call this instead. 128*16467b97STreehugger Robot /// 129*16467b97STreehugger Robot /// \remark mismatch only works for parsers and must be overridden for anything else. 130*16467b97STreehugger Robot /// 131*16467b97STreehugger Robot void mismatch(ANTLR_UINT32 ttype, BitsetListType* follow); 132*16467b97STreehugger Robot 133*16467b97STreehugger Robot /// Report a recognition problem. 134*16467b97STreehugger Robot /// 135*16467b97STreehugger Robot /// This method sets errorRecovery to indicate the parser is recovering 136*16467b97STreehugger Robot /// not parsing. Once in recovery mode, no errors are generated. 137*16467b97STreehugger Robot /// To get out of recovery mode, the parser must successfully match 138*16467b97STreehugger Robot /// a token (after a resync). So it will go: 139*16467b97STreehugger Robot /// 140*16467b97STreehugger Robot /// 1. error occurs 141*16467b97STreehugger Robot /// 2. enter recovery mode, report error 142*16467b97STreehugger Robot /// 3. consume until token found in resynch set 143*16467b97STreehugger Robot /// 4. try to resume parsing 144*16467b97STreehugger Robot /// 5. next match() will reset errorRecovery mode 145*16467b97STreehugger Robot /// 146*16467b97STreehugger Robot /// If you override, make sure to update errorCount if you care about that. 147*16467b97STreehugger Robot /// 148*16467b97STreehugger Robot void reportError(); 149*16467b97STreehugger Robot void reportError( ClassForwarder<LexerType> ); 150*16467b97STreehugger Robot template<typename CompType> 151*16467b97STreehugger Robot void reportError( ClassForwarder<CompType> ); 152*16467b97STreehugger Robot 153*16467b97STreehugger Robot /** Function that is called to display a recognition error message. You may 154*16467b97STreehugger Robot * override this function independently of (*reportError)() above as that function calls 155*16467b97STreehugger Robot * this one to do the actual exception printing. 156*16467b97STreehugger Robot */ 157*16467b97STreehugger Robot void displayRecognitionError(ANTLR_UINT8** tokenNames); 158*16467b97STreehugger Robot 159*16467b97STreehugger Robot /// Get number of recognition errors (lexer, parser, tree parser). Each 160*16467b97STreehugger Robot /// recognizer tracks its own number. So parser and lexer each have 161*16467b97STreehugger Robot /// separate count. Does not count the spurious errors found between 162*16467b97STreehugger Robot /// an error and next valid token match 163*16467b97STreehugger Robot /// 164*16467b97STreehugger Robot /// \see reportError() 165*16467b97STreehugger Robot /// 166*16467b97STreehugger Robot ANTLR_UINT32 getNumberOfSyntaxErrors(); 167*16467b97STreehugger Robot 168*16467b97STreehugger Robot /** Function that recovers from an error found in the input stream. 169*16467b97STreehugger Robot * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also 170*16467b97STreehugger Robot * be from a mismatched token that the (*match)() could not recover from. 171*16467b97STreehugger Robot */ 172*16467b97STreehugger Robot void recover(); 173*16467b97STreehugger Robot 174*16467b97STreehugger Robot /** function that is a hook to listen to token consumption during error recovery. 175*16467b97STreehugger Robot * This is mainly used by the debug parser to send events to the listener. 176*16467b97STreehugger Robot */ 177*16467b97STreehugger Robot void beginResync(); 178*16467b97STreehugger Robot 179*16467b97STreehugger Robot /** function that is a hook to listen to token consumption during error recovery. 180*16467b97STreehugger Robot * This is mainly used by the debug parser to send events to the listener. 181*16467b97STreehugger Robot */ 182*16467b97STreehugger Robot void endResync(); 183*16467b97STreehugger Robot 184*16467b97STreehugger Robot /** function that is a hook to listen to token consumption during error recovery. 185*16467b97STreehugger Robot * This is mainly used by the debug parser to send events to the listener. 186*16467b97STreehugger Robot */ 187*16467b97STreehugger Robot void beginBacktrack(ANTLR_UINT32 level); 188*16467b97STreehugger Robot 189*16467b97STreehugger Robot /** function that is a hook to listen to token consumption during error recovery. 190*16467b97STreehugger Robot * This is mainly used by the debug parser to send events to the listener. 191*16467b97STreehugger Robot */ 192*16467b97STreehugger Robot void endBacktrack(ANTLR_UINT32 level, bool successful); 193*16467b97STreehugger Robot 194*16467b97STreehugger Robot /// Compute the error recovery set for the current rule. 195*16467b97STreehugger Robot /// Documentation below is from the Java implementation. 196*16467b97STreehugger Robot /// 197*16467b97STreehugger Robot /// During rule invocation, the parser pushes the set of tokens that can 198*16467b97STreehugger Robot /// follow that rule reference on the stack; this amounts to 199*16467b97STreehugger Robot /// computing FIRST of what follows the rule reference in the 200*16467b97STreehugger Robot /// enclosing rule. This local follow set only includes tokens 201*16467b97STreehugger Robot /// from within the rule; i.e., the FIRST computation done by 202*16467b97STreehugger Robot /// ANTLR stops at the end of a rule. 203*16467b97STreehugger Robot // 204*16467b97STreehugger Robot /// EXAMPLE 205*16467b97STreehugger Robot // 206*16467b97STreehugger Robot /// When you find a "no viable alt exception", the input is not 207*16467b97STreehugger Robot /// consistent with any of the alternatives for rule r. The best 208*16467b97STreehugger Robot /// thing to do is to consume tokens until you see something that 209*16467b97STreehugger Robot /// can legally follow a call to r *or* any rule that called r. 210*16467b97STreehugger Robot /// You don't want the exact set of viable next tokens because the 211*16467b97STreehugger Robot /// input might just be missing a token--you might consume the 212*16467b97STreehugger Robot /// rest of the input looking for one of the missing tokens. 213*16467b97STreehugger Robot /// 214*16467b97STreehugger Robot /// Consider grammar: 215*16467b97STreehugger Robot /// 216*16467b97STreehugger Robot /// a : '[' b ']' 217*16467b97STreehugger Robot /// | '(' b ')' 218*16467b97STreehugger Robot /// ; 219*16467b97STreehugger Robot /// b : c '^' INT ; 220*16467b97STreehugger Robot /// c : ID 221*16467b97STreehugger Robot /// | INT 222*16467b97STreehugger Robot /// ; 223*16467b97STreehugger Robot /// 224*16467b97STreehugger Robot /// At each rule invocation, the set of tokens that could follow 225*16467b97STreehugger Robot /// that rule is pushed on a stack. Here are the various "local" 226*16467b97STreehugger Robot /// follow sets: 227*16467b97STreehugger Robot /// 228*16467b97STreehugger Robot /// FOLLOW(b1_in_a) = FIRST(']') = ']' 229*16467b97STreehugger Robot /// FOLLOW(b2_in_a) = FIRST(')') = ')' 230*16467b97STreehugger Robot /// FOLLOW(c_in_b) = FIRST('^') = '^' 231*16467b97STreehugger Robot /// 232*16467b97STreehugger Robot /// Upon erroneous input "[]", the call chain is 233*16467b97STreehugger Robot /// 234*16467b97STreehugger Robot /// a -> b -> c 235*16467b97STreehugger Robot /// 236*16467b97STreehugger Robot /// and, hence, the follow context stack is: 237*16467b97STreehugger Robot /// 238*16467b97STreehugger Robot /// depth local follow set after call to rule 239*16467b97STreehugger Robot /// 0 <EOF> a (from main()) 240*16467b97STreehugger Robot /// 1 ']' b 241*16467b97STreehugger Robot /// 3 '^' c 242*16467b97STreehugger Robot /// 243*16467b97STreehugger Robot /// Notice that ')' is not included, because b would have to have 244*16467b97STreehugger Robot /// been called from a different context in rule a for ')' to be 245*16467b97STreehugger Robot /// included. 246*16467b97STreehugger Robot /// 247*16467b97STreehugger Robot /// For error recovery, we cannot consider FOLLOW(c) 248*16467b97STreehugger Robot /// (context-sensitive or otherwise). We need the combined set of 249*16467b97STreehugger Robot /// all context-sensitive FOLLOW sets--the set of all tokens that 250*16467b97STreehugger Robot /// could follow any reference in the call chain. We need to 251*16467b97STreehugger Robot /// resync to one of those tokens. Note that FOLLOW(c)='^' and if 252*16467b97STreehugger Robot /// we resync'd to that token, we'd consume until EOF. We need to 253*16467b97STreehugger Robot /// sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 254*16467b97STreehugger Robot /// In this case, for input "[]", LA(1) is in this set so we would 255*16467b97STreehugger Robot /// not consume anything and after printing an error rule c would 256*16467b97STreehugger Robot /// return normally. It would not find the required '^' though. 257*16467b97STreehugger Robot /// At this point, it gets a mismatched token error and throws an 258*16467b97STreehugger Robot /// exception (since LA(1) is not in the viable following token 259*16467b97STreehugger Robot /// set). The rule exception handler tries to recover, but finds 260*16467b97STreehugger Robot /// the same recovery set and doesn't consume anything. Rule b 261*16467b97STreehugger Robot /// exits normally returning to rule a. Now it finds the ']' (and 262*16467b97STreehugger Robot /// with the successful match exits errorRecovery mode). 263*16467b97STreehugger Robot /// 264*16467b97STreehugger Robot /// So, you can see that the parser walks up call chain looking 265*16467b97STreehugger Robot /// for the token that was a member of the recovery set. 266*16467b97STreehugger Robot /// 267*16467b97STreehugger Robot /// Errors are not generated in errorRecovery mode. 268*16467b97STreehugger Robot /// 269*16467b97STreehugger Robot /// ANTLR's error recovery mechanism is based upon original ideas: 270*16467b97STreehugger Robot /// 271*16467b97STreehugger Robot /// "Algorithms + Data Structures = Programs" by Niklaus Wirth 272*16467b97STreehugger Robot /// 273*16467b97STreehugger Robot /// and 274*16467b97STreehugger Robot /// 275*16467b97STreehugger Robot /// "A note on error recovery in recursive descent parsers": 276*16467b97STreehugger Robot /// http://portal.acm.org/citation.cfm?id=947902.947905 277*16467b97STreehugger Robot /// 278*16467b97STreehugger Robot /// Later, Josef Grosch had some good ideas: 279*16467b97STreehugger Robot /// 280*16467b97STreehugger Robot /// "Efficient and Comfortable Error Recovery in Recursive Descent 281*16467b97STreehugger Robot /// Parsers": 282*16467b97STreehugger Robot /// ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 283*16467b97STreehugger Robot /// 284*16467b97STreehugger Robot /// Like Grosch I implemented local FOLLOW sets that are combined 285*16467b97STreehugger Robot /// at run-time upon error to avoid overhead during parsing. 286*16467b97STreehugger Robot /// 287*16467b97STreehugger Robot BitsetType* computeErrorRecoverySet(); 288*16467b97STreehugger Robot 289*16467b97STreehugger Robot /// Compute the context-sensitive FOLLOW set for current rule. 290*16467b97STreehugger Robot /// Documentation below is from the Java runtime. 291*16467b97STreehugger Robot /// 292*16467b97STreehugger Robot /// This is the set of token types that can follow a specific rule 293*16467b97STreehugger Robot /// reference given a specific call chain. You get the set of 294*16467b97STreehugger Robot /// viable tokens that can possibly come next (look ahead depth 1) 295*16467b97STreehugger Robot /// given the current call chain. Contrast this with the 296*16467b97STreehugger Robot /// definition of plain FOLLOW for rule r: 297*16467b97STreehugger Robot /// 298*16467b97STreehugger Robot /// FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 299*16467b97STreehugger Robot /// 300*16467b97STreehugger Robot /// where x in T* and alpha, beta in V*; T is set of terminals and 301*16467b97STreehugger Robot /// V is the set of terminals and non terminals. In other words, 302*16467b97STreehugger Robot /// FOLLOW(r) is the set of all tokens that can possibly follow 303*16467b97STreehugger Robot /// references to r in///any* sentential form (context). At 304*16467b97STreehugger Robot /// runtime, however, we know precisely which context applies as 305*16467b97STreehugger Robot /// we have the call chain. We may compute the exact (rather 306*16467b97STreehugger Robot /// than covering superset) set of following tokens. 307*16467b97STreehugger Robot /// 308*16467b97STreehugger Robot /// For example, consider grammar: 309*16467b97STreehugger Robot /// 310*16467b97STreehugger Robot /// stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 311*16467b97STreehugger Robot /// | "return" expr '.' 312*16467b97STreehugger Robot /// ; 313*16467b97STreehugger Robot /// expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 314*16467b97STreehugger Robot /// atom : INT // FOLLOW(atom)=={'+',')',';','.'} 315*16467b97STreehugger Robot /// | '(' expr ')' 316*16467b97STreehugger Robot /// ; 317*16467b97STreehugger Robot /// 318*16467b97STreehugger Robot /// The FOLLOW sets are all inclusive whereas context-sensitive 319*16467b97STreehugger Robot /// FOLLOW sets are precisely what could follow a rule reference. 320*16467b97STreehugger Robot /// For input input "i=(3);", here is the derivation: 321*16467b97STreehugger Robot /// 322*16467b97STreehugger Robot /// stat => ID '=' expr ';' 323*16467b97STreehugger Robot /// => ID '=' atom ('+' atom)* ';' 324*16467b97STreehugger Robot /// => ID '=' '(' expr ')' ('+' atom)* ';' 325*16467b97STreehugger Robot /// => ID '=' '(' atom ')' ('+' atom)* ';' 326*16467b97STreehugger Robot /// => ID '=' '(' INT ')' ('+' atom)* ';' 327*16467b97STreehugger Robot /// => ID '=' '(' INT ')' ';' 328*16467b97STreehugger Robot /// 329*16467b97STreehugger Robot /// At the "3" token, you'd have a call chain of 330*16467b97STreehugger Robot /// 331*16467b97STreehugger Robot /// stat -> expr -> atom -> expr -> atom 332*16467b97STreehugger Robot /// 333*16467b97STreehugger Robot /// What can follow that specific nested ref to atom? Exactly ')' 334*16467b97STreehugger Robot /// as you can see by looking at the derivation of this specific 335*16467b97STreehugger Robot /// input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 336*16467b97STreehugger Robot /// 337*16467b97STreehugger Robot /// You want the exact viable token set when recovering from a 338*16467b97STreehugger Robot /// token mismatch. Upon token mismatch, if LA(1) is member of 339*16467b97STreehugger Robot /// the viable next token set, then you know there is most likely 340*16467b97STreehugger Robot /// a missing token in the input stream. "Insert" one by just not 341*16467b97STreehugger Robot /// throwing an exception. 342*16467b97STreehugger Robot /// 343*16467b97STreehugger Robot BitsetType* computeCSRuleFollow(); 344*16467b97STreehugger Robot 345*16467b97STreehugger Robot /// Compute the current followset for the input stream. 346*16467b97STreehugger Robot /// 347*16467b97STreehugger Robot BitsetType* combineFollows(bool exact); 348*16467b97STreehugger Robot 349*16467b97STreehugger Robot /// Attempt to recover from a single missing or extra token. 350*16467b97STreehugger Robot /// 351*16467b97STreehugger Robot /// EXTRA TOKEN 352*16467b97STreehugger Robot /// 353*16467b97STreehugger Robot /// LA(1) is not what we are looking for. If LA(2) has the right token, 354*16467b97STreehugger Robot /// however, then assume LA(1) is some extra spurious token. Delete it 355*16467b97STreehugger Robot /// and LA(2) as if we were doing a normal match(), which advances the 356*16467b97STreehugger Robot /// input. 357*16467b97STreehugger Robot /// 358*16467b97STreehugger Robot /// MISSING TOKEN 359*16467b97STreehugger Robot /// 360*16467b97STreehugger Robot /// If current token is consistent with what could come after 361*16467b97STreehugger Robot /// ttype then it is ok to "insert" the missing token, else throw 362*16467b97STreehugger Robot /// exception For example, Input "i=(3;" is clearly missing the 363*16467b97STreehugger Robot /// ')'. When the parser returns from the nested call to expr, it 364*16467b97STreehugger Robot /// will have call chain: 365*16467b97STreehugger Robot /// 366*16467b97STreehugger Robot /// stat -> expr -> atom 367*16467b97STreehugger Robot /// 368*16467b97STreehugger Robot /// and it will be trying to match the ')' at this point in the 369*16467b97STreehugger Robot /// derivation: 370*16467b97STreehugger Robot /// 371*16467b97STreehugger Robot /// => ID '=' '(' INT ')' ('+' atom)* ';' 372*16467b97STreehugger Robot /// ^ 373*16467b97STreehugger Robot /// match() will see that ';' doesn't match ')' and report a 374*16467b97STreehugger Robot /// mismatched token error. To recover, it sees that LA(1)==';' 375*16467b97STreehugger Robot /// is in the set of tokens that can follow the ')' token 376*16467b97STreehugger Robot /// reference in rule atom. It can assume that you forgot the ')'. 377*16467b97STreehugger Robot /// 378*16467b97STreehugger Robot /// The exception that was passed in, in the java implementation is 379*16467b97STreehugger Robot /// sorted in the recognizer exception stack in the C version. To 'throw' it we set the 380*16467b97STreehugger Robot /// error flag and rules cascade back when this is set. 381*16467b97STreehugger Robot /// 382*16467b97STreehugger Robot const UnitType* recoverFromMismatchedToken( ANTLR_UINT32 ttype, BitsetListType* follow); 383*16467b97STreehugger Robot 384*16467b97STreehugger Robot /** Function that recovers from a mismatched set in the token stream, in a similar manner 385*16467b97STreehugger Robot * to (*recoverFromMismatchedToken) 386*16467b97STreehugger Robot */ 387*16467b97STreehugger Robot const UnitType* recoverFromMismatchedSet(BitsetListType* follow); 388*16467b97STreehugger Robot 389*16467b97STreehugger Robot /** common routine to handle single token insertion for recovery functions. 390*16467b97STreehugger Robot */ 391*16467b97STreehugger Robot /// This code is factored out from mismatched token and mismatched set 392*16467b97STreehugger Robot /// recovery. It handles "single token insertion" error recovery for 393*16467b97STreehugger Robot /// both. No tokens are consumed to recover from insertions. Return 394*16467b97STreehugger Robot /// true if recovery was possible else return false. 395*16467b97STreehugger Robot /// 396*16467b97STreehugger Robot bool recoverFromMismatchedElement(BitsetListType* follow); 397*16467b97STreehugger Robot 398*16467b97STreehugger Robot /** function that consumes input until the next token matches 399*16467b97STreehugger Robot * the given token. 400*16467b97STreehugger Robot */ 401*16467b97STreehugger Robot void consumeUntil(ANTLR_UINT32 tokenType); 402*16467b97STreehugger Robot 403*16467b97STreehugger Robot /** function that consumes input until the next token matches 404*16467b97STreehugger Robot * one in the given set. 405*16467b97STreehugger Robot */ 406*16467b97STreehugger Robot void consumeUntilSet(BitsetType* set); 407*16467b97STreehugger Robot 408*16467b97STreehugger Robot /** function that returns an ANTLR3_LIST of the strings that identify 409*16467b97STreehugger Robot * the rules in the parser that got you to this point. Can be overridden by installing your 410*16467b97STreehugger Robot * own function set. 411*16467b97STreehugger Robot * 412*16467b97STreehugger Robot * \todo Document how to override invocation stack functions. 413*16467b97STreehugger Robot */ 414*16467b97STreehugger Robot StringStackType getRuleInvocationStack(); 415*16467b97STreehugger Robot StringStackType getRuleInvocationStackNamed(ANTLR_UINT8* name); 416*16467b97STreehugger Robot 417*16467b97STreehugger Robot /** function that converts an ANLR3_LIST of tokens to an ANTLR3_LIST of 418*16467b97STreehugger Robot * string token names. As this is mostly used in string template processing it may not be useful 419*16467b97STreehugger Robot * in the C runtime. 420*16467b97STreehugger Robot */ 421*16467b97STreehugger Robot StringListType toStrings( const StringListType& ); 422*16467b97STreehugger Robot 423*16467b97STreehugger Robot /** function to return whether the rule has parsed input starting at the supplied 424*16467b97STreehugger Robot * start index before. If the rule has not parsed input starting from the supplied start index, 425*16467b97STreehugger Robot * then it will return ANTLR3_MEMO_RULE_UNKNOWN. If it has parsed from the suppled start point 426*16467b97STreehugger Robot * then it will return the point where it last stopped parsing after that start point. 427*16467b97STreehugger Robot */ 428*16467b97STreehugger Robot ANTLR_MARKER getRuleMemoization( ANTLR_INTKEY ruleIndex, 429*16467b97STreehugger Robot ANTLR_MARKER ruleParseStart); 430*16467b97STreehugger Robot 431*16467b97STreehugger Robot /** function that determines whether the rule has parsed input at the current index 432*16467b97STreehugger Robot * in the input stream 433*16467b97STreehugger Robot */ 434*16467b97STreehugger Robot bool alreadyParsedRule(ANTLR_MARKER ruleIndex); 435*16467b97STreehugger Robot 436*16467b97STreehugger Robot /** Function that records whether the rule has parsed the input at a 437*16467b97STreehugger Robot * current position successfully or not. 438*16467b97STreehugger Robot */ 439*16467b97STreehugger Robot void memoize(ANTLR_MARKER ruleIndex, 440*16467b97STreehugger Robot ANTLR_MARKER ruleParseStart); 441*16467b97STreehugger Robot 442*16467b97STreehugger Robot /// Function that returns the current input symbol. 443*16467b97STreehugger Robot /// The is placed into any label for the associated token ref; e.g., x=ID. Token 444*16467b97STreehugger Robot /// and tree parsers need to return different objects. Rather than test 445*16467b97STreehugger Robot /// for input stream type or change the IntStream interface, I use 446*16467b97STreehugger Robot /// a simple method to ask the recognizer to tell me what the current 447*16467b97STreehugger Robot /// input symbol is. 448*16467b97STreehugger Robot /// 449*16467b97STreehugger Robot /// This is ignored for lexers and the lexer implementation of this 450*16467b97STreehugger Robot /// function should return NULL. 451*16467b97STreehugger Robot /// 452*16467b97STreehugger Robot const UnitType* getCurrentInputSymbol(IntStreamType* istream); 453*16467b97STreehugger Robot const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<LexerType>); 454*16467b97STreehugger Robot const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<ParserType>); 455*16467b97STreehugger Robot const UnitType* getCurrentInputSymbol(IntStreamType* istream, ClassForwarder<TreeParserType>); 456*16467b97STreehugger Robot 457*16467b97STreehugger Robot /// Conjure up a missing token during error recovery. 458*16467b97STreehugger Robot /// 459*16467b97STreehugger Robot /// The recognizer attempts to recover from single missing 460*16467b97STreehugger Robot /// symbols. But, actions might refer to that missing symbol. 461*16467b97STreehugger Robot /// For example, x=ID {f($x);}. The action clearly assumes 462*16467b97STreehugger Robot /// that there has been an identifier matched previously and that 463*16467b97STreehugger Robot /// $x points at that token. If that token is missing, but 464*16467b97STreehugger Robot /// the next token in the stream is what we want we assume that 465*16467b97STreehugger Robot /// this token is missing and we keep going. Because we 466*16467b97STreehugger Robot /// have to return some token to replace the missing token, 467*16467b97STreehugger Robot /// we have to conjure one up. This method gives the user control 468*16467b97STreehugger Robot /// over the tokens returned for missing tokens. Mostly, 469*16467b97STreehugger Robot /// you will want to create something special for identifier 470*16467b97STreehugger Robot /// tokens. For literals such as '{' and ',', the default 471*16467b97STreehugger Robot /// action in the parser or tree parser works. It simply creates 472*16467b97STreehugger Robot /// a CommonToken of the appropriate type. The text will be the token. 473*16467b97STreehugger Robot /// If you change what tokens must be created by the lexer, 474*16467b97STreehugger Robot /// override this method to create the appropriate tokens. 475*16467b97STreehugger Robot /// 476*16467b97STreehugger Robot UnitType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, 477*16467b97STreehugger Robot ANTLR_UINT32 expectedTokenType, 478*16467b97STreehugger Robot BitsetListType* follow); 479*16467b97STreehugger Robot 480*16467b97STreehugger Robot /** Function that returns whether the supplied grammar function 481*16467b97STreehugger Robot * will parse the current input stream or not. This is the way that syntactic 482*16467b97STreehugger Robot * predicates are evaluated. Unlike java, C is perfectly happy to invoke code 483*16467b97STreehugger Robot * via a pointer to a function (hence that's what all the ANTLR3 C interfaces 484*16467b97STreehugger Robot * do. 485*16467b97STreehugger Robot */ 486*16467b97STreehugger Robot template<typename Predicate> 487*16467b97STreehugger Robot bool synpred( ClassForwarder<Predicate> ); 488*16467b97STreehugger Robot 489*16467b97STreehugger Robot //In place of exConstruct, just directly instantiate the Exception Object 490*16467b97STreehugger Robot 491*16467b97STreehugger Robot /** Reset the recognizer 492*16467b97STreehugger Robot */ 493*16467b97STreehugger Robot void reset(); 494*16467b97STreehugger Robot void reset( ClassForwarder<LexerType> ); 495*16467b97STreehugger Robot template<typename CompType> 496*16467b97STreehugger Robot void reset( ClassForwarder<CompType> ); 497*16467b97STreehugger Robot 498*16467b97STreehugger Robot void exConstruct(); 499*16467b97STreehugger Robot 500*16467b97STreehugger Robot ~BaseRecognizer(); 501*16467b97STreehugger Robot 502*16467b97STreehugger Robot }; 503*16467b97STreehugger Robot 504*16467b97STreehugger Robot ANTLR_END_NAMESPACE() 505*16467b97STreehugger Robot 506*16467b97STreehugger Robot #include "antlr3baserecognizer.inl" 507*16467b97STreehugger Robot 508*16467b97STreehugger Robot /// @} 509*16467b97STreehugger Robot /// 510*16467b97STreehugger Robot 511*16467b97STreehugger Robot #endif /* _ANTLR3_BASERECOGNIZER_H */ 512*16467b97STreehugger Robot 513