1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * Base interface for any ANTLR3 lexer. 3*16467b97STreehugger Robot * 4*16467b97STreehugger Robot * An ANLTR3 lexer builds from two sets of components: 5*16467b97STreehugger Robot * 6*16467b97STreehugger Robot * - The runtime components that provide common functionality such as 7*16467b97STreehugger Robot * traversing character streams, building tokens for output and so on. 8*16467b97STreehugger Robot * - The generated rules and struutre of the actual lexer, which call upon the 9*16467b97STreehugger Robot * runtime components. 10*16467b97STreehugger Robot * 11*16467b97STreehugger Robot * A lexer class contains a character input stream, a base recognizer interface 12*16467b97STreehugger Robot * (which it will normally implement) and a token source interface (which it also 13*16467b97STreehugger Robot * implements. The Tokensource interface is called by a token consumer (such as 14*16467b97STreehugger Robot * a parser, but in theory it can be anything that wants a set of abstract 15*16467b97STreehugger Robot * tokens in place of a raw character stream. 16*16467b97STreehugger Robot * 17*16467b97STreehugger Robot * So then, we set up a lexer in a sequence akin to: 18*16467b97STreehugger Robot * 19*16467b97STreehugger Robot * - Create a character stream (something which implements ANTLR3_INPUT_STREAM) 20*16467b97STreehugger Robot * and initialize it. 21*16467b97STreehugger Robot * - Create a lexer interface and tell it where it its input stream is. 22*16467b97STreehugger Robot * This will cause the creation of a base recognizer class, which it will 23*16467b97STreehugger Robot * override with its own implementations of some methods. The lexer creator 24*16467b97STreehugger Robot * can also then in turn override anything it likes. 25*16467b97STreehugger Robot * - The lexer token source interface is then passed to some interface that 26*16467b97STreehugger Robot * knows how to use it, byte calling for a next token. 27*16467b97STreehugger Robot * - When a next token is called, let ze lexing begin. 28*16467b97STreehugger Robot * 29*16467b97STreehugger Robot */ 30*16467b97STreehugger Robot #ifndef _ANTLR3_LEXER_HPP 31*16467b97STreehugger Robot #define _ANTLR3_LEXER_HPP 32*16467b97STreehugger Robot 33*16467b97STreehugger Robot // [The "BSD licence"] 34*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 35*16467b97STreehugger Robot 36*16467b97STreehugger Robot // 37*16467b97STreehugger Robot // All rights reserved. 38*16467b97STreehugger Robot // 39*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 40*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 41*16467b97STreehugger Robot // are met: 42*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 43*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 44*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 45*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 46*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 47*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 48*16467b97STreehugger Robot // derived from this software without specific prior written permission. 49*16467b97STreehugger Robot // 50*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 51*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 52*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 53*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 54*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 55*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 56*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 57*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 58*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 59*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 60*16467b97STreehugger Robot 61*16467b97STreehugger Robot /* Definitions 62*16467b97STreehugger Robot */ 63*16467b97STreehugger Robot #include "antlr3defs.hpp" 64*16467b97STreehugger Robot 65*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE() 66*16467b97STreehugger Robot 67*16467b97STreehugger Robot static const ANTLR_UINT32 ANTLR_STRING_TERMINATOR = 0xFFFFFFFF; 68*16467b97STreehugger Robot 69*16467b97STreehugger Robot template<class ImplTraits> 70*16467b97STreehugger Robot class Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >, 71*16467b97STreehugger Robot public ImplTraits::TokenSourceType 72*16467b97STreehugger Robot { 73*16467b97STreehugger Robot public: 74*16467b97STreehugger Robot typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 75*16467b97STreehugger Robot typedef typename ImplTraits::InputStreamType InputStreamType; 76*16467b97STreehugger Robot typedef InputStreamType StreamType; 77*16467b97STreehugger Robot typedef typename InputStreamType::IntStreamType IntStreamType; 78*16467b97STreehugger Robot typedef typename ImplTraits::CommonTokenType CommonTokenType; 79*16467b97STreehugger Robot typedef typename ImplTraits::StreamDataType TokenType; 80*16467b97STreehugger Robot typedef typename ImplTraits::StringType StringType; 81*16467b97STreehugger Robot typedef typename ImplTraits::StringStreamType StringStreamType; 82*16467b97STreehugger Robot typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType; 83*16467b97STreehugger Robot typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType; 84*16467b97STreehugger Robot typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType; 85*16467b97STreehugger Robot typedef typename ImplTraits::BitsetListType BitsetListType; 86*16467b97STreehugger Robot typedef typename ImplTraits::TokenSourceType TokenSourceType; 87*16467b97STreehugger Robot 88*16467b97STreehugger Robot typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType; 89*16467b97STreehugger Robot typedef typename RecognizerType::DebugEventListenerType DebuggerType; 90*16467b97STreehugger Robot 91*16467b97STreehugger Robot private: 92*16467b97STreehugger Robot /** A pointer to the character stream whence this lexer is receiving 93*16467b97STreehugger Robot * characters. 94*16467b97STreehugger Robot * TODO: I may come back to this and implement charstream outside 95*16467b97STreehugger Robot * the input stream as per the java implementation. 96*16467b97STreehugger Robot */ 97*16467b97STreehugger Robot InputStreamType* m_input; 98*16467b97STreehugger Robot 99*16467b97STreehugger Robot public: 100*16467b97STreehugger Robot Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state); 101*16467b97STreehugger Robot Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state); 102*16467b97STreehugger Robot 103*16467b97STreehugger Robot InputStreamType* get_input() const; 104*16467b97STreehugger Robot IntStreamType* get_istream() const; 105*16467b97STreehugger Robot RecognizerType* get_rec(); 106*16467b97STreehugger Robot const RecognizerType* get_rec() const; 107*16467b97STreehugger Robot TokenSourceType* get_tokSource(); 108*16467b97STreehugger Robot 109*16467b97STreehugger Robot //functions used in .stg file 110*16467b97STreehugger Robot const RecognizerType* get_recognizer() const; 111*16467b97STreehugger Robot RecognizerSharedStateType* get_lexstate() const; 112*16467b97STreehugger Robot void set_lexstate( RecognizerSharedStateType* lexstate ); 113*16467b97STreehugger Robot const TokenSourceType* get_tokSource() const; 114*16467b97STreehugger Robot CommonTokenType* get_ltoken() const; 115*16467b97STreehugger Robot void set_ltoken( const CommonTokenType* ltoken ); 116*16467b97STreehugger Robot bool hasFailed() const; 117*16467b97STreehugger Robot ANTLR_INT32 get_backtracking() const; 118*16467b97STreehugger Robot void inc_backtracking(); 119*16467b97STreehugger Robot void dec_backtracking(); 120*16467b97STreehugger Robot bool get_failedflag() const; 121*16467b97STreehugger Robot void set_failedflag( bool failed ); 122*16467b97STreehugger Robot InputStreamType* get_strstream() const; 123*16467b97STreehugger Robot ANTLR_MARKER index() const; 124*16467b97STreehugger Robot void seek(ANTLR_MARKER index); 125*16467b97STreehugger Robot const CommonTokenType* EOF_Token() const; 126*16467b97STreehugger Robot bool hasException() const; 127*16467b97STreehugger Robot ExceptionBaseType* get_exception() const; 128*16467b97STreehugger Robot void constructEx(); 129*16467b97STreehugger Robot void lrecover(); 130*16467b97STreehugger Robot ANTLR_MARKER mark(); 131*16467b97STreehugger Robot void rewind(ANTLR_MARKER marker); 132*16467b97STreehugger Robot void rewindLast(); 133*16467b97STreehugger Robot void setText( const StringType& text ); 134*16467b97STreehugger Robot void skip(); 135*16467b97STreehugger Robot RuleMemoType* getRuleMemo() const; 136*16467b97STreehugger Robot DebuggerType* get_debugger() const; 137*16467b97STreehugger Robot void setRuleMemo(RuleMemoType* rulememo); 138*16467b97STreehugger Robot ANTLR_UINT32 LA(ANTLR_INT32 i); 139*16467b97STreehugger Robot void consume(); 140*16467b97STreehugger Robot void memoize(ANTLR_MARKER ruleIndex, ANTLR_MARKER ruleParseStart); 141*16467b97STreehugger Robot bool haveParsedRule(ANTLR_MARKER ruleIndex); 142*16467b97STreehugger Robot 143*16467b97STreehugger Robot /** Pointer to a function that sets the charstream source for the lexer and 144*16467b97STreehugger Robot * causes it to be reset. 145*16467b97STreehugger Robot */ 146*16467b97STreehugger Robot void setCharStream(InputStreamType* input); 147*16467b97STreehugger Robot 148*16467b97STreehugger Robot /*! 149*16467b97STreehugger Robot * \brief 150*16467b97STreehugger Robot * Change to a new input stream, remembering the old one. 151*16467b97STreehugger Robot * 152*16467b97STreehugger Robot * \param lexer 153*16467b97STreehugger Robot * Pointer to the lexer instance to switch input streams for. 154*16467b97STreehugger Robot * 155*16467b97STreehugger Robot * \param input 156*16467b97STreehugger Robot * New input stream to install as the current one. 157*16467b97STreehugger Robot * 158*16467b97STreehugger Robot * Switches the current character input stream to 159*16467b97STreehugger Robot * a new one, saving the old one, which we will revert to at the end of this 160*16467b97STreehugger Robot * new one. 161*16467b97STreehugger Robot */ 162*16467b97STreehugger Robot void pushCharStream(InputStreamType* input); 163*16467b97STreehugger Robot 164*16467b97STreehugger Robot /*! 165*16467b97STreehugger Robot * \brief 166*16467b97STreehugger Robot * Stops using the current input stream and reverts to any prior 167*16467b97STreehugger Robot * input stream on the stack. 168*16467b97STreehugger Robot * 169*16467b97STreehugger Robot * \param lexer 170*16467b97STreehugger Robot * Description of parameter lexer. 171*16467b97STreehugger Robot * 172*16467b97STreehugger Robot * Pointer to a function that abandons the current input stream, whether it 173*16467b97STreehugger Robot * is empty or not and reverts to the previous stacked input stream. 174*16467b97STreehugger Robot * 175*16467b97STreehugger Robot * \remark 176*16467b97STreehugger Robot * The function fails silently if there are no prior input streams. 177*16467b97STreehugger Robot */ 178*16467b97STreehugger Robot void popCharStream(); 179*16467b97STreehugger Robot 180*16467b97STreehugger Robot /** Function that emits (a copy of ) the supplied token as the next token in 181*16467b97STreehugger Robot * the stream. 182*16467b97STreehugger Robot */ 183*16467b97STreehugger Robot void emit(const CommonTokenType* token); 184*16467b97STreehugger Robot 185*16467b97STreehugger Robot /** Pointer to a function that constructs a new token from the lexer stored information 186*16467b97STreehugger Robot */ 187*16467b97STreehugger Robot CommonTokenType* emit(); 188*16467b97STreehugger Robot 189*16467b97STreehugger Robot /** Pointer to a function that attempts to match and consume the specified string from the input 190*16467b97STreehugger Robot * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated 191*16467b97STreehugger Robot * with 0xFFFFFFFF, which is an invalid UTF32 character 192*16467b97STreehugger Robot */ 193*16467b97STreehugger Robot bool matchs(ANTLR_UCHAR* string); 194*16467b97STreehugger Robot 195*16467b97STreehugger Robot /** Pointer to a function that matches and consumes the specified character from the input stream. 196*16467b97STreehugger Robot * The input stream is required to provide characters via LA() as UTF32 characters. The default lexer 197*16467b97STreehugger Robot * implementation is source encoding agnostic and so input streams do not generally need to 198*16467b97STreehugger Robot * override the default implmentation. 199*16467b97STreehugger Robot */ 200*16467b97STreehugger Robot bool matchc(ANTLR_UCHAR c); 201*16467b97STreehugger Robot 202*16467b97STreehugger Robot /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too 203*16467b97STreehugger Robot * but this would only be useful if the tokens were in tsome guaranteed order which is 204*16467b97STreehugger Robot * only going to happen with a hand crafted token set). 205*16467b97STreehugger Robot */ 206*16467b97STreehugger Robot bool matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high); 207*16467b97STreehugger Robot 208*16467b97STreehugger Robot /** Pointer to a function that matches the next token/char in the input stream 209*16467b97STreehugger Robot * regardless of what it actaully is. 210*16467b97STreehugger Robot */ 211*16467b97STreehugger Robot void matchAny(); 212*16467b97STreehugger Robot 213*16467b97STreehugger Robot /** Pointer to a function that recovers from an error found in the input stream. 214*16467b97STreehugger Robot * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also 215*16467b97STreehugger Robot * be from a mismatched token that the (*match)() could not recover from. 216*16467b97STreehugger Robot */ 217*16467b97STreehugger Robot void recover(); 218*16467b97STreehugger Robot 219*16467b97STreehugger Robot /** Function to return the current line number in the input stream 220*16467b97STreehugger Robot */ 221*16467b97STreehugger Robot ANTLR_UINT32 getLine(); 222*16467b97STreehugger Robot ANTLR_MARKER getCharIndex(); 223*16467b97STreehugger Robot ANTLR_UINT32 getCharPositionInLine(); 224*16467b97STreehugger Robot 225*16467b97STreehugger Robot /** Function to return the text so far for the current token being generated 226*16467b97STreehugger Robot */ 227*16467b97STreehugger Robot StringType getText(); 228*16467b97STreehugger Robot 229*16467b97STreehugger Robot //Other utility functions 230*16467b97STreehugger Robot void fillExceptionData( ExceptionBaseType* ex ); 231*16467b97STreehugger Robot 232*16467b97STreehugger Robot /** Default lexer error handler (works for 8 bit streams only!!!) 233*16467b97STreehugger Robot */ 234*16467b97STreehugger Robot void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex); 235*16467b97STreehugger Robot void exConstruct(); 236*16467b97STreehugger Robot TokenType* getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e, 237*16467b97STreehugger Robot ANTLR_UINT32 expectedTokenType, BitsetListType* follow); 238*16467b97STreehugger Robot 239*16467b97STreehugger Robot /** Pointer to a function that knows how to free the resources of a lexer 240*16467b97STreehugger Robot */ 241*16467b97STreehugger Robot ~Lexer(); 242*16467b97STreehugger Robot }; 243*16467b97STreehugger Robot 244*16467b97STreehugger Robot ANTLR_END_NAMESPACE() 245*16467b97STreehugger Robot 246*16467b97STreehugger Robot #include "antlr3lexer.inl" 247*16467b97STreehugger Robot 248*16467b97STreehugger Robot #endif 249