1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * Base interface for any ANTLR3 lexer. 3*16467b97STreehugger Robot * 4*16467b97STreehugger Robot * An ANLTR3 lexer builds from two sets of components: 5*16467b97STreehugger Robot * 6*16467b97STreehugger Robot * - The runtime components that provide common functionality such as 7*16467b97STreehugger Robot * traversing character streams, building tokens for output and so on. 8*16467b97STreehugger Robot * - The generated rules and struutre of the actual lexer, which call upon the 9*16467b97STreehugger Robot * runtime components. 10*16467b97STreehugger Robot * 11*16467b97STreehugger Robot * A lexer class contains a character input stream, a base recognizer interface 12*16467b97STreehugger Robot * (which it will normally implement) and a token source interface (which it also 13*16467b97STreehugger Robot * implements. The Tokensource interface is called by a token consumer (such as 14*16467b97STreehugger Robot * a parser, but in theory it can be anything that wants a set of abstract 15*16467b97STreehugger Robot * tokens in place of a raw character stream. 16*16467b97STreehugger Robot * 17*16467b97STreehugger Robot * So then, we set up a lexer in a sequence akin to: 18*16467b97STreehugger Robot * 19*16467b97STreehugger Robot * - Create a character stream (something which implements ANTLR3_INPUT_STREAM) 20*16467b97STreehugger Robot * and initialize it. 21*16467b97STreehugger Robot * - Create a lexer interface and tell it where it its input stream is. 22*16467b97STreehugger Robot * This will cause the creation of a base recognizer class, which it will 23*16467b97STreehugger Robot * override with its own implementations of some methods. The lexer creator 24*16467b97STreehugger Robot * can also then in turn override anything it likes. 25*16467b97STreehugger Robot * - The lexer token source interface is then passed to some interface that 26*16467b97STreehugger Robot * knows how to use it, byte calling for a next token. 27*16467b97STreehugger Robot * - When a next token is called, let ze lexing begin. 28*16467b97STreehugger Robot * 29*16467b97STreehugger Robot */ 30*16467b97STreehugger Robot #ifndef _ANTLR3_LEXER 31*16467b97STreehugger Robot #define _ANTLR3_LEXER 32*16467b97STreehugger Robot 33*16467b97STreehugger Robot // [The "BSD licence"] 34*16467b97STreehugger Robot // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 35*16467b97STreehugger Robot // http://www.temporal-wave.com 36*16467b97STreehugger Robot // http://www.linkedin.com/in/jimidle 37*16467b97STreehugger Robot // 38*16467b97STreehugger Robot // All rights reserved. 39*16467b97STreehugger Robot // 40*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 41*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 42*16467b97STreehugger Robot // are met: 43*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 44*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 45*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 46*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 47*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 48*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 49*16467b97STreehugger Robot // derived from this software without specific prior written permission. 50*16467b97STreehugger Robot // 51*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 52*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 53*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 54*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 55*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 56*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 57*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 58*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 59*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 60*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 61*16467b97STreehugger Robot 62*16467b97STreehugger Robot /* Definitions 63*16467b97STreehugger Robot */ 64*16467b97STreehugger Robot #define ANTLR3_STRING_TERMINATOR 0xFFFFFFFF 65*16467b97STreehugger Robot 66*16467b97STreehugger Robot #include <antlr3defs.h> 67*16467b97STreehugger Robot #include <antlr3input.h> 68*16467b97STreehugger Robot #include <antlr3commontoken.h> 69*16467b97STreehugger Robot #include <antlr3tokenstream.h> 70*16467b97STreehugger Robot #include <antlr3baserecognizer.h> 71*16467b97STreehugger Robot 72*16467b97STreehugger Robot #ifdef __cplusplus 73*16467b97STreehugger Robot extern "C" { 74*16467b97STreehugger Robot #endif 75*16467b97STreehugger Robot 76*16467b97STreehugger Robot typedef struct ANTLR3_LEXER_struct 77*16467b97STreehugger Robot { 78*16467b97STreehugger Robot /** If there is a super structure that is implementing the 79*16467b97STreehugger Robot * lexer, then a pointer to it can be stored here in case 80*16467b97STreehugger Robot * implementing functions are overridden by this super structure. 81*16467b97STreehugger Robot */ 82*16467b97STreehugger Robot void * super; 83*16467b97STreehugger Robot 84*16467b97STreehugger Robot /** A generated lexer has an mTokens() function, which needs 85*16467b97STreehugger Robot * the context pointer of the generated lexer, not the base lexer interface 86*16467b97STreehugger Robot * this is stored here and initialized by the generated code (or manually 87*16467b97STreehugger Robot * if this is a manually built lexer. 88*16467b97STreehugger Robot */ 89*16467b97STreehugger Robot void * ctx; 90*16467b97STreehugger Robot 91*16467b97STreehugger Robot /** A pointer to the character stream whence this lexer is receiving 92*16467b97STreehugger Robot * characters. 93*16467b97STreehugger Robot * TODO: I may come back to this and implement charstream outside 94*16467b97STreehugger Robot * the input stream as per the java implementation. 95*16467b97STreehugger Robot */ 96*16467b97STreehugger Robot pANTLR3_INPUT_STREAM input; 97*16467b97STreehugger Robot 98*16467b97STreehugger Robot /** Pointer to the implementation of a base recognizer, which the lexer 99*16467b97STreehugger Robot * creates and then overrides with its own lexer oriented functions (the 100*16467b97STreehugger Robot * default implementation is parser oriented). This also contains a 101*16467b97STreehugger Robot * token source interface, which the lexer instance will provide to anything 102*16467b97STreehugger Robot * that needs it, which is anything else that implements a base recognizer, 103*16467b97STreehugger Robot * such as a parser. 104*16467b97STreehugger Robot */ 105*16467b97STreehugger Robot pANTLR3_BASE_RECOGNIZER rec; 106*16467b97STreehugger Robot 107*16467b97STreehugger Robot /** Pointer to a function that sets the charstream source for the lexer and 108*16467b97STreehugger Robot * causes it to be reset. 109*16467b97STreehugger Robot */ 110*16467b97STreehugger Robot void (*setCharStream) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input); 111*16467b97STreehugger Robot 112*16467b97STreehugger Robot /** Pointer to a function that switches the current character input stream to 113*16467b97STreehugger Robot * a new one, saving the old one, which we will revert to at the end of this 114*16467b97STreehugger Robot * new one. 115*16467b97STreehugger Robot */ 116*16467b97STreehugger Robot void (*pushCharStream) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input); 117*16467b97STreehugger Robot 118*16467b97STreehugger Robot /** Pointer to a function that abandons the current input stream, whether it 119*16467b97STreehugger Robot * is empty or not and reverts to the previous stacked input stream. 120*16467b97STreehugger Robot */ 121*16467b97STreehugger Robot void (*popCharStream) (struct ANTLR3_LEXER_struct * lexer); 122*16467b97STreehugger Robot 123*16467b97STreehugger Robot /** Pointer to a function that emits the supplied token as the next token in 124*16467b97STreehugger Robot * the stream. 125*16467b97STreehugger Robot */ 126*16467b97STreehugger Robot void (*emitNew) (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token); 127*16467b97STreehugger Robot 128*16467b97STreehugger Robot /** Pointer to a function that constructs a new token from the lexer stored information 129*16467b97STreehugger Robot */ 130*16467b97STreehugger Robot pANTLR3_COMMON_TOKEN (*emit) (struct ANTLR3_LEXER_struct * lexer); 131*16467b97STreehugger Robot 132*16467b97STreehugger Robot /** Pointer to the user provided (either manually or through code generation 133*16467b97STreehugger Robot * function that causes the lexer rules to run the lexing rules and produce 134*16467b97STreehugger Robot * the next token if there iss one. This is called from nextToken() in the 135*16467b97STreehugger Robot * pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is 136*16467b97STreehugger Robot * the generated lexer context (stored in ctx in this interface) it is a generated 137*16467b97STreehugger Robot * function and expects the context to be the generated lexer. 138*16467b97STreehugger Robot */ 139*16467b97STreehugger Robot void (*mTokens) (void * ctx); 140*16467b97STreehugger Robot 141*16467b97STreehugger Robot /** Pointer to a function that attempts to match and consume the specified string from the input 142*16467b97STreehugger Robot * stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated 143*16467b97STreehugger Robot * with 0xFFFFFFFF, which is an invalid UTF32 character 144*16467b97STreehugger Robot */ 145*16467b97STreehugger Robot ANTLR3_BOOLEAN (*matchs) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string); 146*16467b97STreehugger Robot 147*16467b97STreehugger Robot /** Pointer to a function that matches and consumes the specified character from the input stream. 148*16467b97STreehugger Robot * The input stream is required to provide characters via LA() as UTF32 characters. The default lexer 149*16467b97STreehugger Robot * implementation is source encoding agnostic and so input streams do not generally need to 150*16467b97STreehugger Robot * override the default implmentation. 151*16467b97STreehugger Robot */ 152*16467b97STreehugger Robot ANTLR3_BOOLEAN (*matchc) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c); 153*16467b97STreehugger Robot 154*16467b97STreehugger Robot /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too 155*16467b97STreehugger Robot * but this would only be useful if the tokens were in tsome guaranteed order which is 156*16467b97STreehugger Robot * only going to happen with a hand crafted token set). 157*16467b97STreehugger Robot */ 158*16467b97STreehugger Robot ANTLR3_BOOLEAN (*matchRange) (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high); 159*16467b97STreehugger Robot 160*16467b97STreehugger Robot /** Pointer to a function that matches the next token/char in the input stream 161*16467b97STreehugger Robot * regardless of what it actaully is. 162*16467b97STreehugger Robot */ 163*16467b97STreehugger Robot void (*matchAny) (struct ANTLR3_LEXER_struct * lexer); 164*16467b97STreehugger Robot 165*16467b97STreehugger Robot /** Pointer to a function that recovers from an error found in the input stream. 166*16467b97STreehugger Robot * Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also 167*16467b97STreehugger Robot * be from a mismatched token that the (*match)() could not recover from. 168*16467b97STreehugger Robot */ 169*16467b97STreehugger Robot void (*recover) (struct ANTLR3_LEXER_struct * lexer); 170*16467b97STreehugger Robot 171*16467b97STreehugger Robot /** Pointer to function to return the current line number in the input stream 172*16467b97STreehugger Robot */ 173*16467b97STreehugger Robot ANTLR3_UINT32 (*getLine) (struct ANTLR3_LEXER_struct * lexer); 174*16467b97STreehugger Robot ANTLR3_MARKER (*getCharIndex) (struct ANTLR3_LEXER_struct * lexer); 175*16467b97STreehugger Robot ANTLR3_UINT32 (*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer); 176*16467b97STreehugger Robot 177*16467b97STreehugger Robot /** Pointer to function to return the text so far for the current token being generated 178*16467b97STreehugger Robot */ 179*16467b97STreehugger Robot pANTLR3_STRING (*getText) (struct ANTLR3_LEXER_struct * lexer); 180*16467b97STreehugger Robot 181*16467b97STreehugger Robot 182*16467b97STreehugger Robot /** Pointer to a function that knows how to free the resources of a lexer 183*16467b97STreehugger Robot */ 184*16467b97STreehugger Robot void (*free) (struct ANTLR3_LEXER_struct * lexer); 185*16467b97STreehugger Robot 186*16467b97STreehugger Robot } 187*16467b97STreehugger Robot ANTLR3_LEXER; 188*16467b97STreehugger Robot 189*16467b97STreehugger Robot #ifdef __cplusplus 190*16467b97STreehugger Robot } 191*16467b97STreehugger Robot #endif 192*16467b97STreehugger Robot 193*16467b97STreehugger Robot #endif 194