1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * Defines the basic structures used to manipulate character 3*16467b97STreehugger Robot * streams from any input source. Any character size and encoding 4*16467b97STreehugger Robot * can in theory be used, so long as a set of functinos is provided that 5*16467b97STreehugger Robot * can return a 32 bit Integer representation of their characters amd efficiently mark and revert 6*16467b97STreehugger Robot * to specific offsets into their input streams. 7*16467b97STreehugger Robot */ 8*16467b97STreehugger Robot #ifndef _ANTLR3_INPUT_H 9*16467b97STreehugger Robot #define _ANTLR3_INPUT_H 10*16467b97STreehugger Robot 11*16467b97STreehugger Robot // [The "BSD licence"] 12*16467b97STreehugger Robot // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC 13*16467b97STreehugger Robot // http://www.temporal-wave.com 14*16467b97STreehugger Robot // http://www.linkedin.com/in/jimidle 15*16467b97STreehugger Robot // 16*16467b97STreehugger Robot // All rights reserved. 17*16467b97STreehugger Robot // 18*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 19*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 20*16467b97STreehugger Robot // are met: 21*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 22*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 23*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 24*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 25*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 26*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 27*16467b97STreehugger Robot // derived from this software without specific prior written permission. 28*16467b97STreehugger Robot // 29*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 30*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 31*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 32*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 33*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 34*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 35*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 36*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 38*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39*16467b97STreehugger Robot 40*16467b97STreehugger Robot #include <antlr3defs.h> 41*16467b97STreehugger Robot #include <antlr3string.h> 42*16467b97STreehugger Robot #include <antlr3commontoken.h> 43*16467b97STreehugger Robot #include <antlr3intstream.h> 44*16467b97STreehugger Robot #include <antlr3convertutf.h> 45*16467b97STreehugger Robot 46*16467b97STreehugger Robot #ifdef __cplusplus 47*16467b97STreehugger Robot extern "C" { 48*16467b97STreehugger Robot #endif 49*16467b97STreehugger Robot 50*16467b97STreehugger Robot 51*16467b97STreehugger Robot 52*16467b97STreehugger Robot /// Master context structure for an ANTLR3 C runtime based input stream. 53*16467b97STreehugger Robot /// \ingroup apistructures 54*16467b97STreehugger Robot /// 55*16467b97STreehugger Robot typedef struct ANTLR3_INPUT_STREAM_struct 56*16467b97STreehugger Robot { 57*16467b97STreehugger Robot /** Interfaces that provide streams must all provide 58*16467b97STreehugger Robot * a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM 59*16467b97STreehugger Robot * is no different. 60*16467b97STreehugger Robot */ 61*16467b97STreehugger Robot pANTLR3_INT_STREAM istream; 62*16467b97STreehugger Robot 63*16467b97STreehugger Robot /** Whatever super structure is providing the INPUT stream needs a pointer to itself 64*16467b97STreehugger Robot * so that this can be passed back to it whenever the api functions 65*16467b97STreehugger Robot * are called back from this interface. 66*16467b97STreehugger Robot */ 67*16467b97STreehugger Robot void * super; 68*16467b97STreehugger Robot 69*16467b97STreehugger Robot /** Pointer the start of the input string, characters may be 70*16467b97STreehugger Robot * taken as offsets from here and in original input format encoding. 71*16467b97STreehugger Robot */ 72*16467b97STreehugger Robot void * data; 73*16467b97STreehugger Robot 74*16467b97STreehugger Robot /** Indicates if the data pointer was allocated by us, and so should be freed 75*16467b97STreehugger Robot * when the stream dies. 76*16467b97STreehugger Robot */ 77*16467b97STreehugger Robot int isAllocated; 78*16467b97STreehugger Robot 79*16467b97STreehugger Robot /** String factory for this input stream 80*16467b97STreehugger Robot */ 81*16467b97STreehugger Robot pANTLR3_STRING_FACTORY strFactory; 82*16467b97STreehugger Robot 83*16467b97STreehugger Robot 84*16467b97STreehugger Robot /** Pointer to the next character to be consumed from the input data 85*16467b97STreehugger Robot * This is cast to point at the encoding of the original file that 86*16467b97STreehugger Robot * was read by the functions installed as pointer in this input stream 87*16467b97STreehugger Robot * context instance at file/string/whatever load time. 88*16467b97STreehugger Robot */ 89*16467b97STreehugger Robot void * nextChar; 90*16467b97STreehugger Robot 91*16467b97STreehugger Robot /** Number of characters that can be consumed at this point in time. 92*16467b97STreehugger Robot * Mostly this is just what is left in the pre-read buffer, but if the 93*16467b97STreehugger Robot * input source is a stream such as a socket or something then we may 94*16467b97STreehugger Robot * call special read code to wait for more input. 95*16467b97STreehugger Robot */ 96*16467b97STreehugger Robot ANTLR3_UINT32 sizeBuf; 97*16467b97STreehugger Robot 98*16467b97STreehugger Robot /** The line number we are traversing in the input file. This gets incremented 99*16467b97STreehugger Robot * by a newline() call in the lexer grammar actions. 100*16467b97STreehugger Robot */ 101*16467b97STreehugger Robot ANTLR3_UINT32 line; 102*16467b97STreehugger Robot 103*16467b97STreehugger Robot /** Pointer into the input buffer where the current line 104*16467b97STreehugger Robot * started. 105*16467b97STreehugger Robot */ 106*16467b97STreehugger Robot void * currentLine; 107*16467b97STreehugger Robot 108*16467b97STreehugger Robot /** The offset within the current line of the current character 109*16467b97STreehugger Robot */ 110*16467b97STreehugger Robot ANTLR3_INT32 charPositionInLine; 111*16467b97STreehugger Robot 112*16467b97STreehugger Robot /** Tracks how deep mark() calls are nested 113*16467b97STreehugger Robot */ 114*16467b97STreehugger Robot ANTLR3_UINT32 markDepth; 115*16467b97STreehugger Robot 116*16467b97STreehugger Robot /** List of mark() points in the input stream 117*16467b97STreehugger Robot */ 118*16467b97STreehugger Robot pANTLR3_VECTOR markers; 119*16467b97STreehugger Robot 120*16467b97STreehugger Robot /** File name string, set to pointer to memory if 121*16467b97STreehugger Robot * you set it manually as it will be free()d 122*16467b97STreehugger Robot */ 123*16467b97STreehugger Robot pANTLR3_STRING fileName; 124*16467b97STreehugger Robot 125*16467b97STreehugger Robot /** File number, needs to be set manually to some file index of your devising. 126*16467b97STreehugger Robot */ 127*16467b97STreehugger Robot ANTLR3_UINT32 fileNo; 128*16467b97STreehugger Robot 129*16467b97STreehugger Robot /* API */ 130*16467b97STreehugger Robot 131*16467b97STreehugger Robot 132*16467b97STreehugger Robot /** Pointer to function that closes the input stream 133*16467b97STreehugger Robot */ 134*16467b97STreehugger Robot void (*close) (struct ANTLR3_INPUT_STREAM_struct * input); 135*16467b97STreehugger Robot void (*free) (struct ANTLR3_INPUT_STREAM_struct * input); 136*16467b97STreehugger Robot 137*16467b97STreehugger Robot /** Pointer to function that resets the input stream 138*16467b97STreehugger Robot */ 139*16467b97STreehugger Robot void (*reset) (struct ANTLR3_INPUT_STREAM_struct * input); 140*16467b97STreehugger Robot 141*16467b97STreehugger Robot /** Pointer to a function that reuses and resets an input stream by 142*16467b97STreehugger Robot * supplying a new 'source' 143*16467b97STreehugger Robot */ 144*16467b97STreehugger Robot void (*reuse) (struct ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name); 145*16467b97STreehugger Robot 146*16467b97STreehugger Robot /** 147*16467b97STreehugger Robot * Pointer to function that installs a version of LA that always 148*16467b97STreehugger Robot * returns upper case. Only valid for character streams and creates a case 149*16467b97STreehugger Robot * insensitive lexer if the lexer tokens are described in upper case. The 150*16467b97STreehugger Robot * tokens will preserve case in the token text. 151*16467b97STreehugger Robot */ 152*16467b97STreehugger Robot void (*setUcaseLA) (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag); 153*16467b97STreehugger Robot 154*16467b97STreehugger Robot /** Pointer to function to return input stream element at 1 based 155*16467b97STreehugger Robot * offset from nextChar. Same as _LA for char stream, but token 156*16467b97STreehugger Robot * streams etc. have one of these that does other stuff of course. 157*16467b97STreehugger Robot */ 158*16467b97STreehugger Robot void * (*_LT) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt); 159*16467b97STreehugger Robot 160*16467b97STreehugger Robot /** Pointer to function to return the total size of the input buffer. For streams 161*16467b97STreehugger Robot * this may be just the total we have available so far. This means of course that 162*16467b97STreehugger Robot * the input stream must be careful to accumulate enough input so that any backtracking 163*16467b97STreehugger Robot * can be satisfied. 164*16467b97STreehugger Robot */ 165*16467b97STreehugger Robot ANTLR3_UINT32 (*size) (struct ANTLR3_INPUT_STREAM_struct * input); 166*16467b97STreehugger Robot 167*16467b97STreehugger Robot /** Pointer to function to return a substring of the input stream. String is returned in allocated 168*16467b97STreehugger Robot * memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form. 169*16467b97STreehugger Robot */ 170*16467b97STreehugger Robot pANTLR3_STRING (*substr) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop); 171*16467b97STreehugger Robot 172*16467b97STreehugger Robot /** Pointer to function to return the current line number in the input stream 173*16467b97STreehugger Robot */ 174*16467b97STreehugger Robot ANTLR3_UINT32 (*getLine) (struct ANTLR3_INPUT_STREAM_struct * input); 175*16467b97STreehugger Robot 176*16467b97STreehugger Robot /** Pointer to function to return the current line buffer in the input stream 177*16467b97STreehugger Robot * The pointer returned is directly into the input stream so you must copy 178*16467b97STreehugger Robot * it if you wish to manipulate it without damaging the input stream. Encoding 179*16467b97STreehugger Robot * is obviously in the same form as the input stream. 180*16467b97STreehugger Robot * \remark 181*16467b97STreehugger Robot * - Note taht this function wil lbe inaccurate if setLine is called as there 182*16467b97STreehugger Robot * is no way at the moment to position the input stream at a particular line 183*16467b97STreehugger Robot * number offset. 184*16467b97STreehugger Robot */ 185*16467b97STreehugger Robot void * (*getLineBuf) (struct ANTLR3_INPUT_STREAM_struct * input); 186*16467b97STreehugger Robot 187*16467b97STreehugger Robot /** Pointer to function to return the current offset in the current input stream line 188*16467b97STreehugger Robot */ 189*16467b97STreehugger Robot ANTLR3_UINT32 (*getCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input); 190*16467b97STreehugger Robot 191*16467b97STreehugger Robot /** Pointer to function to set the current line number in the input stream 192*16467b97STreehugger Robot */ 193*16467b97STreehugger Robot void (*setLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line); 194*16467b97STreehugger Robot 195*16467b97STreehugger Robot /** Pointer to function to set the current position in the current line. 196*16467b97STreehugger Robot */ 197*16467b97STreehugger Robot void (*setCharPositionInLine) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position); 198*16467b97STreehugger Robot 199*16467b97STreehugger Robot /** Pointer to function to override the default newline character that the input stream 200*16467b97STreehugger Robot * looks for to trigger the line/offset and line buffer recording information. 201*16467b97STreehugger Robot * \remark 202*16467b97STreehugger Robot * - By default the chracter '\n' will be installed as the newline trigger character. When this 203*16467b97STreehugger Robot * character is seen by the consume() function then the current line number is incremented and the 204*16467b97STreehugger Robot * current line offset is reset to 0. The Pointer for the line of input we are consuming 205*16467b97STreehugger Robot * is updated to point to the next character after this one in the input stream (which means it 206*16467b97STreehugger Robot * may become invalid if the last newline character in the file is seen (so watch out). 207*16467b97STreehugger Robot * - If for some reason you do not want the counters and pointers to be restee, you can set the 208*16467b97STreehugger Robot * chracter to some impossible character such as '\0' or whatever. 209*16467b97STreehugger Robot * - This is a single character only, so choose the last character in a sequence of two or more. 210*16467b97STreehugger Robot * - This is only a simple aid to error reporting - if you have a complicated binary input structure 211*16467b97STreehugger Robot * it may not be adequate, but you can always override every function in the input stream with your 212*16467b97STreehugger Robot * own of course, and can even write your own complete input stream set if you like. 213*16467b97STreehugger Robot * - It is your responsiblity to set a valid character for the input stream type. There is no point 214*16467b97STreehugger Robot * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never 215*16467b97STreehugger Robot * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF 216*16467b97STreehugger Robot */ 217*16467b97STreehugger Robot void (*SetNewLineChar) (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar); 218*16467b97STreehugger Robot 219*16467b97STreehugger Robot /// Character that automatically causes an internal line count 220*16467b97STreehugger Robot /// increment. 221*16467b97STreehugger Robot /// 222*16467b97STreehugger Robot ANTLR3_UCHAR newlineChar; 223*16467b97STreehugger Robot 224*16467b97STreehugger Robot /// Indicates the size, in 8 bit units, of a single character. Note that 225*16467b97STreehugger Robot /// the C runtime does not deal with surrogates as this would be 226*16467b97STreehugger Robot /// slow and complicated. If this is a UTF-8 stream then this field 227*16467b97STreehugger Robot /// will be set to 0. Generally you are best working internally with 32 bit characters 228*16467b97STreehugger Robot /// as this is the most efficient. 229*16467b97STreehugger Robot /// 230*16467b97STreehugger Robot ANTLR3_UINT8 charByteSize; 231*16467b97STreehugger Robot 232*16467b97STreehugger Robot /// Indicates the encoding scheme used in this input stream 233*16467b97STreehugger Robot /// 234*16467b97STreehugger Robot ANTLR3_UINT32 encoding; 235*16467b97STreehugger Robot } 236*16467b97STreehugger Robot 237*16467b97STreehugger Robot ANTLR3_INPUT_STREAM; 238*16467b97STreehugger Robot 239*16467b97STreehugger Robot 240*16467b97STreehugger Robot /** \brief Structure for track lex input states as part of mark() 241*16467b97STreehugger Robot * and rewind() of lexer. 242*16467b97STreehugger Robot */ 243*16467b97STreehugger Robot typedef struct ANTLR3_LEX_STATE_struct 244*16467b97STreehugger Robot { 245*16467b97STreehugger Robot /** Pointer to the next character to be consumed from the input data 246*16467b97STreehugger Robot * This is cast to point at the encoding of the original file that 247*16467b97STreehugger Robot * was read by the functions installed as pointer in this input stream 248*16467b97STreehugger Robot * context instance at file/string/whatever load time. 249*16467b97STreehugger Robot */ 250*16467b97STreehugger Robot void * nextChar; 251*16467b97STreehugger Robot 252*16467b97STreehugger Robot /** The line number we are traversing in the input file. This gets incremented 253*16467b97STreehugger Robot * by a newline() call in the lexer grammer actions. 254*16467b97STreehugger Robot */ 255*16467b97STreehugger Robot ANTLR3_UINT32 line; 256*16467b97STreehugger Robot 257*16467b97STreehugger Robot /** Pointer into the input buffer where the current line 258*16467b97STreehugger Robot * started. 259*16467b97STreehugger Robot */ 260*16467b97STreehugger Robot void * currentLine; 261*16467b97STreehugger Robot 262*16467b97STreehugger Robot /** The offset within the current line of the current character 263*16467b97STreehugger Robot */ 264*16467b97STreehugger Robot ANTLR3_INT32 charPositionInLine; 265*16467b97STreehugger Robot 266*16467b97STreehugger Robot } 267*16467b97STreehugger Robot ANTLR3_LEX_STATE; 268*16467b97STreehugger Robot 269*16467b97STreehugger Robot /* Prototypes 270*16467b97STreehugger Robot */ 271*16467b97STreehugger Robot void antlr38BitSetupStream (pANTLR3_INPUT_STREAM input); 272*16467b97STreehugger Robot void antlr3UTF16SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian); 273*16467b97STreehugger Robot void antlr3UTF32SetupStream (pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian); 274*16467b97STreehugger Robot void antlr3UTF8SetupStream (pANTLR3_INPUT_STREAM input); 275*16467b97STreehugger Robot void antlr3EBCDICSetupStream (pANTLR3_INPUT_STREAM input); 276*16467b97STreehugger Robot void antlr3GenericSetupStream (pANTLR3_INPUT_STREAM input); 277*16467b97STreehugger Robot #ifdef __cplusplus 278*16467b97STreehugger Robot } 279*16467b97STreehugger Robot #endif 280*16467b97STreehugger Robot 281*16467b97STreehugger Robot #endif /* _ANTLR3_INPUT_H */ 282