1*16467b97STreehugger Robot /** \file 2*16467b97STreehugger Robot * Defines the basic structures used to manipulate character 3*16467b97STreehugger Robot * streams from any input source. Any character size and encoding 4*16467b97STreehugger Robot * can in theory be used, so long as a set of functinos is provided that 5*16467b97STreehugger Robot * can return a 32 bit Integer representation of their characters amd efficiently mark and revert 6*16467b97STreehugger Robot * to specific offsets into their input streams. 7*16467b97STreehugger Robot */ 8*16467b97STreehugger Robot #ifndef _ANTLR_INPUT_HPP 9*16467b97STreehugger Robot #define _ANTLR_INPUT_HPP 10*16467b97STreehugger Robot 11*16467b97STreehugger Robot // [The "BSD licence"] 12*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 13*16467b97STreehugger Robot 14*16467b97STreehugger Robot // 15*16467b97STreehugger Robot // All rights reserved. 16*16467b97STreehugger Robot // 17*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without 18*16467b97STreehugger Robot // modification, are permitted provided that the following conditions 19*16467b97STreehugger Robot // are met: 20*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright 21*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer. 22*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright 23*16467b97STreehugger Robot // notice, this list of conditions and the following disclaimer in the 24*16467b97STreehugger Robot // documentation and/or other materials provided with the distribution. 25*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products 26*16467b97STreehugger Robot // derived from this software without specific prior written permission. 27*16467b97STreehugger Robot // 28*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 29*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 30*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 31*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 32*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 33*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 37*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38*16467b97STreehugger Robot 39*16467b97STreehugger Robot #include "antlr3defs.hpp" 40*16467b97STreehugger Robot 41*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE() 42*16467b97STreehugger Robot 43*16467b97STreehugger Robot /// Master context structure for an ANTLR3 C runtime based input stream. 44*16467b97STreehugger Robot /// \ingroup apistructures. Calling _LT on this doesn't seem right. You would 45*16467b97STreehugger Robot /// call it only with parser / TreeParser, and their respective input streams 46*16467b97STreehugger Robot /// has that function. calling it from lexer will throw a compile time error 47*16467b97STreehugger Robot /// 48*16467b97STreehugger Robot 49*16467b97STreehugger Robot template<class ImplTraits> 50*16467b97STreehugger Robot class InputStream : public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > 51*16467b97STreehugger Robot { 52*16467b97STreehugger Robot public: 53*16467b97STreehugger Robot typedef typename ImplTraits::AllocPolicyType AllocPolicyType; 54*16467b97STreehugger Robot typedef typename ImplTraits::LexStateType LexStateType; 55*16467b97STreehugger Robot typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType; 56*16467b97STreehugger Robot typedef IntStreamType BaseType; 57*16467b97STreehugger Robot typedef typename ImplTraits::StreamDataType UnitType; 58*16467b97STreehugger Robot typedef UnitType DataType; 59*16467b97STreehugger Robot typedef UnitType TokenType; 60*16467b97STreehugger Robot typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType; 61*16467b97STreehugger Robot typedef typename ImplTraits::StringType StringType; 62*16467b97STreehugger Robot 63*16467b97STreehugger Robot private: 64*16467b97STreehugger Robot /** Pointer the start of the input string, characters may be 65*16467b97STreehugger Robot * taken as offsets from here and in original input format encoding. 66*16467b97STreehugger Robot */ 67*16467b97STreehugger Robot const DataType* m_data; 68*16467b97STreehugger Robot 69*16467b97STreehugger Robot /** Pointer to the next character to be consumed from the input data 70*16467b97STreehugger Robot * This is cast to point at the encoding of the original file that 71*16467b97STreehugger Robot * was read by the functions installed as pointer in this input stream 72*16467b97STreehugger Robot * context instance at file/string/whatever load time. 73*16467b97STreehugger Robot */ 74*16467b97STreehugger Robot const DataType* m_nextChar; 75*16467b97STreehugger Robot 76*16467b97STreehugger Robot /** Number of characters that can be consumed at this point in time. 77*16467b97STreehugger Robot * Mostly this is just what is left in the pre-read buffer, but if the 78*16467b97STreehugger Robot * input source is a stream such as a socket or something then we may 79*16467b97STreehugger Robot * call special read code to wait for more input. 80*16467b97STreehugger Robot */ 81*16467b97STreehugger Robot ANTLR_UINT32 m_sizeBuf; 82*16467b97STreehugger Robot 83*16467b97STreehugger Robot /** The line number we are traversing in the input file. This gets incremented 84*16467b97STreehugger Robot * by a newline() call in the lexer grammar actions. 85*16467b97STreehugger Robot */ 86*16467b97STreehugger Robot ANTLR_UINT32 m_line; 87*16467b97STreehugger Robot 88*16467b97STreehugger Robot /** Pointer into the input buffer where the current line 89*16467b97STreehugger Robot * started. 90*16467b97STreehugger Robot */ 91*16467b97STreehugger Robot const DataType* m_currentLine; 92*16467b97STreehugger Robot 93*16467b97STreehugger Robot /** The offset within the current line of the current character 94*16467b97STreehugger Robot */ 95*16467b97STreehugger Robot ANTLR_INT32 m_charPositionInLine; 96*16467b97STreehugger Robot 97*16467b97STreehugger Robot /** Tracks how deep mark() calls are nested 98*16467b97STreehugger Robot */ 99*16467b97STreehugger Robot ANTLR_UINT32 m_markDepth; 100*16467b97STreehugger Robot 101*16467b97STreehugger Robot /** List of mark() points in the input stream 102*16467b97STreehugger Robot */ 103*16467b97STreehugger Robot MarkersType m_markers; 104*16467b97STreehugger Robot 105*16467b97STreehugger Robot /** File name string, set to pointer to memory if 106*16467b97STreehugger Robot * you set it manually as it will be free()d 107*16467b97STreehugger Robot */ 108*16467b97STreehugger Robot StringType m_fileName; 109*16467b97STreehugger Robot 110*16467b97STreehugger Robot /** File number, needs to be set manually to some file index of your devising. 111*16467b97STreehugger Robot */ 112*16467b97STreehugger Robot ANTLR_UINT32 m_fileNo; 113*16467b97STreehugger Robot 114*16467b97STreehugger Robot /// Character that automatically causes an internal line count 115*16467b97STreehugger Robot /// increment. 116*16467b97STreehugger Robot /// 117*16467b97STreehugger Robot ANTLR_UCHAR m_newlineChar; 118*16467b97STreehugger Robot 119*16467b97STreehugger Robot /// Indicates the size, in 8 bit units, of a single character. Note that 120*16467b97STreehugger Robot /// the C runtime does not deal with surrogates as this would be 121*16467b97STreehugger Robot /// slow and complicated. If this is a UTF-8 stream then this field 122*16467b97STreehugger Robot /// will be set to 0. Generally you are best working internally with 32 bit characters 123*16467b97STreehugger Robot /// as this is the most efficient. 124*16467b97STreehugger Robot /// 125*16467b97STreehugger Robot ANTLR_UINT8 m_charByteSize; 126*16467b97STreehugger Robot 127*16467b97STreehugger Robot /** Indicates if the data pointer was allocated by us, and so should be freed 128*16467b97STreehugger Robot * when the stream dies. 129*16467b97STreehugger Robot */ 130*16467b97STreehugger Robot bool m_isAllocated; 131*16467b97STreehugger Robot 132*16467b97STreehugger Robot /// Indicates the encoding scheme used in this input stream 133*16467b97STreehugger Robot /// 134*16467b97STreehugger Robot ANTLR_UINT32 m_encoding; 135*16467b97STreehugger Robot 136*16467b97STreehugger Robot /* API */ 137*16467b97STreehugger Robot public: 138*16467b97STreehugger Robot InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding); 139*16467b97STreehugger Robot InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name); 140*16467b97STreehugger Robot ~InputStream(); 141*16467b97STreehugger Robot const DataType* get_data() const; 142*16467b97STreehugger Robot bool get_isAllocated() const; 143*16467b97STreehugger Robot const DataType* get_nextChar() const; 144*16467b97STreehugger Robot ANTLR_UINT32 get_sizeBuf() const; 145*16467b97STreehugger Robot ANTLR_UINT32 get_line() const; 146*16467b97STreehugger Robot const DataType* get_currentLine() const; 147*16467b97STreehugger Robot ANTLR_INT32 get_charPositionInLine() const; 148*16467b97STreehugger Robot ANTLR_UINT32 get_markDepth() const; 149*16467b97STreehugger Robot MarkersType& get_markers(); 150*16467b97STreehugger Robot const StringType& get_fileName() const; 151*16467b97STreehugger Robot ANTLR_UINT32 get_fileNo() const; 152*16467b97STreehugger Robot ANTLR_UCHAR get_newlineChar() const; 153*16467b97STreehugger Robot ANTLR_UINT8 get_charByteSize() const; 154*16467b97STreehugger Robot ANTLR_UINT32 get_encoding() const; 155*16467b97STreehugger Robot 156*16467b97STreehugger Robot void set_data( DataType* data ); 157*16467b97STreehugger Robot void set_isAllocated( bool isAllocated ); 158*16467b97STreehugger Robot void set_nextChar( const DataType* nextChar ); 159*16467b97STreehugger Robot void set_sizeBuf( ANTLR_UINT32 sizeBuf ); 160*16467b97STreehugger Robot void set_line( ANTLR_UINT32 line ); 161*16467b97STreehugger Robot void set_currentLine( const DataType* currentLine ); 162*16467b97STreehugger Robot void set_charPositionInLine( ANTLR_INT32 charPositionInLine ); 163*16467b97STreehugger Robot void set_markDepth( ANTLR_UINT32 markDepth ); 164*16467b97STreehugger Robot void set_markers( const MarkersType& markers ); 165*16467b97STreehugger Robot void set_fileName( const StringType& fileName ); 166*16467b97STreehugger Robot void set_fileNo( ANTLR_UINT32 fileNo ); 167*16467b97STreehugger Robot void set_newlineChar( ANTLR_UCHAR newlineChar ); 168*16467b97STreehugger Robot void set_charByteSize( ANTLR_UINT8 charByteSize ); 169*16467b97STreehugger Robot void set_encoding( ANTLR_UINT32 encoding ); 170*16467b97STreehugger Robot 171*16467b97STreehugger Robot void inc_charPositionInLine(); 172*16467b97STreehugger Robot void inc_line(); 173*16467b97STreehugger Robot void inc_markDepth(); 174*16467b97STreehugger Robot 175*16467b97STreehugger Robot IntStreamType* get_istream(); 176*16467b97STreehugger Robot 177*16467b97STreehugger Robot /** Function that resets the input stream 178*16467b97STreehugger Robot */ 179*16467b97STreehugger Robot void reset(); 180*16467b97STreehugger Robot 181*16467b97STreehugger Robot /** Pointer to a function that reuses and resets an input stream by 182*16467b97STreehugger Robot * supplying a new 'source' 183*16467b97STreehugger Robot */ 184*16467b97STreehugger Robot void reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name); 185*16467b97STreehugger Robot 186*16467b97STreehugger Robot 187*16467b97STreehugger Robot /** Function to return the total size of the input buffer. For streams 188*16467b97STreehugger Robot * this may be just the total we have available so far. This means of course that 189*16467b97STreehugger Robot * the input stream must be careful to accumulate enough input so that any backtracking 190*16467b97STreehugger Robot * can be satisfied. 191*16467b97STreehugger Robot */ 192*16467b97STreehugger Robot ANTLR_UINT32 size(); 193*16467b97STreehugger Robot 194*16467b97STreehugger Robot /** Function to return a substring of the input stream. String is returned in allocated 195*16467b97STreehugger Robot * memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form. 196*16467b97STreehugger Robot */ 197*16467b97STreehugger Robot StringType substr(ANTLR_MARKER start, ANTLR_MARKER stop); 198*16467b97STreehugger Robot 199*16467b97STreehugger Robot /** Function to return the current line number in the input stream 200*16467b97STreehugger Robot */ 201*16467b97STreehugger Robot ANTLR_UINT32 get_line(); 202*16467b97STreehugger Robot 203*16467b97STreehugger Robot /** Function to return the current line buffer in the input stream 204*16467b97STreehugger Robot * The pointer returned is directly into the input stream so you must copy 205*16467b97STreehugger Robot * it if you wish to manipulate it without damaging the input stream. Encoding 206*16467b97STreehugger Robot * is obviously in the same form as the input stream. 207*16467b97STreehugger Robot * \remark 208*16467b97STreehugger Robot * - Note taht this function wil lbe inaccurate if setLine is called as there 209*16467b97STreehugger Robot * is no way at the moment to position the input stream at a particular line 210*16467b97STreehugger Robot * number offset. 211*16467b97STreehugger Robot */ 212*16467b97STreehugger Robot const DataType* getLineBuf(); 213*16467b97STreehugger Robot 214*16467b97STreehugger Robot /** Function to return the current offset in the current input stream line 215*16467b97STreehugger Robot */ 216*16467b97STreehugger Robot ANTLR_UINT32 get_charPositionInLine(); 217*16467b97STreehugger Robot 218*16467b97STreehugger Robot /** Function to set the current position in the current line. 219*16467b97STreehugger Robot */ 220*16467b97STreehugger Robot void set_charPositionInLine(ANTLR_UINT32 position); 221*16467b97STreehugger Robot 222*16467b97STreehugger Robot /** Function to override the default newline character that the input stream 223*16467b97STreehugger Robot * looks for to trigger the line/offset and line buffer recording information. 224*16467b97STreehugger Robot * \remark 225*16467b97STreehugger Robot * - By default the chracter '\n' will be installed as the newline trigger character. When this 226*16467b97STreehugger Robot * character is seen by the consume() function then the current line number is incremented and the 227*16467b97STreehugger Robot * current line offset is reset to 0. The Pointer for the line of input we are consuming 228*16467b97STreehugger Robot * is updated to point to the next character after this one in the input stream (which means it 229*16467b97STreehugger Robot * may become invalid if the last newline character in the file is seen (so watch out). 230*16467b97STreehugger Robot * - If for some reason you do not want the counters and pointers to be restee, you can set the 231*16467b97STreehugger Robot * chracter to some impossible character such as '\0' or whatever. 232*16467b97STreehugger Robot * - This is a single character only, so choose the last character in a sequence of two or more. 233*16467b97STreehugger Robot * - This is only a simple aid to error reporting - if you have a complicated binary input structure 234*16467b97STreehugger Robot * it may not be adequate, but you can always override every function in the input stream with your 235*16467b97STreehugger Robot * own of course, and can even write your own complete input stream set if you like. 236*16467b97STreehugger Robot * - It is your responsiblity to set a valid character for the input stream type. There is no point 237*16467b97STreehugger Robot * setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never 238*16467b97STreehugger Robot * trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF 239*16467b97STreehugger Robot */ 240*16467b97STreehugger Robot void set_newLineChar(ANTLR_UINT32 newlineChar); 241*16467b97STreehugger Robot 242*16467b97STreehugger Robot ANTLR_MARKER index_impl(); 243*16467b97STreehugger Robot 244*16467b97STreehugger Robot private: 245*16467b97STreehugger Robot /** \brief Use the contents of an operating system file as the input 246*16467b97STreehugger Robot * for an input stream. 247*16467b97STreehugger Robot * 248*16467b97STreehugger Robot * \param fileName Name of operating system file to read. 249*16467b97STreehugger Robot * \return 250*16467b97STreehugger Robot * - Pointer to new input stream context upon success 251*16467b97STreehugger Robot * - One of the ANTLR3_ERR_ defines on error. 252*16467b97STreehugger Robot */ 253*16467b97STreehugger Robot void createFileStream(const ANTLR_UINT8* fileName); 254*16467b97STreehugger Robot 255*16467b97STreehugger Robot /** \brief Use the supplied 'string' as input to the stream 256*16467b97STreehugger Robot * 257*16467b97STreehugger Robot * \param data Pointer to the input data 258*16467b97STreehugger Robot * \return 259*16467b97STreehugger Robot * - Pointer to new input stream context upon success 260*16467b97STreehugger Robot * - NULL defines on error. 261*16467b97STreehugger Robot */ 262*16467b97STreehugger Robot void createStringStream(const ANTLR_UINT8* data); 263*16467b97STreehugger Robot void genericSetupStream(); 264*16467b97STreehugger Robot 265*16467b97STreehugger Robot /// Determine endianess of the input stream and install the 266*16467b97STreehugger Robot /// API required for the encoding in that format. 267*16467b97STreehugger Robot /// 268*16467b97STreehugger Robot void setupInputStream(); 269*16467b97STreehugger Robot 270*16467b97STreehugger Robot }; 271*16467b97STreehugger Robot 272*16467b97STreehugger Robot /** \brief Structure for track lex input states as part of mark() 273*16467b97STreehugger Robot * and rewind() of lexer. 274*16467b97STreehugger Robot */ 275*16467b97STreehugger Robot template<class ImplTraits> 276*16467b97STreehugger Robot class LexState : public ImplTraits::AllocPolicyType 277*16467b97STreehugger Robot { 278*16467b97STreehugger Robot public: 279*16467b97STreehugger Robot typedef typename ImplTraits::StreamDataType DataType; 280*16467b97STreehugger Robot 281*16467b97STreehugger Robot private: 282*16467b97STreehugger Robot /** Pointer to the next character to be consumed from the input data 283*16467b97STreehugger Robot * This is cast to point at the encoding of the original file that 284*16467b97STreehugger Robot * was read by the functions installed as pointer in this input stream 285*16467b97STreehugger Robot * context instance at file/string/whatever load time. 286*16467b97STreehugger Robot */ 287*16467b97STreehugger Robot const DataType* m_nextChar; 288*16467b97STreehugger Robot 289*16467b97STreehugger Robot /** The line number we are traversing in the input file. This gets incremented 290*16467b97STreehugger Robot * by a newline() call in the lexer grammer actions. 291*16467b97STreehugger Robot */ 292*16467b97STreehugger Robot ANTLR_UINT32 m_line; 293*16467b97STreehugger Robot 294*16467b97STreehugger Robot /** Pointer into the input buffer where the current line 295*16467b97STreehugger Robot * started. 296*16467b97STreehugger Robot */ 297*16467b97STreehugger Robot const DataType* m_currentLine; 298*16467b97STreehugger Robot 299*16467b97STreehugger Robot /** The offset within the current line of the current character 300*16467b97STreehugger Robot */ 301*16467b97STreehugger Robot ANTLR_INT32 m_charPositionInLine; 302*16467b97STreehugger Robot 303*16467b97STreehugger Robot public: 304*16467b97STreehugger Robot LexState(); 305*16467b97STreehugger Robot const DataType* get_nextChar() const; 306*16467b97STreehugger Robot ANTLR_UINT32 get_line() const; 307*16467b97STreehugger Robot const DataType* get_currentLine() const; 308*16467b97STreehugger Robot ANTLR_INT32 get_charPositionInLine() const; 309*16467b97STreehugger Robot void set_nextChar( const DataType* nextChar ); 310*16467b97STreehugger Robot void set_line( ANTLR_UINT32 line ); 311*16467b97STreehugger Robot void set_currentLine( const DataType* currentLine ); 312*16467b97STreehugger Robot void set_charPositionInLine( ANTLR_INT32 charPositionInLine ); 313*16467b97STreehugger Robot }; 314*16467b97STreehugger Robot 315*16467b97STreehugger Robot class ParseNullStringException : public std::exception 316*16467b97STreehugger Robot { what() const317*16467b97STreehugger Robot virtual const char* what() const throw() 318*16467b97STreehugger Robot { 319*16467b97STreehugger Robot return "Null String"; 320*16467b97STreehugger Robot } 321*16467b97STreehugger Robot }; 322*16467b97STreehugger Robot 323*16467b97STreehugger Robot ANTLR_END_NAMESPACE() 324*16467b97STreehugger Robot 325*16467b97STreehugger Robot #include "antlr3input.inl" 326*16467b97STreehugger Robot 327*16467b97STreehugger Robot #endif /* _ANTLR_INPUT_H */ 328