1 /** \file 2 * Defines the the class interface for an antlr3 INTSTREAM. 3 * 4 * Certain functionality (such as DFAs for instance) abstract the stream of tokens 5 * or characters in to a steam of integers. Hence this structure should be included 6 * in any stream that is able to provide the output as a stream of integers (which is anything 7 * basically. 8 * 9 * There are no specific implementations of the methods in this interface in general. Though 10 * for purposes of casting and so on, it may be necesssary to implement a function with 11 * the signature in this interface which abstracts the base immplementation. In essence though 12 * the base stream provides a pointer to this interface, within which it installs its 13 * normal match() functions and so on. Interaces such as DFA are then passed the pANTLR3_INT_STREAM 14 * and can treat any input as an int stream. 15 * 16 * For instance, a lexer implements a pANTLR3_BASE_RECOGNIZER, within which there is a pANTLR3_INT_STREAM. 17 * However, a pANTLR3_INPUT_STREAM also provides a pANTLR3_INT_STREAM, which it has constructed from 18 * it's normal interface when it was created. This is then pointed at by the pANTLR_BASE_RECOGNIZER 19 * when it is intialized with a pANTLR3_INPUT_STREAM. 20 * 21 * Similarly if a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TOKEN_STREAM, then the 22 * pANTLR3_INT_STREAM is taken from the pANTLR3_TOKEN_STREAM. 23 * 24 * If a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TREENODE_STREAM, then guess where 25 * the pANTLR3_INT_STREAM comes from? 26 * 27 * Note that because the context pointer points to the actual interface structure that is providing 28 * the ANTLR3_INT_STREAM it is defined as a (void *) in this interface. There is no direct implementation 29 * of an ANTLR3_INT_STREAM (unless someone did not understand what I was doing here =;?P 30 */ 31 #ifndef _ANTLR3_INTSTREAM_HPP 32 #define _ANTLR3_INTSTREAM_HPP 33 34 // [The "BSD licence"] 35 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB 36 37 // 38 // All rights reserved. 39 // 40 // Redistribution and use in source and binary forms, with or without 41 // modification, are permitted provided that the following conditions 42 // are met: 43 // 1. Redistributions of source code must retain the above copyright 44 // notice, this list of conditions and the following disclaimer. 45 // 2. Redistributions in binary form must reproduce the above copyright 46 // notice, this list of conditions and the following disclaimer in the 47 // documentation and/or other materials provided with the distribution. 48 // 3. The name of the author may not be used to endorse or promote products 49 // derived from this software without specific prior written permission. 50 // 51 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 52 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 53 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 54 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 55 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 56 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 57 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 58 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 59 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 60 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 61 62 #include <cassert> 63 64 #include "antlr3defs.hpp" 65 66 ANTLR_BEGIN_NAMESPACE() 67 68 enum STREAM_TYPE 69 { 70 /** Type indicator for a character stream 71 * \remark if a custom stream is created but it can be treated as 72 * a char stream, then you may OR in this value to your type indicator 73 */ 74 CHARSTREAM = 0x0001 75 76 /** Type indicator for a Token stream 77 * \remark if a custom stream is created but it can be treated as 78 * a token stream, then you may OR in this value to your type indicator 79 */ 80 , TOKENSTREAM = 0x0002 81 82 /** Type indicator for a common tree node stream 83 * \remark if a custom stream is created but it can be treated as 84 * a common tree node stream, then you may OR in this value to your type indicator 85 */ 86 , COMMONTREENODE = 0x0004 87 88 /** Type mask for input stream so we can switch in the above types 89 * \remark DO NOT USE 0x0000 as a stream type! 90 */ 91 , INPUT_MASK = 0x0007 92 }; 93 94 class RESOLVE_ENDIAN_AT_RUNTIME {}; 95 class BYTE_AGNOSTIC {}; 96 class ANTLR_LITTLE_ENDIAN {}; 97 class ANTLR_BIG_ENDIAN {}; 98 99 template<class ImplTraits, class SuperType> 100 class IntStream : public ImplTraits::AllocPolicyType 101 { 102 public: 103 typedef typename ImplTraits::StringType StringType; 104 105 protected: 106 /** Potentially useful in error reporting and so on, this string is 107 * an identification of the input source. It may be NULL, so anything 108 * attempting to access it needs to check this and substitute a sensible 109 * default. 110 */ 111 StringType m_streamName; 112 113 /** Last marker position allocated 114 */ 115 ANTLR_MARKER m_lastMarker; 116 117 bool m_upper_case; //if set, values should be returbed in upper case 118 119 /// Indicates whether we should implement endian-specific logic 120 /// 0 - Undefined 1 - Default(machine and input are both same), 2 - Little Endian, 3 - Big Endian 121 ANTLR_UINT8 m_endian_spec; 122 123 public: 124 IntStream(); 125 126 // Return a string that identifies the input source 127 // 128 StringType getSourceName(); 129 StringType& get_streamName(); 130 const StringType& get_streamName() const; 131 ANTLR_MARKER get_lastMarker() const; 132 133 SuperType* get_super(); 134 /** 135 * Function that installs a version of LA that always 136 * returns upper case. Only valid for character streams and creates a case 137 * insensitive lexer if the lexer tokens are described in upper case. The 138 * tokens will preserve case in the token text. 139 */ 140 void setUcaseLA(bool flag); 141 142 /** Consume the next 'ANTR3_UINT32' in the stream 143 */ 144 void consume(); 145 146 /** Get ANTLR3_UINT32 at current input pointer + i ahead where i=1 is next ANTLR3_UINT32 147 */ 148 ANTLR_UINT32 _LA( ANTLR_INT32 i); 149 150 /** Tell the stream to start buffering if it hasn't already. Return 151 * current input position, index(), or some other marker so that 152 * when passed to rewind() you get back to the same spot. 153 * rewind(mark()) should not affect the input cursor. 154 */ 155 ANTLR_MARKER mark(); 156 157 /** Return the current input symbol index 0..n where n indicates the 158 * last symbol has been read. 159 */ 160 ANTLR_MARKER index(); 161 162 /** Reset the stream so that next call to index would return marker. 163 * The marker will usually be index() but it doesn't have to be. It's 164 * just a marker to indicate what state the stream was in. This is 165 * essentially calling release() and seek(). If there are markers 166 * created after this marker argument, this routine must unroll them 167 * like a stack. Assume the state the stream was in when this marker 168 * was created. 169 */ 170 void rewind(ANTLR_MARKER marker); 171 172 /** Reset the stream to the last marker position, witouh destryoing the 173 * last marker position. 174 */ 175 void rewindLast(); 176 177 /** You may want to commit to a backtrack but don't want to force the 178 * stream to keep bookkeeping objects around for a marker that is 179 * no longer necessary. This will have the same behavior as 180 * rewind() except it releases resources without the backward seek. 181 */ 182 void release(ANTLR_MARKER mark); 183 184 /** Set the input cursor to the position indicated by index. This is 185 * normally used to seek ahead in the input stream. No buffering is 186 * required to do this unless you know your stream will use seek to 187 * move backwards such as when backtracking. 188 * 189 * This is different from rewind in its multi-directional 190 * requirement and in that its argument is strictly an input cursor (index). 191 * 192 * For char streams, seeking forward must update the stream state such 193 * as line number. For seeking backwards, you will be presumably 194 * backtracking using the mark/rewind mechanism that restores state and 195 * so this method does not need to update state when seeking backwards. 196 * 197 * Currently, this method is only used for efficient backtracking, but 198 * in the future it may be used for incremental parsing. 199 */ 200 void seek(ANTLR_MARKER index); 201 202 /// Debug only method to flag consumption of initial off-channel 203 /// tokens in the input stream 204 /// 205 void consumeInitialHiddenTokens(); 206 207 void rewindMark(ANTLR_MARKER marker); 208 ANTLR_MARKER tindex(); 209 210 /** Frees any resources that were allocated for the implementation of this 211 * interface. Usually this is just releasing the memory allocated 212 * for the structure itself, but it may of course do anything it need to 213 * so long as it does not stamp on anything else. 214 */ 215 ~IntStream(); 216 217 protected: 218 void setupIntStream(bool machineBigEndian, bool inputBigEndian); 219 void findout_endian_spec(bool machineBigEndian, bool inputBigEndian); 220 221 //If the user chooses this option, then we will be resolving stuffs at run-time 222 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); 223 224 //resolve into one of the three categories below at runtime 225 void consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); 226 }; 227 228 template<class ImplTraits, class SuperType> 229 class EBCDIC_IntStream : public IntStream<ImplTraits, SuperType> 230 { 231 public: 232 ANTLR_UINT32 _LA( ANTLR_INT32 i); 233 234 protected: 235 void setupIntStream(); 236 }; 237 238 template<class ImplTraits, class SuperType> 239 class UTF8_IntStream : public IntStream<ImplTraits, SuperType> 240 { 241 public: 242 ANTLR_UINT32 _LA( ANTLR_INT32 i); 243 void consume(); 244 245 protected: 246 void setupIntStream(bool machineBigEndian, bool inputBigEndian); 247 248 private: 249 static const ANTLR_UINT32* TrailingBytesForUTF8(); 250 static const UTF32* OffsetsFromUTF8(); 251 }; 252 253 template<class ImplTraits, class SuperType> 254 class UTF16_IntStream : public IntStream<ImplTraits, SuperType> 255 { 256 public: 257 ANTLR_UINT32 _LA( ANTLR_INT32 i); 258 void consume(); 259 ANTLR_MARKER index(); 260 void seek(ANTLR_MARKER seekPoint); 261 262 protected: 263 void setupIntStream(bool machineBigEndian, bool inputBigEndian); 264 265 /// \brief Return the input element assuming an 8 bit ascii input 266 /// 267 /// \param[in] input Input stream context pointer 268 /// \param[in] la 1 based offset of next input stream element 269 /// 270 /// \return Next input character in internal ANTLR3 encoding (UTF32) 271 /// 272 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> ); 273 274 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not 275 /// 276 /// \param[in] input Input stream context pointer 277 /// \param[in] la 1 based offset of next input stream element 278 /// 279 /// \return Next input character in internal ANTLR3 encoding (UTF32) 280 /// 281 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> ); 282 283 /// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not 284 /// 285 /// \param[in] input Input stream context pointer 286 /// \param[in] la 1 based offset of next input stream element 287 /// 288 /// \return Next input character in internal ANTLR3 encoding (UTF32) 289 /// 290 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> ); 291 292 /// \brief Consume the next character in a UTF16 input stream 293 /// 294 /// \param input Input stream context pointer 295 /// 296 void consume( ClassForwarder<BYTE_AGNOSTIC> ); 297 298 /// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not 299 /// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance 300 /// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we 301 /// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream 302 /// is fubar but we just ignore that. 303 /// 304 /// \param input Input stream context pointer 305 /// 306 void consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> ); 307 308 /// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not 309 /// 310 /// \param input Input stream context pointer 311 /// 312 void consume( ClassForwarder<ANTLR_BIG_ENDIAN> ); 313 }; 314 315 316 317 template<class ImplTraits, class SuperType> 318 class UTF32_IntStream : public IntStream<ImplTraits, SuperType> 319 { 320 public: 321 ANTLR_UINT32 _LA( ANTLR_INT32 i); 322 void consume(); 323 324 /// \brief Calculate the current index in the output stream. 325 /// \param[in] input Input stream context pointer 326 /// 327 ANTLR_MARKER index(); 328 void seek(ANTLR_MARKER seekPoint); 329 330 protected: 331 void setupIntStream(bool machineBigEndian, bool inputBigEndian); 332 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); 333 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> ); 334 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> ); 335 ANTLR_UINT32 _LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> ); 336 337 void consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> ); 338 void consume( ClassForwarder<BYTE_AGNOSTIC> ); 339 void consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> ); 340 void consume( ClassForwarder<ANTLR_BIG_ENDIAN> ); 341 }; 342 343 template<class ImplTraits> 344 class TokenIntStream : public IntStream<ImplTraits, typename ImplTraits::TokenStreamType > 345 { 346 public: 347 typedef typename ImplTraits::CommonTokenType CommonTokenType; 348 typedef typename ImplTraits::StringType StringType; 349 typedef typename ImplTraits::TokenStreamType TokenStreamType; 350 typedef IntStream<ImplTraits, TokenStreamType > BaseType; 351 352 private: 353 /** Because the indirect call, though small in individual cases can 354 * mount up if there are thousands of tokens (very large input streams), callers 355 * of size can optionally use this cached size field. 356 */ 357 ANTLR_UINT32 m_cachedSize; 358 359 public: 360 TokenIntStream(); 361 ANTLR_UINT32 get_cachedSize() const; 362 void set_cachedSize( ANTLR_UINT32 cachedSize ); 363 364 void consume(); 365 void consumeInitialHiddenTokens(); 366 ANTLR_UINT32 _LA( ANTLR_INT32 i ); 367 ANTLR_MARKER mark(); 368 ANTLR_UINT32 size(); 369 void release(); 370 ANTLR_MARKER tindex(); 371 void rewindLast(); 372 void rewind(ANTLR_MARKER marker); 373 void seek(ANTLR_MARKER index); 374 StringType getSourceName(); 375 376 }; 377 378 template<class ImplTraits> 379 class TreeNodeIntStream : public IntStream<ImplTraits, typename ImplTraits::CommonTreeNodeStreamType> 380 { 381 public: 382 typedef typename ImplTraits::CommonTreeNodeStreamType CommonTreeNodeStreamType; 383 typedef IntStream<ImplTraits, CommonTreeNodeStreamType > BaseType; 384 typedef typename ImplTraits::TreeType TreeType; 385 typedef typename ImplTraits::CommonTokenType CommonTokenType; 386 387 public: 388 void consume(); 389 ANTLR_MARKER tindex(); 390 ANTLR_UINT32 _LA(ANTLR_INT32 i); 391 ANTLR_MARKER mark(); 392 void release(ANTLR_MARKER marker); 393 void rewindMark(ANTLR_MARKER marker); 394 void rewindLast(); 395 void seek(ANTLR_MARKER index); 396 ANTLR_UINT32 size(); 397 }; 398 399 ANTLR_END_NAMESPACE() 400 401 #include "antlr3intstream.inl" 402 403 #endif 404 405