xref: /aosp_15_r20/external/antlr/runtime/Cpp/include/antlr3intstream.hpp (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1 /** \file
2  * Defines the the class interface for an antlr3 INTSTREAM.
3  *
4  * Certain functionality (such as DFAs for instance) abstract the stream of tokens
5  * or characters in to a steam of integers. Hence this structure should be included
6  * in any stream that is able to provide the output as a stream of integers (which is anything
7  * basically.
8  *
9  * There are no specific implementations of the methods in this interface in general. Though
10  * for purposes of casting and so on, it may be necesssary to implement a function with
11  * the signature in this interface which abstracts the base immplementation. In essence though
12  * the base stream provides a pointer to this interface, within which it installs its
13  * normal match() functions and so on. Interaces such as DFA are then passed the pANTLR3_INT_STREAM
14  * and can treat any input as an int stream.
15  *
16  * For instance, a lexer implements a pANTLR3_BASE_RECOGNIZER, within which there is a pANTLR3_INT_STREAM.
17  * However, a pANTLR3_INPUT_STREAM also provides a pANTLR3_INT_STREAM, which it has constructed from
18  * it's normal interface when it was created. This is then pointed at by the pANTLR_BASE_RECOGNIZER
19  * when it is intialized with a pANTLR3_INPUT_STREAM.
20  *
21  * Similarly if a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TOKEN_STREAM, then the
22  * pANTLR3_INT_STREAM is taken from the pANTLR3_TOKEN_STREAM.
23  *
24  * If a pANTLR3_BASE_RECOGNIZER is initialized with a pANTLR3_TREENODE_STREAM, then guess where
25  * the pANTLR3_INT_STREAM comes from?
26  *
27  * Note that because the context pointer points to the actual interface structure that is providing
28  * the ANTLR3_INT_STREAM it is defined as a (void *) in this interface. There is no direct implementation
29  * of an ANTLR3_INT_STREAM (unless someone did not understand what I was doing here =;?P
30  */
31 #ifndef	_ANTLR3_INTSTREAM_HPP
32 #define	_ANTLR3_INTSTREAM_HPP
33 
34 // [The "BSD licence"]
35 // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
36 
37 //
38 // All rights reserved.
39 //
40 // Redistribution and use in source and binary forms, with or without
41 // modification, are permitted provided that the following conditions
42 // are met:
43 // 1. Redistributions of source code must retain the above copyright
44 //    notice, this list of conditions and the following disclaimer.
45 // 2. Redistributions in binary form must reproduce the above copyright
46 //    notice, this list of conditions and the following disclaimer in the
47 //    documentation and/or other materials provided with the distribution.
48 // 3. The name of the author may not be used to endorse or promote products
49 //    derived from this software without specific prior written permission.
50 //
51 // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
52 // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
53 // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
54 // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
55 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
56 // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
57 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
58 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
59 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
60 // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61 
62 #include  <cassert>
63 
64 #include    "antlr3defs.hpp"
65 
66 ANTLR_BEGIN_NAMESPACE()
67 
68 enum STREAM_TYPE
69 {
70 	/** Type indicator for a character stream
71 	 * \remark if a custom stream is created but it can be treated as
72 	 * a char stream, then you may OR in this value to your type indicator
73 	 */
74 	CHARSTREAM	= 0x0001
75 
76 	/** Type indicator for a Token stream
77 	 * \remark if a custom stream is created but it can be treated as
78 	 * a token stream, then you may OR in this value to your type indicator
79 	 */
80 	, TOKENSTREAM = 0x0002
81 
82 	/** Type indicator for a common tree node stream
83 	 * \remark if a custom stream is created but it can be treated as
84 	 * a common tree node stream, then you may OR in this value to your type indicator
85 	 */
86 	, COMMONTREENODE = 0x0004
87 
88 	/** Type mask for input stream so we can switch in the above types
89 	*  \remark DO NOT USE 0x0000 as a stream type!
90 	*/
91 	, INPUT_MASK =	0x0007
92 };
93 
94 class RESOLVE_ENDIAN_AT_RUNTIME {};
95 class BYTE_AGNOSTIC {};
96 class ANTLR_LITTLE_ENDIAN {};
97 class ANTLR_BIG_ENDIAN {};
98 
99 template<class ImplTraits, class SuperType>
100 class IntStream : public ImplTraits::AllocPolicyType
101 {
102 public:
103 	typedef typename ImplTraits::StringType StringType;
104 
105 protected:
106     /** Potentially useful in error reporting and so on, this string is
107      *  an identification of the input source. It may be NULL, so anything
108      *  attempting to access it needs to check this and substitute a sensible
109      *  default.
110      */
111     StringType		m_streamName;
112 
113     /** Last marker position allocated
114      */
115     ANTLR_MARKER	m_lastMarker;
116 
117     bool		m_upper_case; //if set, values should be returbed in upper case
118 
119     /// Indicates whether we should implement endian-specific logic
120     /// 0 - Undefined 1 - Default(machine and input are both same), 2 - Little Endian, 3 - Big Endian
121     ANTLR_UINT8		m_endian_spec;
122 
123 public:
124 	IntStream();
125 
126 	// Return a string that identifies the input source
127 	//
128 	StringType		getSourceName();
129 	StringType& 	get_streamName();
130 	const StringType& 	get_streamName() const;
131 	ANTLR_MARKER get_lastMarker() const;
132 
133 	SuperType* get_super();
134 	/**
135      * Function that installs a version of LA that always
136      * returns upper case. Only valid for character streams and creates a case
137      * insensitive lexer if the lexer tokens are described in upper case. The
138      * tokens will preserve case in the token text.
139      */
140     void	setUcaseLA(bool flag);
141 
142     /** Consume the next 'ANTR3_UINT32' in the stream
143      */
144     void		    consume();
145 
146     /** Get ANTLR3_UINT32 at current input pointer + i ahead where i=1 is next ANTLR3_UINT32
147      */
148     ANTLR_UINT32	_LA( ANTLR_INT32 i);
149 
150     /** Tell the stream to start buffering if it hasn't already.  Return
151      *  current input position, index(), or some other marker so that
152      *  when passed to rewind() you get back to the same spot.
153      *  rewind(mark()) should not affect the input cursor.
154      */
155     ANTLR_MARKER	    mark();
156 
157     /** Return the current input symbol index 0..n where n indicates the
158      *  last symbol has been read.
159      */
160     ANTLR_MARKER	    index();
161 
162     /** Reset the stream so that next call to index would return marker.
163      *  The marker will usually be index() but it doesn't have to be.  It's
164      *  just a marker to indicate what state the stream was in.  This is
165      *  essentially calling release() and seek().  If there are markers
166      *  created after this marker argument, this routine must unroll them
167      *  like a stack.  Assume the state the stream was in when this marker
168      *  was created.
169      */
170     void	rewind(ANTLR_MARKER marker);
171 
172     /** Reset the stream to the last marker position, witouh destryoing the
173      *  last marker position.
174      */
175     void	rewindLast();
176 
177     /** You may want to commit to a backtrack but don't want to force the
178      *  stream to keep bookkeeping objects around for a marker that is
179      *  no longer necessary.  This will have the same behavior as
180      *  rewind() except it releases resources without the backward seek.
181      */
182     void	release(ANTLR_MARKER mark);
183 
184     /** Set the input cursor to the position indicated by index.  This is
185      *  normally used to seek ahead in the input stream.  No buffering is
186      *  required to do this unless you know your stream will use seek to
187      *  move backwards such as when backtracking.
188      *
189      *  This is different from rewind in its multi-directional
190      *  requirement and in that its argument is strictly an input cursor (index).
191      *
192      *  For char streams, seeking forward must update the stream state such
193      *  as line number.  For seeking backwards, you will be presumably
194      *  backtracking using the mark/rewind mechanism that restores state and
195      *  so this method does not need to update state when seeking backwards.
196      *
197      *  Currently, this method is only used for efficient backtracking, but
198      *  in the future it may be used for incremental parsing.
199      */
200     void	seek(ANTLR_MARKER index);
201 
202 	/// Debug only method to flag consumption of initial off-channel
203 	/// tokens in the input stream
204 	///
205 	void consumeInitialHiddenTokens();
206 
207 	void  rewindMark(ANTLR_MARKER marker);
208 	ANTLR_MARKER tindex();
209 
210     /** Frees any resources that were allocated for the implementation of this
211      *  interface. Usually this is just releasing the memory allocated
212      *  for the structure itself, but it may of course do anything it need to
213      *  so long as it does not stamp on anything else.
214      */
215 	~IntStream();
216 
217 protected:
218 	void setupIntStream(bool machineBigEndian, bool inputBigEndian);
219 	void findout_endian_spec(bool machineBigEndian, bool inputBigEndian);
220 
221 	//If the user chooses this option, then we will be resolving stuffs at run-time
222 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
223 
224 	//resolve into one of the three categories below at runtime
225 	void	consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
226 };
227 
228 template<class ImplTraits, class SuperType>
229 class EBCDIC_IntStream : public IntStream<ImplTraits, SuperType>
230 {
231 public:
232 	ANTLR_UINT32	_LA( ANTLR_INT32 i);
233 
234 protected:
235 	void setupIntStream();
236 };
237 
238 template<class ImplTraits, class SuperType>
239 class UTF8_IntStream : public IntStream<ImplTraits, SuperType>
240 {
241 public:
242 	ANTLR_UINT32	_LA( ANTLR_INT32 i);
243 	void consume();
244 
245 protected:
246 	void setupIntStream(bool machineBigEndian, bool inputBigEndian);
247 
248 private:
249 	static const ANTLR_UINT32* TrailingBytesForUTF8();
250 	static const UTF32* OffsetsFromUTF8();
251 };
252 
253 template<class ImplTraits, class SuperType>
254 class UTF16_IntStream : public IntStream<ImplTraits, SuperType>
255 {
256 public:
257 	ANTLR_UINT32	_LA( ANTLR_INT32 i);
258 	void		    consume();
259 	ANTLR_MARKER	index();
260 	void seek(ANTLR_MARKER seekPoint);
261 
262 protected:
263 	void setupIntStream(bool machineBigEndian, bool inputBigEndian);
264 
265 	/// \brief Return the input element assuming an 8 bit ascii input
266 	///
267 	/// \param[in] input Input stream context pointer
268 	/// \param[in] la 1 based offset of next input stream element
269 	///
270 	/// \return Next input character in internal ANTLR3 encoding (UTF32)
271 	///
272 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> );
273 
274 	/// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
275 	///
276 	/// \param[in] input Input stream context pointer
277 	/// \param[in] la 1 based offset of next input stream element
278 	///
279 	/// \return Next input character in internal ANTLR3 encoding (UTF32)
280 	///
281 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> );
282 
283 	/// \brief Return the input element assuming a UTF16 input when the input is Little Endian and the machine is not
284 	///
285 	/// \param[in] input Input stream context pointer
286 	/// \param[in] la 1 based offset of next input stream element
287 	///
288 	/// \return Next input character in internal ANTLR3 encoding (UTF32)
289 	///
290 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> );
291 
292 	/// \brief Consume the next character in a UTF16 input stream
293 	///
294 	/// \param input Input stream context pointer
295 	///
296 	void	consume( ClassForwarder<BYTE_AGNOSTIC> );
297 
298 	/// \brief Consume the next character in a UTF16 input stream when the input is Little Endian and the machine is not
299 	/// Note that the UTF16 routines do not do any substantial verification of the input stream as for performance
300 	/// sake, we assume it is validly encoded. So if a low surrogate is found at the curent input position then we
301 	/// just consume it. Surrogate pairs should be seen as Hi, Lo. So if we have a Lo first, then the input stream
302 	/// is fubar but we just ignore that.
303 	///
304 	/// \param input Input stream context pointer
305 	///
306 	void	consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> );
307 
308 	/// \brief Consume the next character in a UTF16 input stream when the input is Big Endian and the machine is not
309 	///
310 	/// \param input Input stream context pointer
311 	///
312 	void	consume( ClassForwarder<ANTLR_BIG_ENDIAN> );
313 };
314 
315 
316 
317 template<class ImplTraits, class SuperType>
318 class UTF32_IntStream : public IntStream<ImplTraits, SuperType>
319 {
320 public:
321 	ANTLR_UINT32	_LA( ANTLR_INT32 i);
322 	void		    consume();
323 
324 	/// \brief Calculate the current index in the output stream.
325 	/// \param[in] input Input stream context pointer
326 	///
327 	ANTLR_MARKER	index();
328 	void seek(ANTLR_MARKER seekPoint);
329 
330 protected:
331 	void setupIntStream(bool machineBigEndian, bool inputBigEndian);
332 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
333 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<BYTE_AGNOSTIC> );
334 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<ANTLR_LITTLE_ENDIAN> );
335 	ANTLR_UINT32	_LA( ANTLR_INT32 i, ClassForwarder<ANTLR_BIG_ENDIAN> );
336 
337 	void	consume( ClassForwarder<RESOLVE_ENDIAN_AT_RUNTIME> );
338 	void	consume( ClassForwarder<BYTE_AGNOSTIC> );
339 	void	consume( ClassForwarder<ANTLR_LITTLE_ENDIAN> );
340 	void	consume( ClassForwarder<ANTLR_BIG_ENDIAN> );
341 };
342 
343 template<class ImplTraits>
344 class TokenIntStream : public IntStream<ImplTraits, typename ImplTraits::TokenStreamType >
345 {
346 public:
347 	typedef typename ImplTraits::CommonTokenType CommonTokenType;
348 	typedef typename ImplTraits::StringType StringType;
349 	typedef typename ImplTraits::TokenStreamType TokenStreamType;
350 	typedef IntStream<ImplTraits, TokenStreamType > BaseType;
351 
352 private:
353 	/** Because the indirect call, though small in individual cases can
354      *  mount up if there are thousands of tokens (very large input streams), callers
355      *  of size can optionally use this cached size field.
356      */
357     ANTLR_UINT32	    m_cachedSize;
358 
359 public:
360 	TokenIntStream();
361 	ANTLR_UINT32 get_cachedSize() const;
362 	void set_cachedSize( ANTLR_UINT32 cachedSize );
363 
364 	void consume();
365 	void  consumeInitialHiddenTokens();
366 	ANTLR_UINT32  _LA( ANTLR_INT32 i );
367 	ANTLR_MARKER  mark();
368 	ANTLR_UINT32  size();
369 	void release();
370 	ANTLR_MARKER  tindex();
371 	void rewindLast();
372 	void rewind(ANTLR_MARKER marker);
373 	void seek(ANTLR_MARKER index);
374 	StringType getSourceName();
375 
376 };
377 
378 template<class ImplTraits>
379 class TreeNodeIntStream : public IntStream<ImplTraits, typename ImplTraits::CommonTreeNodeStreamType>
380 {
381 public:
382 	typedef typename ImplTraits::CommonTreeNodeStreamType CommonTreeNodeStreamType;
383 	typedef IntStream<ImplTraits, CommonTreeNodeStreamType > BaseType;
384 	typedef typename ImplTraits::TreeType TreeType;
385 	typedef typename ImplTraits::CommonTokenType CommonTokenType;
386 
387 public:
388 	void				consume();
389 	ANTLR_MARKER		tindex();
390 	ANTLR_UINT32		_LA(ANTLR_INT32 i);
391 	ANTLR_MARKER		mark();
392 	void				release(ANTLR_MARKER marker);
393 	void				rewindMark(ANTLR_MARKER marker);
394 	void				rewindLast();
395 	void				seek(ANTLR_MARKER index);
396 	ANTLR_UINT32		size();
397 };
398 
399 ANTLR_END_NAMESPACE()
400 
401 #include "antlr3intstream.inl"
402 
403 #endif
404 
405