xref: /aosp_15_r20/external/antlr/runtime/Cpp/include/antlr3input.hpp (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /** \file
2*16467b97STreehugger Robot  * Defines the basic structures used to manipulate character
3*16467b97STreehugger Robot  * streams from any input source. Any character size and encoding
4*16467b97STreehugger Robot  * can in theory be used, so long as a set of functinos is provided that
5*16467b97STreehugger Robot  * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
6*16467b97STreehugger Robot  * to specific offsets into their input streams.
7*16467b97STreehugger Robot  */
8*16467b97STreehugger Robot #ifndef	_ANTLR_INPUT_HPP
9*16467b97STreehugger Robot #define	_ANTLR_INPUT_HPP
10*16467b97STreehugger Robot 
11*16467b97STreehugger Robot // [The "BSD licence"]
12*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
13*16467b97STreehugger Robot 
14*16467b97STreehugger Robot //
15*16467b97STreehugger Robot // All rights reserved.
16*16467b97STreehugger Robot //
17*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without
18*16467b97STreehugger Robot // modification, are permitted provided that the following conditions
19*16467b97STreehugger Robot // are met:
20*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright
21*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer.
22*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright
23*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer in the
24*16467b97STreehugger Robot //    documentation and/or other materials provided with the distribution.
25*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products
26*16467b97STreehugger Robot //    derived from this software without specific prior written permission.
27*16467b97STreehugger Robot //
28*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
29*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
30*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
31*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
32*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
33*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
34*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
35*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
36*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
37*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38*16467b97STreehugger Robot 
39*16467b97STreehugger Robot #include    "antlr3defs.hpp"
40*16467b97STreehugger Robot 
41*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE()
42*16467b97STreehugger Robot 
43*16467b97STreehugger Robot /// Master context structure for an ANTLR3 C runtime based input stream.
44*16467b97STreehugger Robot /// \ingroup apistructures. Calling _LT on this doesn't seem right. You would
45*16467b97STreehugger Robot /// call it only with parser / TreeParser, and their respective input streams
46*16467b97STreehugger Robot /// has that function. calling it from lexer will throw a compile time error
47*16467b97STreehugger Robot ///
48*16467b97STreehugger Robot 
49*16467b97STreehugger Robot template<class ImplTraits>
50*16467b97STreehugger Robot class	InputStream :   public ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType >
51*16467b97STreehugger Robot {
52*16467b97STreehugger Robot public:
53*16467b97STreehugger Robot 	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
54*16467b97STreehugger Robot 	typedef typename ImplTraits::LexStateType LexStateType;
55*16467b97STreehugger Robot 	typedef typename ImplTraits::template IntStreamType< typename ImplTraits::InputStreamType > IntStreamType;
56*16467b97STreehugger Robot 	typedef IntStreamType BaseType;
57*16467b97STreehugger Robot 	typedef typename ImplTraits::StreamDataType UnitType;
58*16467b97STreehugger Robot 	typedef UnitType DataType;
59*16467b97STreehugger Robot 	typedef UnitType TokenType;
60*16467b97STreehugger Robot 	typedef typename AllocPolicyType::template VectorType<LexStateType> MarkersType;
61*16467b97STreehugger Robot 	typedef typename ImplTraits::StringType StringType;
62*16467b97STreehugger Robot 
63*16467b97STreehugger Robot private:
64*16467b97STreehugger Robot     /** Pointer the start of the input string, characters may be
65*16467b97STreehugger Robot      *  taken as offsets from here and in original input format encoding.
66*16467b97STreehugger Robot      */
67*16467b97STreehugger Robot     const DataType*		m_data;
68*16467b97STreehugger Robot 
69*16467b97STreehugger Robot     /** Pointer to the next character to be consumed from the input data
70*16467b97STreehugger Robot      *  This is cast to point at the encoding of the original file that
71*16467b97STreehugger Robot      *  was read by the functions installed as pointer in this input stream
72*16467b97STreehugger Robot      *  context instance at file/string/whatever load time.
73*16467b97STreehugger Robot      */
74*16467b97STreehugger Robot     const DataType*		m_nextChar;
75*16467b97STreehugger Robot 
76*16467b97STreehugger Robot     /** Number of characters that can be consumed at this point in time.
77*16467b97STreehugger Robot      *  Mostly this is just what is left in the pre-read buffer, but if the
78*16467b97STreehugger Robot      *  input source is a stream such as a socket or something then we may
79*16467b97STreehugger Robot      *  call special read code to wait for more input.
80*16467b97STreehugger Robot      */
81*16467b97STreehugger Robot     ANTLR_UINT32	m_sizeBuf;
82*16467b97STreehugger Robot 
83*16467b97STreehugger Robot     /** The line number we are traversing in the input file. This gets incremented
84*16467b97STreehugger Robot      *  by a newline() call in the lexer grammar actions.
85*16467b97STreehugger Robot      */
86*16467b97STreehugger Robot     ANTLR_UINT32	m_line;
87*16467b97STreehugger Robot 
88*16467b97STreehugger Robot     /** Pointer into the input buffer where the current line
89*16467b97STreehugger Robot      *  started.
90*16467b97STreehugger Robot      */
91*16467b97STreehugger Robot     const DataType*		m_currentLine;
92*16467b97STreehugger Robot 
93*16467b97STreehugger Robot     /** The offset within the current line of the current character
94*16467b97STreehugger Robot      */
95*16467b97STreehugger Robot     ANTLR_INT32		m_charPositionInLine;
96*16467b97STreehugger Robot 
97*16467b97STreehugger Robot     /** Tracks how deep mark() calls are nested
98*16467b97STreehugger Robot      */
99*16467b97STreehugger Robot     ANTLR_UINT32	m_markDepth;
100*16467b97STreehugger Robot 
101*16467b97STreehugger Robot     /** List of mark() points in the input stream
102*16467b97STreehugger Robot      */
103*16467b97STreehugger Robot     MarkersType		m_markers;
104*16467b97STreehugger Robot 
105*16467b97STreehugger Robot     /** File name string, set to pointer to memory if
106*16467b97STreehugger Robot      * you set it manually as it will be free()d
107*16467b97STreehugger Robot      */
108*16467b97STreehugger Robot     StringType		m_fileName;
109*16467b97STreehugger Robot 
110*16467b97STreehugger Robot     /** File number, needs to be set manually to some file index of your devising.
111*16467b97STreehugger Robot      */
112*16467b97STreehugger Robot     ANTLR_UINT32	m_fileNo;
113*16467b97STreehugger Robot 
114*16467b97STreehugger Robot 	/// Character that automatically causes an internal line count
115*16467b97STreehugger Robot     ///  increment.
116*16467b97STreehugger Robot     ///
117*16467b97STreehugger Robot     ANTLR_UCHAR		m_newlineChar;
118*16467b97STreehugger Robot 
119*16467b97STreehugger Robot     /// Indicates the size, in 8 bit units, of a single character. Note that
120*16467b97STreehugger Robot     /// the C runtime does not deal with surrogates as this would be
121*16467b97STreehugger Robot     /// slow and complicated. If this is a UTF-8 stream then this field
122*16467b97STreehugger Robot     /// will be set to 0. Generally you are best working internally with 32 bit characters
123*16467b97STreehugger Robot     /// as this is the most efficient.
124*16467b97STreehugger Robot     ///
125*16467b97STreehugger Robot     ANTLR_UINT8		m_charByteSize;
126*16467b97STreehugger Robot 
127*16467b97STreehugger Robot    /** Indicates if the data pointer was allocated by us, and so should be freed
128*16467b97STreehugger Robot      *  when the stream dies.
129*16467b97STreehugger Robot      */
130*16467b97STreehugger Robot     bool			m_isAllocated;
131*16467b97STreehugger Robot 
132*16467b97STreehugger Robot     /// Indicates the encoding scheme used in this input stream
133*16467b97STreehugger Robot     ///
134*16467b97STreehugger Robot     ANTLR_UINT32    m_encoding;
135*16467b97STreehugger Robot 
136*16467b97STreehugger Robot     /* API */
137*16467b97STreehugger Robot public:
138*16467b97STreehugger Robot 	InputStream(const ANTLR_UINT8* fileName, ANTLR_UINT32 encoding);
139*16467b97STreehugger Robot 	InputStream(const ANTLR_UINT8* data, ANTLR_UINT32 encoding, ANTLR_UINT32 size, ANTLR_UINT8* name);
140*16467b97STreehugger Robot 	~InputStream();
141*16467b97STreehugger Robot 	const DataType* get_data() const;
142*16467b97STreehugger Robot 	bool get_isAllocated() const;
143*16467b97STreehugger Robot 	const DataType* get_nextChar() const;
144*16467b97STreehugger Robot 	ANTLR_UINT32 get_sizeBuf() const;
145*16467b97STreehugger Robot 	ANTLR_UINT32 get_line() const;
146*16467b97STreehugger Robot 	const DataType* get_currentLine() const;
147*16467b97STreehugger Robot 	ANTLR_INT32 get_charPositionInLine() const;
148*16467b97STreehugger Robot 	ANTLR_UINT32 get_markDepth() const;
149*16467b97STreehugger Robot 	MarkersType& get_markers();
150*16467b97STreehugger Robot 	const StringType& get_fileName() const;
151*16467b97STreehugger Robot 	ANTLR_UINT32 get_fileNo() const;
152*16467b97STreehugger Robot 	ANTLR_UCHAR get_newlineChar() const;
153*16467b97STreehugger Robot 	ANTLR_UINT8 get_charByteSize() const;
154*16467b97STreehugger Robot 	ANTLR_UINT32 get_encoding() const;
155*16467b97STreehugger Robot 
156*16467b97STreehugger Robot 	void  set_data( DataType* data );
157*16467b97STreehugger Robot 	void  set_isAllocated( bool isAllocated );
158*16467b97STreehugger Robot 	void  set_nextChar( const DataType* nextChar );
159*16467b97STreehugger Robot 	void  set_sizeBuf( ANTLR_UINT32 sizeBuf );
160*16467b97STreehugger Robot 	void  set_line( ANTLR_UINT32 line );
161*16467b97STreehugger Robot 	void  set_currentLine( const DataType* currentLine );
162*16467b97STreehugger Robot 	void  set_charPositionInLine( ANTLR_INT32 charPositionInLine );
163*16467b97STreehugger Robot 	void  set_markDepth( ANTLR_UINT32 markDepth );
164*16467b97STreehugger Robot 	void  set_markers( const MarkersType& markers );
165*16467b97STreehugger Robot 	void  set_fileName( const StringType& fileName );
166*16467b97STreehugger Robot 	void  set_fileNo( ANTLR_UINT32 fileNo );
167*16467b97STreehugger Robot 	void  set_newlineChar( ANTLR_UCHAR newlineChar );
168*16467b97STreehugger Robot 	void  set_charByteSize( ANTLR_UINT8 charByteSize );
169*16467b97STreehugger Robot 	void  set_encoding( ANTLR_UINT32 encoding );
170*16467b97STreehugger Robot 
171*16467b97STreehugger Robot 	void inc_charPositionInLine();
172*16467b97STreehugger Robot 	void inc_line();
173*16467b97STreehugger Robot 	void inc_markDepth();
174*16467b97STreehugger Robot 
175*16467b97STreehugger Robot 	IntStreamType*	get_istream();
176*16467b97STreehugger Robot 
177*16467b97STreehugger Robot     /** Function that resets the input stream
178*16467b97STreehugger Robot      */
179*16467b97STreehugger Robot     void	reset();
180*16467b97STreehugger Robot 
181*16467b97STreehugger Robot     /** Pointer to a function that reuses and resets an input stream by
182*16467b97STreehugger Robot      *  supplying a new 'source'
183*16467b97STreehugger Robot      */
184*16467b97STreehugger Robot     void    reuse(ANTLR_UINT8* inString, ANTLR_UINT32 size, ANTLR_UINT8* name);
185*16467b97STreehugger Robot 
186*16467b97STreehugger Robot 
187*16467b97STreehugger Robot     /** Function to return the total size of the input buffer. For streams
188*16467b97STreehugger Robot      *  this may be just the total we have available so far. This means of course that
189*16467b97STreehugger Robot      *  the input stream must be careful to accumulate enough input so that any backtracking
190*16467b97STreehugger Robot      *  can be satisfied.
191*16467b97STreehugger Robot      */
192*16467b97STreehugger Robot     ANTLR_UINT32	size();
193*16467b97STreehugger Robot 
194*16467b97STreehugger Robot     /** Function to return a substring of the input stream. String is returned in allocated
195*16467b97STreehugger Robot      *  memory and is in same encoding as the input stream itself, NOT internal ANTLR_UCHAR form.
196*16467b97STreehugger Robot      */
197*16467b97STreehugger Robot     StringType	substr(ANTLR_MARKER start, ANTLR_MARKER stop);
198*16467b97STreehugger Robot 
199*16467b97STreehugger Robot     /** Function to return the current line number in the input stream
200*16467b97STreehugger Robot      */
201*16467b97STreehugger Robot     ANTLR_UINT32	get_line();
202*16467b97STreehugger Robot 
203*16467b97STreehugger Robot     /** Function to return the current line buffer in the input stream
204*16467b97STreehugger Robot      *  The pointer returned is directly into the input stream so you must copy
205*16467b97STreehugger Robot      *  it if you wish to manipulate it without damaging the input stream. Encoding
206*16467b97STreehugger Robot      *  is obviously in the same form as the input stream.
207*16467b97STreehugger Robot      *  \remark
208*16467b97STreehugger Robot      *    - Note taht this function wil lbe inaccurate if setLine is called as there
209*16467b97STreehugger Robot      *      is no way at the moment to position the input stream at a particular line
210*16467b97STreehugger Robot      *	    number offset.
211*16467b97STreehugger Robot      */
212*16467b97STreehugger Robot     const DataType*	getLineBuf();
213*16467b97STreehugger Robot 
214*16467b97STreehugger Robot     /** Function to return the current offset in the current input stream line
215*16467b97STreehugger Robot      */
216*16467b97STreehugger Robot     ANTLR_UINT32	get_charPositionInLine();
217*16467b97STreehugger Robot 
218*16467b97STreehugger Robot     /** Function to set the current position in the current line.
219*16467b97STreehugger Robot      */
220*16467b97STreehugger Robot     void	set_charPositionInLine(ANTLR_UINT32 position);
221*16467b97STreehugger Robot 
222*16467b97STreehugger Robot     /** Function to override the default newline character that the input stream
223*16467b97STreehugger Robot      *  looks for to trigger the line/offset and line buffer recording information.
224*16467b97STreehugger Robot      *  \remark
225*16467b97STreehugger Robot      *   - By default the chracter '\n' will be installed as the newline trigger character. When this
226*16467b97STreehugger Robot      *     character is seen by the consume() function then the current line number is incremented and the
227*16467b97STreehugger Robot      *     current line offset is reset to 0. The Pointer for the line of input we are consuming
228*16467b97STreehugger Robot      *     is updated to point to the next character after this one in the input stream (which means it
229*16467b97STreehugger Robot      *     may become invalid if the last newline character in the file is seen (so watch out).
230*16467b97STreehugger Robot      *   - If for some reason you do not want the counters and pointers to be restee, you can set the
231*16467b97STreehugger Robot      *     chracter to some impossible character such as '\0' or whatever.
232*16467b97STreehugger Robot      *   - This is a single character only, so choose the last character in a sequence of two or more.
233*16467b97STreehugger Robot      *   - This is only a simple aid to error reporting - if you have a complicated binary input structure
234*16467b97STreehugger Robot      *     it may not be adequate, but you can always override every function in the input stream with your
235*16467b97STreehugger Robot      *     own of course, and can even write your own complete input stream set if you like.
236*16467b97STreehugger Robot      *   - It is your responsiblity to set a valid character for the input stream type. There is no point
237*16467b97STreehugger Robot      *     setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
238*16467b97STreehugger Robot      *	   trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
239*16467b97STreehugger Robot      */
240*16467b97STreehugger Robot     void	set_newLineChar(ANTLR_UINT32 newlineChar);
241*16467b97STreehugger Robot 
242*16467b97STreehugger Robot 	ANTLR_MARKER index_impl();
243*16467b97STreehugger Robot 
244*16467b97STreehugger Robot private:
245*16467b97STreehugger Robot 	/** \brief Use the contents of an operating system file as the input
246*16467b97STreehugger Robot 	 *         for an input stream.
247*16467b97STreehugger Robot 	 *
248*16467b97STreehugger Robot 	 * \param fileName Name of operating system file to read.
249*16467b97STreehugger Robot 	 * \return
250*16467b97STreehugger Robot 	 *	- Pointer to new input stream context upon success
251*16467b97STreehugger Robot 	 *	- One of the ANTLR3_ERR_ defines on error.
252*16467b97STreehugger Robot 	 */
253*16467b97STreehugger Robot 	void createFileStream(const ANTLR_UINT8* fileName);
254*16467b97STreehugger Robot 
255*16467b97STreehugger Robot 	/** \brief Use the supplied 'string' as input to the stream
256*16467b97STreehugger Robot 	 *
257*16467b97STreehugger Robot 	 * \param data Pointer to the input data
258*16467b97STreehugger Robot 	 * \return
259*16467b97STreehugger Robot 	 *	- Pointer to new input stream context upon success
260*16467b97STreehugger Robot 	 *	- NULL defines on error.
261*16467b97STreehugger Robot 	 */
262*16467b97STreehugger Robot 	void createStringStream(const ANTLR_UINT8* data);
263*16467b97STreehugger Robot 	void genericSetupStream();
264*16467b97STreehugger Robot 
265*16467b97STreehugger Robot 	/// Determine endianess of the input stream and install the
266*16467b97STreehugger Robot 	/// API required for the encoding in that format.
267*16467b97STreehugger Robot 	///
268*16467b97STreehugger Robot 	void setupInputStream();
269*16467b97STreehugger Robot 
270*16467b97STreehugger Robot };
271*16467b97STreehugger Robot 
272*16467b97STreehugger Robot /** \brief Structure for track lex input states as part of mark()
273*16467b97STreehugger Robot  *  and rewind() of lexer.
274*16467b97STreehugger Robot  */
275*16467b97STreehugger Robot template<class ImplTraits>
276*16467b97STreehugger Robot class	LexState : public ImplTraits::AllocPolicyType
277*16467b97STreehugger Robot {
278*16467b97STreehugger Robot public:
279*16467b97STreehugger Robot 	typedef typename ImplTraits::StreamDataType DataType;
280*16467b97STreehugger Robot 
281*16467b97STreehugger Robot private:
282*16467b97STreehugger Robot         /** Pointer to the next character to be consumed from the input data
283*16467b97STreehugger Robot      *  This is cast to point at the encoding of the original file that
284*16467b97STreehugger Robot      *  was read by the functions installed as pointer in this input stream
285*16467b97STreehugger Robot      *  context instance at file/string/whatever load time.
286*16467b97STreehugger Robot      */
287*16467b97STreehugger Robot     const DataType*			m_nextChar;
288*16467b97STreehugger Robot 
289*16467b97STreehugger Robot     /** The line number we are traversing in the input file. This gets incremented
290*16467b97STreehugger Robot      *  by a newline() call in the lexer grammer actions.
291*16467b97STreehugger Robot      */
292*16467b97STreehugger Robot     ANTLR_UINT32	m_line;
293*16467b97STreehugger Robot 
294*16467b97STreehugger Robot     /** Pointer into the input buffer where the current line
295*16467b97STreehugger Robot      *  started.
296*16467b97STreehugger Robot      */
297*16467b97STreehugger Robot     const DataType*			m_currentLine;
298*16467b97STreehugger Robot 
299*16467b97STreehugger Robot     /** The offset within the current line of the current character
300*16467b97STreehugger Robot      */
301*16467b97STreehugger Robot     ANTLR_INT32		m_charPositionInLine;
302*16467b97STreehugger Robot 
303*16467b97STreehugger Robot public:
304*16467b97STreehugger Robot 	LexState();
305*16467b97STreehugger Robot 	const DataType* get_nextChar() const;
306*16467b97STreehugger Robot 	ANTLR_UINT32 get_line() const;
307*16467b97STreehugger Robot 	const DataType* get_currentLine() const;
308*16467b97STreehugger Robot 	ANTLR_INT32 get_charPositionInLine() const;
309*16467b97STreehugger Robot 	void  set_nextChar( const DataType* nextChar );
310*16467b97STreehugger Robot 	void  set_line( ANTLR_UINT32 line );
311*16467b97STreehugger Robot 	void  set_currentLine( const DataType* currentLine );
312*16467b97STreehugger Robot 	void  set_charPositionInLine( ANTLR_INT32 charPositionInLine );
313*16467b97STreehugger Robot };
314*16467b97STreehugger Robot 
315*16467b97STreehugger Robot class ParseNullStringException : public std::exception
316*16467b97STreehugger Robot {
what() const317*16467b97STreehugger Robot 	virtual const char* what() const throw()
318*16467b97STreehugger Robot 	{
319*16467b97STreehugger Robot 		return "Null String";
320*16467b97STreehugger Robot 	}
321*16467b97STreehugger Robot };
322*16467b97STreehugger Robot 
323*16467b97STreehugger Robot ANTLR_END_NAMESPACE()
324*16467b97STreehugger Robot 
325*16467b97STreehugger Robot #include "antlr3input.inl"
326*16467b97STreehugger Robot 
327*16467b97STreehugger Robot #endif	/* _ANTLR_INPUT_H  */
328