xref: /aosp_15_r20/external/antlr/runtime/C/include/antlr3input.h (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /** \file
2*16467b97STreehugger Robot  * Defines the basic structures used to manipulate character
3*16467b97STreehugger Robot  * streams from any input source. Any character size and encoding
4*16467b97STreehugger Robot  * can in theory be used, so long as a set of functinos is provided that
5*16467b97STreehugger Robot  * can return a 32 bit Integer representation of their characters amd efficiently mark and revert
6*16467b97STreehugger Robot  * to specific offsets into their input streams.
7*16467b97STreehugger Robot  */
8*16467b97STreehugger Robot #ifndef	_ANTLR3_INPUT_H
9*16467b97STreehugger Robot #define	_ANTLR3_INPUT_H
10*16467b97STreehugger Robot 
11*16467b97STreehugger Robot // [The "BSD licence"]
12*16467b97STreehugger Robot // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
13*16467b97STreehugger Robot // http://www.temporal-wave.com
14*16467b97STreehugger Robot // http://www.linkedin.com/in/jimidle
15*16467b97STreehugger Robot //
16*16467b97STreehugger Robot // All rights reserved.
17*16467b97STreehugger Robot //
18*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without
19*16467b97STreehugger Robot // modification, are permitted provided that the following conditions
20*16467b97STreehugger Robot // are met:
21*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright
22*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer.
23*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright
24*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer in the
25*16467b97STreehugger Robot //    documentation and/or other materials provided with the distribution.
26*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products
27*16467b97STreehugger Robot //    derived from this software without specific prior written permission.
28*16467b97STreehugger Robot //
29*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
30*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
31*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
32*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
33*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
34*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
35*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
36*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
38*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39*16467b97STreehugger Robot 
40*16467b97STreehugger Robot #include    <antlr3defs.h>
41*16467b97STreehugger Robot #include    <antlr3string.h>
42*16467b97STreehugger Robot #include    <antlr3commontoken.h>
43*16467b97STreehugger Robot #include    <antlr3intstream.h>
44*16467b97STreehugger Robot #include    <antlr3convertutf.h>
45*16467b97STreehugger Robot 
46*16467b97STreehugger Robot #ifdef __cplusplus
47*16467b97STreehugger Robot extern "C" {
48*16467b97STreehugger Robot #endif
49*16467b97STreehugger Robot 
50*16467b97STreehugger Robot 
51*16467b97STreehugger Robot 
52*16467b97STreehugger Robot /// Master context structure for an ANTLR3 C runtime based input stream.
53*16467b97STreehugger Robot /// \ingroup apistructures
54*16467b97STreehugger Robot ///
55*16467b97STreehugger Robot typedef	struct	ANTLR3_INPUT_STREAM_struct
56*16467b97STreehugger Robot {
57*16467b97STreehugger Robot     /** Interfaces that provide streams must all provide
58*16467b97STreehugger Robot      *  a generic ANTLR3_INT_STREAM interface and an ANTLR3_INPUT_STREAM
59*16467b97STreehugger Robot      *  is no different.
60*16467b97STreehugger Robot      */
61*16467b97STreehugger Robot     pANTLR3_INT_STREAM	istream;
62*16467b97STreehugger Robot 
63*16467b97STreehugger Robot     /** Whatever super structure is providing the INPUT stream needs a pointer to itself
64*16467b97STreehugger Robot      *  so that this can be passed back to it whenever the api functions
65*16467b97STreehugger Robot      *  are called back from this interface.
66*16467b97STreehugger Robot      */
67*16467b97STreehugger Robot     void	      * super;
68*16467b97STreehugger Robot 
69*16467b97STreehugger Robot     /** Pointer the start of the input string, characters may be
70*16467b97STreehugger Robot      *  taken as offsets from here and in original input format encoding.
71*16467b97STreehugger Robot      */
72*16467b97STreehugger Robot     void	      *	data;
73*16467b97STreehugger Robot 
74*16467b97STreehugger Robot     /** Indicates if the data pointer was allocated by us, and so should be freed
75*16467b97STreehugger Robot      *  when the stream dies.
76*16467b97STreehugger Robot      */
77*16467b97STreehugger Robot     int			isAllocated;
78*16467b97STreehugger Robot 
79*16467b97STreehugger Robot     /** String factory for this input stream
80*16467b97STreehugger Robot      */
81*16467b97STreehugger Robot     pANTLR3_STRING_FACTORY  strFactory;
82*16467b97STreehugger Robot 
83*16467b97STreehugger Robot 
84*16467b97STreehugger Robot     /** Pointer to the next character to be consumed from the input data
85*16467b97STreehugger Robot      *  This is cast to point at the encoding of the original file that
86*16467b97STreehugger Robot      *  was read by the functions installed as pointer in this input stream
87*16467b97STreehugger Robot      *  context instance at file/string/whatever load time.
88*16467b97STreehugger Robot      */
89*16467b97STreehugger Robot     void	      * nextChar;
90*16467b97STreehugger Robot 
91*16467b97STreehugger Robot     /** Number of characters that can be consumed at this point in time.
92*16467b97STreehugger Robot      *  Mostly this is just what is left in the pre-read buffer, but if the
93*16467b97STreehugger Robot      *  input source is a stream such as a socket or something then we may
94*16467b97STreehugger Robot      *  call special read code to wait for more input.
95*16467b97STreehugger Robot      */
96*16467b97STreehugger Robot     ANTLR3_UINT32	sizeBuf;
97*16467b97STreehugger Robot 
98*16467b97STreehugger Robot     /** The line number we are traversing in the input file. This gets incremented
99*16467b97STreehugger Robot      *  by a newline() call in the lexer grammar actions.
100*16467b97STreehugger Robot      */
101*16467b97STreehugger Robot     ANTLR3_UINT32	line;
102*16467b97STreehugger Robot 
103*16467b97STreehugger Robot     /** Pointer into the input buffer where the current line
104*16467b97STreehugger Robot      *  started.
105*16467b97STreehugger Robot      */
106*16467b97STreehugger Robot     void	      * currentLine;
107*16467b97STreehugger Robot 
108*16467b97STreehugger Robot     /** The offset within the current line of the current character
109*16467b97STreehugger Robot      */
110*16467b97STreehugger Robot     ANTLR3_INT32	charPositionInLine;
111*16467b97STreehugger Robot 
112*16467b97STreehugger Robot     /** Tracks how deep mark() calls are nested
113*16467b97STreehugger Robot      */
114*16467b97STreehugger Robot     ANTLR3_UINT32	markDepth;
115*16467b97STreehugger Robot 
116*16467b97STreehugger Robot     /** List of mark() points in the input stream
117*16467b97STreehugger Robot      */
118*16467b97STreehugger Robot     pANTLR3_VECTOR	markers;
119*16467b97STreehugger Robot 
120*16467b97STreehugger Robot     /** File name string, set to pointer to memory if
121*16467b97STreehugger Robot      * you set it manually as it will be free()d
122*16467b97STreehugger Robot      */
123*16467b97STreehugger Robot     pANTLR3_STRING	fileName;
124*16467b97STreehugger Robot 
125*16467b97STreehugger Robot     /** File number, needs to be set manually to some file index of your devising.
126*16467b97STreehugger Robot      */
127*16467b97STreehugger Robot     ANTLR3_UINT32	fileNo;
128*16467b97STreehugger Robot 
129*16467b97STreehugger Robot     /* API */
130*16467b97STreehugger Robot 
131*16467b97STreehugger Robot 
132*16467b97STreehugger Robot    /** Pointer to function that closes the input stream
133*16467b97STreehugger Robot      */
134*16467b97STreehugger Robot     void		(*close)	(struct	ANTLR3_INPUT_STREAM_struct * input);
135*16467b97STreehugger Robot     void		(*free)		(struct	ANTLR3_INPUT_STREAM_struct * input);
136*16467b97STreehugger Robot 
137*16467b97STreehugger Robot     /** Pointer to function that resets the input stream
138*16467b97STreehugger Robot      */
139*16467b97STreehugger Robot     void		(*reset)	(struct	ANTLR3_INPUT_STREAM_struct * input);
140*16467b97STreehugger Robot 
141*16467b97STreehugger Robot     /** Pointer to a function that reuses and resets an input stream by
142*16467b97STreehugger Robot      *  supplying a new 'source'
143*16467b97STreehugger Robot      */
144*16467b97STreehugger Robot     void                (*reuse)        (struct	ANTLR3_INPUT_STREAM_struct * input, pANTLR3_UINT8 inString, ANTLR3_UINT32 size, pANTLR3_UINT8 name);
145*16467b97STreehugger Robot 
146*16467b97STreehugger Robot     /**
147*16467b97STreehugger Robot      * Pointer to function that installs a version of LA that always
148*16467b97STreehugger Robot      * returns upper case. Only valid for character streams and creates a case
149*16467b97STreehugger Robot      * insensitive lexer if the lexer tokens are described in upper case. The
150*16467b97STreehugger Robot      * tokens will preserve case in the token text.
151*16467b97STreehugger Robot      */
152*16467b97STreehugger Robot     void		(*setUcaseLA)		(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN flag);
153*16467b97STreehugger Robot 
154*16467b97STreehugger Robot     /** Pointer to function to return input stream element at 1 based
155*16467b97STreehugger Robot      *  offset from nextChar. Same as _LA for char stream, but token
156*16467b97STreehugger Robot      *  streams etc. have one of these that does other stuff of course.
157*16467b97STreehugger Robot      */
158*16467b97STreehugger Robot     void *		(*_LT)		(struct	ANTLR3_INPUT_STREAM_struct * input, ANTLR3_INT32 lt);
159*16467b97STreehugger Robot 
160*16467b97STreehugger Robot     /** Pointer to function to return the total size of the input buffer. For streams
161*16467b97STreehugger Robot      *  this may be just the total we have available so far. This means of course that
162*16467b97STreehugger Robot      *  the input stream must be careful to accumulate enough input so that any backtracking
163*16467b97STreehugger Robot      *  can be satisfied.
164*16467b97STreehugger Robot      */
165*16467b97STreehugger Robot     ANTLR3_UINT32	(*size)		(struct ANTLR3_INPUT_STREAM_struct * input);
166*16467b97STreehugger Robot 
167*16467b97STreehugger Robot     /** Pointer to function to return a substring of the input stream. String is returned in allocated
168*16467b97STreehugger Robot      *  memory and is in same encoding as the input stream itself, NOT internal ANTLR3_UCHAR form.
169*16467b97STreehugger Robot      */
170*16467b97STreehugger Robot     pANTLR3_STRING	(*substr)	(struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_MARKER start, ANTLR3_MARKER stop);
171*16467b97STreehugger Robot 
172*16467b97STreehugger Robot     /** Pointer to function to return the current line number in the input stream
173*16467b97STreehugger Robot      */
174*16467b97STreehugger Robot     ANTLR3_UINT32	(*getLine)	(struct ANTLR3_INPUT_STREAM_struct * input);
175*16467b97STreehugger Robot 
176*16467b97STreehugger Robot     /** Pointer to function to return the current line buffer in the input stream
177*16467b97STreehugger Robot      *  The pointer returned is directly into the input stream so you must copy
178*16467b97STreehugger Robot      *  it if you wish to manipulate it without damaging the input stream. Encoding
179*16467b97STreehugger Robot      *  is obviously in the same form as the input stream.
180*16467b97STreehugger Robot      *  \remark
181*16467b97STreehugger Robot      *    - Note taht this function wil lbe inaccurate if setLine is called as there
182*16467b97STreehugger Robot      *      is no way at the moment to position the input stream at a particular line
183*16467b97STreehugger Robot      *	    number offset.
184*16467b97STreehugger Robot      */
185*16467b97STreehugger Robot     void	  *	(*getLineBuf)	(struct ANTLR3_INPUT_STREAM_struct * input);
186*16467b97STreehugger Robot 
187*16467b97STreehugger Robot     /** Pointer to function to return the current offset in the current input stream line
188*16467b97STreehugger Robot      */
189*16467b97STreehugger Robot     ANTLR3_UINT32	(*getCharPositionInLine)  (struct ANTLR3_INPUT_STREAM_struct * input);
190*16467b97STreehugger Robot 
191*16467b97STreehugger Robot     /** Pointer to function to set the current line number in the input stream
192*16467b97STreehugger Robot      */
193*16467b97STreehugger Robot     void		(*setLine)		  (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 line);
194*16467b97STreehugger Robot 
195*16467b97STreehugger Robot     /** Pointer to function to set the current position in the current line.
196*16467b97STreehugger Robot      */
197*16467b97STreehugger Robot     void		(*setCharPositionInLine)  (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 position);
198*16467b97STreehugger Robot 
199*16467b97STreehugger Robot     /** Pointer to function to override the default newline character that the input stream
200*16467b97STreehugger Robot      *  looks for to trigger the line/offset and line buffer recording information.
201*16467b97STreehugger Robot      *  \remark
202*16467b97STreehugger Robot      *   - By default the chracter '\n' will be installed as the newline trigger character. When this
203*16467b97STreehugger Robot      *     character is seen by the consume() function then the current line number is incremented and the
204*16467b97STreehugger Robot      *     current line offset is reset to 0. The Pointer for the line of input we are consuming
205*16467b97STreehugger Robot      *     is updated to point to the next character after this one in the input stream (which means it
206*16467b97STreehugger Robot      *     may become invalid if the last newline character in the file is seen (so watch out).
207*16467b97STreehugger Robot      *   - If for some reason you do not want the counters and pointers to be restee, you can set the
208*16467b97STreehugger Robot      *     chracter to some impossible character such as '\0' or whatever.
209*16467b97STreehugger Robot      *   - This is a single character only, so choose the last character in a sequence of two or more.
210*16467b97STreehugger Robot      *   - This is only a simple aid to error reporting - if you have a complicated binary input structure
211*16467b97STreehugger Robot      *     it may not be adequate, but you can always override every function in the input stream with your
212*16467b97STreehugger Robot      *     own of course, and can even write your own complete input stream set if you like.
213*16467b97STreehugger Robot      *   - It is your responsiblity to set a valid character for the input stream type. There is no point
214*16467b97STreehugger Robot      *     setting this to 0xFFFFFFFF if the input stream is 8 bit ASCII, as this will just be truncated and never
215*16467b97STreehugger Robot      *	   trigger as the comparison will be (INT32)0xFF == (INT32)0xFFFFFFFF
216*16467b97STreehugger Robot      */
217*16467b97STreehugger Robot     void		(*SetNewLineChar)	    (struct ANTLR3_INPUT_STREAM_struct * input, ANTLR3_UINT32 newlineChar);
218*16467b97STreehugger Robot 
219*16467b97STreehugger Robot     /// Character that automatically causes an internal line count
220*16467b97STreehugger Robot     ///  increment.
221*16467b97STreehugger Robot     ///
222*16467b97STreehugger Robot     ANTLR3_UCHAR	newlineChar;
223*16467b97STreehugger Robot 
224*16467b97STreehugger Robot     /// Indicates the size, in 8 bit units, of a single character. Note that
225*16467b97STreehugger Robot     /// the C runtime does not deal with surrogates as this would be
226*16467b97STreehugger Robot     /// slow and complicated. If this is a UTF-8 stream then this field
227*16467b97STreehugger Robot     /// will be set to 0. Generally you are best working internally with 32 bit characters
228*16467b97STreehugger Robot     /// as this is the most efficient.
229*16467b97STreehugger Robot     ///
230*16467b97STreehugger Robot     ANTLR3_UINT8	charByteSize;
231*16467b97STreehugger Robot 
232*16467b97STreehugger Robot     /// Indicates the encoding scheme used in this input stream
233*16467b97STreehugger Robot     ///
234*16467b97STreehugger Robot     ANTLR3_UINT32       encoding;
235*16467b97STreehugger Robot }
236*16467b97STreehugger Robot 
237*16467b97STreehugger Robot     ANTLR3_INPUT_STREAM;
238*16467b97STreehugger Robot 
239*16467b97STreehugger Robot 
240*16467b97STreehugger Robot /** \brief Structure for track lex input states as part of mark()
241*16467b97STreehugger Robot  *  and rewind() of lexer.
242*16467b97STreehugger Robot  */
243*16467b97STreehugger Robot typedef	struct	ANTLR3_LEX_STATE_struct
244*16467b97STreehugger Robot {
245*16467b97STreehugger Robot         /** Pointer to the next character to be consumed from the input data
246*16467b97STreehugger Robot      *  This is cast to point at the encoding of the original file that
247*16467b97STreehugger Robot      *  was read by the functions installed as pointer in this input stream
248*16467b97STreehugger Robot      *  context instance at file/string/whatever load time.
249*16467b97STreehugger Robot      */
250*16467b97STreehugger Robot     void	      * nextChar;
251*16467b97STreehugger Robot 
252*16467b97STreehugger Robot     /** The line number we are traversing in the input file. This gets incremented
253*16467b97STreehugger Robot      *  by a newline() call in the lexer grammer actions.
254*16467b97STreehugger Robot      */
255*16467b97STreehugger Robot     ANTLR3_UINT32	line;
256*16467b97STreehugger Robot 
257*16467b97STreehugger Robot     /** Pointer into the input buffer where the current line
258*16467b97STreehugger Robot      *  started.
259*16467b97STreehugger Robot      */
260*16467b97STreehugger Robot     void	      * currentLine;
261*16467b97STreehugger Robot 
262*16467b97STreehugger Robot     /** The offset within the current line of the current character
263*16467b97STreehugger Robot      */
264*16467b97STreehugger Robot     ANTLR3_INT32	charPositionInLine;
265*16467b97STreehugger Robot 
266*16467b97STreehugger Robot }
267*16467b97STreehugger Robot     ANTLR3_LEX_STATE;
268*16467b97STreehugger Robot 
269*16467b97STreehugger Robot /* Prototypes
270*16467b97STreehugger Robot  */
271*16467b97STreehugger Robot void	    antlr38BitSetupStream	(pANTLR3_INPUT_STREAM input);
272*16467b97STreehugger Robot void	    antlr3UTF16SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
273*16467b97STreehugger Robot void	    antlr3UTF32SetupStream	(pANTLR3_INPUT_STREAM input, ANTLR3_BOOLEAN machineBigEndian, ANTLR3_BOOLEAN inputBigEndian);
274*16467b97STreehugger Robot void	    antlr3UTF8SetupStream	(pANTLR3_INPUT_STREAM input);
275*16467b97STreehugger Robot void	    antlr3EBCDICSetupStream	(pANTLR3_INPUT_STREAM input);
276*16467b97STreehugger Robot void        antlr3GenericSetupStream    (pANTLR3_INPUT_STREAM input);
277*16467b97STreehugger Robot #ifdef __cplusplus
278*16467b97STreehugger Robot }
279*16467b97STreehugger Robot #endif
280*16467b97STreehugger Robot 
281*16467b97STreehugger Robot #endif	/* _ANTLR3_INPUT_H  */
282