xref: /aosp_15_r20/external/antlr/runtime/C/include/antlr3lexer.h (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /** \file
2*16467b97STreehugger Robot  * Base interface for any ANTLR3 lexer.
3*16467b97STreehugger Robot  *
4*16467b97STreehugger Robot  * An ANLTR3 lexer builds from two sets of components:
5*16467b97STreehugger Robot  *
6*16467b97STreehugger Robot  *  - The runtime components that provide common functionality such as
7*16467b97STreehugger Robot  *    traversing character streams, building tokens for output and so on.
8*16467b97STreehugger Robot  *  - The generated rules and struutre of the actual lexer, which call upon the
9*16467b97STreehugger Robot  *    runtime components.
10*16467b97STreehugger Robot  *
11*16467b97STreehugger Robot  * A lexer class contains  a character input stream, a base recognizer interface
12*16467b97STreehugger Robot  * (which it will normally implement) and a token source interface (which it also
13*16467b97STreehugger Robot  * implements. The Tokensource interface is called by a token consumer (such as
14*16467b97STreehugger Robot  * a parser, but in theory it can be anything that wants a set of abstract
15*16467b97STreehugger Robot  * tokens in place of a raw character stream.
16*16467b97STreehugger Robot  *
17*16467b97STreehugger Robot  * So then, we set up a lexer in a sequence akin to:
18*16467b97STreehugger Robot  *
19*16467b97STreehugger Robot  *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
20*16467b97STreehugger Robot  *    and initialize it.
21*16467b97STreehugger Robot  *  - Create a lexer interface and tell it where it its input stream is.
22*16467b97STreehugger Robot  *    This will cause the creation of a base recognizer class, which it will
23*16467b97STreehugger Robot  *    override with its own implementations of some methods. The lexer creator
24*16467b97STreehugger Robot  *    can also then in turn override anything it likes.
25*16467b97STreehugger Robot  *  - The lexer token source interface is then passed to some interface that
26*16467b97STreehugger Robot  *    knows how to use it, byte calling for a next token.
27*16467b97STreehugger Robot  *  - When a next token is called, let ze lexing begin.
28*16467b97STreehugger Robot  *
29*16467b97STreehugger Robot  */
30*16467b97STreehugger Robot #ifndef	_ANTLR3_LEXER
31*16467b97STreehugger Robot #define	_ANTLR3_LEXER
32*16467b97STreehugger Robot 
33*16467b97STreehugger Robot // [The "BSD licence"]
34*16467b97STreehugger Robot // Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
35*16467b97STreehugger Robot // http://www.temporal-wave.com
36*16467b97STreehugger Robot // http://www.linkedin.com/in/jimidle
37*16467b97STreehugger Robot //
38*16467b97STreehugger Robot // All rights reserved.
39*16467b97STreehugger Robot //
40*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without
41*16467b97STreehugger Robot // modification, are permitted provided that the following conditions
42*16467b97STreehugger Robot // are met:
43*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright
44*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer.
45*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright
46*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer in the
47*16467b97STreehugger Robot //    documentation and/or other materials provided with the distribution.
48*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products
49*16467b97STreehugger Robot //    derived from this software without specific prior written permission.
50*16467b97STreehugger Robot //
51*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
52*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
53*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
54*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
55*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
56*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
57*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
58*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
59*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
60*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
61*16467b97STreehugger Robot 
62*16467b97STreehugger Robot /* Definitions
63*16467b97STreehugger Robot  */
64*16467b97STreehugger Robot #define	ANTLR3_STRING_TERMINATOR	0xFFFFFFFF
65*16467b97STreehugger Robot 
66*16467b97STreehugger Robot #include    <antlr3defs.h>
67*16467b97STreehugger Robot #include    <antlr3input.h>
68*16467b97STreehugger Robot #include    <antlr3commontoken.h>
69*16467b97STreehugger Robot #include    <antlr3tokenstream.h>
70*16467b97STreehugger Robot #include    <antlr3baserecognizer.h>
71*16467b97STreehugger Robot 
72*16467b97STreehugger Robot #ifdef __cplusplus
73*16467b97STreehugger Robot extern "C" {
74*16467b97STreehugger Robot #endif
75*16467b97STreehugger Robot 
76*16467b97STreehugger Robot typedef	struct ANTLR3_LEXER_struct
77*16467b97STreehugger Robot {
78*16467b97STreehugger Robot     /** If there is a super structure that is implementing the
79*16467b97STreehugger Robot      *  lexer, then a pointer to it can be stored here in case
80*16467b97STreehugger Robot      *  implementing functions are overridden by this super structure.
81*16467b97STreehugger Robot      */
82*16467b97STreehugger Robot     void	* super;
83*16467b97STreehugger Robot 
84*16467b97STreehugger Robot     /** A generated lexer has an mTokens() function, which needs
85*16467b97STreehugger Robot      *  the context pointer of the generated lexer, not the base lexer interface
86*16467b97STreehugger Robot      *  this is stored here and initialized by the generated code (or manually
87*16467b97STreehugger Robot      *  if this is a manually built lexer.
88*16467b97STreehugger Robot      */
89*16467b97STreehugger Robot     void	* ctx;
90*16467b97STreehugger Robot 
91*16467b97STreehugger Robot     /** A pointer to the character stream whence this lexer is receiving
92*16467b97STreehugger Robot      *  characters.
93*16467b97STreehugger Robot      *  TODO: I may come back to this and implement charstream outside
94*16467b97STreehugger Robot      *  the input stream as per the java implementation.
95*16467b97STreehugger Robot      */
96*16467b97STreehugger Robot     pANTLR3_INPUT_STREAM	input;
97*16467b97STreehugger Robot 
98*16467b97STreehugger Robot     /** Pointer to the implementation of a base recognizer, which the lexer
99*16467b97STreehugger Robot      *  creates and then overrides with its own lexer oriented functions (the
100*16467b97STreehugger Robot      *  default implementation is parser oriented). This also contains a
101*16467b97STreehugger Robot      *  token source interface, which the lexer instance will provide to anything
102*16467b97STreehugger Robot      *  that needs it, which is anything else that implements a base recognizer,
103*16467b97STreehugger Robot      *  such as a parser.
104*16467b97STreehugger Robot      */
105*16467b97STreehugger Robot     pANTLR3_BASE_RECOGNIZER	rec;
106*16467b97STreehugger Robot 
107*16467b97STreehugger Robot     /** Pointer to a function that sets the charstream source for the lexer and
108*16467b97STreehugger Robot      *  causes it to  be reset.
109*16467b97STreehugger Robot      */
110*16467b97STreehugger Robot     void			(*setCharStream)    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
111*16467b97STreehugger Robot 
112*16467b97STreehugger Robot     /** Pointer to a function that switches the current character input stream to
113*16467b97STreehugger Robot      *  a new one, saving the old one, which we will revert to at the end of this
114*16467b97STreehugger Robot      *  new one.
115*16467b97STreehugger Robot      */
116*16467b97STreehugger Robot     void			(*pushCharStream)   (struct ANTLR3_LEXER_struct * lexer, pANTLR3_INPUT_STREAM input);
117*16467b97STreehugger Robot 
118*16467b97STreehugger Robot     /** Pointer to a function that abandons the current input stream, whether it
119*16467b97STreehugger Robot      *  is empty or not and reverts to the previous stacked input stream.
120*16467b97STreehugger Robot      */
121*16467b97STreehugger Robot     void			(*popCharStream)    (struct ANTLR3_LEXER_struct * lexer);
122*16467b97STreehugger Robot 
123*16467b97STreehugger Robot     /** Pointer to a function that emits the supplied token as the next token in
124*16467b97STreehugger Robot      *  the stream.
125*16467b97STreehugger Robot      */
126*16467b97STreehugger Robot     void			(*emitNew)	    (struct ANTLR3_LEXER_struct * lexer, pANTLR3_COMMON_TOKEN token);
127*16467b97STreehugger Robot 
128*16467b97STreehugger Robot     /** Pointer to a function that constructs a new token from the lexer stored information
129*16467b97STreehugger Robot      */
130*16467b97STreehugger Robot     pANTLR3_COMMON_TOKEN	(*emit)		    (struct ANTLR3_LEXER_struct * lexer);
131*16467b97STreehugger Robot 
132*16467b97STreehugger Robot     /** Pointer to the user provided (either manually or through code generation
133*16467b97STreehugger Robot      *  function that causes the lexer rules to run the lexing rules and produce
134*16467b97STreehugger Robot      *  the next token if there iss one. This is called from nextToken() in the
135*16467b97STreehugger Robot      *  pANTLR3_TOKEN_SOURCE. Note that the input parameter for this funciton is
136*16467b97STreehugger Robot      *  the generated lexer context (stored in ctx in this interface) it is a generated
137*16467b97STreehugger Robot      *  function and expects the context to be the generated lexer.
138*16467b97STreehugger Robot      */
139*16467b97STreehugger Robot     void	        (*mTokens)		    (void * ctx);
140*16467b97STreehugger Robot 
141*16467b97STreehugger Robot     /** Pointer to a function that attempts to match and consume the specified string from the input
142*16467b97STreehugger Robot      *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
143*16467b97STreehugger Robot      *  with 0xFFFFFFFF, which is an invalid UTF32 character
144*16467b97STreehugger Robot      */
145*16467b97STreehugger Robot     ANTLR3_BOOLEAN	(*matchs)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR * string);
146*16467b97STreehugger Robot 
147*16467b97STreehugger Robot     /** Pointer to a function that matches and consumes the specified character from the input stream.
148*16467b97STreehugger Robot      *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
149*16467b97STreehugger Robot      *  implementation is source encoding agnostic and so input streams do not generally need to
150*16467b97STreehugger Robot      *  override the default implmentation.
151*16467b97STreehugger Robot      */
152*16467b97STreehugger Robot     ANTLR3_BOOLEAN	(*matchc)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR c);
153*16467b97STreehugger Robot 
154*16467b97STreehugger Robot     /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
155*16467b97STreehugger Robot      *  but this would only be useful if the tokens were in tsome guaranteed order which is
156*16467b97STreehugger Robot      *  only going to happen with a hand crafted token set).
157*16467b97STreehugger Robot      */
158*16467b97STreehugger Robot     ANTLR3_BOOLEAN	(*matchRange)	    (struct ANTLR3_LEXER_struct * lexer, ANTLR3_UCHAR low, ANTLR3_UCHAR high);
159*16467b97STreehugger Robot 
160*16467b97STreehugger Robot     /** Pointer to a function that matches the next token/char in the input stream
161*16467b97STreehugger Robot      *  regardless of what it actaully is.
162*16467b97STreehugger Robot      */
163*16467b97STreehugger Robot     void		(*matchAny)	    (struct ANTLR3_LEXER_struct * lexer);
164*16467b97STreehugger Robot 
165*16467b97STreehugger Robot     /** Pointer to a function that recovers from an error found in the input stream.
166*16467b97STreehugger Robot      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
167*16467b97STreehugger Robot      *  be from a mismatched token that the (*match)() could not recover from.
168*16467b97STreehugger Robot      */
169*16467b97STreehugger Robot     void		(*recover)	    (struct ANTLR3_LEXER_struct * lexer);
170*16467b97STreehugger Robot 
171*16467b97STreehugger Robot     /** Pointer to function to return the current line number in the input stream
172*16467b97STreehugger Robot      */
173*16467b97STreehugger Robot     ANTLR3_UINT32	(*getLine)		(struct ANTLR3_LEXER_struct * lexer);
174*16467b97STreehugger Robot     ANTLR3_MARKER	(*getCharIndex)		(struct ANTLR3_LEXER_struct * lexer);
175*16467b97STreehugger Robot     ANTLR3_UINT32	(*getCharPositionInLine)(struct ANTLR3_LEXER_struct * lexer);
176*16467b97STreehugger Robot 
177*16467b97STreehugger Robot     /** Pointer to function to return the text so far for the current token being generated
178*16467b97STreehugger Robot      */
179*16467b97STreehugger Robot     pANTLR3_STRING	(*getText)	    (struct ANTLR3_LEXER_struct * lexer);
180*16467b97STreehugger Robot 
181*16467b97STreehugger Robot 
182*16467b97STreehugger Robot     /** Pointer to a function that knows how to free the resources of a lexer
183*16467b97STreehugger Robot      */
184*16467b97STreehugger Robot     void		(*free)		    (struct ANTLR3_LEXER_struct * lexer);
185*16467b97STreehugger Robot 
186*16467b97STreehugger Robot }
187*16467b97STreehugger Robot     ANTLR3_LEXER;
188*16467b97STreehugger Robot 
189*16467b97STreehugger Robot #ifdef __cplusplus
190*16467b97STreehugger Robot }
191*16467b97STreehugger Robot #endif
192*16467b97STreehugger Robot 
193*16467b97STreehugger Robot #endif
194