xref: /aosp_15_r20/external/antlr/runtime/Cpp/include/antlr3lexer.hpp (revision 16467b971bd3e2009fad32dd79016f2c7e421deb)
1*16467b97STreehugger Robot /** \file
2*16467b97STreehugger Robot  * Base interface for any ANTLR3 lexer.
3*16467b97STreehugger Robot  *
4*16467b97STreehugger Robot  * An ANLTR3 lexer builds from two sets of components:
5*16467b97STreehugger Robot  *
6*16467b97STreehugger Robot  *  - The runtime components that provide common functionality such as
7*16467b97STreehugger Robot  *    traversing character streams, building tokens for output and so on.
8*16467b97STreehugger Robot  *  - The generated rules and struutre of the actual lexer, which call upon the
9*16467b97STreehugger Robot  *    runtime components.
10*16467b97STreehugger Robot  *
11*16467b97STreehugger Robot  * A lexer class contains  a character input stream, a base recognizer interface
12*16467b97STreehugger Robot  * (which it will normally implement) and a token source interface (which it also
13*16467b97STreehugger Robot  * implements. The Tokensource interface is called by a token consumer (such as
14*16467b97STreehugger Robot  * a parser, but in theory it can be anything that wants a set of abstract
15*16467b97STreehugger Robot  * tokens in place of a raw character stream.
16*16467b97STreehugger Robot  *
17*16467b97STreehugger Robot  * So then, we set up a lexer in a sequence akin to:
18*16467b97STreehugger Robot  *
19*16467b97STreehugger Robot  *  - Create a character stream (something which implements ANTLR3_INPUT_STREAM)
20*16467b97STreehugger Robot  *    and initialize it.
21*16467b97STreehugger Robot  *  - Create a lexer interface and tell it where it its input stream is.
22*16467b97STreehugger Robot  *    This will cause the creation of a base recognizer class, which it will
23*16467b97STreehugger Robot  *    override with its own implementations of some methods. The lexer creator
24*16467b97STreehugger Robot  *    can also then in turn override anything it likes.
25*16467b97STreehugger Robot  *  - The lexer token source interface is then passed to some interface that
26*16467b97STreehugger Robot  *    knows how to use it, byte calling for a next token.
27*16467b97STreehugger Robot  *  - When a next token is called, let ze lexing begin.
28*16467b97STreehugger Robot  *
29*16467b97STreehugger Robot  */
30*16467b97STreehugger Robot #ifndef	_ANTLR3_LEXER_HPP
31*16467b97STreehugger Robot #define	_ANTLR3_LEXER_HPP
32*16467b97STreehugger Robot 
33*16467b97STreehugger Robot // [The "BSD licence"]
34*16467b97STreehugger Robot // Copyright (c) 2005-2009 Gokulakannan Somasundaram, ElectronDB
35*16467b97STreehugger Robot 
36*16467b97STreehugger Robot //
37*16467b97STreehugger Robot // All rights reserved.
38*16467b97STreehugger Robot //
39*16467b97STreehugger Robot // Redistribution and use in source and binary forms, with or without
40*16467b97STreehugger Robot // modification, are permitted provided that the following conditions
41*16467b97STreehugger Robot // are met:
42*16467b97STreehugger Robot // 1. Redistributions of source code must retain the above copyright
43*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer.
44*16467b97STreehugger Robot // 2. Redistributions in binary form must reproduce the above copyright
45*16467b97STreehugger Robot //    notice, this list of conditions and the following disclaimer in the
46*16467b97STreehugger Robot //    documentation and/or other materials provided with the distribution.
47*16467b97STreehugger Robot // 3. The name of the author may not be used to endorse or promote products
48*16467b97STreehugger Robot //    derived from this software without specific prior written permission.
49*16467b97STreehugger Robot //
50*16467b97STreehugger Robot // THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
51*16467b97STreehugger Robot // IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
52*16467b97STreehugger Robot // OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
53*16467b97STreehugger Robot // IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
54*16467b97STreehugger Robot // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
55*16467b97STreehugger Robot // NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
56*16467b97STreehugger Robot // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
57*16467b97STreehugger Robot // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
58*16467b97STreehugger Robot // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
59*16467b97STreehugger Robot // THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
60*16467b97STreehugger Robot 
61*16467b97STreehugger Robot /* Definitions
62*16467b97STreehugger Robot  */
63*16467b97STreehugger Robot #include    "antlr3defs.hpp"
64*16467b97STreehugger Robot 
65*16467b97STreehugger Robot ANTLR_BEGIN_NAMESPACE()
66*16467b97STreehugger Robot 
67*16467b97STreehugger Robot static const ANTLR_UINT32	ANTLR_STRING_TERMINATOR	= 0xFFFFFFFF;
68*16467b97STreehugger Robot 
69*16467b97STreehugger Robot template<class ImplTraits>
70*16467b97STreehugger Robot class  Lexer : public ImplTraits::template RecognizerType< typename ImplTraits::InputStreamType >,
71*16467b97STreehugger Robot 			   public ImplTraits::TokenSourceType
72*16467b97STreehugger Robot {
73*16467b97STreehugger Robot public:
74*16467b97STreehugger Robot 	typedef typename ImplTraits::AllocPolicyType AllocPolicyType;
75*16467b97STreehugger Robot 	typedef typename ImplTraits::InputStreamType InputStreamType;
76*16467b97STreehugger Robot 	typedef InputStreamType StreamType;
77*16467b97STreehugger Robot 	typedef typename InputStreamType::IntStreamType IntStreamType;
78*16467b97STreehugger Robot 	typedef typename ImplTraits::CommonTokenType CommonTokenType;
79*16467b97STreehugger Robot 	typedef typename ImplTraits::StreamDataType TokenType;
80*16467b97STreehugger Robot 	typedef typename ImplTraits::StringType StringType;
81*16467b97STreehugger Robot 	typedef typename ImplTraits::StringStreamType StringStreamType;
82*16467b97STreehugger Robot 	typedef typename ImplTraits::template RecognizerType< InputStreamType > RecognizerType;
83*16467b97STreehugger Robot 	typedef typename RecognizerType::RecognizerSharedStateType RecognizerSharedStateType;
84*16467b97STreehugger Robot 	typedef typename ImplTraits::template ExceptionBaseType<InputStreamType> ExceptionBaseType;
85*16467b97STreehugger Robot 	typedef typename ImplTraits::BitsetListType BitsetListType;
86*16467b97STreehugger Robot 	typedef typename ImplTraits::TokenSourceType TokenSourceType;
87*16467b97STreehugger Robot 
88*16467b97STreehugger Robot 	typedef typename RecognizerSharedStateType::RuleMemoType RuleMemoType;
89*16467b97STreehugger Robot 	typedef typename RecognizerType::DebugEventListenerType DebuggerType;
90*16467b97STreehugger Robot 
91*16467b97STreehugger Robot private:
92*16467b97STreehugger Robot     /** A pointer to the character stream whence this lexer is receiving
93*16467b97STreehugger Robot      *  characters.
94*16467b97STreehugger Robot      *  TODO: I may come back to this and implement charstream outside
95*16467b97STreehugger Robot      *  the input stream as per the java implementation.
96*16467b97STreehugger Robot      */
97*16467b97STreehugger Robot     InputStreamType*		m_input;
98*16467b97STreehugger Robot 
99*16467b97STreehugger Robot public:
100*16467b97STreehugger Robot 	Lexer(ANTLR_UINT32 sizeHint, RecognizerSharedStateType* state);
101*16467b97STreehugger Robot 	Lexer(ANTLR_UINT32 sizeHint, InputStreamType* input, RecognizerSharedStateType* state);
102*16467b97STreehugger Robot 
103*16467b97STreehugger Robot 	InputStreamType* get_input() const;
104*16467b97STreehugger Robot 	IntStreamType* get_istream() const;
105*16467b97STreehugger Robot 	RecognizerType* get_rec();
106*16467b97STreehugger Robot 	const RecognizerType* get_rec() const;
107*16467b97STreehugger Robot 	TokenSourceType* get_tokSource();
108*16467b97STreehugger Robot 
109*16467b97STreehugger Robot 	//functions used in .stg file
110*16467b97STreehugger Robot 	const RecognizerType* get_recognizer() const;
111*16467b97STreehugger Robot 	RecognizerSharedStateType* get_lexstate() const;
112*16467b97STreehugger Robot 	void set_lexstate( RecognizerSharedStateType* lexstate );
113*16467b97STreehugger Robot 	const TokenSourceType* get_tokSource() const;
114*16467b97STreehugger Robot 	CommonTokenType* get_ltoken() const;
115*16467b97STreehugger Robot 	void set_ltoken( const CommonTokenType* ltoken );
116*16467b97STreehugger Robot 	bool hasFailed() const;
117*16467b97STreehugger Robot 	ANTLR_INT32 get_backtracking() const;
118*16467b97STreehugger Robot 	void inc_backtracking();
119*16467b97STreehugger Robot 	void dec_backtracking();
120*16467b97STreehugger Robot 	bool get_failedflag() const;
121*16467b97STreehugger Robot 	void set_failedflag( bool failed );
122*16467b97STreehugger Robot 	InputStreamType* get_strstream() const;
123*16467b97STreehugger Robot 	ANTLR_MARKER  index() const;
124*16467b97STreehugger Robot 	void	seek(ANTLR_MARKER index);
125*16467b97STreehugger Robot 	const CommonTokenType* EOF_Token() const;
126*16467b97STreehugger Robot 	bool hasException() const;
127*16467b97STreehugger Robot 	ExceptionBaseType* get_exception() const;
128*16467b97STreehugger Robot 	void constructEx();
129*16467b97STreehugger Robot 	void lrecover();
130*16467b97STreehugger Robot 	ANTLR_MARKER mark();
131*16467b97STreehugger Robot 	void rewind(ANTLR_MARKER marker);
132*16467b97STreehugger Robot 	void rewindLast();
133*16467b97STreehugger Robot 	void setText( const StringType& text );
134*16467b97STreehugger Robot 	void skip();
135*16467b97STreehugger Robot 	RuleMemoType* getRuleMemo() const;
136*16467b97STreehugger Robot 	DebuggerType* get_debugger() const;
137*16467b97STreehugger Robot 	void setRuleMemo(RuleMemoType* rulememo);
138*16467b97STreehugger Robot 	ANTLR_UINT32 LA(ANTLR_INT32 i);
139*16467b97STreehugger Robot 	void consume();
140*16467b97STreehugger Robot 	void memoize(ANTLR_MARKER	ruleIndex, ANTLR_MARKER	ruleParseStart);
141*16467b97STreehugger Robot 	bool haveParsedRule(ANTLR_MARKER	ruleIndex);
142*16467b97STreehugger Robot 
143*16467b97STreehugger Robot     /** Pointer to a function that sets the charstream source for the lexer and
144*16467b97STreehugger Robot      *  causes it to  be reset.
145*16467b97STreehugger Robot      */
146*16467b97STreehugger Robot     void	setCharStream(InputStreamType* input);
147*16467b97STreehugger Robot 
148*16467b97STreehugger Robot     /*!
149*16467b97STreehugger Robot 	 * \brief
150*16467b97STreehugger Robot 	 * Change to a new input stream, remembering the old one.
151*16467b97STreehugger Robot 	 *
152*16467b97STreehugger Robot 	 * \param lexer
153*16467b97STreehugger Robot 	 * Pointer to the lexer instance to switch input streams for.
154*16467b97STreehugger Robot 	 *
155*16467b97STreehugger Robot 	 * \param input
156*16467b97STreehugger Robot 	 * New input stream to install as the current one.
157*16467b97STreehugger Robot 	 *
158*16467b97STreehugger Robot 	 * Switches the current character input stream to
159*16467b97STreehugger Robot 	 * a new one, saving the old one, which we will revert to at the end of this
160*16467b97STreehugger Robot 	 * new one.
161*16467b97STreehugger Robot 	 */
162*16467b97STreehugger Robot     void	pushCharStream(InputStreamType* input);
163*16467b97STreehugger Robot 
164*16467b97STreehugger Robot 	/*!
165*16467b97STreehugger Robot 	 * \brief
166*16467b97STreehugger Robot 	 * Stops using the current input stream and reverts to any prior
167*16467b97STreehugger Robot 	 * input stream on the stack.
168*16467b97STreehugger Robot 	 *
169*16467b97STreehugger Robot 	 * \param lexer
170*16467b97STreehugger Robot 	 * Description of parameter lexer.
171*16467b97STreehugger Robot 	 *
172*16467b97STreehugger Robot 	 * Pointer to a function that abandons the current input stream, whether it
173*16467b97STreehugger Robot 	 * is empty or not and reverts to the previous stacked input stream.
174*16467b97STreehugger Robot 	 *
175*16467b97STreehugger Robot 	 * \remark
176*16467b97STreehugger Robot 	 * The function fails silently if there are no prior input streams.
177*16467b97STreehugger Robot 	 */
178*16467b97STreehugger Robot     void	popCharStream();
179*16467b97STreehugger Robot 
180*16467b97STreehugger Robot     /** Function that emits (a copy of ) the supplied token as the next token in
181*16467b97STreehugger Robot      *  the stream.
182*16467b97STreehugger Robot      */
183*16467b97STreehugger Robot     void	emit(const CommonTokenType* token);
184*16467b97STreehugger Robot 
185*16467b97STreehugger Robot     /** Pointer to a function that constructs a new token from the lexer stored information
186*16467b97STreehugger Robot      */
187*16467b97STreehugger Robot     CommonTokenType*	emit();
188*16467b97STreehugger Robot 
189*16467b97STreehugger Robot     /** Pointer to a function that attempts to match and consume the specified string from the input
190*16467b97STreehugger Robot      *  stream. Note that strings muse be passed as terminated arrays of ANTLR3_UCHAR. Strings are terminated
191*16467b97STreehugger Robot      *  with 0xFFFFFFFF, which is an invalid UTF32 character
192*16467b97STreehugger Robot      */
193*16467b97STreehugger Robot     bool	matchs(ANTLR_UCHAR* string);
194*16467b97STreehugger Robot 
195*16467b97STreehugger Robot     /** Pointer to a function that matches and consumes the specified character from the input stream.
196*16467b97STreehugger Robot      *  The input stream is required to provide characters via LA() as UTF32 characters. The default lexer
197*16467b97STreehugger Robot      *  implementation is source encoding agnostic and so input streams do not generally need to
198*16467b97STreehugger Robot      *  override the default implmentation.
199*16467b97STreehugger Robot      */
200*16467b97STreehugger Robot     bool	matchc(ANTLR_UCHAR c);
201*16467b97STreehugger Robot 
202*16467b97STreehugger Robot     /** Pointer to a function that matches any character in the supplied range (I suppose it could be a token range too
203*16467b97STreehugger Robot      *  but this would only be useful if the tokens were in tsome guaranteed order which is
204*16467b97STreehugger Robot      *  only going to happen with a hand crafted token set).
205*16467b97STreehugger Robot      */
206*16467b97STreehugger Robot     bool	matchRange(ANTLR_UCHAR low, ANTLR_UCHAR high);
207*16467b97STreehugger Robot 
208*16467b97STreehugger Robot     /** Pointer to a function that matches the next token/char in the input stream
209*16467b97STreehugger Robot      *  regardless of what it actaully is.
210*16467b97STreehugger Robot      */
211*16467b97STreehugger Robot     void		matchAny();
212*16467b97STreehugger Robot 
213*16467b97STreehugger Robot     /** Pointer to a function that recovers from an error found in the input stream.
214*16467b97STreehugger Robot      *  Generally, this will be a #ANTLR3_EXCEPTION_NOVIABLE_ALT but it could also
215*16467b97STreehugger Robot      *  be from a mismatched token that the (*match)() could not recover from.
216*16467b97STreehugger Robot      */
217*16467b97STreehugger Robot     void		recover();
218*16467b97STreehugger Robot 
219*16467b97STreehugger Robot     /** Function to return the current line number in the input stream
220*16467b97STreehugger Robot      */
221*16467b97STreehugger Robot     ANTLR_UINT32	getLine();
222*16467b97STreehugger Robot     ANTLR_MARKER	getCharIndex();
223*16467b97STreehugger Robot     ANTLR_UINT32	getCharPositionInLine();
224*16467b97STreehugger Robot 
225*16467b97STreehugger Robot     /** Function to return the text so far for the current token being generated
226*16467b97STreehugger Robot      */
227*16467b97STreehugger Robot     StringType 	getText();
228*16467b97STreehugger Robot 
229*16467b97STreehugger Robot 	//Other utility functions
230*16467b97STreehugger Robot 	void fillExceptionData( ExceptionBaseType* ex );
231*16467b97STreehugger Robot 
232*16467b97STreehugger Robot 	/** Default lexer error handler (works for 8 bit streams only!!!)
233*16467b97STreehugger Robot 	 */
234*16467b97STreehugger Robot 	void displayRecognitionError( ANTLR_UINT8** tokenNames, ExceptionBaseType* ex);
235*16467b97STreehugger Robot 	void exConstruct();
236*16467b97STreehugger Robot 	TokenType*	getMissingSymbol( IntStreamType* istream, ExceptionBaseType* e,
237*16467b97STreehugger Robot 								  ANTLR_UINT32	expectedTokenType, BitsetListType*	follow);
238*16467b97STreehugger Robot 
239*16467b97STreehugger Robot     /** Pointer to a function that knows how to free the resources of a lexer
240*16467b97STreehugger Robot      */
241*16467b97STreehugger Robot 	~Lexer();
242*16467b97STreehugger Robot };
243*16467b97STreehugger Robot 
244*16467b97STreehugger Robot ANTLR_END_NAMESPACE()
245*16467b97STreehugger Robot 
246*16467b97STreehugger Robot #include "antlr3lexer.inl"
247*16467b97STreehugger Robot 
248*16467b97STreehugger Robot #endif
249