1 //! Contains simple lexer for XML documents.
2 //!
3 //! This module is for internal use. Use `xml::pull` module to do parsing.
4 
5 
6 use crate::reader::ErrorKind;
7 use crate::reader::error::SyntaxError;
8 use std::collections::VecDeque;
9 use std::fmt;
10 use std::io::Read;
11 use std::result;
12 use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char};
13 use crate::reader::Error;
14 use crate::util::{CharReader, Encoding};
15 
16 use super::ParserConfig2;
17 
18 /// `Token` represents a single lexeme of an XML document. These lexemes
19 /// are used to perform actual parsing.
20 #[derive(Copy, Clone, PartialEq, Eq, Debug)]
21 pub(crate) enum Token {
22     /// `<?`
23     ProcessingInstructionStart,
24     /// `?>`
25     ProcessingInstructionEnd,
26     /// `<!DOCTYPE
27     DoctypeStart,
28     /// `<`
29     OpeningTagStart,
30     /// `</`
31     ClosingTagStart,
32     /// `>`
33     TagEnd,
34     /// `/>`
35     EmptyTagEnd,
36     /// `<!--`
37     CommentStart,
38     /// `-->`
39     CommentEnd,
40     /// Any non-special character except whitespace.
41     Character(char),
42     /// `=`
43     EqualsSign,
44     /// `'`
45     SingleQuote,
46     /// `"`
47     DoubleQuote,
48     /// `<![CDATA[`
49     CDataStart,
50     /// `]]>`
51     CDataEnd,
52     /// `&`
53     ReferenceStart,
54     /// `;`
55     ReferenceEnd,
56     /// `<!` of `ENTITY`
57     MarkupDeclarationStart,
58 }
59 
60 impl fmt::Display for Token {
61     #[cold]
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result62     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
63         match *self {
64             Token::Character(c) => c.fmt(f),
65             other => match other {
66                 Token::OpeningTagStart            => "<",
67                 Token::ProcessingInstructionStart => "<?",
68                 Token::DoctypeStart               => "<!DOCTYPE",
69                 Token::ClosingTagStart            => "</",
70                 Token::CommentStart               => "<!--",
71                 Token::CDataStart                 => "<![CDATA[",
72                 Token::TagEnd                     => ">",
73                 Token::EmptyTagEnd                => "/>",
74                 Token::ProcessingInstructionEnd   => "?>",
75                 Token::CommentEnd                 => "-->",
76                 Token::CDataEnd                   => "]]>",
77                 Token::ReferenceStart             => "&",
78                 Token::ReferenceEnd               => ";",
79                 Token::EqualsSign                 => "=",
80                 Token::SingleQuote                => "'",
81                 Token::DoubleQuote                => "\"",
82                 Token::MarkupDeclarationStart     => "<!",
83                 _                          => unreachable!()
84             }.fmt(f),
85         }
86     }
87 }
88 
89 impl Token {
as_static_str(&self) -> Option<&'static str>90     pub fn as_static_str(&self) -> Option<&'static str> {
91         match *self {
92             Token::OpeningTagStart            => Some("<"),
93             Token::ProcessingInstructionStart => Some("<?"),
94             Token::DoctypeStart               => Some("<!DOCTYPE"),
95             Token::ClosingTagStart            => Some("</"),
96             Token::CommentStart               => Some("<!--"),
97             Token::CDataStart                 => Some("<![CDATA["),
98             Token::TagEnd                     => Some(">"),
99             Token::EmptyTagEnd                => Some("/>"),
100             Token::ProcessingInstructionEnd   => Some("?>"),
101             Token::CommentEnd                 => Some("-->"),
102             Token::CDataEnd                   => Some("]]>"),
103             Token::ReferenceStart             => Some("&"),
104             Token::ReferenceEnd               => Some(";"),
105             Token::EqualsSign                 => Some("="),
106             Token::SingleQuote                => Some("'"),
107             Token::DoubleQuote                => Some("\""),
108             _                                 => None
109         }
110     }
111 
112     // using String.push_str(token.to_string()) is simply way too slow
push_to_string(&self, target: &mut String)113     pub fn push_to_string(&self, target: &mut String) {
114         match *self {
115             Token::Character(c) => {
116                 debug_assert!(is_xml10_char(c) || is_xml11_char(c));
117                 target.push(c)
118             },
119             _ => if let Some(s) = self.as_static_str() {
120                 target.push_str(s);
121             }
122         }
123     }
124 }
125 
126 #[derive(Copy, Clone)]
127 enum State {
128     /// Default state
129     Normal,
130     /// Triggered on '<'
131     TagStarted,
132     /// Triggered on '<!'
133     CommentOrCDataOrDoctypeStarted,
134     /// Triggered on '<!-'
135     CommentStarted,
136     /// Triggered on '<!D' up to '<!DOCTYPE'
137     DoctypeStarted(DoctypeStartedSubstate),
138     /// Other items like `<!ELEMENT` in DTD
139     InsideMarkupDeclaration,
140     /// Triggered after DoctypeStarted to handle sub elements
141     InsideDoctype,
142     /// Triggered on '<![' up to '<![CDATA'
143     CDataStarted(CDataStartedSubstate),
144     /// Triggered on '?'
145     ProcessingInstructionClosing,
146     /// Triggered on '/'
147     EmptyTagClosing,
148     /// Triggered on '-' up to '--'
149     CommentClosing(ClosingSubstate),
150     /// Triggered on ']' up to ']]' inside CDATA
151     CDataClosing(ClosingSubstate),
152     /// Triggered on ']' up to ']]' outside CDATA
153     InvalidCDataClosing(ClosingSubstate),
154     /// After `<!--`
155     InsideComment,
156     /// After `<[[`
157     InsideCdata,
158     /// After `<?`
159     InsideProcessingInstruction,
160     /// `<!ENTITY "here">`
161     InsideMarkupDeclarationQuotedString(QuoteStyle),
162 }
163 
164 #[derive(Copy, Clone, Eq, PartialEq)]
165 enum QuoteStyle {
166     Single, Double
167 }
168 
169 #[derive(Copy, Clone)]
170 enum ClosingSubstate {
171     First, Second
172 }
173 
174 #[derive(Copy, Clone)]
175 enum DoctypeStartedSubstate {
176     D, DO, DOC, DOCT, DOCTY, DOCTYP
177 }
178 
179 #[derive(Copy, Clone)]
180 enum CDataStartedSubstate {
181     E, C, CD, CDA, CDAT, CDATA
182 }
183 
184 /// `Result` represents lexing result. It is either a token or an error message.
185 pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>;
186 
187 /// Helps to set up a dispatch table for lexing large unambigous tokens like
188 /// `<![CDATA[` or `<!DOCTYPE `.
189 macro_rules! dispatch_on_enum_state(
190     ($_self:ident, $s:expr, $c:expr, $is:expr,
191      $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+;
192      $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => (
193         match $s {
194             $(
195             $st => match $c {
196                 $stc => $_self.move_to($is($next_st)),
197                 _  => $_self.handle_error($chunk, $c)
198             },
199             )+
200             $end_st => match $c {
201                 $end_c => $e,
202                 _      => $_self.handle_error($end_chunk, $c)
203             }
204         }
205     )
206 );
207 
208 /// `Lexer` is a lexer for XML documents, which implements pull API.
209 ///
210 /// Main method is `next_token` which accepts an `std::io::Read` instance and
211 /// tries to read the next lexeme from it.
212 ///
213 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s.
214 /// When it is not set, errors will be reported as `Err` objects with a string message.
215 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods
216 /// to toggle the behavior.
217 pub(crate) struct Lexer {
218     st: State,
219     reader: CharReader,
220     pos: TextPosition,
221     head_pos: TextPosition,
222     char_queue: VecDeque<char>,
223     /// Default state to go back to after a tag end (may be `InsideDoctype`)
224     normal_state: State,
225     inside_token: bool,
226     eof_handled: bool,
227     reparse_depth: u8,
228     #[cfg(test)]
229     skip_errors: bool,
230 
231     max_entity_expansion_depth: u8,
232     max_entity_expansion_length: usize,
233 }
234 
235 impl Position for Lexer {
236     #[inline]
237     /// Returns the position of the last token produced by the lexer
position(&self) -> TextPosition238     fn position(&self) -> TextPosition { self.pos }
239 }
240 
241 impl Lexer {
242     /// Returns a new lexer with default state.
new(config: &ParserConfig2) -> Lexer243     pub(crate) fn new(config: &ParserConfig2) -> Lexer {
244         Lexer {
245             reader: CharReader::new(),
246             pos: TextPosition::new(),
247             head_pos: TextPosition::new(),
248             char_queue: VecDeque::with_capacity(4),  // TODO: check size
249             st: State::Normal,
250             normal_state: State::Normal,
251             inside_token: false,
252             eof_handled: false,
253             reparse_depth: 0,
254             #[cfg(test)]
255             skip_errors: false,
256 
257             max_entity_expansion_depth: config.max_entity_expansion_depth,
258             max_entity_expansion_length: config.max_entity_expansion_length,
259         }
260     }
261 
encoding(&mut self) -> Encoding262     pub(crate) fn encoding(&mut self) -> Encoding {
263         self.reader.encoding
264     }
265 
set_encoding(&mut self, encoding: Encoding)266     pub(crate) fn set_encoding(&mut self, encoding: Encoding) {
267         self.reader.encoding = encoding;
268     }
269 
270     /// Disables error handling so `next_token` will return `Some(Chunk(..))`
271     /// upon invalid lexeme with this lexeme content.
disable_errors(&mut self)272     #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; }
273 
274     /// Reset the eof handled flag of the lexer.
275     #[inline]
reset_eof_handled(&mut self)276     pub fn reset_eof_handled(&mut self) { self.eof_handled = false; }
277 
278     /// Tries to read the next token from the buffer.
279     ///
280     /// It is possible to pass different instaces of `BufReader` each time
281     /// this method is called, but the resulting behavior is undefined in this case.
282     ///
283     /// Return value:
284     /// * `Err(reason) where reason: reader::Error` - when an error occurs;
285     /// * `Ok(None)` - upon end of stream is reached;
286     /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream.
next_token<B: Read>(&mut self, b: &mut B) -> Result287     pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result {
288         // Already reached end of buffer
289         if self.eof_handled {
290             return Ok(None);
291         }
292 
293         if !self.inside_token {
294             self.pos = self.head_pos;
295             self.inside_token = true;
296         }
297 
298         // Check if we have saved a char or two for ourselves
299         while let Some(c) = self.char_queue.pop_front() {
300             match self.dispatch_char(c)? {
301                 Some(t) => {
302                     self.inside_token = false;
303                     return Ok(Some(t));
304                 }
305                 None => {} // continue
306             }
307         }
308         // if char_queue is empty, all circular reparsing is done
309         self.reparse_depth = 0;
310         loop {
311             let c = match self.reader.next_char_from(b)? {
312                 Some(c) => c,  // got next char
313                 None => break, // nothing to read left
314             };
315 
316             if c == '\n' {
317                 self.head_pos.new_line();
318             } else {
319                 self.head_pos.advance(1);
320             }
321 
322             match self.dispatch_char(c)? {
323                 Some(t) => {
324                     self.inside_token = false;
325                     return Ok(Some(t));
326                 }
327                 None => {
328                     // continue
329                 }
330             }
331         }
332 
333         self.end_of_stream()
334     }
335 
336     #[inline(never)]
end_of_stream(&mut self) -> Result337     fn end_of_stream(&mut self) -> Result {
338         // Handle end of stream
339         self.eof_handled = true;
340         self.pos = self.head_pos;
341         match self.st {
342             State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)),
343             State::TagStarted | State::CommentOrCDataOrDoctypeStarted |
344             State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) |
345             State::CommentClosing(ClosingSubstate::Second) |
346             State::InsideComment | State::InsideMarkupDeclaration |
347             State::InsideProcessingInstruction | State::ProcessingInstructionClosing |
348             State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) =>
349                 Err(self.error(SyntaxError::UnexpectedEof)),
350             State::EmptyTagClosing =>
351                 Ok(Some(Token::Character('/'))),
352             State::CommentClosing(ClosingSubstate::First) =>
353                 Ok(Some(Token::Character('-'))),
354             State::InvalidCDataClosing(ClosingSubstate::First) =>
355                 Ok(Some(Token::Character(']'))),
356             State::InvalidCDataClosing(ClosingSubstate::Second) => {
357                 self.eof_handled = false;
358                 self.move_to_with_unread(State::Normal, &[']'], Token::Character(']'))
359             },
360             State::Normal =>
361                 Ok(None),
362         }
363     }
364 
365     #[cold]
error(&self, e: SyntaxError) -> Error366     fn error(&self, e: SyntaxError) -> Error {
367         Error {
368             pos: self.position(),
369             kind: ErrorKind::Syntax(e.to_cow()),
370         }
371     }
372 
373 
374     #[inline(never)]
dispatch_char(&mut self, c: char) -> Result375     fn dispatch_char(&mut self, c: char) -> Result {
376         match self.st {
377             State::Normal                         => self.normal(c),
378             State::TagStarted                     => self.tag_opened(c),
379             State::EmptyTagClosing                => self.empty_element_closing(c),
380             State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c),
381             State::InsideCdata                    => self.inside_cdata(c),
382             State::CDataStarted(s)                => self.cdata_started(c, s),
383             State::InsideComment                  => self.inside_comment_state(c),
384             State::CommentStarted                 => self.comment_started(c),
385             State::InsideProcessingInstruction    => self.inside_processing_instruction(c),
386             State::ProcessingInstructionClosing   => self.processing_instruction_closing(c),
387             State::CommentClosing(s)              => self.comment_closing(c, s),
388             State::CDataClosing(s)                => self.cdata_closing(c, s),
389             State::InsideDoctype                  => self.inside_doctype(c),
390             State::DoctypeStarted(s)              => self.doctype_started(c, s),
391             State::InvalidCDataClosing(s)         => self.invalid_cdata_closing(c, s),
392             State::InsideMarkupDeclaration        => self.markup_declaration(c),
393             State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q),
394         }
395     }
396 
397     #[inline]
move_to(&mut self, st: State) -> Result398     fn move_to(&mut self, st: State) -> Result {
399         self.st = st;
400         Ok(None)
401     }
402 
403     #[inline]
move_to_with(&mut self, st: State, token: Token) -> Result404     fn move_to_with(&mut self, st: State, token: Token) -> Result {
405         self.st = st;
406         Ok(Some(token))
407     }
408 
409     #[inline]
move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result410     fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result {
411         self.normal_state = st;
412         self.st = st;
413         Ok(Some(token))
414     }
415 
move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result416     fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result {
417         for c in cs.iter().rev().copied() {
418             self.char_queue.push_front(c);
419         }
420         self.move_to_with(st, token)
421     }
422 
reparse(&mut self, markup: &str) -> Result<()>423     pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> {
424         if markup.is_empty() {
425             return Ok(());
426         }
427 
428         self.reparse_depth += 1;
429         if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length {
430             return Err(self.error(SyntaxError::EntityTooBig))
431         }
432 
433         self.eof_handled = false;
434         self.char_queue.reserve(markup.len());
435         for c in markup.chars().rev() {
436             self.char_queue.push_front(c);
437         }
438 
439         Ok(())
440     }
441 
handle_error(&mut self, chunk: &'static str, c: char) -> Result442     fn handle_error(&mut self, chunk: &'static str, c: char) -> Result {
443         debug_assert!(!chunk.is_empty());
444 
445         #[cfg(test)]
446         if self.skip_errors {
447             let mut chars = chunk.chars();
448             let first = chars.next().unwrap_or('\0');
449             self.char_queue.extend(chars);
450             self.char_queue.push_back(c);
451             return self.move_to_with(State::Normal, Token::Character(first));
452         }
453         Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c)))
454     }
455 
456     /// Encountered a char
normal(&mut self, c: char) -> Result457     fn normal(&mut self, c: char) -> Result {
458         match c {
459             '<'                        => self.move_to(State::TagStarted),
460             '>'                        => Ok(Some(Token::TagEnd)),
461             '/'                        => self.move_to(State::EmptyTagClosing),
462             '='                        => Ok(Some(Token::EqualsSign)),
463             '"'                        => Ok(Some(Token::DoubleQuote)),
464             '\''                       => Ok(Some(Token::SingleQuote)),
465             ']'                        => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)),
466             '&'                        => Ok(Some(Token::ReferenceStart)),
467             ';'                        => Ok(Some(Token::ReferenceEnd)),
468             _                          => Ok(Some(Token::Character(c)))
469         }
470     }
471 
inside_cdata(&mut self, c: char) -> Result472     fn inside_cdata(&mut self, c: char) -> Result {
473         match c {
474             ']'                        => self.move_to(State::CDataClosing(ClosingSubstate::First)),
475             _                          => Ok(Some(Token::Character(c)))
476         }
477     }
478 
inside_processing_instruction(&mut self, c: char) -> Result479     fn inside_processing_instruction(&mut self, c: char) -> Result {
480         // These tokens are used by `<?xml?>` parser
481         match c {
482             '?'                        => self.move_to(State::ProcessingInstructionClosing),
483             '<'                        => Ok(Some(Token::OpeningTagStart)),
484             '>'                        => Ok(Some(Token::TagEnd)),
485             '/'                        => Ok(Some(Token::ClosingTagStart)),
486             '='                        => Ok(Some(Token::EqualsSign)),
487             '"'                        => Ok(Some(Token::DoubleQuote)),
488             '\''                       => Ok(Some(Token::SingleQuote)),
489             '&'                        => Ok(Some(Token::ReferenceStart)),
490             ';'                        => Ok(Some(Token::ReferenceEnd)),
491             _                          => Ok(Some(Token::Character(c)))
492         }
493     }
494 
inside_comment_state(&mut self, c: char) -> Result495     fn inside_comment_state(&mut self, c: char) -> Result {
496         match c {
497             '-'                        => self.move_to(State::CommentClosing(ClosingSubstate::First)),
498             _                          => Ok(Some(Token::Character(c)))
499         }
500     }
501 
502     /// Encountered '<'
tag_opened(&mut self, c: char) -> Result503     fn tag_opened(&mut self, c: char) -> Result {
504         match c {
505             '?'                        => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart),
506             '/'                        => self.move_to_with(self.normal_state, Token::ClosingTagStart),
507             '!'                        => self.move_to(State::CommentOrCDataOrDoctypeStarted),
508             _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
509             _ if is_name_char(c)       => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart),
510             _                          => self.handle_error("<", c)
511         }
512     }
513 
514     /// Encountered '<!'
comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result515     fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result {
516         match c {
517             '-' => self.move_to(State::CommentStarted),
518             '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)),
519             'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)),
520             'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => {
521                 self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart)
522             },
523             _ => self.handle_error("<!", c),
524         }
525     }
526 
527     /// Encountered '<!-'
comment_started(&mut self, c: char) -> Result528     fn comment_started(&mut self, c: char) -> Result {
529         match c {
530             '-' => self.move_to_with(State::InsideComment, Token::CommentStart),
531             _ => self.handle_error("<!-", c),
532         }
533     }
534 
535     /// Encountered '<!['
cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result536     fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result {
537         use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E};
538         dispatch_on_enum_state!(self, s, c, State::CDataStarted,
539             E     ; 'C' ; C     ; "<![",
540             C     ; 'D' ; CD    ; "<![C",
541             CD    ; 'A' ; CDA   ; "<![CD",
542             CDA   ; 'T' ; CDAT  ; "<![CDA",
543             CDAT  ; 'A' ; CDATA ; "<![CDAT";
544             CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::InsideCdata, Token::CDataStart)
545         )
546     }
547 
548     /// Encountered '<!…' that isn't DOCTYPE or CDATA
markup_declaration(&mut self, c: char) -> Result549     fn markup_declaration(&mut self, c: char) -> Result {
550         match c {
551             '<'                        => self.handle_error("<!", c),
552             '>'                        => self.move_to_with(self.normal_state, Token::TagEnd),
553             '&'                        => Ok(Some(Token::ReferenceStart)),
554             ';'                        => Ok(Some(Token::ReferenceEnd)),
555             '"'                        => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote),
556             '\''                       => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote),
557             _                          => Ok(Some(Token::Character(c))),
558         }
559     }
560 
markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result561     fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result {
562         match c {
563             '"' if q == QuoteStyle::Double  => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote),
564             '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote),
565             _                               => Ok(Some(Token::Character(c))),
566         }
567     }
568 
569     /// Encountered '<!D'
doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result570     fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result {
571         use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP};
572         dispatch_on_enum_state!(self, s, c, State::DoctypeStarted,
573             D      ; 'O' ; DO     ; "<!D",
574             DO     ; 'C' ; DOC    ; "<!DO",
575             DOC    ; 'T' ; DOCT   ; "<!DOC",
576             DOCT   ; 'Y' ; DOCTY  ; "<!DOCT",
577             DOCTY  ; 'P' ; DOCTYP ; "<!DOCTY";
578             DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart)
579         )
580     }
581 
582     /// State used while awaiting the closing bracket for the <!DOCTYPE tag
inside_doctype(&mut self, c: char) -> Result583     fn inside_doctype(&mut self, c: char) -> Result {
584         match c {
585             '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd),
586             '<'                        => self.move_to(State::TagStarted),
587             '&'                        => Ok(Some(Token::ReferenceStart)),
588             ';'                        => Ok(Some(Token::ReferenceEnd)),
589             '"'                        => Ok(Some(Token::DoubleQuote)),
590             '\''                       => Ok(Some(Token::SingleQuote)),
591             _                          => Ok(Some(Token::Character(c))),
592         }
593     }
594 
595     /// Encountered '?'
processing_instruction_closing(&mut self, c: char) -> Result596     fn processing_instruction_closing(&mut self, c: char) -> Result {
597         match c {
598             '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd),
599             _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')),
600         }
601     }
602 
603     /// Encountered '/'
empty_element_closing(&mut self, c: char) -> Result604     fn empty_element_closing(&mut self, c: char) -> Result {
605         match c {
606             '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd),
607             _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')),
608         }
609     }
610 
611     /// Encountered '-'
comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result612     fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
613         match s {
614             ClosingSubstate::First => match c {
615                 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)),
616                 _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')),
617             },
618             ClosingSubstate::Second => match c {
619                 '>' => self.move_to_with(self.normal_state, Token::CommentEnd),
620                 // double dash not followed by a greater-than is a hard error inside comment
621                 _ => self.handle_error("--", c),
622             },
623         }
624     }
625 
626     /// Encountered ']'
cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result627     fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
628         match s {
629             ClosingSubstate::First => match c {
630                 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)),
631                 _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')),
632             },
633             ClosingSubstate::Second => match c {
634                 '>' => self.move_to_with(State::Normal, Token::CDataEnd),
635                 _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')),
636             },
637         }
638     }
639 
640     /// Encountered ']'
invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result641     fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result {
642         match s {
643             ClosingSubstate::First => match c {
644                 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)),
645                 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')),
646             },
647             ClosingSubstate::Second => match c {
648                 '>' => self.move_to_with(self.normal_state, Token::CDataEnd),
649                 _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')),
650             },
651         }
652     }
653 }
654 
655 #[cfg(test)]
656 mod tests {
657     use crate::{common::Position, reader::ParserConfig2};
658     use std::io::{BufReader, Cursor};
659 
660     use super::{Lexer, Token};
661 
662     macro_rules! assert_oks(
663         (for $lex:ident and $buf:ident ; $($e:expr)+) => ({
664             $(
665                 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf));
666              )+
667         })
668     );
669 
670     macro_rules! assert_err(
671         (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({
672             let err = $lex.next_token(&mut $buf);
673             assert!(err.is_err());
674             let err = err.unwrap_err();
675             assert_eq!($r as u64, err.position().row);
676             assert_eq!($c as u64, err.position().column);
677         })
678     );
679 
680     macro_rules! assert_none(
681         (for $lex:ident and $buf:ident) => (
682             assert_eq!(Ok(None), $lex.next_token(&mut $buf))
683         )
684     );
685 
make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)686     fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) {
687         (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes())))
688     }
689 
690     #[test]
tricky_pi()691     fn tricky_pi() {
692         let (mut lex, mut buf) = make_lex_and_buf(r#"<?x<!-- &??><x>"#);
693 
694         assert_oks!(for lex and buf ;
695             Token::ProcessingInstructionStart
696             Token::Character('x')
697             Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens
698             Token::Character('!')
699             Token::Character('-')
700             Token::Character('-')
701             Token::Character(' ')
702             Token::ReferenceStart
703             Token::Character('?')
704             Token::ProcessingInstructionEnd
705             Token::OpeningTagStart
706             Token::Character('x')
707             Token::TagEnd
708         );
709         assert_none!(for lex and buf);
710     }
711 
712     #[test]
reparser()713     fn reparser() {
714         let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#);
715 
716         assert_oks!(for lex and buf ;
717             Token::ReferenceStart
718             Token::Character('a')
719             Token::ReferenceEnd
720         );
721         lex.reparse("<hi/>").unwrap();
722         assert_oks!(for lex and buf ;
723             Token::OpeningTagStart
724             Token::Character('h')
725             Token::Character('i')
726             Token::EmptyTagEnd
727         );
728         assert_none!(for lex and buf);
729     }
730 
731     #[test]
simple_lexer_test()732     fn simple_lexer_test() {
733         let (mut lex, mut buf) = make_lex_and_buf(
734             r#"<a p='q'> x<b z="y">d	</b></a><p/> <?nm ?> <!-- a c --> &nbsp;"#
735         );
736 
737         assert_oks!(for lex and buf ;
738             Token::OpeningTagStart
739             Token::Character('a')
740             Token::Character(' ')
741             Token::Character('p')
742             Token::EqualsSign
743             Token::SingleQuote
744             Token::Character('q')
745             Token::SingleQuote
746             Token::TagEnd
747             Token::Character(' ')
748             Token::Character('x')
749             Token::OpeningTagStart
750             Token::Character('b')
751             Token::Character(' ')
752             Token::Character('z')
753             Token::EqualsSign
754             Token::DoubleQuote
755             Token::Character('y')
756             Token::DoubleQuote
757             Token::TagEnd
758             Token::Character('d')
759             Token::Character('\t')
760             Token::ClosingTagStart
761             Token::Character('b')
762             Token::TagEnd
763             Token::ClosingTagStart
764             Token::Character('a')
765             Token::TagEnd
766             Token::OpeningTagStart
767             Token::Character('p')
768             Token::EmptyTagEnd
769             Token::Character(' ')
770             Token::ProcessingInstructionStart
771             Token::Character('n')
772             Token::Character('m')
773             Token::Character(' ')
774             Token::ProcessingInstructionEnd
775             Token::Character(' ')
776             Token::CommentStart
777             Token::Character(' ')
778             Token::Character('a')
779             Token::Character(' ')
780             Token::Character('c')
781             Token::Character(' ')
782             Token::CommentEnd
783             Token::Character(' ')
784             Token::ReferenceStart
785             Token::Character('n')
786             Token::Character('b')
787             Token::Character('s')
788             Token::Character('p')
789             Token::ReferenceEnd
790         );
791         assert_none!(for lex and buf);
792     }
793 
794     #[test]
special_chars_test()795     fn special_chars_test() {
796         let (mut lex, mut buf) = make_lex_and_buf(
797             r#"?x!+ // -| ]z]]"#
798         );
799 
800         assert_oks!(for lex and buf ;
801             Token::Character('?')
802             Token::Character('x')
803             Token::Character('!')
804             Token::Character('+')
805             Token::Character(' ')
806             Token::Character('/')
807             Token::Character('/')
808             Token::Character(' ')
809             Token::Character('-')
810             Token::Character('|')
811             Token::Character(' ')
812             Token::Character(']')
813             Token::Character('z')
814             Token::Character(']')
815             Token::Character(']')
816         );
817         assert_none!(for lex and buf);
818     }
819 
820     #[test]
cdata_test()821     fn cdata_test() {
822         let (mut lex, mut buf) = make_lex_and_buf(
823             r#"<a><![CDATA[x y ?]]> </a>"#
824         );
825 
826         assert_oks!(for lex and buf ;
827             Token::OpeningTagStart
828             Token::Character('a')
829             Token::TagEnd
830             Token::CDataStart
831             Token::Character('x')
832             Token::Character(' ')
833             Token::Character('y')
834             Token::Character(' ')
835             Token::Character('?')
836             Token::CDataEnd
837             Token::Character(' ')
838             Token::ClosingTagStart
839             Token::Character('a')
840             Token::TagEnd
841         );
842         assert_none!(for lex and buf);
843     }
844 
845     #[test]
cdata_closers_test()846     fn cdata_closers_test() {
847         let (mut lex, mut buf) = make_lex_and_buf(
848             r#"<![CDATA[] > ]> ]]><!---->]]<a>"#
849         );
850 
851         assert_oks!(for lex and buf ;
852             Token::CDataStart
853             Token::Character(']')
854             Token::Character(' ')
855             Token::Character('>')
856             Token::Character(' ')
857             Token::Character(']')
858             Token::Character('>')
859             Token::Character(' ')
860             Token::CDataEnd
861             Token::CommentStart
862             Token::CommentEnd
863             Token::Character(']')
864             Token::Character(']')
865             Token::OpeningTagStart
866             Token::Character('a')
867             Token::TagEnd
868         );
869         assert_none!(for lex and buf);
870     }
871 
872     #[test]
doctype_test()873     fn doctype_test() {
874         let (mut lex, mut buf) = make_lex_and_buf(
875             r#"<a><!DOCTYPE ab xx z> "#
876         );
877         assert_oks!(for lex and buf ;
878             Token::OpeningTagStart
879             Token::Character('a')
880             Token::TagEnd
881             Token::DoctypeStart
882             Token::Character(' ')
883             Token::Character('a')
884             Token::Character('b')
885             Token::Character(' ')
886             Token::Character('x')
887             Token::Character('x')
888             Token::Character(' ')
889             Token::Character('z')
890             Token::TagEnd
891             Token::Character(' ')
892         );
893         assert_none!(for lex and buf);
894     }
895 
896     #[test]
tricky_comments()897     fn tricky_comments() {
898         let (mut lex, mut buf) = make_lex_and_buf(
899             r#"<a><!-- C ->--></a>"#
900         );
901         assert_oks!(for lex and buf ;
902             Token::OpeningTagStart
903             Token::Character('a')
904             Token::TagEnd
905             Token::CommentStart
906             Token::Character(' ')
907             Token::Character('C')
908             Token::Character(' ')
909             Token::Character('-')
910             Token::Character('>')
911             Token::CommentEnd
912             Token::ClosingTagStart
913             Token::Character('a')
914             Token::TagEnd
915         );
916         assert_none!(for lex and buf);
917     }
918 
919     #[test]
doctype_with_internal_subset_test()920     fn doctype_with_internal_subset_test() {
921         let (mut lex, mut buf) = make_lex_and_buf(
922             r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "#
923         );
924         assert_oks!(for lex and buf ;
925             Token::OpeningTagStart
926             Token::Character('a')
927             Token::TagEnd
928             Token::DoctypeStart
929             Token::Character(' ')
930             Token::Character('a')
931             Token::Character('b')
932             Token::Character('[')
933             Token::MarkupDeclarationStart
934             Token::Character('E')
935             Token::Character('L')
936             Token::Character('E')
937             Token::Character('M')
938             Token::Character('E')
939             Token::Character('N')
940             Token::Character('T')
941             Token::Character(' ')
942             Token::Character('b')
943             Token::Character('a')
944             Token::Character(' ')
945             Token::DoubleQuote
946             Token::Character('>')
947             Token::Character('>')
948             Token::Character('>')
949             Token::DoubleQuote
950             Token::TagEnd
951             Token::Character(' ')
952             Token::Character(']')
953             Token::TagEnd
954             Token::Character(' ')
955         );
956         assert_none!(for lex and buf);
957     }
958 
959     #[test]
doctype_internal_pi_comment()960     fn doctype_internal_pi_comment() {
961         let (mut lex, mut buf) = make_lex_and_buf(
962             "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>"
963         );
964         assert_oks!(for lex and buf ;
965             Token::DoctypeStart
966             Token::Character(' ')
967             Token::Character('a')
968             Token::Character(' ')
969             Token::Character('[')
970             Token::Character('\n')
971             Token::MarkupDeclarationStart
972             Token::Character('E')
973             Token::Character('L')
974             Token::Character('E')
975             Token::Character('M')
976             Token::Character('E')
977             Token::Character('N')
978             Token::Character('T')
979             Token::Character(' ')
980             Token::Character('l')
981             Token::Character(' ')
982             Token::Character('A')
983             Token::Character('N')
984             Token::Character('Y')
985             Token::TagEnd
986             Token::Character(' ')
987             Token::CommentStart
988             Token::Character(' ')
989             Token::Character('<')
990             Token::Character('?')
991             Token::Character('n')
992             Token::Character('o')
993             Token::Character('n')
994             Token::Character('?')
995             Token::Character('>')
996             Token::CommentEnd
997             Token::Character(' ')
998             Token::ProcessingInstructionStart
999             Token::Character('p')
1000             Token::Character('i')
1001             Token::Character(' ')
1002             Token::TagEnd // not really
1003             Token::Character(' ')
1004             Token::ProcessingInstructionEnd
1005             Token::Character(' ')
1006             Token::Character('\n')
1007             Token::Character(']')
1008             Token::TagEnd // DTD
1009         );
1010         assert_none!(for lex and buf);
1011     }
1012 
1013     #[test]
end_of_stream_handling_ok()1014     fn end_of_stream_handling_ok() {
1015         macro_rules! eof_check(
1016             ($data:expr ; $token:expr) => ({
1017                 let (mut lex, mut buf) = make_lex_and_buf($data);
1018                 assert_oks!(for lex and buf ; $token);
1019                 assert_none!(for lex and buf);
1020             })
1021         );
1022         eof_check!("?"  ; Token::Character('?'));
1023         eof_check!("/"  ; Token::Character('/'));
1024         eof_check!("-"  ; Token::Character('-'));
1025         eof_check!("]"  ; Token::Character(']'));
1026         eof_check!("]"  ; Token::Character(']'));
1027         eof_check!("]"  ; Token::Character(']'));
1028     }
1029 
1030     #[test]
end_of_stream_handling_error()1031     fn end_of_stream_handling_error() {
1032         macro_rules! eof_check(
1033             ($data:expr; $r:expr, $c:expr) => ({
1034                 let (mut lex, mut buf) = make_lex_and_buf($data);
1035                 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream");
1036                 assert_none!(for lex and buf);
1037             })
1038         );
1039         eof_check!("<"        ; 0, 1);
1040         eof_check!("<!"       ; 0, 2);
1041         eof_check!("<!-"      ; 0, 3);
1042         eof_check!("<!["      ; 0, 3);
1043         eof_check!("<![C"     ; 0, 4);
1044         eof_check!("<![CD"    ; 0, 5);
1045         eof_check!("<![CDA"   ; 0, 6);
1046         eof_check!("<![CDAT"  ; 0, 7);
1047         eof_check!("<![CDATA" ; 0, 8);
1048     }
1049 
1050     #[test]
error_in_comment_or_cdata_prefix()1051     fn error_in_comment_or_cdata_prefix() {
1052         let (mut lex, mut buf) = make_lex_and_buf("<!x");
1053         assert_err!(for lex and buf expect row 0 ; 0,
1054             "Unexpected token '<!' before 'x'"
1055         );
1056 
1057         let (mut lex, mut buf) = make_lex_and_buf("<!x");
1058         lex.disable_errors();
1059         assert_oks!(for lex and buf ;
1060             Token::Character('<')
1061             Token::Character('!')
1062             Token::Character('x')
1063         );
1064         assert_none!(for lex and buf);
1065     }
1066 
1067     #[test]
error_in_comment_started()1068     fn error_in_comment_started() {
1069         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1070         assert_err!(for lex and buf expect row 0 ; 0,
1071             "Unexpected token '<!-' before '\t'"
1072         );
1073 
1074         let (mut lex, mut buf) = make_lex_and_buf("<!-\t");
1075         lex.disable_errors();
1076         assert_oks!(for lex and buf ;
1077             Token::Character('<')
1078             Token::Character('!')
1079             Token::Character('-')
1080             Token::Character('\t')
1081         );
1082         assert_none!(for lex and buf);
1083     }
1084 
1085     #[test]
error_in_comment_two_dashes_not_at_end()1086     fn error_in_comment_two_dashes_not_at_end() {
1087         let (mut lex, mut buf) = make_lex_and_buf("--x");
1088         lex.st = super::State::InsideComment;
1089         assert_err!(for lex and buf expect row 0; 0,
1090             "Unexpected token '--' before 'x'"
1091         );
1092 
1093         let (mut lex, mut buf) = make_lex_and_buf("--x");
1094         assert_oks!(for lex and buf ;
1095             Token::Character('-')
1096             Token::Character('-')
1097             Token::Character('x')
1098         );
1099     }
1100 
1101     macro_rules! check_case(
1102         ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({
1103             let (mut lex, mut buf) = make_lex_and_buf($data);
1104             assert_err!(for lex and buf expect row $r ; $c, $s);
1105 
1106             let (mut lex, mut buf) = make_lex_and_buf($data);
1107             lex.disable_errors();
1108             for c in $chunk.chars() {
1109                 assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf));
1110             }
1111             assert_oks!(for lex and buf ;
1112                 Token::Character($app)
1113             );
1114             assert_none!(for lex and buf);
1115         })
1116     );
1117 
1118     #[test]
token_size()1119     fn token_size() {
1120         assert_eq!(4, std::mem::size_of::<Token>());
1121         assert_eq!(2, std::mem::size_of::<super::State>());
1122     }
1123 
1124     #[test]
error_in_cdata_started()1125     fn error_in_cdata_started() {
1126         check_case!("<![",      '['; "<![["      ; 0, 0, "Unexpected token '<![' before '['");
1127         check_case!("<![C",     '['; "<![C["     ; 0, 0, "Unexpected token '<![C' before '['");
1128         check_case!("<![CD",    '['; "<![CD["    ; 0, 0, "Unexpected token '<![CD' before '['");
1129         check_case!("<![CDA",   '['; "<![CDA["   ; 0, 0, "Unexpected token '<![CDA' before '['");
1130         check_case!("<![CDAT",  '['; "<![CDAT["  ; 0, 0, "Unexpected token '<![CDAT' before '['");
1131         check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'");
1132     }
1133 
1134     #[test]
error_in_doctype_started()1135     fn error_in_doctype_started() {
1136         check_case!("<!D",      'a'; "<!Da"      ; 0, 0, "Unexpected token '<!D' before 'a'");
1137         check_case!("<!DO",     'b'; "<!DOb"     ; 0, 0, "Unexpected token '<!DO' before 'b'");
1138         check_case!("<!DOC",    'c'; "<!DOCc"    ; 0, 0, "Unexpected token '<!DOC' before 'c'");
1139         check_case!("<!DOCT",   'd'; "<!DOCTd"   ; 0, 0, "Unexpected token '<!DOCT' before 'd'");
1140         check_case!("<!DOCTY",  'e'; "<!DOCTYe"  ; 0, 0, "Unexpected token '<!DOCTY' before 'e'");
1141         check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'");
1142     }
1143 
1144 
1145 
1146     #[test]
issue_98_cdata_ending_with_right_bracket()1147     fn issue_98_cdata_ending_with_right_bracket() {
1148         let (mut lex, mut buf) = make_lex_and_buf(
1149             r#"<![CDATA[Foo [Bar]]]>"#
1150         );
1151 
1152         assert_oks!(for lex and buf ;
1153             Token::CDataStart
1154             Token::Character('F')
1155             Token::Character('o')
1156             Token::Character('o')
1157             Token::Character(' ')
1158             Token::Character('[')
1159             Token::Character('B')
1160             Token::Character('a')
1161             Token::Character('r')
1162             Token::Character(']')
1163             Token::CDataEnd
1164         );
1165         assert_none!(for lex and buf);
1166     }
1167 }
1168