1 //! Contains simple lexer for XML documents. 2 //! 3 //! This module is for internal use. Use `xml::pull` module to do parsing. 4 5 6 use crate::reader::ErrorKind; 7 use crate::reader::error::SyntaxError; 8 use std::collections::VecDeque; 9 use std::fmt; 10 use std::io::Read; 11 use std::result; 12 use crate::common::{is_name_char, is_whitespace_char, Position, TextPosition, is_xml10_char, is_xml11_char}; 13 use crate::reader::Error; 14 use crate::util::{CharReader, Encoding}; 15 16 use super::ParserConfig2; 17 18 /// `Token` represents a single lexeme of an XML document. These lexemes 19 /// are used to perform actual parsing. 20 #[derive(Copy, Clone, PartialEq, Eq, Debug)] 21 pub(crate) enum Token { 22 /// `<?` 23 ProcessingInstructionStart, 24 /// `?>` 25 ProcessingInstructionEnd, 26 /// `<!DOCTYPE 27 DoctypeStart, 28 /// `<` 29 OpeningTagStart, 30 /// `</` 31 ClosingTagStart, 32 /// `>` 33 TagEnd, 34 /// `/>` 35 EmptyTagEnd, 36 /// `<!--` 37 CommentStart, 38 /// `-->` 39 CommentEnd, 40 /// Any non-special character except whitespace. 41 Character(char), 42 /// `=` 43 EqualsSign, 44 /// `'` 45 SingleQuote, 46 /// `"` 47 DoubleQuote, 48 /// `<![CDATA[` 49 CDataStart, 50 /// `]]>` 51 CDataEnd, 52 /// `&` 53 ReferenceStart, 54 /// `;` 55 ReferenceEnd, 56 /// `<!` of `ENTITY` 57 MarkupDeclarationStart, 58 } 59 60 impl fmt::Display for Token { 61 #[cold] fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result62 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 63 match *self { 64 Token::Character(c) => c.fmt(f), 65 other => match other { 66 Token::OpeningTagStart => "<", 67 Token::ProcessingInstructionStart => "<?", 68 Token::DoctypeStart => "<!DOCTYPE", 69 Token::ClosingTagStart => "</", 70 Token::CommentStart => "<!--", 71 Token::CDataStart => "<![CDATA[", 72 Token::TagEnd => ">", 73 Token::EmptyTagEnd => "/>", 74 Token::ProcessingInstructionEnd => "?>", 75 Token::CommentEnd => "-->", 76 Token::CDataEnd => "]]>", 77 Token::ReferenceStart => "&", 78 Token::ReferenceEnd => ";", 79 Token::EqualsSign => "=", 80 Token::SingleQuote => "'", 81 Token::DoubleQuote => "\"", 82 Token::MarkupDeclarationStart => "<!", 83 _ => unreachable!() 84 }.fmt(f), 85 } 86 } 87 } 88 89 impl Token { as_static_str(&self) -> Option<&'static str>90 pub fn as_static_str(&self) -> Option<&'static str> { 91 match *self { 92 Token::OpeningTagStart => Some("<"), 93 Token::ProcessingInstructionStart => Some("<?"), 94 Token::DoctypeStart => Some("<!DOCTYPE"), 95 Token::ClosingTagStart => Some("</"), 96 Token::CommentStart => Some("<!--"), 97 Token::CDataStart => Some("<![CDATA["), 98 Token::TagEnd => Some(">"), 99 Token::EmptyTagEnd => Some("/>"), 100 Token::ProcessingInstructionEnd => Some("?>"), 101 Token::CommentEnd => Some("-->"), 102 Token::CDataEnd => Some("]]>"), 103 Token::ReferenceStart => Some("&"), 104 Token::ReferenceEnd => Some(";"), 105 Token::EqualsSign => Some("="), 106 Token::SingleQuote => Some("'"), 107 Token::DoubleQuote => Some("\""), 108 _ => None 109 } 110 } 111 112 // using String.push_str(token.to_string()) is simply way too slow push_to_string(&self, target: &mut String)113 pub fn push_to_string(&self, target: &mut String) { 114 match *self { 115 Token::Character(c) => { 116 debug_assert!(is_xml10_char(c) || is_xml11_char(c)); 117 target.push(c) 118 }, 119 _ => if let Some(s) = self.as_static_str() { 120 target.push_str(s); 121 } 122 } 123 } 124 } 125 126 #[derive(Copy, Clone)] 127 enum State { 128 /// Default state 129 Normal, 130 /// Triggered on '<' 131 TagStarted, 132 /// Triggered on '<!' 133 CommentOrCDataOrDoctypeStarted, 134 /// Triggered on '<!-' 135 CommentStarted, 136 /// Triggered on '<!D' up to '<!DOCTYPE' 137 DoctypeStarted(DoctypeStartedSubstate), 138 /// Other items like `<!ELEMENT` in DTD 139 InsideMarkupDeclaration, 140 /// Triggered after DoctypeStarted to handle sub elements 141 InsideDoctype, 142 /// Triggered on '<![' up to '<![CDATA' 143 CDataStarted(CDataStartedSubstate), 144 /// Triggered on '?' 145 ProcessingInstructionClosing, 146 /// Triggered on '/' 147 EmptyTagClosing, 148 /// Triggered on '-' up to '--' 149 CommentClosing(ClosingSubstate), 150 /// Triggered on ']' up to ']]' inside CDATA 151 CDataClosing(ClosingSubstate), 152 /// Triggered on ']' up to ']]' outside CDATA 153 InvalidCDataClosing(ClosingSubstate), 154 /// After `<!--` 155 InsideComment, 156 /// After `<[[` 157 InsideCdata, 158 /// After `<?` 159 InsideProcessingInstruction, 160 /// `<!ENTITY "here">` 161 InsideMarkupDeclarationQuotedString(QuoteStyle), 162 } 163 164 #[derive(Copy, Clone, Eq, PartialEq)] 165 enum QuoteStyle { 166 Single, Double 167 } 168 169 #[derive(Copy, Clone)] 170 enum ClosingSubstate { 171 First, Second 172 } 173 174 #[derive(Copy, Clone)] 175 enum DoctypeStartedSubstate { 176 D, DO, DOC, DOCT, DOCTY, DOCTYP 177 } 178 179 #[derive(Copy, Clone)] 180 enum CDataStartedSubstate { 181 E, C, CD, CDA, CDAT, CDATA 182 } 183 184 /// `Result` represents lexing result. It is either a token or an error message. 185 pub(crate) type Result<T = Option<Token>, E = Error> = result::Result<T, E>; 186 187 /// Helps to set up a dispatch table for lexing large unambigous tokens like 188 /// `<![CDATA[` or `<!DOCTYPE `. 189 macro_rules! dispatch_on_enum_state( 190 ($_self:ident, $s:expr, $c:expr, $is:expr, 191 $($st:ident; $stc:expr ; $next_st:ident ; $chunk:expr),+; 192 $end_st:ident ; $end_c:expr ; $end_chunk:expr ; $e:expr) => ( 193 match $s { 194 $( 195 $st => match $c { 196 $stc => $_self.move_to($is($next_st)), 197 _ => $_self.handle_error($chunk, $c) 198 }, 199 )+ 200 $end_st => match $c { 201 $end_c => $e, 202 _ => $_self.handle_error($end_chunk, $c) 203 } 204 } 205 ) 206 ); 207 208 /// `Lexer` is a lexer for XML documents, which implements pull API. 209 /// 210 /// Main method is `next_token` which accepts an `std::io::Read` instance and 211 /// tries to read the next lexeme from it. 212 /// 213 /// When `skip_errors` flag is set, invalid lexemes will be returned as `Chunk`s. 214 /// When it is not set, errors will be reported as `Err` objects with a string message. 215 /// By default this flag is not set. Use `enable_errors` and `disable_errors` methods 216 /// to toggle the behavior. 217 pub(crate) struct Lexer { 218 st: State, 219 reader: CharReader, 220 pos: TextPosition, 221 head_pos: TextPosition, 222 char_queue: VecDeque<char>, 223 /// Default state to go back to after a tag end (may be `InsideDoctype`) 224 normal_state: State, 225 inside_token: bool, 226 eof_handled: bool, 227 reparse_depth: u8, 228 #[cfg(test)] 229 skip_errors: bool, 230 231 max_entity_expansion_depth: u8, 232 max_entity_expansion_length: usize, 233 } 234 235 impl Position for Lexer { 236 #[inline] 237 /// Returns the position of the last token produced by the lexer position(&self) -> TextPosition238 fn position(&self) -> TextPosition { self.pos } 239 } 240 241 impl Lexer { 242 /// Returns a new lexer with default state. new(config: &ParserConfig2) -> Lexer243 pub(crate) fn new(config: &ParserConfig2) -> Lexer { 244 Lexer { 245 reader: CharReader::new(), 246 pos: TextPosition::new(), 247 head_pos: TextPosition::new(), 248 char_queue: VecDeque::with_capacity(4), // TODO: check size 249 st: State::Normal, 250 normal_state: State::Normal, 251 inside_token: false, 252 eof_handled: false, 253 reparse_depth: 0, 254 #[cfg(test)] 255 skip_errors: false, 256 257 max_entity_expansion_depth: config.max_entity_expansion_depth, 258 max_entity_expansion_length: config.max_entity_expansion_length, 259 } 260 } 261 encoding(&mut self) -> Encoding262 pub(crate) fn encoding(&mut self) -> Encoding { 263 self.reader.encoding 264 } 265 set_encoding(&mut self, encoding: Encoding)266 pub(crate) fn set_encoding(&mut self, encoding: Encoding) { 267 self.reader.encoding = encoding; 268 } 269 270 /// Disables error handling so `next_token` will return `Some(Chunk(..))` 271 /// upon invalid lexeme with this lexeme content. disable_errors(&mut self)272 #[cfg(test)] fn disable_errors(&mut self) { self.skip_errors = true; } 273 274 /// Reset the eof handled flag of the lexer. 275 #[inline] reset_eof_handled(&mut self)276 pub fn reset_eof_handled(&mut self) { self.eof_handled = false; } 277 278 /// Tries to read the next token from the buffer. 279 /// 280 /// It is possible to pass different instaces of `BufReader` each time 281 /// this method is called, but the resulting behavior is undefined in this case. 282 /// 283 /// Return value: 284 /// * `Err(reason) where reason: reader::Error` - when an error occurs; 285 /// * `Ok(None)` - upon end of stream is reached; 286 /// * `Ok(Some(token)) where token: Token` - in case a complete-token has been read from the stream. next_token<B: Read>(&mut self, b: &mut B) -> Result287 pub fn next_token<B: Read>(&mut self, b: &mut B) -> Result { 288 // Already reached end of buffer 289 if self.eof_handled { 290 return Ok(None); 291 } 292 293 if !self.inside_token { 294 self.pos = self.head_pos; 295 self.inside_token = true; 296 } 297 298 // Check if we have saved a char or two for ourselves 299 while let Some(c) = self.char_queue.pop_front() { 300 match self.dispatch_char(c)? { 301 Some(t) => { 302 self.inside_token = false; 303 return Ok(Some(t)); 304 } 305 None => {} // continue 306 } 307 } 308 // if char_queue is empty, all circular reparsing is done 309 self.reparse_depth = 0; 310 loop { 311 let c = match self.reader.next_char_from(b)? { 312 Some(c) => c, // got next char 313 None => break, // nothing to read left 314 }; 315 316 if c == '\n' { 317 self.head_pos.new_line(); 318 } else { 319 self.head_pos.advance(1); 320 } 321 322 match self.dispatch_char(c)? { 323 Some(t) => { 324 self.inside_token = false; 325 return Ok(Some(t)); 326 } 327 None => { 328 // continue 329 } 330 } 331 } 332 333 self.end_of_stream() 334 } 335 336 #[inline(never)] end_of_stream(&mut self) -> Result337 fn end_of_stream(&mut self) -> Result { 338 // Handle end of stream 339 self.eof_handled = true; 340 self.pos = self.head_pos; 341 match self.st { 342 State::InsideCdata | State::CDataClosing(_) => Err(self.error(SyntaxError::UnclosedCdata)), 343 State::TagStarted | State::CommentOrCDataOrDoctypeStarted | 344 State::CommentStarted | State::CDataStarted(_)| State::DoctypeStarted(_) | 345 State::CommentClosing(ClosingSubstate::Second) | 346 State::InsideComment | State::InsideMarkupDeclaration | 347 State::InsideProcessingInstruction | State::ProcessingInstructionClosing | 348 State::InsideDoctype | State::InsideMarkupDeclarationQuotedString(_) => 349 Err(self.error(SyntaxError::UnexpectedEof)), 350 State::EmptyTagClosing => 351 Ok(Some(Token::Character('/'))), 352 State::CommentClosing(ClosingSubstate::First) => 353 Ok(Some(Token::Character('-'))), 354 State::InvalidCDataClosing(ClosingSubstate::First) => 355 Ok(Some(Token::Character(']'))), 356 State::InvalidCDataClosing(ClosingSubstate::Second) => { 357 self.eof_handled = false; 358 self.move_to_with_unread(State::Normal, &[']'], Token::Character(']')) 359 }, 360 State::Normal => 361 Ok(None), 362 } 363 } 364 365 #[cold] error(&self, e: SyntaxError) -> Error366 fn error(&self, e: SyntaxError) -> Error { 367 Error { 368 pos: self.position(), 369 kind: ErrorKind::Syntax(e.to_cow()), 370 } 371 } 372 373 374 #[inline(never)] dispatch_char(&mut self, c: char) -> Result375 fn dispatch_char(&mut self, c: char) -> Result { 376 match self.st { 377 State::Normal => self.normal(c), 378 State::TagStarted => self.tag_opened(c), 379 State::EmptyTagClosing => self.empty_element_closing(c), 380 State::CommentOrCDataOrDoctypeStarted => self.comment_or_cdata_or_doctype_started(c), 381 State::InsideCdata => self.inside_cdata(c), 382 State::CDataStarted(s) => self.cdata_started(c, s), 383 State::InsideComment => self.inside_comment_state(c), 384 State::CommentStarted => self.comment_started(c), 385 State::InsideProcessingInstruction => self.inside_processing_instruction(c), 386 State::ProcessingInstructionClosing => self.processing_instruction_closing(c), 387 State::CommentClosing(s) => self.comment_closing(c, s), 388 State::CDataClosing(s) => self.cdata_closing(c, s), 389 State::InsideDoctype => self.inside_doctype(c), 390 State::DoctypeStarted(s) => self.doctype_started(c, s), 391 State::InvalidCDataClosing(s) => self.invalid_cdata_closing(c, s), 392 State::InsideMarkupDeclaration => self.markup_declaration(c), 393 State::InsideMarkupDeclarationQuotedString(q) => self.markup_declaration_string(c, q), 394 } 395 } 396 397 #[inline] move_to(&mut self, st: State) -> Result398 fn move_to(&mut self, st: State) -> Result { 399 self.st = st; 400 Ok(None) 401 } 402 403 #[inline] move_to_with(&mut self, st: State, token: Token) -> Result404 fn move_to_with(&mut self, st: State, token: Token) -> Result { 405 self.st = st; 406 Ok(Some(token)) 407 } 408 409 #[inline] move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result410 fn move_to_and_reset_normal(&mut self, st: State, token: Token) -> Result { 411 self.normal_state = st; 412 self.st = st; 413 Ok(Some(token)) 414 } 415 move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result416 fn move_to_with_unread(&mut self, st: State, cs: &[char], token: Token) -> Result { 417 for c in cs.iter().rev().copied() { 418 self.char_queue.push_front(c); 419 } 420 self.move_to_with(st, token) 421 } 422 reparse(&mut self, markup: &str) -> Result<()>423 pub(crate) fn reparse(&mut self, markup: &str) -> Result<()> { 424 if markup.is_empty() { 425 return Ok(()); 426 } 427 428 self.reparse_depth += 1; 429 if self.reparse_depth > self.max_entity_expansion_depth || self.char_queue.len() > self.max_entity_expansion_length { 430 return Err(self.error(SyntaxError::EntityTooBig)) 431 } 432 433 self.eof_handled = false; 434 self.char_queue.reserve(markup.len()); 435 for c in markup.chars().rev() { 436 self.char_queue.push_front(c); 437 } 438 439 Ok(()) 440 } 441 handle_error(&mut self, chunk: &'static str, c: char) -> Result442 fn handle_error(&mut self, chunk: &'static str, c: char) -> Result { 443 debug_assert!(!chunk.is_empty()); 444 445 #[cfg(test)] 446 if self.skip_errors { 447 let mut chars = chunk.chars(); 448 let first = chars.next().unwrap_or('\0'); 449 self.char_queue.extend(chars); 450 self.char_queue.push_back(c); 451 return self.move_to_with(State::Normal, Token::Character(first)); 452 } 453 Err(self.error(SyntaxError::UnexpectedTokenBefore(chunk, c))) 454 } 455 456 /// Encountered a char normal(&mut self, c: char) -> Result457 fn normal(&mut self, c: char) -> Result { 458 match c { 459 '<' => self.move_to(State::TagStarted), 460 '>' => Ok(Some(Token::TagEnd)), 461 '/' => self.move_to(State::EmptyTagClosing), 462 '=' => Ok(Some(Token::EqualsSign)), 463 '"' => Ok(Some(Token::DoubleQuote)), 464 '\'' => Ok(Some(Token::SingleQuote)), 465 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::First)), 466 '&' => Ok(Some(Token::ReferenceStart)), 467 ';' => Ok(Some(Token::ReferenceEnd)), 468 _ => Ok(Some(Token::Character(c))) 469 } 470 } 471 inside_cdata(&mut self, c: char) -> Result472 fn inside_cdata(&mut self, c: char) -> Result { 473 match c { 474 ']' => self.move_to(State::CDataClosing(ClosingSubstate::First)), 475 _ => Ok(Some(Token::Character(c))) 476 } 477 } 478 inside_processing_instruction(&mut self, c: char) -> Result479 fn inside_processing_instruction(&mut self, c: char) -> Result { 480 // These tokens are used by `<?xml?>` parser 481 match c { 482 '?' => self.move_to(State::ProcessingInstructionClosing), 483 '<' => Ok(Some(Token::OpeningTagStart)), 484 '>' => Ok(Some(Token::TagEnd)), 485 '/' => Ok(Some(Token::ClosingTagStart)), 486 '=' => Ok(Some(Token::EqualsSign)), 487 '"' => Ok(Some(Token::DoubleQuote)), 488 '\'' => Ok(Some(Token::SingleQuote)), 489 '&' => Ok(Some(Token::ReferenceStart)), 490 ';' => Ok(Some(Token::ReferenceEnd)), 491 _ => Ok(Some(Token::Character(c))) 492 } 493 } 494 inside_comment_state(&mut self, c: char) -> Result495 fn inside_comment_state(&mut self, c: char) -> Result { 496 match c { 497 '-' => self.move_to(State::CommentClosing(ClosingSubstate::First)), 498 _ => Ok(Some(Token::Character(c))) 499 } 500 } 501 502 /// Encountered '<' tag_opened(&mut self, c: char) -> Result503 fn tag_opened(&mut self, c: char) -> Result { 504 match c { 505 '?' => self.move_to_with(State::InsideProcessingInstruction, Token::ProcessingInstructionStart), 506 '/' => self.move_to_with(self.normal_state, Token::ClosingTagStart), 507 '!' => self.move_to(State::CommentOrCDataOrDoctypeStarted), 508 _ if is_whitespace_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), 509 _ if is_name_char(c) => self.move_to_with_unread(self.normal_state, &[c], Token::OpeningTagStart), 510 _ => self.handle_error("<", c) 511 } 512 } 513 514 /// Encountered '<!' comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result515 fn comment_or_cdata_or_doctype_started(&mut self, c: char) -> Result { 516 match c { 517 '-' => self.move_to(State::CommentStarted), 518 '[' => self.move_to(State::CDataStarted(CDataStartedSubstate::E)), 519 'D' => self.move_to(State::DoctypeStarted(DoctypeStartedSubstate::D)), 520 'E' | 'A' | 'N' if matches!(self.normal_state, State::InsideDoctype) => { 521 self.move_to_with_unread(State::InsideMarkupDeclaration, &[c], Token::MarkupDeclarationStart) 522 }, 523 _ => self.handle_error("<!", c), 524 } 525 } 526 527 /// Encountered '<!-' comment_started(&mut self, c: char) -> Result528 fn comment_started(&mut self, c: char) -> Result { 529 match c { 530 '-' => self.move_to_with(State::InsideComment, Token::CommentStart), 531 _ => self.handle_error("<!-", c), 532 } 533 } 534 535 /// Encountered '<![' cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result536 fn cdata_started(&mut self, c: char, s: CDataStartedSubstate) -> Result { 537 use self::CDataStartedSubstate::{C, CD, CDA, CDAT, CDATA, E}; 538 dispatch_on_enum_state!(self, s, c, State::CDataStarted, 539 E ; 'C' ; C ; "<![", 540 C ; 'D' ; CD ; "<![C", 541 CD ; 'A' ; CDA ; "<![CD", 542 CDA ; 'T' ; CDAT ; "<![CDA", 543 CDAT ; 'A' ; CDATA ; "<![CDAT"; 544 CDATA ; '[' ; "<![CDATA" ; self.move_to_with(State::InsideCdata, Token::CDataStart) 545 ) 546 } 547 548 /// Encountered '<!…' that isn't DOCTYPE or CDATA markup_declaration(&mut self, c: char) -> Result549 fn markup_declaration(&mut self, c: char) -> Result { 550 match c { 551 '<' => self.handle_error("<!", c), 552 '>' => self.move_to_with(self.normal_state, Token::TagEnd), 553 '&' => Ok(Some(Token::ReferenceStart)), 554 ';' => Ok(Some(Token::ReferenceEnd)), 555 '"' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Double), Token::DoubleQuote), 556 '\'' => self.move_to_with(State::InsideMarkupDeclarationQuotedString(QuoteStyle::Single), Token::SingleQuote), 557 _ => Ok(Some(Token::Character(c))), 558 } 559 } 560 markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result561 fn markup_declaration_string(&mut self, c: char, q: QuoteStyle) -> Result { 562 match c { 563 '"' if q == QuoteStyle::Double => self.move_to_with(State::InsideMarkupDeclaration, Token::DoubleQuote), 564 '\'' if q == QuoteStyle::Single => self.move_to_with(State::InsideMarkupDeclaration, Token::SingleQuote), 565 _ => Ok(Some(Token::Character(c))), 566 } 567 } 568 569 /// Encountered '<!D' doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result570 fn doctype_started(&mut self, c: char, s: DoctypeStartedSubstate) -> Result { 571 use self::DoctypeStartedSubstate::{D, DO, DOC, DOCT, DOCTY, DOCTYP}; 572 dispatch_on_enum_state!(self, s, c, State::DoctypeStarted, 573 D ; 'O' ; DO ; "<!D", 574 DO ; 'C' ; DOC ; "<!DO", 575 DOC ; 'T' ; DOCT ; "<!DOC", 576 DOCT ; 'Y' ; DOCTY ; "<!DOCT", 577 DOCTY ; 'P' ; DOCTYP ; "<!DOCTY"; 578 DOCTYP ; 'E' ; "<!DOCTYP" ; self.move_to_and_reset_normal(State::InsideDoctype, Token::DoctypeStart) 579 ) 580 } 581 582 /// State used while awaiting the closing bracket for the <!DOCTYPE tag inside_doctype(&mut self, c: char) -> Result583 fn inside_doctype(&mut self, c: char) -> Result { 584 match c { 585 '>' => self.move_to_and_reset_normal(State::Normal, Token::TagEnd), 586 '<' => self.move_to(State::TagStarted), 587 '&' => Ok(Some(Token::ReferenceStart)), 588 ';' => Ok(Some(Token::ReferenceEnd)), 589 '"' => Ok(Some(Token::DoubleQuote)), 590 '\'' => Ok(Some(Token::SingleQuote)), 591 _ => Ok(Some(Token::Character(c))), 592 } 593 } 594 595 /// Encountered '?' processing_instruction_closing(&mut self, c: char) -> Result596 fn processing_instruction_closing(&mut self, c: char) -> Result { 597 match c { 598 '>' => self.move_to_with(self.normal_state, Token::ProcessingInstructionEnd), 599 _ => self.move_to_with_unread(State::InsideProcessingInstruction, &[c], Token::Character('?')), 600 } 601 } 602 603 /// Encountered '/' empty_element_closing(&mut self, c: char) -> Result604 fn empty_element_closing(&mut self, c: char) -> Result { 605 match c { 606 '>' => self.move_to_with(self.normal_state, Token::EmptyTagEnd), 607 _ => self.move_to_with_unread(self.normal_state, &[c], Token::Character('/')), 608 } 609 } 610 611 /// Encountered '-' comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result612 fn comment_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 613 match s { 614 ClosingSubstate::First => match c { 615 '-' => self.move_to(State::CommentClosing(ClosingSubstate::Second)), 616 _ => self.move_to_with_unread(State::InsideComment, &[c], Token::Character('-')), 617 }, 618 ClosingSubstate::Second => match c { 619 '>' => self.move_to_with(self.normal_state, Token::CommentEnd), 620 // double dash not followed by a greater-than is a hard error inside comment 621 _ => self.handle_error("--", c), 622 }, 623 } 624 } 625 626 /// Encountered ']' cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result627 fn cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 628 match s { 629 ClosingSubstate::First => match c { 630 ']' => self.move_to(State::CDataClosing(ClosingSubstate::Second)), 631 _ => self.move_to_with_unread(State::InsideCdata, &[c], Token::Character(']')), 632 }, 633 ClosingSubstate::Second => match c { 634 '>' => self.move_to_with(State::Normal, Token::CDataEnd), 635 _ => self.move_to_with_unread(State::InsideCdata, &[']', c], Token::Character(']')), 636 }, 637 } 638 } 639 640 /// Encountered ']' invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result641 fn invalid_cdata_closing(&mut self, c: char, s: ClosingSubstate) -> Result { 642 match s { 643 ClosingSubstate::First => match c { 644 ']' => self.move_to(State::InvalidCDataClosing(ClosingSubstate::Second)), 645 _ => self.move_to_with_unread(State::Normal, &[c], Token::Character(']')), 646 }, 647 ClosingSubstate::Second => match c { 648 '>' => self.move_to_with(self.normal_state, Token::CDataEnd), 649 _ => self.move_to_with_unread(State::Normal, &[']', c], Token::Character(']')), 650 }, 651 } 652 } 653 } 654 655 #[cfg(test)] 656 mod tests { 657 use crate::{common::Position, reader::ParserConfig2}; 658 use std::io::{BufReader, Cursor}; 659 660 use super::{Lexer, Token}; 661 662 macro_rules! assert_oks( 663 (for $lex:ident and $buf:ident ; $($e:expr)+) => ({ 664 $( 665 assert_eq!(Ok(Some($e)), $lex.next_token(&mut $buf)); 666 )+ 667 }) 668 ); 669 670 macro_rules! assert_err( 671 (for $lex:ident and $buf:ident expect row $r:expr ; $c:expr, $s:expr) => ({ 672 let err = $lex.next_token(&mut $buf); 673 assert!(err.is_err()); 674 let err = err.unwrap_err(); 675 assert_eq!($r as u64, err.position().row); 676 assert_eq!($c as u64, err.position().column); 677 }) 678 ); 679 680 macro_rules! assert_none( 681 (for $lex:ident and $buf:ident) => ( 682 assert_eq!(Ok(None), $lex.next_token(&mut $buf)) 683 ) 684 ); 685 make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>)686 fn make_lex_and_buf(s: &str) -> (Lexer, BufReader<Cursor<Vec<u8>>>) { 687 (Lexer::new(&ParserConfig2::default()), BufReader::new(Cursor::new(s.to_owned().into_bytes()))) 688 } 689 690 #[test] tricky_pi()691 fn tricky_pi() { 692 let (mut lex, mut buf) = make_lex_and_buf(r#"<?x<!-- &??><x>"#); 693 694 assert_oks!(for lex and buf ; 695 Token::ProcessingInstructionStart 696 Token::Character('x') 697 Token::OpeningTagStart // processing of <?xml?> relies on the extra tokens 698 Token::Character('!') 699 Token::Character('-') 700 Token::Character('-') 701 Token::Character(' ') 702 Token::ReferenceStart 703 Token::Character('?') 704 Token::ProcessingInstructionEnd 705 Token::OpeningTagStart 706 Token::Character('x') 707 Token::TagEnd 708 ); 709 assert_none!(for lex and buf); 710 } 711 712 #[test] reparser()713 fn reparser() { 714 let (mut lex, mut buf) = make_lex_and_buf(r#"&a;"#); 715 716 assert_oks!(for lex and buf ; 717 Token::ReferenceStart 718 Token::Character('a') 719 Token::ReferenceEnd 720 ); 721 lex.reparse("<hi/>").unwrap(); 722 assert_oks!(for lex and buf ; 723 Token::OpeningTagStart 724 Token::Character('h') 725 Token::Character('i') 726 Token::EmptyTagEnd 727 ); 728 assert_none!(for lex and buf); 729 } 730 731 #[test] simple_lexer_test()732 fn simple_lexer_test() { 733 let (mut lex, mut buf) = make_lex_and_buf( 734 r#"<a p='q'> x<b z="y">d </b></a><p/> <?nm ?> <!-- a c --> "# 735 ); 736 737 assert_oks!(for lex and buf ; 738 Token::OpeningTagStart 739 Token::Character('a') 740 Token::Character(' ') 741 Token::Character('p') 742 Token::EqualsSign 743 Token::SingleQuote 744 Token::Character('q') 745 Token::SingleQuote 746 Token::TagEnd 747 Token::Character(' ') 748 Token::Character('x') 749 Token::OpeningTagStart 750 Token::Character('b') 751 Token::Character(' ') 752 Token::Character('z') 753 Token::EqualsSign 754 Token::DoubleQuote 755 Token::Character('y') 756 Token::DoubleQuote 757 Token::TagEnd 758 Token::Character('d') 759 Token::Character('\t') 760 Token::ClosingTagStart 761 Token::Character('b') 762 Token::TagEnd 763 Token::ClosingTagStart 764 Token::Character('a') 765 Token::TagEnd 766 Token::OpeningTagStart 767 Token::Character('p') 768 Token::EmptyTagEnd 769 Token::Character(' ') 770 Token::ProcessingInstructionStart 771 Token::Character('n') 772 Token::Character('m') 773 Token::Character(' ') 774 Token::ProcessingInstructionEnd 775 Token::Character(' ') 776 Token::CommentStart 777 Token::Character(' ') 778 Token::Character('a') 779 Token::Character(' ') 780 Token::Character('c') 781 Token::Character(' ') 782 Token::CommentEnd 783 Token::Character(' ') 784 Token::ReferenceStart 785 Token::Character('n') 786 Token::Character('b') 787 Token::Character('s') 788 Token::Character('p') 789 Token::ReferenceEnd 790 ); 791 assert_none!(for lex and buf); 792 } 793 794 #[test] special_chars_test()795 fn special_chars_test() { 796 let (mut lex, mut buf) = make_lex_and_buf( 797 r#"?x!+ // -| ]z]]"# 798 ); 799 800 assert_oks!(for lex and buf ; 801 Token::Character('?') 802 Token::Character('x') 803 Token::Character('!') 804 Token::Character('+') 805 Token::Character(' ') 806 Token::Character('/') 807 Token::Character('/') 808 Token::Character(' ') 809 Token::Character('-') 810 Token::Character('|') 811 Token::Character(' ') 812 Token::Character(']') 813 Token::Character('z') 814 Token::Character(']') 815 Token::Character(']') 816 ); 817 assert_none!(for lex and buf); 818 } 819 820 #[test] cdata_test()821 fn cdata_test() { 822 let (mut lex, mut buf) = make_lex_and_buf( 823 r#"<a><![CDATA[x y ?]]> </a>"# 824 ); 825 826 assert_oks!(for lex and buf ; 827 Token::OpeningTagStart 828 Token::Character('a') 829 Token::TagEnd 830 Token::CDataStart 831 Token::Character('x') 832 Token::Character(' ') 833 Token::Character('y') 834 Token::Character(' ') 835 Token::Character('?') 836 Token::CDataEnd 837 Token::Character(' ') 838 Token::ClosingTagStart 839 Token::Character('a') 840 Token::TagEnd 841 ); 842 assert_none!(for lex and buf); 843 } 844 845 #[test] cdata_closers_test()846 fn cdata_closers_test() { 847 let (mut lex, mut buf) = make_lex_and_buf( 848 r#"<![CDATA[] > ]> ]]><!---->]]<a>"# 849 ); 850 851 assert_oks!(for lex and buf ; 852 Token::CDataStart 853 Token::Character(']') 854 Token::Character(' ') 855 Token::Character('>') 856 Token::Character(' ') 857 Token::Character(']') 858 Token::Character('>') 859 Token::Character(' ') 860 Token::CDataEnd 861 Token::CommentStart 862 Token::CommentEnd 863 Token::Character(']') 864 Token::Character(']') 865 Token::OpeningTagStart 866 Token::Character('a') 867 Token::TagEnd 868 ); 869 assert_none!(for lex and buf); 870 } 871 872 #[test] doctype_test()873 fn doctype_test() { 874 let (mut lex, mut buf) = make_lex_and_buf( 875 r#"<a><!DOCTYPE ab xx z> "# 876 ); 877 assert_oks!(for lex and buf ; 878 Token::OpeningTagStart 879 Token::Character('a') 880 Token::TagEnd 881 Token::DoctypeStart 882 Token::Character(' ') 883 Token::Character('a') 884 Token::Character('b') 885 Token::Character(' ') 886 Token::Character('x') 887 Token::Character('x') 888 Token::Character(' ') 889 Token::Character('z') 890 Token::TagEnd 891 Token::Character(' ') 892 ); 893 assert_none!(for lex and buf); 894 } 895 896 #[test] tricky_comments()897 fn tricky_comments() { 898 let (mut lex, mut buf) = make_lex_and_buf( 899 r#"<a><!-- C ->--></a>"# 900 ); 901 assert_oks!(for lex and buf ; 902 Token::OpeningTagStart 903 Token::Character('a') 904 Token::TagEnd 905 Token::CommentStart 906 Token::Character(' ') 907 Token::Character('C') 908 Token::Character(' ') 909 Token::Character('-') 910 Token::Character('>') 911 Token::CommentEnd 912 Token::ClosingTagStart 913 Token::Character('a') 914 Token::TagEnd 915 ); 916 assert_none!(for lex and buf); 917 } 918 919 #[test] doctype_with_internal_subset_test()920 fn doctype_with_internal_subset_test() { 921 let (mut lex, mut buf) = make_lex_and_buf( 922 r#"<a><!DOCTYPE ab[<!ELEMENT ba ">>>"> ]> "# 923 ); 924 assert_oks!(for lex and buf ; 925 Token::OpeningTagStart 926 Token::Character('a') 927 Token::TagEnd 928 Token::DoctypeStart 929 Token::Character(' ') 930 Token::Character('a') 931 Token::Character('b') 932 Token::Character('[') 933 Token::MarkupDeclarationStart 934 Token::Character('E') 935 Token::Character('L') 936 Token::Character('E') 937 Token::Character('M') 938 Token::Character('E') 939 Token::Character('N') 940 Token::Character('T') 941 Token::Character(' ') 942 Token::Character('b') 943 Token::Character('a') 944 Token::Character(' ') 945 Token::DoubleQuote 946 Token::Character('>') 947 Token::Character('>') 948 Token::Character('>') 949 Token::DoubleQuote 950 Token::TagEnd 951 Token::Character(' ') 952 Token::Character(']') 953 Token::TagEnd 954 Token::Character(' ') 955 ); 956 assert_none!(for lex and buf); 957 } 958 959 #[test] doctype_internal_pi_comment()960 fn doctype_internal_pi_comment() { 961 let (mut lex, mut buf) = make_lex_and_buf( 962 "<!DOCTYPE a [\n<!ELEMENT l ANY> <!-- <?non?>--> <?pi > ?> \n]>" 963 ); 964 assert_oks!(for lex and buf ; 965 Token::DoctypeStart 966 Token::Character(' ') 967 Token::Character('a') 968 Token::Character(' ') 969 Token::Character('[') 970 Token::Character('\n') 971 Token::MarkupDeclarationStart 972 Token::Character('E') 973 Token::Character('L') 974 Token::Character('E') 975 Token::Character('M') 976 Token::Character('E') 977 Token::Character('N') 978 Token::Character('T') 979 Token::Character(' ') 980 Token::Character('l') 981 Token::Character(' ') 982 Token::Character('A') 983 Token::Character('N') 984 Token::Character('Y') 985 Token::TagEnd 986 Token::Character(' ') 987 Token::CommentStart 988 Token::Character(' ') 989 Token::Character('<') 990 Token::Character('?') 991 Token::Character('n') 992 Token::Character('o') 993 Token::Character('n') 994 Token::Character('?') 995 Token::Character('>') 996 Token::CommentEnd 997 Token::Character(' ') 998 Token::ProcessingInstructionStart 999 Token::Character('p') 1000 Token::Character('i') 1001 Token::Character(' ') 1002 Token::TagEnd // not really 1003 Token::Character(' ') 1004 Token::ProcessingInstructionEnd 1005 Token::Character(' ') 1006 Token::Character('\n') 1007 Token::Character(']') 1008 Token::TagEnd // DTD 1009 ); 1010 assert_none!(for lex and buf); 1011 } 1012 1013 #[test] end_of_stream_handling_ok()1014 fn end_of_stream_handling_ok() { 1015 macro_rules! eof_check( 1016 ($data:expr ; $token:expr) => ({ 1017 let (mut lex, mut buf) = make_lex_and_buf($data); 1018 assert_oks!(for lex and buf ; $token); 1019 assert_none!(for lex and buf); 1020 }) 1021 ); 1022 eof_check!("?" ; Token::Character('?')); 1023 eof_check!("/" ; Token::Character('/')); 1024 eof_check!("-" ; Token::Character('-')); 1025 eof_check!("]" ; Token::Character(']')); 1026 eof_check!("]" ; Token::Character(']')); 1027 eof_check!("]" ; Token::Character(']')); 1028 } 1029 1030 #[test] end_of_stream_handling_error()1031 fn end_of_stream_handling_error() { 1032 macro_rules! eof_check( 1033 ($data:expr; $r:expr, $c:expr) => ({ 1034 let (mut lex, mut buf) = make_lex_and_buf($data); 1035 assert_err!(for lex and buf expect row $r ; $c, "Unexpected end of stream"); 1036 assert_none!(for lex and buf); 1037 }) 1038 ); 1039 eof_check!("<" ; 0, 1); 1040 eof_check!("<!" ; 0, 2); 1041 eof_check!("<!-" ; 0, 3); 1042 eof_check!("<![" ; 0, 3); 1043 eof_check!("<![C" ; 0, 4); 1044 eof_check!("<![CD" ; 0, 5); 1045 eof_check!("<![CDA" ; 0, 6); 1046 eof_check!("<![CDAT" ; 0, 7); 1047 eof_check!("<![CDATA" ; 0, 8); 1048 } 1049 1050 #[test] error_in_comment_or_cdata_prefix()1051 fn error_in_comment_or_cdata_prefix() { 1052 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 1053 assert_err!(for lex and buf expect row 0 ; 0, 1054 "Unexpected token '<!' before 'x'" 1055 ); 1056 1057 let (mut lex, mut buf) = make_lex_and_buf("<!x"); 1058 lex.disable_errors(); 1059 assert_oks!(for lex and buf ; 1060 Token::Character('<') 1061 Token::Character('!') 1062 Token::Character('x') 1063 ); 1064 assert_none!(for lex and buf); 1065 } 1066 1067 #[test] error_in_comment_started()1068 fn error_in_comment_started() { 1069 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 1070 assert_err!(for lex and buf expect row 0 ; 0, 1071 "Unexpected token '<!-' before '\t'" 1072 ); 1073 1074 let (mut lex, mut buf) = make_lex_and_buf("<!-\t"); 1075 lex.disable_errors(); 1076 assert_oks!(for lex and buf ; 1077 Token::Character('<') 1078 Token::Character('!') 1079 Token::Character('-') 1080 Token::Character('\t') 1081 ); 1082 assert_none!(for lex and buf); 1083 } 1084 1085 #[test] error_in_comment_two_dashes_not_at_end()1086 fn error_in_comment_two_dashes_not_at_end() { 1087 let (mut lex, mut buf) = make_lex_and_buf("--x"); 1088 lex.st = super::State::InsideComment; 1089 assert_err!(for lex and buf expect row 0; 0, 1090 "Unexpected token '--' before 'x'" 1091 ); 1092 1093 let (mut lex, mut buf) = make_lex_and_buf("--x"); 1094 assert_oks!(for lex and buf ; 1095 Token::Character('-') 1096 Token::Character('-') 1097 Token::Character('x') 1098 ); 1099 } 1100 1101 macro_rules! check_case( 1102 ($chunk:expr, $app:expr; $data:expr; $r:expr, $c:expr, $s:expr) => ({ 1103 let (mut lex, mut buf) = make_lex_and_buf($data); 1104 assert_err!(for lex and buf expect row $r ; $c, $s); 1105 1106 let (mut lex, mut buf) = make_lex_and_buf($data); 1107 lex.disable_errors(); 1108 for c in $chunk.chars() { 1109 assert_eq!(Ok(Some(Token::Character(c))), lex.next_token(&mut buf)); 1110 } 1111 assert_oks!(for lex and buf ; 1112 Token::Character($app) 1113 ); 1114 assert_none!(for lex and buf); 1115 }) 1116 ); 1117 1118 #[test] token_size()1119 fn token_size() { 1120 assert_eq!(4, std::mem::size_of::<Token>()); 1121 assert_eq!(2, std::mem::size_of::<super::State>()); 1122 } 1123 1124 #[test] error_in_cdata_started()1125 fn error_in_cdata_started() { 1126 check_case!("<![", '['; "<![[" ; 0, 0, "Unexpected token '<![' before '['"); 1127 check_case!("<![C", '['; "<![C[" ; 0, 0, "Unexpected token '<![C' before '['"); 1128 check_case!("<![CD", '['; "<![CD[" ; 0, 0, "Unexpected token '<![CD' before '['"); 1129 check_case!("<![CDA", '['; "<![CDA[" ; 0, 0, "Unexpected token '<![CDA' before '['"); 1130 check_case!("<![CDAT", '['; "<![CDAT[" ; 0, 0, "Unexpected token '<![CDAT' before '['"); 1131 check_case!("<![CDATA", '|'; "<![CDATA|" ; 0, 0, "Unexpected token '<![CDATA' before '|'"); 1132 } 1133 1134 #[test] error_in_doctype_started()1135 fn error_in_doctype_started() { 1136 check_case!("<!D", 'a'; "<!Da" ; 0, 0, "Unexpected token '<!D' before 'a'"); 1137 check_case!("<!DO", 'b'; "<!DOb" ; 0, 0, "Unexpected token '<!DO' before 'b'"); 1138 check_case!("<!DOC", 'c'; "<!DOCc" ; 0, 0, "Unexpected token '<!DOC' before 'c'"); 1139 check_case!("<!DOCT", 'd'; "<!DOCTd" ; 0, 0, "Unexpected token '<!DOCT' before 'd'"); 1140 check_case!("<!DOCTY", 'e'; "<!DOCTYe" ; 0, 0, "Unexpected token '<!DOCTY' before 'e'"); 1141 check_case!("<!DOCTYP", 'f'; "<!DOCTYPf" ; 0, 0, "Unexpected token '<!DOCTYP' before 'f'"); 1142 } 1143 1144 1145 1146 #[test] issue_98_cdata_ending_with_right_bracket()1147 fn issue_98_cdata_ending_with_right_bracket() { 1148 let (mut lex, mut buf) = make_lex_and_buf( 1149 r#"<![CDATA[Foo [Bar]]]>"# 1150 ); 1151 1152 assert_oks!(for lex and buf ; 1153 Token::CDataStart 1154 Token::Character('F') 1155 Token::Character('o') 1156 Token::Character('o') 1157 Token::Character(' ') 1158 Token::Character('[') 1159 Token::Character('B') 1160 Token::Character('a') 1161 Token::Character('r') 1162 Token::Character(']') 1163 Token::CDataEnd 1164 ); 1165 assert_none!(for lex and buf); 1166 } 1167 } 1168