1 package org.jsoup.parser; 2 3 import org.jsoup.UncheckedIOException; 4 import org.jsoup.helper.Validate; 5 import org.jspecify.annotations.Nullable; 6 7 import java.io.IOException; 8 import java.io.Reader; 9 import java.io.StringReader; 10 import java.util.ArrayList; 11 import java.util.Arrays; 12 import java.util.Collections; 13 import java.util.Locale; 14 15 /** 16 CharacterReader consumes tokens off a string. Used internally by jsoup. API subject to changes. 17 */ 18 public final class CharacterReader { 19 static final char EOF = (char) -1; 20 private static final int maxStringCacheLen = 12; 21 static final int maxBufferLen = 1024 * 32; // visible for testing 22 static final int readAheadLimit = (int) (maxBufferLen * 0.75); // visible for testing 23 private static final int minReadAheadLen = 1024; // the minimum mark length supported. No HTML entities can be larger than this. 24 25 private char[] charBuf; 26 private Reader reader; 27 private int bufLength; 28 private int bufSplitPoint; 29 private int bufPos; 30 private int readerPos; 31 private int bufMark = -1; 32 private static final int stringCacheSize = 512; 33 private String[] stringCache = new String[stringCacheSize]; // holds reused strings in this doc, to lessen garbage 34 35 @Nullable private ArrayList<Integer> newlinePositions = null; // optionally track the pos() position of newlines - scans during bufferUp() 36 private int lineNumberOffset = 1; // line numbers start at 1; += newlinePosition[indexof(pos)] 37 CharacterReader(Reader input, int sz)38 public CharacterReader(Reader input, int sz) { 39 Validate.notNull(input); 40 Validate.isTrue(input.markSupported()); 41 reader = input; 42 charBuf = new char[Math.min(sz, maxBufferLen)]; 43 bufferUp(); 44 } 45 CharacterReader(Reader input)46 public CharacterReader(Reader input) { 47 this(input, maxBufferLen); 48 } 49 CharacterReader(String input)50 public CharacterReader(String input) { 51 this(new StringReader(input), input.length()); 52 } 53 close()54 public void close() { 55 if (reader == null) 56 return; 57 try { 58 reader.close(); 59 } catch (IOException ignored) { 60 } finally { 61 reader = null; 62 charBuf = null; 63 stringCache = null; 64 } 65 } 66 67 private boolean readFully; // if the underlying stream has been completely read, no value in further buffering bufferUp()68 private void bufferUp() { 69 if (readFully || bufPos < bufSplitPoint) 70 return; 71 72 final int pos; 73 final int offset; 74 if (bufMark != -1) { 75 pos = bufMark; 76 offset = bufPos - bufMark; 77 } else { 78 pos = bufPos; 79 offset = 0; 80 } 81 82 try { 83 final long skipped = reader.skip(pos); 84 reader.mark(maxBufferLen); 85 int read = 0; 86 while (read <= minReadAheadLen) { 87 int thisRead = reader.read(charBuf, read, charBuf.length - read); 88 if (thisRead == -1) 89 readFully = true; 90 if (thisRead <= 0) 91 break; 92 read += thisRead; 93 } 94 reader.reset(); 95 if (read > 0) { 96 Validate.isTrue(skipped == pos); // Previously asserted that there is room in buf to skip, so this will be a WTF 97 bufLength = read; 98 readerPos += pos; 99 bufPos = offset; 100 if (bufMark != -1) 101 bufMark = 0; 102 bufSplitPoint = Math.min(bufLength, readAheadLimit); 103 } 104 } catch (IOException e) { 105 throw new UncheckedIOException(e); 106 } 107 scanBufferForNewlines(); // if enabled, we index newline positions for line number tracking 108 lastIcSeq = null; // cache for last containsIgnoreCase(seq) 109 } 110 111 /** 112 * Gets the position currently read to in the content. Starts at 0. 113 * @return current position 114 */ pos()115 public int pos() { 116 return readerPos + bufPos; 117 } 118 119 /** Tests if the buffer has been fully read. */ readFully()120 boolean readFully() { 121 return readFully; 122 } 123 124 /** 125 Enables or disables line number tracking. By default, will be <b>off</b>.Tracking line numbers improves the 126 legibility of parser error messages, for example. Tracking should be enabled before any content is read to be of 127 use. 128 129 @param track set tracking on|off 130 @since 1.14.3 131 */ trackNewlines(boolean track)132 public void trackNewlines(boolean track) { 133 if (track && newlinePositions == null) { 134 newlinePositions = new ArrayList<>(maxBufferLen / 80); // rough guess of likely count 135 scanBufferForNewlines(); // first pass when enabled; subsequently called during bufferUp 136 } 137 else if (!track) 138 newlinePositions = null; 139 } 140 141 /** 142 Check if the tracking of newlines is enabled. 143 @return the current newline tracking state 144 @since 1.14.3 145 */ isTrackNewlines()146 public boolean isTrackNewlines() { 147 return newlinePositions != null; 148 } 149 150 /** 151 Get the current line number (that the reader has consumed to). Starts at line #1. 152 @return the current line number, or 1 if line tracking is not enabled. 153 @since 1.14.3 154 @see #trackNewlines(boolean) 155 */ lineNumber()156 public int lineNumber() { 157 return lineNumber(pos()); 158 } 159 lineNumber(int pos)160 int lineNumber(int pos) { 161 // note that this impl needs to be called before the next buffer up or line numberoffset will be wrong. if that 162 // causes issues, can remove the reset of newlinepositions during buffer, at the cost of a larger tracking array 163 if (!isTrackNewlines()) 164 return 1; 165 166 int i = lineNumIndex(pos); 167 if (i == -1) 168 return lineNumberOffset; // first line 169 return i + lineNumberOffset + 1; 170 } 171 172 /** 173 Get the current column number (that the reader has consumed to). Starts at column #1. 174 @return the current column number 175 @since 1.14.3 176 @see #trackNewlines(boolean) 177 */ columnNumber()178 public int columnNumber() { 179 return columnNumber(pos()); 180 } 181 columnNumber(int pos)182 int columnNumber(int pos) { 183 if (!isTrackNewlines()) 184 return pos + 1; 185 186 int i = lineNumIndex(pos); 187 if (i == -1) 188 return pos + 1; 189 return pos - newlinePositions.get(i) + 1; 190 } 191 192 /** 193 Get a formatted string representing the current line and column positions. E.g. <code>5:10</code> indicating line 194 number 5 and column number 10. 195 @return line:col position 196 @since 1.14.3 197 @see #trackNewlines(boolean) 198 */ posLineCol()199 String posLineCol() { 200 return lineNumber() + ":" + columnNumber(); 201 } 202 lineNumIndex(int pos)203 private int lineNumIndex(int pos) { 204 if (!isTrackNewlines()) return 0; 205 int i = Collections.binarySearch(newlinePositions, pos); 206 if (i < -1) i = Math.abs(i) - 2; 207 return i; 208 } 209 210 /** 211 Scans the buffer for newline position, and tracks their location in newlinePositions. 212 */ scanBufferForNewlines()213 private void scanBufferForNewlines() { 214 if (!isTrackNewlines()) 215 return; 216 217 if (newlinePositions.size() > 0) { 218 // work out the line number that we have read up to (as we have likely scanned past this point) 219 int index = lineNumIndex(readerPos); 220 if (index == -1) index = 0; // first line 221 int linePos = newlinePositions.get(index); 222 lineNumberOffset += index; // the num lines we've read up to 223 newlinePositions.clear(); 224 newlinePositions.add(linePos); // roll the last read pos to first, for cursor num after buffer 225 } 226 227 for (int i = bufPos; i < bufLength; i++) { 228 if (charBuf[i] == '\n') 229 newlinePositions.add(1 + readerPos + i); 230 } 231 } 232 233 /** 234 * Tests if all the content has been read. 235 * @return true if nothing left to read. 236 */ isEmpty()237 public boolean isEmpty() { 238 bufferUp(); 239 return bufPos >= bufLength; 240 } 241 isEmptyNoBufferUp()242 private boolean isEmptyNoBufferUp() { 243 return bufPos >= bufLength; 244 } 245 246 /** 247 * Get the char at the current position. 248 * @return char 249 */ current()250 public char current() { 251 bufferUp(); 252 return isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 253 } 254 consume()255 char consume() { 256 bufferUp(); 257 char val = isEmptyNoBufferUp() ? EOF : charBuf[bufPos]; 258 bufPos++; 259 return val; 260 } 261 262 /** 263 Unconsume one character (bufPos--). MUST only be called directly after a consume(), and no chance of a bufferUp. 264 */ unconsume()265 void unconsume() { 266 if (bufPos < 1) 267 throw new UncheckedIOException(new IOException("WTF: No buffer left to unconsume.")); // a bug if this fires, need to trace it. 268 269 bufPos--; 270 } 271 272 /** 273 * Moves the current position by one. 274 */ advance()275 public void advance() { 276 bufPos++; 277 } 278 mark()279 void mark() { 280 // make sure there is enough look ahead capacity 281 if (bufLength - bufPos < minReadAheadLen) 282 bufSplitPoint = 0; 283 284 bufferUp(); 285 bufMark = bufPos; 286 } 287 unmark()288 void unmark() { 289 bufMark = -1; 290 } 291 rewindToMark()292 void rewindToMark() { 293 if (bufMark == -1) 294 throw new UncheckedIOException(new IOException("Mark invalid")); 295 296 bufPos = bufMark; 297 unmark(); 298 } 299 300 /** 301 * Returns the number of characters between the current position and the next instance of the input char 302 * @param c scan target 303 * @return offset between current position and next instance of target. -1 if not found. 304 */ nextIndexOf(char c)305 int nextIndexOf(char c) { 306 // doesn't handle scanning for surrogates 307 bufferUp(); 308 for (int i = bufPos; i < bufLength; i++) { 309 if (c == charBuf[i]) 310 return i - bufPos; 311 } 312 return -1; 313 } 314 315 /** 316 * Returns the number of characters between the current position and the next instance of the input sequence 317 * 318 * @param seq scan target 319 * @return offset between current position and next instance of target. -1 if not found. 320 */ nextIndexOf(CharSequence seq)321 int nextIndexOf(CharSequence seq) { 322 bufferUp(); 323 // doesn't handle scanning for surrogates 324 char startChar = seq.charAt(0); 325 for (int offset = bufPos; offset < bufLength; offset++) { 326 // scan to first instance of startchar: 327 if (startChar != charBuf[offset]) 328 while(++offset < bufLength && startChar != charBuf[offset]) { /* empty */ } 329 int i = offset + 1; 330 int last = i + seq.length()-1; 331 if (offset < bufLength && last <= bufLength) { 332 for (int j = 1; i < last && seq.charAt(j) == charBuf[i]; i++, j++) { /* empty */ } 333 if (i == last) // found full sequence 334 return offset - bufPos; 335 } 336 } 337 return -1; 338 } 339 340 /** 341 * Reads characters up to the specific char. 342 * @param c the delimiter 343 * @return the chars read 344 */ consumeTo(char c)345 public String consumeTo(char c) { 346 int offset = nextIndexOf(c); 347 if (offset != -1) { 348 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 349 bufPos += offset; 350 return consumed; 351 } else { 352 return consumeToEnd(); 353 } 354 } 355 consumeTo(String seq)356 String consumeTo(String seq) { 357 int offset = nextIndexOf(seq); 358 if (offset != -1) { 359 String consumed = cacheString(charBuf, stringCache, bufPos, offset); 360 bufPos += offset; 361 return consumed; 362 } else if (bufLength - bufPos < seq.length()) { 363 // nextIndexOf() did a bufferUp(), so if the buffer is shorter than the search string, we must be at EOF 364 return consumeToEnd(); 365 } else { 366 // the string we're looking for may be straddling a buffer boundary, so keep (length - 1) characters 367 // unread in case they contain the beginning of the search string 368 int endPos = bufLength - seq.length() + 1; 369 String consumed = cacheString(charBuf, stringCache, bufPos, endPos - bufPos); 370 bufPos = endPos; 371 return consumed; 372 } 373 } 374 375 /** 376 * Read characters until the first of any delimiters is found. 377 * @param chars delimiters to scan for 378 * @return characters read up to the matched delimiter. 379 */ consumeToAny(final char... chars)380 public String consumeToAny(final char... chars) { 381 bufferUp(); 382 int pos = bufPos; 383 final int start = pos; 384 final int remaining = bufLength; 385 final char[] val = charBuf; 386 final int charLen = chars.length; 387 int i; 388 389 OUTER: while (pos < remaining) { 390 for (i = 0; i < charLen; i++) { 391 if (val[pos] == chars[i]) 392 break OUTER; 393 } 394 pos++; 395 } 396 397 bufPos = pos; 398 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 399 } 400 consumeToAnySorted(final char... chars)401 String consumeToAnySorted(final char... chars) { 402 bufferUp(); 403 int pos = bufPos; 404 final int start = pos; 405 final int remaining = bufLength; 406 final char[] val = charBuf; 407 408 while (pos < remaining) { 409 if (Arrays.binarySearch(chars, val[pos]) >= 0) 410 break; 411 pos++; 412 } 413 bufPos = pos; 414 return bufPos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 415 } 416 consumeData()417 String consumeData() { 418 // &, <, null 419 //bufferUp(); // no need to bufferUp, just called consume() 420 int pos = bufPos; 421 final int start = pos; 422 final int remaining = bufLength; 423 final char[] val = charBuf; 424 425 OUTER: while (pos < remaining) { 426 switch (val[pos]) { 427 case '&': 428 case '<': 429 case TokeniserState.nullChar: 430 break OUTER; 431 default: 432 pos++; 433 } 434 } 435 bufPos = pos; 436 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 437 } 438 consumeAttributeQuoted(final boolean single)439 String consumeAttributeQuoted(final boolean single) { 440 // null, " or ', & 441 //bufferUp(); // no need to bufferUp, just called consume() 442 int pos = bufPos; 443 final int start = pos; 444 final int remaining = bufLength; 445 final char[] val = charBuf; 446 447 OUTER: while (pos < remaining) { 448 switch (val[pos]) { 449 case '&': 450 case TokeniserState.nullChar: 451 break OUTER; 452 case '\'': 453 if (single) break OUTER; 454 break; 455 case '"': 456 if (!single) break OUTER; 457 break; 458 } 459 pos++; 460 } 461 bufPos = pos; 462 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 463 } 464 465 consumeRawData()466 String consumeRawData() { 467 // <, null 468 //bufferUp(); // no need to bufferUp, just called consume() 469 int pos = bufPos; 470 final int start = pos; 471 final int remaining = bufLength; 472 final char[] val = charBuf; 473 474 OUTER: while (pos < remaining) { 475 switch (val[pos]) { 476 case '<': 477 case TokeniserState.nullChar: 478 break OUTER; 479 default: 480 pos++; 481 } 482 } 483 bufPos = pos; 484 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 485 } 486 consumeTagName()487 String consumeTagName() { 488 // '\t', '\n', '\r', '\f', ' ', '/', '>' 489 // NOTE: out of spec, added '<' to fix common author bugs; does not stop and append on nullChar but eats 490 bufferUp(); 491 int pos = bufPos; 492 final int start = pos; 493 final int remaining = bufLength; 494 final char[] val = charBuf; 495 496 OUTER: while (pos < remaining) { 497 switch (val[pos]) { 498 case '\t': 499 case '\n': 500 case '\r': 501 case '\f': 502 case ' ': 503 case '/': 504 case '>': 505 case '<': 506 break OUTER; 507 } 508 pos++; 509 } 510 511 bufPos = pos; 512 return pos > start ? cacheString(charBuf, stringCache, start, pos -start) : ""; 513 } 514 consumeToEnd()515 String consumeToEnd() { 516 bufferUp(); 517 String data = cacheString(charBuf, stringCache, bufPos, bufLength - bufPos); 518 bufPos = bufLength; 519 return data; 520 } 521 consumeLetterSequence()522 String consumeLetterSequence() { 523 bufferUp(); 524 int start = bufPos; 525 while (bufPos < bufLength) { 526 char c = charBuf[bufPos]; 527 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) 528 bufPos++; 529 else 530 break; 531 } 532 533 return cacheString(charBuf, stringCache, start, bufPos - start); 534 } 535 consumeLetterThenDigitSequence()536 String consumeLetterThenDigitSequence() { 537 bufferUp(); 538 int start = bufPos; 539 while (bufPos < bufLength) { 540 char c = charBuf[bufPos]; 541 if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c)) 542 bufPos++; 543 else 544 break; 545 } 546 while (!isEmptyNoBufferUp()) { 547 char c = charBuf[bufPos]; 548 if (c >= '0' && c <= '9') 549 bufPos++; 550 else 551 break; 552 } 553 554 return cacheString(charBuf, stringCache, start, bufPos - start); 555 } 556 consumeHexSequence()557 String consumeHexSequence() { 558 bufferUp(); 559 int start = bufPos; 560 while (bufPos < bufLength) { 561 char c = charBuf[bufPos]; 562 if ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f')) 563 bufPos++; 564 else 565 break; 566 } 567 return cacheString(charBuf, stringCache, start, bufPos - start); 568 } 569 consumeDigitSequence()570 String consumeDigitSequence() { 571 bufferUp(); 572 int start = bufPos; 573 while (bufPos < bufLength) { 574 char c = charBuf[bufPos]; 575 if (c >= '0' && c <= '9') 576 bufPos++; 577 else 578 break; 579 } 580 return cacheString(charBuf, stringCache, start, bufPos - start); 581 } 582 matches(char c)583 boolean matches(char c) { 584 return !isEmpty() && charBuf[bufPos] == c; 585 586 } 587 matches(String seq)588 boolean matches(String seq) { 589 bufferUp(); 590 int scanLength = seq.length(); 591 if (scanLength > bufLength - bufPos) 592 return false; 593 594 for (int offset = 0; offset < scanLength; offset++) 595 if (seq.charAt(offset) != charBuf[bufPos +offset]) 596 return false; 597 return true; 598 } 599 matchesIgnoreCase(String seq)600 boolean matchesIgnoreCase(String seq) { 601 bufferUp(); 602 int scanLength = seq.length(); 603 if (scanLength > bufLength - bufPos) 604 return false; 605 606 for (int offset = 0; offset < scanLength; offset++) { 607 char upScan = Character.toUpperCase(seq.charAt(offset)); 608 char upTarget = Character.toUpperCase(charBuf[bufPos + offset]); 609 if (upScan != upTarget) 610 return false; 611 } 612 return true; 613 } 614 matchesAny(char... seq)615 boolean matchesAny(char... seq) { 616 if (isEmpty()) 617 return false; 618 619 bufferUp(); 620 char c = charBuf[bufPos]; 621 for (char seek : seq) { 622 if (seek == c) 623 return true; 624 } 625 return false; 626 } 627 matchesAnySorted(char[] seq)628 boolean matchesAnySorted(char[] seq) { 629 bufferUp(); 630 return !isEmpty() && Arrays.binarySearch(seq, charBuf[bufPos]) >= 0; 631 } 632 matchesLetter()633 boolean matchesLetter() { 634 if (isEmpty()) 635 return false; 636 char c = charBuf[bufPos]; 637 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || Character.isLetter(c); 638 } 639 640 /** 641 Checks if the current pos matches an ascii alpha (A-Z a-z) per https://infra.spec.whatwg.org/#ascii-alpha 642 @return if it matches or not 643 */ matchesAsciiAlpha()644 boolean matchesAsciiAlpha() { 645 if (isEmpty()) 646 return false; 647 char c = charBuf[bufPos]; 648 return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'); 649 } 650 matchesDigit()651 boolean matchesDigit() { 652 if (isEmpty()) 653 return false; 654 char c = charBuf[bufPos]; 655 return (c >= '0' && c <= '9'); 656 } 657 matchConsume(String seq)658 boolean matchConsume(String seq) { 659 bufferUp(); 660 if (matches(seq)) { 661 bufPos += seq.length(); 662 return true; 663 } else { 664 return false; 665 } 666 } 667 matchConsumeIgnoreCase(String seq)668 boolean matchConsumeIgnoreCase(String seq) { 669 if (matchesIgnoreCase(seq)) { 670 bufPos += seq.length(); 671 return true; 672 } else { 673 return false; 674 } 675 } 676 677 // we maintain a cache of the previously scanned sequence, and return that if applicable on repeated scans. 678 // that improves the situation where there is a sequence of <p<p<p<p<p<p<p...</title> and we're bashing on the <p 679 // looking for the </title>. Resets in bufferUp() 680 @Nullable private String lastIcSeq; // scan cache 681 private int lastIcIndex; // nearest found indexOf 682 683 /** Used to check presence of </title>, </style> when we're in RCData and see a <xxx. Only finds consistent case. */ containsIgnoreCase(String seq)684 boolean containsIgnoreCase(String seq) { 685 if (seq.equals(lastIcSeq)) { 686 if (lastIcIndex == -1) return false; 687 if (lastIcIndex >= bufPos) return true; 688 } 689 lastIcSeq = seq; 690 691 String loScan = seq.toLowerCase(Locale.ENGLISH); 692 int lo = nextIndexOf(loScan); 693 if (lo > -1) { 694 lastIcIndex = bufPos + lo; return true; 695 } 696 697 String hiScan = seq.toUpperCase(Locale.ENGLISH); 698 int hi = nextIndexOf(hiScan); 699 boolean found = hi > -1; 700 lastIcIndex = found ? bufPos + hi : -1; // we don't care about finding the nearest, just that buf contains 701 return found; 702 } 703 704 @Override toString()705 public String toString() { 706 if (bufLength - bufPos < 0) 707 return ""; 708 return new String(charBuf, bufPos, bufLength - bufPos); 709 } 710 711 /** 712 * Caches short strings, as a flyweight pattern, to reduce GC load. Just for this doc, to prevent leaks. 713 * <p /> 714 * Simplistic, and on hash collisions just falls back to creating a new string, vs a full HashMap with Entry list. 715 * That saves both having to create objects as hash keys, and running through the entry list, at the expense of 716 * some more duplicates. 717 */ cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count)718 private static String cacheString(final char[] charBuf, final String[] stringCache, final int start, final int count) { 719 // limit (no cache): 720 if (count > maxStringCacheLen) 721 return new String(charBuf, start, count); 722 if (count < 1) 723 return ""; 724 725 // calculate hash: 726 int hash = 0; 727 for (int i = 0; i < count; i++) { 728 hash = 31 * hash + charBuf[start + i]; 729 } 730 731 // get from cache 732 final int index = hash & stringCacheSize - 1; 733 String cached = stringCache[index]; 734 735 if (cached != null && rangeEquals(charBuf, start, count, cached)) // positive hit 736 return cached; 737 else { 738 cached = new String(charBuf, start, count); 739 stringCache[index] = cached; // add or replace, assuming most recently used are most likely to recur next 740 } 741 742 return cached; 743 } 744 745 /** 746 * Check if the value of the provided range equals the string. 747 */ rangeEquals(final char[] charBuf, final int start, int count, final String cached)748 static boolean rangeEquals(final char[] charBuf, final int start, int count, final String cached) { 749 if (count == cached.length()) { 750 int i = start; 751 int j = 0; 752 while (count-- != 0) { 753 if (charBuf[i++] != cached.charAt(j++)) 754 return false; 755 } 756 return true; 757 } 758 return false; 759 } 760 761 // just used for testing rangeEquals(final int start, final int count, final String cached)762 boolean rangeEquals(final int start, final int count, final String cached) { 763 return rangeEquals(charBuf, start, count, cached); 764 } 765 } 766