1# This set of tests is for UTF-8 support and Unicode property support, with 2# relevance only for the 8-bit library. 3 4#newline_default lf any anycrlf 5 6# The next 5 patterns have UTF-8 errors 7 8/[�]/utf 9 10/�/utf 11 12/���xxx/utf 13 14/Â��������/utf 15 16/Â��������/match_invalid_utf 17 18# Now test subjects 19 20/badutf/utf 21\= Expect UTF-8 errors 22 X\xdf 23 XX\xef 24 XXX\xef\x80 25 X\xf7 26 XX\xf7\x80 27 XXX\xf7\x80\x80 28 \xfb 29 \xfb\x80 30 \xfb\x80\x80 31 \xfb\x80\x80\x80 32 \xfd 33 \xfd\x80 34 \xfd\x80\x80 35 \xfd\x80\x80\x80 36 \xfd\x80\x80\x80\x80 37 \xdf\x7f 38 \xef\x7f\x80 39 \xef\x80\x7f 40 \xf7\x7f\x80\x80 41 \xf7\x80\x7f\x80 42 \xf7\x80\x80\x7f 43 \xfb\x7f\x80\x80\x80 44 \xfb\x80\x7f\x80\x80 45 \xfb\x80\x80\x7f\x80 46 \xfb\x80\x80\x80\x7f 47 \xfd\x7f\x80\x80\x80\x80 48 \xfd\x80\x7f\x80\x80\x80 49 \xfd\x80\x80\x7f\x80\x80 50 \xfd\x80\x80\x80\x7f\x80 51 \xfd\x80\x80\x80\x80\x7f 52 \xed\xa0\x80 53 \xc0\x8f 54 \xe0\x80\x8f 55 \xf0\x80\x80\x8f 56 \xf8\x80\x80\x80\x8f 57 \xfc\x80\x80\x80\x80\x8f 58 \x80 59 \xfe 60 \xff 61 62/badutf/utf 63\= Expect UTF-8 errors 64 XX\xfb\x80\x80\x80\x80 65 XX\xfd\x80\x80\x80\x80\x80 66 XX\xf7\xbf\xbf\xbf 67 68/shortutf/utf 69\= Expect UTF-8 errors 70 XX\xdf\=ph 71 XX\xef\=ph 72 XX\xef\x80\=ph 73 \xf7\=ph 74 \xf7\x80\=ph 75 \xf7\x80\x80\=ph 76 \xfb\=ph 77 \xfb\x80\=ph 78 \xfb\x80\x80\=ph 79 \xfb\x80\x80\x80\=ph 80 \xfd\=ph 81 \xfd\x80\=ph 82 \xfd\x80\x80\=ph 83 \xfd\x80\x80\x80\=ph 84 \xfd\x80\x80\x80\x80\=ph 85 86/anything/utf 87\= Expect UTF-8 errors 88 X\xc0\x80 89 XX\xc1\x8f 90 XXX\xe0\x9f\x80 91 \xf0\x8f\x80\x80 92 \xf8\x87\x80\x80\x80 93 \xfc\x83\x80\x80\x80\x80 94 \xfe\x80\x80\x80\x80\x80 95 \xff\x80\x80\x80\x80\x80 96 \xf8\x88\x80\x80\x80 97 \xf9\x87\x80\x80\x80 98 \xfc\x84\x80\x80\x80\x80 99 \xfd\x83\x80\x80\x80\x80 100\= Expect no match 101 \xc3\x8f 102 \xe0\xaf\x80 103 \xe1\x80\x80 104 \xf0\x9f\x80\x80 105 \xf1\x8f\x80\x80 106 \xf8\x88\x80\x80\x80\=no_utf_check 107 \xf9\x87\x80\x80\x80\=no_utf_check 108 \xfc\x84\x80\x80\x80\x80\=no_utf_check 109 \xfd\x83\x80\x80\x80\x80\=no_utf_check 110 111# Similar tests with offsets 112 113/badutf/utf 114\= Expect UTF-8 errors 115 X\xdfabcd 116 X\xdfabcd\=offset=1 117\= Expect no match 118 X\xdfabcd\=offset=2 119 120/(?<=x)badutf/utf 121\= Expect UTF-8 errors 122 X\xdfabcd 123 X\xdfabcd\=offset=1 124 X\xdfabcd\=offset=2 125 X\xdfabcd\xdf\=offset=3 126\= Expect no match 127 X\xdfabcd\=offset=3 128 129/(?<=xx)badutf/utf 130\= Expect UTF-8 errors 131 X\xdfabcd 132 X\xdfabcd\=offset=1 133 X\xdfabcd\=offset=2 134 X\xdfabcd\=offset=3 135 136/(?<=xxxx)badutf/utf 137\= Expect UTF-8 errors 138 X\xdfabcd 139 X\xdfabcd\=offset=1 140 X\xdfabcd\=offset=2 141 X\xdfabcd\=offset=3 142 X\xdfabc\xdf\=offset=6 143 X\xdfabc\xdf\=offset=7 144\= Expect no match 145 X\xdfabcd\=offset=6 146 147/\x{100}/IB,utf 148 149/\x{1000}/IB,utf 150 151/\x{10000}/IB,utf 152 153/\x{100000}/IB,utf 154 155/\x{10ffff}/IB,utf 156 157/[\x{ff}]/IB,utf 158 159/[\x{100}]/IB,utf 160 161/\x80/IB,utf 162 163/\xff/IB,utf 164 165/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf 166 \x{D55c}\x{ad6d}\x{C5B4} 167 168/\x{65e5}\x{672c}\x{8a9e}/IB,utf 169 \x{65e5}\x{672c}\x{8a9e} 170 171/\x{80}/IB,utf 172 173/\x{084}/IB,utf 174 175/\x{104}/IB,utf 176 177/\x{861}/IB,utf 178 179/\x{212ab}/IB,utf 180 181/[^ab\xC0-\xF0]/IB,utf 182 \x{f1} 183 \x{bf} 184 \x{100} 185 \x{1000} 186\= Expect no match 187 \x{c0} 188 \x{f0} 189 190/Ā{3,4}/IB,utf 191 \x{100}\x{100}\x{100}\x{100\x{100} 192 193/(\x{100}+|x)/IB,utf 194 195/(\x{100}*a|x)/IB,utf 196 197/(\x{100}{0,2}a|x)/IB,utf 198 199/(\x{100}{1,2}a|x)/IB,utf 200 201/\x{100}/IB,utf 202 203/a\x{100}\x{101}*/IB,utf 204 205/a\x{100}\x{101}+/IB,utf 206 207/[^\x{c4}]/IB 208 209/[\x{100}]/IB,utf 210 \x{100} 211 Z\x{100} 212 \x{100}Z 213 214/[\xff]/IB,utf 215 >\x{ff}< 216 217/[^\xff]/IB,utf 218 219/\x{100}abc(xyz(?1))/IB,utf 220 221/\777/I,utf 222 \x{1ff} 223 \777 224 225/\x{100}+\x{200}/IB,utf 226 227/\x{100}+X/IB,utf 228 229/^[\QĀ\E-\QŐ\E/B,utf 230 231# This tests the stricter UTF-8 check according to RFC 3629. 232 233/X/utf 234\= Expect UTF-8 errors 235 \x{d800} 236 \x{da00} 237 \x{dfff} 238 \x{110000} 239 \x{2000000} 240 \x{7fffffff} 241\= Expect no match 242 \x{d800}\=no_utf_check 243 \x{da00}\=no_utf_check 244 \x{dfff}\=no_utf_check 245 \x{110000}\=no_utf_check 246 \x{2000000}\=no_utf_check 247 \x{7fffffff}\=no_utf_check 248 249/(*UTF8)\x{1234}/ 250 abcd\x{1234}pqr 251 252/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I 253 254/\h/I,utf 255 ABC\x{09} 256 ABC\x{20} 257 ABC\x{a0} 258 ABC\x{1680} 259 ABC\x{180e} 260 ABC\x{2000} 261 ABC\x{202f} 262 ABC\x{205f} 263 ABC\x{3000} 264 265/\v/I,utf 266 ABC\x{0a} 267 ABC\x{0b} 268 ABC\x{0c} 269 ABC\x{0d} 270 ABC\x{85} 271 ABC\x{2028} 272 273/\h*A/I,utf 274 CDBABC 275 276/\v+A/I,utf 277 278/\s?xxx\s/I,utf 279 280/\sxxx\s/I,utf,tables=2 281 AB\x{85}xxx\x{a0}XYZ 282 AB\x{a0}xxx\x{85}XYZ 283 284/\S \S/I,utf,tables=2 285 \x{a2} \x{84} 286 A Z 287 288/a+/utf 289 a\x{123}aa\=offset=1 290 a\x{123}aa\=offset=3 291 a\x{123}aa\=offset=4 292\= Expect bad offset value 293 a\x{123}aa\=offset=6 294\= Expect bad UTF-8 offset 295 a\x{123}aa\=offset=2 296\= Expect no match 297 a\x{123}aa\=offset=5 298 299/\x{1234}+/Ii,utf 300 301/\x{1234}+?/Ii,utf 302 303/\x{1234}++/Ii,utf 304 305/\x{1234}{2}/Ii,utf 306 307/[^\x{c4}]/IB,utf 308 309/X+\x{200}/IB,utf 310 311/\R/I,utf 312 313/\777/IB,utf 314 315/\w+\x{C4}/B,utf 316 a\x{C4}\x{C4} 317 318/\w+\x{C4}/B,utf,tables=2 319 a\x{C4}\x{C4} 320 321/\W+\x{C4}/B,utf 322 !\x{C4} 323 324/\W+\x{C4}/B,utf,tables=2 325 !\x{C4} 326 327/\W+\x{A1}/B,utf 328 !\x{A1} 329 330/\W+\x{A1}/B,utf,tables=2 331 !\x{A1} 332 333/X\s+\x{A0}/B,utf 334 X\x20\x{A0}\x{A0} 335 336/X\s+\x{A0}/B,utf,tables=2 337 X\x20\x{A0}\x{A0} 338 339/\S+\x{A0}/B,utf 340 X\x{A0}\x{A0} 341 342/\S+\x{A0}/B,utf,tables=2 343 X\x{A0}\x{A0} 344 345/\x{a0}+\s!/B,utf 346 \x{a0}\x20! 347 348/\x{a0}+\s!/B,utf,tables=2 349 \x{a0}\x20! 350 351/A/utf 352 \x{ff000041} 353 \x{7f000041} 354 355/(*UTF8)abc/never_utf 356 357/abc/utf,never_utf 358 359/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf 360 361/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf 362 363/AB\x{1fb0}/IB,utf 364 365/AB\x{1fb0}/IBi,utf 366 367/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf 368 \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} 369 \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} 370 371/[ⱥ]/Bi,utf 372 373/[^ⱥ]/Bi,utf 374 375/\h/I 376 377/\v/I 378 379/\R/I 380 381/[[:blank:]]/B,ucp 382 383/\x{212a}+/Ii,utf 384 KKkk\x{212a} 385 386/s+/Ii,utf 387 SSss\x{17f} 388 389/\x{100}*A/IB,utf 390 A 391 392/\x{100}*\d(?R)/IB,utf 393 394/[Z\x{100}]/IB,utf 395 Z\x{100} 396 \x{100} 397 \x{100}Z 398 399/[z-\x{100}]/IB,utf 400 401/[z\Qa-d]Ā\E]/IB,utf 402 \x{100} 403 Ā 404 405/[ab\x{100}]abc(xyz(?1))/IB,utf 406 407/\x{100}*\s/IB,utf 408 409/\x{100}*\d/IB,utf 410 411/\x{100}*\w/IB,utf 412 413/\x{100}*\D/IB,utf 414 415/\x{100}*\S/IB,utf 416 417/\x{100}*\W/IB,utf 418 419/[\x{105}-\x{109}]/IBi,utf 420 \x{104} 421 \x{105} 422 \x{109} 423\= Expect no match 424 \x{100} 425 \x{10a} 426 427/[z-\x{100}]/IBi,utf 428 Z 429 z 430 \x{39c} 431 \x{178} 432 | 433 \x{80} 434 \x{ff} 435 \x{100} 436 \x{101} 437\= Expect no match 438 \x{102} 439 Y 440 y 441 442/[z-\x{100}]/IBi,utf 443 444/\x{3a3}B/IBi,utf 445 446/abc/utf,replace=� 447 abc 448 449/(?<=(a)(?-1))x/I,utf 450 a\x80zx\=offset=3 451 452/[\W\p{Any}]/B 453 abc 454 123 455 456/[\W\pL]/B 457 abc 458\= Expect no match 459 123 460 461/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf 462 463/[\s[:^ascii:]]/B,ucp 464 465# A special extra option allows excaped surrogate code points in 8-bit mode, 466# but subjects containing them must not be UTF-checked. 467 468/\x{d800}/I,utf,allow_surrogate_escapes 469 \x{d800}\=no_utf_check 470 471/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes 472 \x{dfff}\x{df01}\=no_utf_check 473 474# This has different starting code units in 8-bit mode. 475 476/^[^ab]/IB,utf 477 c 478 \x{ff} 479 \x{100} 480\= Expect no match 481 aaa 482 483# Offsets are different in 8-bit mode. 484 485/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout 486 123abcáyzabcdef789abcሴqr 487 488# Check name length with non-ASCII characters 489 490/(?'ABáC678901234567890123456789012012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf 491 492/(?'ABáC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf 493 494/(?'ABZC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf 495 496/(?(n/utf 497 498/(?(á/utf 499 500# Invalid UTF-8 tests 501 502/.../g,match_invalid_utf 503 abcd\x80wxzy\x80pqrs 504 abcd\x{80}wxzy\x80pqrs 505 506/abc/match_invalid_utf 507 ab\x80ab\=ph 508\= Expect no match 509 ab\x80cdef\=ph 510 511/.a/match_invalid_utf 512 ab\=ph 513 ab\=ps 514 b\xf0\x91\x88b\=ph 515 b\xf0\x91\x88b\=ps 516 b\xf0\x91\x88\xb4a 517\= Expect no match 518 b\x80\=ph 519 b\x80\=ps 520 b\xf0\x91\x88\=ph 521 b\xf0\x91\x88\=ps 522 523/.a$/match_invalid_utf 524 ab\=ph 525 ab\=ps 526\= Expect no match 527 b\xf0\x91\x98\=ph 528 b\xf0\x91\x98\=ps 529 530/ab$/match_invalid_utf 531 ab\x80cdeab 532\= Expect no match 533 ab\x80cde 534 535/.../g,match_invalid_utf 536 abcd\x{80}wxzy\x80pqrs 537 538/(?<=x)../g,match_invalid_utf 539 abcd\x{80}wxzy\x80pqrs 540 abcd\x{80}wxzy\x80xpqrs 541 542/X$/match_invalid_utf 543\= Expect no match 544 X\xc4 545 546/(?<=..)X/match_invalid_utf,aftertext 547 AB\x80AQXYZ 548 AB\x80AQXYZ\=offset=5 549 AB\x80\x80AXYZXC\=offset=5 550\= Expect no match 551 AB\x80XYZ 552 AB\x80XYZ\=offset=3 553 AB\xfeXYZ 554 AB\xffXYZ\=offset=3 555 AB\x80AXYZ 556 AB\x80AXYZ\=offset=4 557 AB\x80\x80AXYZ\=offset=5 558 559/.../match_invalid_utf 560 AB\xc4CCC 561\= Expect no match 562 A\x{d800}B 563 A\x{110000}B 564 A\xc4B 565 566/\bX/match_invalid_utf 567 A\x80X 568 569/\BX/match_invalid_utf 570\= Expect no match 571 A\x80X 572 573/(?<=...)X/match_invalid_utf 574 AAA\x80BBBXYZ 575\= Expect no match 576 AAA\x80BXYZ 577 AAA\x80BBXYZ 578 579# ------------------------------------- 580 581/(*UTF)(?=\x{123})/I 582 583/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf 584 585/[,]/BI,utf 586 587/[\x{fff4}-\x{ffff8}]/I,utf 588 589/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf 590 591/[\xff\x{ffff}]/I,utf 592 593/[\xff\x{ff}]/I,utf 594 abc\x{ff}def 595 596/[\xff\x{ff}]/I 597 abc\x{ff}def 598 599/[Ss]/I 600 601/[Ss]/I,utf 602 603/(?:\x{ff}|\x{3000})/I,utf 604 605/x/utf 606 abxyz 607 \x80\=startchar 608 abc\x80\=startchar 609 abc\x80\=startchar,offset=3 610 611/\x{c1}+\x{e1}/iIB,ucp 612 \x{c1}\x{c1}\x{c1} 613 \x{e1}\x{e1}\x{e1} 614 615/a|\x{c1}/iI,ucp 616 \x{e1}xxx 617 618/a|\x{c1}/iI,utf 619 \x{e1}xxx 620 621/\x{c1}|\x{e1}/iI,ucp 622 623/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended 624 X\x{e1}Y 625 626/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended 627 X\x{c1}Y 628 629# Without UTF or UCP characters > 127 have only one case in the default locale. 630 631/X(\x{e1})Y/replace=>\U$1<,substitute_extended 632 X\x{e1}Y 633 634/A/utf,match_invalid_utf,caseless 635 \xe5A 636 637/\bch\b/utf,match_invalid_utf 638 qchq\=ph 639 qchq\=ps 640 641/line1\nbreak/firstline,utf,match_invalid_utf 642 line1\nbreak 643 line0\nline1\nbreak 644 645/A\z/utf,match_invalid_utf 646 A\x80\x42\n 647 648# End of testinput10 649