xref: /aosp_15_r20/external/pcre/testdata/testinput10 (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1# This set of tests is for UTF-8 support and Unicode property support, with
2# relevance only for the 8-bit library.
3
4#newline_default lf any anycrlf
5
6# The next 5 patterns have UTF-8 errors
7
8/[�]/utf
9
10/�/utf
11
12/���xxx/utf
13
14/��������/utf
15
16/��������/match_invalid_utf
17
18# Now test subjects
19
20/badutf/utf
21\= Expect UTF-8 errors
22    X\xdf
23    XX\xef
24    XXX\xef\x80
25    X\xf7
26    XX\xf7\x80
27    XXX\xf7\x80\x80
28    \xfb
29    \xfb\x80
30    \xfb\x80\x80
31    \xfb\x80\x80\x80
32    \xfd
33    \xfd\x80
34    \xfd\x80\x80
35    \xfd\x80\x80\x80
36    \xfd\x80\x80\x80\x80
37    \xdf\x7f
38    \xef\x7f\x80
39    \xef\x80\x7f
40    \xf7\x7f\x80\x80
41    \xf7\x80\x7f\x80
42    \xf7\x80\x80\x7f
43    \xfb\x7f\x80\x80\x80
44    \xfb\x80\x7f\x80\x80
45    \xfb\x80\x80\x7f\x80
46    \xfb\x80\x80\x80\x7f
47    \xfd\x7f\x80\x80\x80\x80
48    \xfd\x80\x7f\x80\x80\x80
49    \xfd\x80\x80\x7f\x80\x80
50    \xfd\x80\x80\x80\x7f\x80
51    \xfd\x80\x80\x80\x80\x7f
52    \xed\xa0\x80
53    \xc0\x8f
54    \xe0\x80\x8f
55    \xf0\x80\x80\x8f
56    \xf8\x80\x80\x80\x8f
57    \xfc\x80\x80\x80\x80\x8f
58    \x80
59    \xfe
60    \xff
61
62/badutf/utf
63\= Expect UTF-8 errors
64    XX\xfb\x80\x80\x80\x80
65    XX\xfd\x80\x80\x80\x80\x80
66    XX\xf7\xbf\xbf\xbf
67
68/shortutf/utf
69\= Expect UTF-8 errors
70    XX\xdf\=ph
71    XX\xef\=ph
72    XX\xef\x80\=ph
73    \xf7\=ph
74    \xf7\x80\=ph
75    \xf7\x80\x80\=ph
76    \xfb\=ph
77    \xfb\x80\=ph
78    \xfb\x80\x80\=ph
79    \xfb\x80\x80\x80\=ph
80    \xfd\=ph
81    \xfd\x80\=ph
82    \xfd\x80\x80\=ph
83    \xfd\x80\x80\x80\=ph
84    \xfd\x80\x80\x80\x80\=ph
85
86/anything/utf
87\= Expect UTF-8 errors
88    X\xc0\x80
89    XX\xc1\x8f
90    XXX\xe0\x9f\x80
91    \xf0\x8f\x80\x80
92    \xf8\x87\x80\x80\x80
93    \xfc\x83\x80\x80\x80\x80
94    \xfe\x80\x80\x80\x80\x80
95    \xff\x80\x80\x80\x80\x80
96    \xf8\x88\x80\x80\x80
97    \xf9\x87\x80\x80\x80
98    \xfc\x84\x80\x80\x80\x80
99    \xfd\x83\x80\x80\x80\x80
100\= Expect no match
101    \xc3\x8f
102    \xe0\xaf\x80
103    \xe1\x80\x80
104    \xf0\x9f\x80\x80
105    \xf1\x8f\x80\x80
106    \xf8\x88\x80\x80\x80\=no_utf_check
107    \xf9\x87\x80\x80\x80\=no_utf_check
108    \xfc\x84\x80\x80\x80\x80\=no_utf_check
109    \xfd\x83\x80\x80\x80\x80\=no_utf_check
110
111# Similar tests with offsets
112
113/badutf/utf
114\= Expect UTF-8 errors
115    X\xdfabcd
116    X\xdfabcd\=offset=1
117\= Expect no match
118    X\xdfabcd\=offset=2
119
120/(?<=x)badutf/utf
121\= Expect UTF-8 errors
122    X\xdfabcd
123    X\xdfabcd\=offset=1
124    X\xdfabcd\=offset=2
125    X\xdfabcd\xdf\=offset=3
126\= Expect no match
127    X\xdfabcd\=offset=3
128
129/(?<=xx)badutf/utf
130\= Expect UTF-8 errors
131    X\xdfabcd
132    X\xdfabcd\=offset=1
133    X\xdfabcd\=offset=2
134    X\xdfabcd\=offset=3
135
136/(?<=xxxx)badutf/utf
137\= Expect UTF-8 errors
138    X\xdfabcd
139    X\xdfabcd\=offset=1
140    X\xdfabcd\=offset=2
141    X\xdfabcd\=offset=3
142    X\xdfabc\xdf\=offset=6
143    X\xdfabc\xdf\=offset=7
144\= Expect no match
145    X\xdfabcd\=offset=6
146
147/\x{100}/IB,utf
148
149/\x{1000}/IB,utf
150
151/\x{10000}/IB,utf
152
153/\x{100000}/IB,utf
154
155/\x{10ffff}/IB,utf
156
157/[\x{ff}]/IB,utf
158
159/[\x{100}]/IB,utf
160
161/\x80/IB,utf
162
163/\xff/IB,utf
164
165/\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
166    \x{D55c}\x{ad6d}\x{C5B4}
167
168/\x{65e5}\x{672c}\x{8a9e}/IB,utf
169    \x{65e5}\x{672c}\x{8a9e}
170
171/\x{80}/IB,utf
172
173/\x{084}/IB,utf
174
175/\x{104}/IB,utf
176
177/\x{861}/IB,utf
178
179/\x{212ab}/IB,utf
180
181/[^ab\xC0-\xF0]/IB,utf
182    \x{f1}
183    \x{bf}
184    \x{100}
185    \x{1000}
186\= Expect no match
187    \x{c0}
188    \x{f0}
189
190/Ā{3,4}/IB,utf
191  \x{100}\x{100}\x{100}\x{100\x{100}
192
193/(\x{100}+|x)/IB,utf
194
195/(\x{100}*a|x)/IB,utf
196
197/(\x{100}{0,2}a|x)/IB,utf
198
199/(\x{100}{1,2}a|x)/IB,utf
200
201/\x{100}/IB,utf
202
203/a\x{100}\x{101}*/IB,utf
204
205/a\x{100}\x{101}+/IB,utf
206
207/[^\x{c4}]/IB
208
209/[\x{100}]/IB,utf
210    \x{100}
211    Z\x{100}
212    \x{100}Z
213
214/[\xff]/IB,utf
215    >\x{ff}<
216
217/[^\xff]/IB,utf
218
219/\x{100}abc(xyz(?1))/IB,utf
220
221/\777/I,utf
222  \x{1ff}
223  \777
224
225/\x{100}+\x{200}/IB,utf
226
227/\x{100}+X/IB,utf
228
229/^[\QĀ\E-\QŐ\E/B,utf
230
231# This tests the stricter UTF-8 check according to RFC 3629.
232
233/X/utf
234\= Expect UTF-8 errors
235    \x{d800}
236    \x{da00}
237    \x{dfff}
238    \x{110000}
239    \x{2000000}
240    \x{7fffffff}
241\= Expect no match
242    \x{d800}\=no_utf_check
243    \x{da00}\=no_utf_check
244    \x{dfff}\=no_utf_check
245    \x{110000}\=no_utf_check
246    \x{2000000}\=no_utf_check
247    \x{7fffffff}\=no_utf_check
248
249/(*UTF8)\x{1234}/
250    abcd\x{1234}pqr
251
252/(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I
253
254/\h/I,utf
255    ABC\x{09}
256    ABC\x{20}
257    ABC\x{a0}
258    ABC\x{1680}
259    ABC\x{180e}
260    ABC\x{2000}
261    ABC\x{202f}
262    ABC\x{205f}
263    ABC\x{3000}
264
265/\v/I,utf
266    ABC\x{0a}
267    ABC\x{0b}
268    ABC\x{0c}
269    ABC\x{0d}
270    ABC\x{85}
271    ABC\x{2028}
272
273/\h*A/I,utf
274    CDBABC
275
276/\v+A/I,utf
277
278/\s?xxx\s/I,utf
279
280/\sxxx\s/I,utf,tables=2
281    AB\x{85}xxx\x{a0}XYZ
282    AB\x{a0}xxx\x{85}XYZ
283
284/\S \S/I,utf,tables=2
285    \x{a2} \x{84}
286    A Z
287
288/a+/utf
289    a\x{123}aa\=offset=1
290    a\x{123}aa\=offset=3
291    a\x{123}aa\=offset=4
292\= Expect bad offset value
293    a\x{123}aa\=offset=6
294\= Expect bad UTF-8 offset
295    a\x{123}aa\=offset=2
296\= Expect no match
297    a\x{123}aa\=offset=5
298
299/\x{1234}+/Ii,utf
300
301/\x{1234}+?/Ii,utf
302
303/\x{1234}++/Ii,utf
304
305/\x{1234}{2}/Ii,utf
306
307/[^\x{c4}]/IB,utf
308
309/X+\x{200}/IB,utf
310
311/\R/I,utf
312
313/\777/IB,utf
314
315/\w+\x{C4}/B,utf
316    a\x{C4}\x{C4}
317
318/\w+\x{C4}/B,utf,tables=2
319    a\x{C4}\x{C4}
320
321/\W+\x{C4}/B,utf
322    !\x{C4}
323
324/\W+\x{C4}/B,utf,tables=2
325    !\x{C4}
326
327/\W+\x{A1}/B,utf
328    !\x{A1}
329
330/\W+\x{A1}/B,utf,tables=2
331    !\x{A1}
332
333/X\s+\x{A0}/B,utf
334    X\x20\x{A0}\x{A0}
335
336/X\s+\x{A0}/B,utf,tables=2
337    X\x20\x{A0}\x{A0}
338
339/\S+\x{A0}/B,utf
340    X\x{A0}\x{A0}
341
342/\S+\x{A0}/B,utf,tables=2
343    X\x{A0}\x{A0}
344
345/\x{a0}+\s!/B,utf
346    \x{a0}\x20!
347
348/\x{a0}+\s!/B,utf,tables=2
349    \x{a0}\x20!
350
351/A/utf
352  \x{ff000041}
353  \x{7f000041}
354
355/(*UTF8)abc/never_utf
356
357/abc/utf,never_utf
358
359/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
360
361/A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
362
363/AB\x{1fb0}/IB,utf
364
365/AB\x{1fb0}/IBi,utf
366
367/\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
368    \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
369    \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
370
371/[ⱥ]/Bi,utf
372
373/[^ⱥ]/Bi,utf
374
375/\h/I
376
377/\v/I
378
379/\R/I
380
381/[[:blank:]]/B,ucp
382
383/\x{212a}+/Ii,utf
384    KKkk\x{212a}
385
386/s+/Ii,utf
387    SSss\x{17f}
388
389/\x{100}*A/IB,utf
390    A
391
392/\x{100}*\d(?R)/IB,utf
393
394/[Z\x{100}]/IB,utf
395    Z\x{100}
396    \x{100}
397    \x{100}Z
398
399/[z-\x{100}]/IB,utf
400
401/[z\Qa-d]Ā\E]/IB,utf
402    \x{100}
403    Ā
404
405/[ab\x{100}]abc(xyz(?1))/IB,utf
406
407/\x{100}*\s/IB,utf
408
409/\x{100}*\d/IB,utf
410
411/\x{100}*\w/IB,utf
412
413/\x{100}*\D/IB,utf
414
415/\x{100}*\S/IB,utf
416
417/\x{100}*\W/IB,utf
418
419/[\x{105}-\x{109}]/IBi,utf
420    \x{104}
421    \x{105}
422    \x{109}
423\= Expect no match
424    \x{100}
425    \x{10a}
426
427/[z-\x{100}]/IBi,utf
428    Z
429    z
430    \x{39c}
431    \x{178}
432    |
433    \x{80}
434    \x{ff}
435    \x{100}
436    \x{101}
437\= Expect no match
438    \x{102}
439    Y
440    y
441
442/[z-\x{100}]/IBi,utf
443
444/\x{3a3}B/IBi,utf
445
446/abc/utf,replace=�
447    abc
448
449/(?<=(a)(?-1))x/I,utf
450    a\x80zx\=offset=3
451
452/[\W\p{Any}]/B
453    abc
454    123
455
456/[\W\pL]/B
457    abc
458\= Expect no match
459    123
460
461/(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':ƿ)/utf
462
463/[\s[:^ascii:]]/B,ucp
464
465# A special extra option allows excaped surrogate code points in 8-bit mode,
466# but subjects containing them must not be UTF-checked.
467
468/\x{d800}/I,utf,allow_surrogate_escapes
469    \x{d800}\=no_utf_check
470
471/\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
472    \x{dfff}\x{df01}\=no_utf_check
473
474# This has different starting code units in 8-bit mode.
475
476/^[^ab]/IB,utf
477    c
478    \x{ff}
479    \x{100}
480\= Expect no match
481    aaa
482
483# Offsets are different in 8-bit mode.
484
485/(?<=abc)(|def)/g,utf,replace=<$0>,substitute_callout
486    123abcáyzabcdef789abcሴqr
487
488# Check name length with non-ASCII characters
489
490/(?'ABáC678901234567890123456789012012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
491
492/(?'ABáC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
493
494/(?'ABZC6789012345678901234567890123012345678901234567890123456789AB012345678901234567890123456789AB012345678901234567890123456789AB'...)/utf
495
496/(?(n/utf
497
498/(?(á/utf
499
500# Invalid UTF-8 tests
501
502/.../g,match_invalid_utf
503    abcd\x80wxzy\x80pqrs
504    abcd\x{80}wxzy\x80pqrs
505
506/abc/match_invalid_utf
507    ab\x80ab\=ph
508\= Expect no match
509    ab\x80cdef\=ph
510
511/.a/match_invalid_utf
512    ab\=ph
513    ab\=ps
514    b\xf0\x91\x88b\=ph
515    b\xf0\x91\x88b\=ps
516    b\xf0\x91\x88\xb4a
517\= Expect no match
518    b\x80\=ph
519    b\x80\=ps
520    b\xf0\x91\x88\=ph
521    b\xf0\x91\x88\=ps
522
523/.a$/match_invalid_utf
524    ab\=ph
525    ab\=ps
526\= Expect no match
527    b\xf0\x91\x98\=ph
528    b\xf0\x91\x98\=ps
529
530/ab$/match_invalid_utf
531    ab\x80cdeab
532\= Expect no match
533    ab\x80cde
534
535/.../g,match_invalid_utf
536    abcd\x{80}wxzy\x80pqrs
537
538/(?<=x)../g,match_invalid_utf
539    abcd\x{80}wxzy\x80pqrs
540    abcd\x{80}wxzy\x80xpqrs
541
542/X$/match_invalid_utf
543\= Expect no match
544    X\xc4
545
546/(?<=..)X/match_invalid_utf,aftertext
547    AB\x80AQXYZ
548    AB\x80AQXYZ\=offset=5
549    AB\x80\x80AXYZXC\=offset=5
550\= Expect no match
551    AB\x80XYZ
552    AB\x80XYZ\=offset=3
553    AB\xfeXYZ
554    AB\xffXYZ\=offset=3
555    AB\x80AXYZ
556    AB\x80AXYZ\=offset=4
557    AB\x80\x80AXYZ\=offset=5
558
559/.../match_invalid_utf
560    AB\xc4CCC
561\= Expect no match
562    A\x{d800}B
563    A\x{110000}B
564    A\xc4B
565
566/\bX/match_invalid_utf
567    A\x80X
568
569/\BX/match_invalid_utf
570\= Expect no match
571    A\x80X
572
573/(?<=...)X/match_invalid_utf
574    AAA\x80BBBXYZ
575\= Expect no match
576    AAA\x80BXYZ
577    AAA\x80BBXYZ
578
579# -------------------------------------
580
581/(*UTF)(?=\x{123})/I
582
583/[\x{c1}\x{e1}]X[\x{145}\x{146}]/I,utf
584
585/[��,]/BI,utf
586
587/[\x{fff4}-\x{ffff8}]/I,utf
588
589/[\x{fff4}-\x{afff8}\x{10ffff}]/I,utf
590
591/[\xff\x{ffff}]/I,utf
592
593/[\xff\x{ff}]/I,utf
594    abc\x{ff}def
595
596/[\xff\x{ff}]/I
597    abc\x{ff}def
598
599/[Ss]/I
600
601/[Ss]/I,utf
602
603/(?:\x{ff}|\x{3000})/I,utf
604
605/x/utf
606    abxyz
607    \x80\=startchar
608    abc\x80\=startchar
609    abc\x80\=startchar,offset=3
610
611/\x{c1}+\x{e1}/iIB,ucp
612    \x{c1}\x{c1}\x{c1}
613    \x{e1}\x{e1}\x{e1}
614
615/a|\x{c1}/iI,ucp
616    \x{e1}xxx
617
618/a|\x{c1}/iI,utf
619    \x{e1}xxx
620
621/\x{c1}|\x{e1}/iI,ucp
622
623/X(\x{e1})Y/ucp,replace=>\U$1<,substitute_extended
624    X\x{e1}Y
625
626/X(\x{e1})Y/i,ucp,replace=>\L$1<,substitute_extended
627    X\x{c1}Y
628
629# Without UTF or UCP characters > 127 have only one case in the default locale.
630
631/X(\x{e1})Y/replace=>\U$1<,substitute_extended
632    X\x{e1}Y
633
634/A/utf,match_invalid_utf,caseless
635    \xe5A
636
637/\bch\b/utf,match_invalid_utf
638    qchq\=ph
639    qchq\=ps
640
641/line1\nbreak/firstline,utf,match_invalid_utf
642    line1\nbreak
643    line0\nline1\nbreak
644
645/A\z/utf,match_invalid_utf
646    A\x80\x42\n
647
648# End of testinput10
649