1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016 University of Cambridge
11
12 -----------------------------------------------------------------------------
13 Redistribution and use in source and binary forms, with or without
14 modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37 POSSIBILITY OF SUCH DAMAGE.
38 -----------------------------------------------------------------------------
39 */
40
41 #ifdef HAVE_CONFIG_H
42 #include "config.h"
43 #endif
44
45 #include <stdio.h>
46 #include <string.h>
47
48 #define PCRE2_CODE_UNIT_WIDTH 0
49 #include "pcre2.h"
50
51 /*
52 Letter characters:
53 \xe6\x92\xad = 0x64ad = 25773 (kanji)
54 Non-letter characters:
55 \xc2\xa1 = 0xa1 = (Inverted Exclamation Mark)
56 \xf3\xa9\xb7\x80 = 0xe9dc0 = 957888
57 \xed\xa0\x80 = 55296 = 0xd800 (Invalid UTF character)
58 \xed\xb0\x80 = 56320 = 0xdc00 (Invalid UTF character)
59 Newlines:
60 \xc2\x85 = 0x85 = 133 (NExt Line = NEL)
61 \xe2\x80\xa8 = 0x2028 = 8232 (Line Separator)
62 Othercase pairs:
63 \xc3\xa9 = 0xe9 = 233 (e')
64 \xc3\x89 = 0xc9 = 201 (E')
65 \xc3\xa1 = 0xe1 = 225 (a')
66 \xc3\x81 = 0xc1 = 193 (A')
67 \x53 = 0x53 = S
68 \x73 = 0x73 = s
69 \xc5\xbf = 0x17f = 383 (long S)
70 \xc8\xba = 0x23a = 570
71 \xe2\xb1\xa5 = 0x2c65 = 11365
72 \xe1\xbd\xb8 = 0x1f78 = 8056
73 \xe1\xbf\xb8 = 0x1ff8 = 8184
74 \xf0\x90\x90\x80 = 0x10400 = 66560
75 \xf0\x90\x90\xa8 = 0x10428 = 66600
76 \xc7\x84 = 0x1c4 = 452
77 \xc7\x85 = 0x1c5 = 453
78 \xc7\x86 = 0x1c6 = 454
79 Caseless sets:
80 ucp_Armenian - \x{531}-\x{556} -> \x{561}-\x{586}
81 ucp_Coptic - \x{2c80}-\x{2ce3} -> caseless: XOR 0x1
82 ucp_Latin - \x{ff21}-\x{ff3a} -> \x{ff41]-\x{ff5a}
83
84 Mark property:
85 \xcc\x8d = 0x30d = 781
86 Special:
87 \xc2\x80 = 0x80 = 128 (lowest 2 byte character)
88 \xdf\xbf = 0x7ff = 2047 (highest 2 byte character)
89 \xe0\xa0\x80 = 0x800 = 2048 (lowest 2 byte character)
90 \xef\xbf\xbf = 0xffff = 65535 (highest 3 byte character)
91 \xf0\x90\x80\x80 = 0x10000 = 65536 (lowest 4 byte character)
92 \xf4\x8f\xbf\xbf = 0x10ffff = 1114111 (highest allowed utf character)
93 */
94
95 static int regression_tests(void);
96 static int invalid_utf8_regression_tests(void);
97 static int invalid_utf16_regression_tests(void);
98 static int invalid_utf32_regression_tests(void);
99
main(void)100 int main(void)
101 {
102 int jit = 0;
103 #if defined SUPPORT_PCRE2_8
104 pcre2_config_8(PCRE2_CONFIG_JIT, &jit);
105 #elif defined SUPPORT_PCRE2_16
106 pcre2_config_16(PCRE2_CONFIG_JIT, &jit);
107 #elif defined SUPPORT_PCRE2_32
108 pcre2_config_32(PCRE2_CONFIG_JIT, &jit);
109 #endif
110 if (!jit) {
111 printf("JIT must be enabled to run pcre2_jit_test\n");
112 return 1;
113 }
114 return regression_tests()
115 | invalid_utf8_regression_tests()
116 | invalid_utf16_regression_tests()
117 | invalid_utf32_regression_tests();
118 }
119
120 /* --------------------------------------------------------------------------------------- */
121
122 #if !(defined SUPPORT_PCRE2_8) && !(defined SUPPORT_PCRE2_16) && !(defined SUPPORT_PCRE2_32)
123 #error SUPPORT_PCRE2_8 or SUPPORT_PCRE2_16 or SUPPORT_PCRE2_32 must be defined
124 #endif
125
126 #define MU (PCRE2_MULTILINE | PCRE2_UTF)
127 #define MUP (PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
128 #define CMU (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF)
129 #define CMUP (PCRE2_CASELESS | PCRE2_MULTILINE | PCRE2_UTF | PCRE2_UCP)
130 #define M (PCRE2_MULTILINE)
131 #define MP (PCRE2_MULTILINE | PCRE2_UCP)
132 #define U (PCRE2_UTF)
133 #define CM (PCRE2_CASELESS | PCRE2_MULTILINE)
134
135 #define BSR(x) ((x) << 16)
136 #define A PCRE2_NEWLINE_ANYCRLF
137
138 #define GET_NEWLINE(x) ((x) & 0xffff)
139 #define GET_BSR(x) ((x) >> 16)
140
141 #define OFFSET_MASK 0x00ffff
142 #define F_NO8 0x010000
143 #define F_NO16 0x020000
144 #define F_NO32 0x020000
145 #define F_NOMATCH 0x040000
146 #define F_DIFF 0x080000
147 #define F_FORCECONV 0x100000
148 #define F_PROPERTY 0x200000
149
150 struct regression_test_case {
151 uint32_t compile_options;
152 int newline;
153 int match_options;
154 int start_offset;
155 const char *pattern;
156 const char *input;
157 };
158
159 static struct regression_test_case regression_test_cases[] = {
160 /* Constant strings. */
161 { MU, A, 0, 0, "AbC", "AbAbC" },
162 { MU, A, 0, 0, "ACCEPT", "AACACCACCEACCEPACCEPTACCEPTT" },
163 { CMU, A, 0, 0, "aA#\xc3\xa9\xc3\x81", "aA#Aa#\xc3\x89\xc3\xa1" },
164 { M, A, 0, 0, "[^a]", "aAbB" },
165 { CM, A, 0, 0, "[^m]", "mMnN" },
166 { M, A, 0, 0, "a[^b][^#]", "abacd" },
167 { CM, A, 0, 0, "A[^B][^E]", "abacd" },
168 { CMU, A, 0, 0, "[^x][^#]", "XxBll" },
169 { MU, A, 0, 0, "[^a]", "aaa\xc3\xa1#Ab" },
170 { CMU, A, 0, 0, "[^A]", "aA\xe6\x92\xad" },
171 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\n+bc" },
172 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\r+bc" },
173 { MU, A, 0, 0, "\\W(\\W)?\\w", "\r\r+bc" },
174 { MU, A, 0, 0, "\\W(\\W)?\\w", "\n\n+bc" },
175 { MU, A, 0, 0, "[axd]", "sAXd" },
176 { CMU, A, 0, 0, "[axd]", "sAXd" },
177 { CMU, A, 0, 0 | F_NOMATCH, "[^axd]", "DxA" },
178 { MU, A, 0, 0, "[a-dA-C]", "\xe6\x92\xad\xc3\xa9.B" },
179 { MU, A, 0, 0, "[^a-dA-C]", "\xe6\x92\xad\xc3\xa9" },
180 { CMU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
181 { MU, A, 0, 0, "[^\xc3\xa9]", "\xc3\xa9\xc3\x89." },
182 { MU, A, 0, 0, "[^a]", "\xc2\x80[]" },
183 { CMU, A, 0, 0, "\xf0\x90\x90\xa7", "\xf0\x90\x91\x8f" },
184 { CM, A, 0, 0, "1a2b3c4", "1a2B3c51A2B3C4" },
185 { PCRE2_CASELESS, 0, 0, 0, "\xff#a", "\xff#\xff\xfe##\xff#A" },
186 { PCRE2_CASELESS, 0, 0, 0, "\xfe", "\xff\xfc#\xfe\xfe" },
187 { PCRE2_CASELESS, 0, 0, 0, "a1", "Aa1" },
188 #ifndef NEVER_BACKSLASH_C
189 { M, A, 0, 0, "\\Ca", "cda" },
190 { CM, A, 0, 0, "\\Ca", "CDA" },
191 { M, A, 0, 0 | F_NOMATCH, "\\Cx", "cda" },
192 { CM, A, 0, 0 | F_NOMATCH, "\\Cx", "CDA" },
193 #endif /* !NEVER_BACKSLASH_C */
194 { CMUP, A, 0, 0, "\xf0\x90\x90\x80\xf0\x90\x90\xa8", "\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
195 { CMUP, A, 0, 0, "\xf0\x90\x90\x80{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
196 { CMUP, A, 0, 0, "\xf0\x90\x90\xa8{2}", "\xf0\x90\x90\x80#\xf0\x90\x90\xa8\xf0\x90\x90\x80" },
197 { CMUP, A, 0, 0, "\xe1\xbd\xb8\xe1\xbf\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
198 { M, A, 0, 0, "[3-57-9]", "5" },
199 { PCRE2_AUTO_CALLOUT, A, 0, 0, "12345678901234567890123456789012345678901234567890123456789012345678901234567890",
200 "12345678901234567890123456789012345678901234567890123456789012345678901234567890" },
201 { 0, A, 0, 0, "..a.......b", "bbbbbbbbbbbbbbbbbbbbbabbbbbbbb" },
202 { 0, A, 0, 0, "..a.....b", "bbbbbbbbbbbbbbbbbbbbbabbbbbbbb" },
203
204 /* Assertions. */
205 { MU, A, 0, 0, "\\b[^A]", "A_B#" },
206 { M, A, 0, 0 | F_NOMATCH, "\\b\\W", "\n*" },
207 { MU, A, 0, 0, "\\B[^,]\\b[^s]\\b", "#X" },
208 { MP, A, 0, 0, "\\B", "_\xa1" },
209 { MP, A, 0, 0 | F_PROPERTY, "\\b_\\b[,A]\\B", "_," },
210 { MUP, A, 0, 0, "\\b", "\xe6\x92\xad!" },
211 { MUP, A, 0, 0, "\\B", "_\xc2\xa1\xc3\xa1\xc2\x85" },
212 { MUP, A, 0, 0, "\\b[^A]\\B[^c]\\b[^_]\\B", "_\xc3\xa1\xe2\x80\xa8" },
213 { MUP, A, 0, 0, "\\b\\w+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
214 { MU, A, 0, 0 | F_NOMATCH, "\\b.", "\xcd\xbe" },
215 { CMUP, A, 0, 0, "\\By", "\xf0\x90\x90\xa8y" },
216 { M, A, 0, 0 | F_NOMATCH, "\\R^", "\n" },
217 { M, A, 0, 1 | F_NOMATCH, "^", "\n" },
218 { 0, 0, 0, 0, "^ab", "ab" },
219 { 0, 0, 0, 0 | F_NOMATCH, "^ab", "aab" },
220 { M, PCRE2_NEWLINE_CRLF, 0, 0, "^a", "\r\raa\n\naa\r\naa" },
221 { MU, A, 0, 0, "^-", "\xe2\x80\xa8--\xc2\x85-\r\n-" },
222 { M, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--b--\x85--" },
223 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xe2\x80\xa8--" },
224 { MU, PCRE2_NEWLINE_ANY, 0, 0, "^-", "a--\xc2\x85--" },
225 { 0, 0, 0, 0, "ab$", "ab" },
226 { 0, 0, 0, 0 | F_NOMATCH, "ab$", "abab\n\n" },
227 { PCRE2_DOLLAR_ENDONLY, 0, 0, 0 | F_NOMATCH, "ab$", "abab\r\n" },
228 { M, PCRE2_NEWLINE_CRLF, 0, 0, "a$", "\r\raa\n\naa\r\naa" },
229 { M, PCRE2_NEWLINE_ANY, 0, 0, "a$", "aaa" },
230 { MU, PCRE2_NEWLINE_ANYCRLF, 0, 0, "#$", "#\xc2\x85###\r#" },
231 { MU, PCRE2_NEWLINE_ANY, 0, 0, "#$", "#\xe2\x80\xa9" },
232 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0 | F_NOMATCH, "^a", "aa\naa" },
233 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTBOL, 0, "^a", "aa\naa" },
234 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\naa" },
235 { 0, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0 | F_NOMATCH, "a$", "aa\r\n" },
236 { U | PCRE2_DOLLAR_ENDONLY, PCRE2_NEWLINE_ANY, 0, 0 | F_PROPERTY, "\\p{Any}{2,}$", "aa\r\n" },
237 { M, PCRE2_NEWLINE_ANY, PCRE2_NOTEOL, 0, "a$", "aa\naa" },
238 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa" },
239 { U, PCRE2_NEWLINE_CR, 0, 0, "a\\Z", "aaa\r" },
240 { 0, PCRE2_NEWLINE_CR, 0, 0, ".\\Z", "aaa\n" },
241 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r" },
242 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\n" },
243 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".\\Z", "aaa\r\n" },
244 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
245 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
246 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
247 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
248 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
249 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa" },
250 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r" },
251 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\n" },
252 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".\\Z", "aaa\r\n" },
253 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xc2\x85" },
254 { U, PCRE2_NEWLINE_ANY, 0, 0, ".\\Z", "aaa\xe2\x80\xa8" },
255 { M, A, 0, 0, "\\Aa", "aaa" },
256 { M, A, 0, 1 | F_NOMATCH, "\\Aa", "aaa" },
257 { M, A, 0, 1, "\\Ga", "aaa" },
258 { M, A, 0, 1 | F_NOMATCH, "\\Ga", "aba" },
259 { M, A, 0, 0, "a\\z", "aaa" },
260 { M, A, 0, 0 | F_NOMATCH, "a\\z", "aab" },
261
262 /* Brackets and alternatives. */
263 { MU, A, 0, 0, "(ab|bb|cd)", "bacde" },
264 { MU, A, 0, 0, "(?:ab|a)(bc|c)", "ababc" },
265 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|efg))", "abac" },
266 { CMU, A, 0, 0, "((aB|(Cc))|(bB)|(?:cd|EFg))", "AcCe" },
267 { MU, A, 0, 0, "((ab|(cc))|(bb)|(?:cd|ebg))", "acebebg" },
268 { MU, A, 0, 0, "(?:(a)|(?:b))(cc|(?:d|e))(a|b)k", "accabdbbccbk" },
269 { MU, A, 0, 0, "\xc7\x82|\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
270 { MU, A, 0, 0, "=\xc7\x82|#\xc6\x82", "\xf1\x83\x82\x82=\xc7\x82\xc7\x83" },
271 { MU, A, 0, 0, "\xc7\x82\xc7\x83|\xc6\x82\xc6\x82", "\xf1\x83\x82\x82\xc7\x82\xc7\x83" },
272 { MU, A, 0, 0, "\xc6\x82\xc6\x82|\xc7\x83\xc7\x83|\xc8\x84\xc8\x84", "\xf1\x83\x82\x82\xc8\x84\xc8\x84" },
273 { U, A, 0, 0, "\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80", "\xdf\xbf\xc2\x80\xe4\x84\x80" },
274 { U, A, 0, 0, "(?:\xe1\x81\x80|\xe2\x82\x80|\xe4\x84\x80)#", "\xdf\xbf\xc2\x80#\xe4\x84\x80#" },
275 { CM, A, 0, 0, "ab|cd", "CD" },
276 { CM, A, 0, 0, "a1277|a1377|bX487", "bx487" },
277 { CM, A, 0, 0, "a1277|a1377|bx487", "bX487" },
278 { 0, A, 0, 0, "(a|)b*+a", "a" },
279 { 0, A, 0, 0 | F_NOMATCH, "(.|.|.|.|.)(|.|.|.|.)(.||.|.|.)(.|.||.|.)(.|.|.||.)(.|.|.|.|)(A|.|.|.|.)(.|A|.|.|.)(.|.|A|.|.)(.|.|.|A|.)(.|.|.|.|A)(B|.|.|.|.)(.|B|.|.|.)(.|.|B|.|.)(.|.|.|B|.)(.|.|.|.|B)xa", "1234567890123456ax" },
280
281 /* Greedy and non-greedy ? operators. */
282 { MU, A, 0, 0, "(?:a)?a", "laab" },
283 { CMU, A, 0, 0, "(A)?A", "llaab" },
284 { MU, A, 0, 0, "(a)?\?a", "aab" }, /* ?? is the prefix of trygraphs in GCC. */
285 { MU, A, 0, 0, "(a)?a", "manm" },
286 { CMU, A, 0, 0, "(a|b)?\?d((?:e)?)", "ABABdx" },
287 { MU, A, 0, 0, "(a|b)?\?d((?:e)?)", "abcde" },
288 { MU, A, 0, 0, "((?:ab)?\?g|b(?:g(nn|d)?\?)?)?\?(?:n)?m", "abgnbgnnbgdnmm" },
289
290 /* Greedy and non-greedy + operators */
291 { MU, A, 0, 0, "(aa)+aa", "aaaaaaa" },
292 { MU, A, 0, 0, "(aa)+?aa", "aaaaaaa" },
293 { MU, A, 0, 0, "(?:aba|ab|a)+l", "ababamababal" },
294 { MU, A, 0, 0, "(?:aba|ab|a)+?l", "ababamababal" },
295 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+?|ss)+e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
296 { MU, A, 0, 0, "(a(?:bc|cb|b|c)+|ss)+?e", "accssabccbcacbccbbXaccssabccbcacbccbbe" },
297 { MU, A, 0, 0, "(?:(b(c)+?)+)?\?(?:(bc)+|(cb)+)+(?:m)+", "bccbcccbcbccbcbPbccbcccbcbccbcbmmn" },
298 { MU, A, 0, 0, "(aa|bb){8,1000}", "abaabbaabbaabbaab_aabbaabbaabbaabbaabbaabb_" },
299
300 /* Greedy and non-greedy * operators */
301 { CMU, A, 0, 0, "(?:AA)*AB", "aaaaaaamaaaaaaab" },
302 { MU, A, 0, 0, "(?:aa)*?ab", "aaaaaaamaaaaaaab" },
303 { MU, A, 0, 0, "(aa|ab)*ab", "aaabaaab" },
304 { CMU, A, 0, 0, "(aa|Ab)*?aB", "aaabaaab" },
305 { MU, A, 0, 0, "(a|b)*(?:a)*(?:b)*m", "abbbaaababanabbbaaababamm" },
306 { MU, A, 0, 0, "(a|b)*?(?:a)*?(?:b)*?m", "abbbaaababanabbbaaababamm" },
307 { M, A, 0, 0, "a(a(\\1*)a|(b)b+){0}a", "aa" },
308 { M, A, 0, 0, "((?:a|)*){0}a", "a" },
309
310 /* Combining ? + * operators */
311 { MU, A, 0, 0, "((bm)+)?\?(?:a)*(bm)+n|((am)+?)?(?:a)+(am)*n", "bmbmabmamaaamambmaman" },
312 { MU, A, 0, 0, "(((ab)?cd)*ef)+g", "abcdcdefcdefefmabcdcdefcdefefgg" },
313 { MU, A, 0, 0, "(((ab)?\?cd)*?ef)+?g", "abcdcdefcdefefmabcdcdefcdefefgg" },
314 { MU, A, 0, 0, "(?:(ab)?c|(?:ab)+?d)*g", "ababcdccababddg" },
315 { MU, A, 0, 0, "(?:(?:ab)?\?c|(ab)+d)*?g", "ababcdccababddg" },
316
317 /* Single character iterators. */
318 { MU, A, 0, 0, "(a+aab)+aaaab", "aaaabcaaaabaabcaabcaaabaaaab" },
319 { MU, A, 0, 0, "(a*a*aab)+x", "aaaaabaabaaabmaabx" },
320 { MU, A, 0, 0, "(a*?(b|ab)a*?)+x", "aaaabcxbbaabaacbaaabaabax" },
321 { MU, A, 0, 0, "(a+(ab|ad)a+)+x", "aaabaaaadaabaaabaaaadaaax" },
322 { MU, A, 0, 0, "(a?(a)a?)+(aaa)", "abaaabaaaaaaaa" },
323 { MU, A, 0, 0, "(a?\?(a)a?\?)+(b)", "aaaacaaacaacacbaaab" },
324 { MU, A, 0, 0, "(a{0,4}(b))+d", "aaaaaabaabcaaaaabaaaaabd" },
325 { MU, A, 0, 0, "(a{0,4}?[^b])+d+(a{0,4}[^b])d+", "aaaaadaaaacaadddaaddd" },
326 { MU, A, 0, 0, "(ba{2})+c", "baabaaabacbaabaac" },
327 { MU, A, 0, 0, "(a*+bc++)+", "aaabbcaaabcccab" },
328 { MU, A, 0, 0, "(a?+[^b])+", "babaacacb" },
329 { MU, A, 0, 0, "(a{0,3}+b)(a{0,3}+b)(a{0,3}+)[^c]", "abaabaaacbaabaaaac" },
330 { CMU, A, 0, 0, "([a-c]+[d-f]+?)+?g", "aBdacdehAbDaFgA" },
331 { CMU, A, 0, 0, "[c-f]+k", "DemmFke" },
332 { MU, A, 0, 0, "([DGH]{0,4}M)+", "GGDGHDGMMHMDHHGHM" },
333 { MU, A, 0, 0, "([a-c]{4,}s)+", "abasabbasbbaabsbba" },
334 { CMU, A, 0, 0, "[ace]{3,7}", "AcbDAcEEcEd" },
335 { CMU, A, 0, 0, "[ace]{3,7}?", "AcbDAcEEcEd" },
336 { CMU, A, 0, 0, "[ace]{3,}", "AcbDAcEEcEd" },
337 { CMU, A, 0, 0, "[ace]{3,}?", "AcbDAcEEcEd" },
338 { MU, A, 0, 0, "[ckl]{2,}?g", "cdkkmlglglkcg" },
339 { CMU, A, 0, 0, "[ace]{5}?", "AcCebDAcEEcEd" },
340 { MU, A, 0, 0, "([AbC]{3,5}?d)+", "BACaAbbAEAACCbdCCbdCCAAbb" },
341 { MU, A, 0, 0, "([^ab]{0,}s){2}", "abaabcdsABamsDDs" },
342 { MU, A, 0, 0, "\\b\\w+\\B", "x,a_cd" },
343 { MUP, A, 0, 0, "\\b[^\xc2\xa1]+\\B", "\xc3\x89\xc2\xa1\xe6\x92\xad\xc3\x81\xc3\xa1" },
344 { CMU, A, 0, 0, "[^b]+(a*)([^c]?d{3})", "aaaaddd" },
345 { CMUP, A, 0, 0, "\xe1\xbd\xb8{2}", "\xe1\xbf\xb8#\xe1\xbf\xb8\xe1\xbd\xb8" },
346 { CMU, A, 0, 0, "[^\xf0\x90\x90\x80]{2,4}@", "\xf0\x90\x90\xa8\xf0\x90\x90\x80###\xf0\x90\x90\x80@@@" },
347 { CMU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
348 { MU, A, 0, 0, "[^\xe1\xbd\xb8][^\xc3\xa9]", "\xe1\xbd\xb8\xe1\xbf\xb8\xc3\xa9\xc3\x89#" },
349 { MU, A, 0, 0, "[^\xe1\xbd\xb8]{3,}?", "##\xe1\xbd\xb8#\xe1\xbd\xb8#\xc3\x89#\xe1\xbd\xb8" },
350 { MU, A, 0, 0, "\\d+123", "987654321,01234" },
351 { MU, A, 0, 0, "abcd*|\\w+xy", "aaaaa,abxyz" },
352 { MU, A, 0, 0, "(?:abc|((?:amc|\\b\\w*xy)))", "aaaaa,abxyz" },
353 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.abcd#."},
354 { MU, A, 0, 0, "a(?R)|([a-z]++)#", ".abcd.mbcd#."},
355 { MU, A, 0, 0, ".[ab]*.", "xx" },
356 { MU, A, 0, 0, ".[ab]*a", "xxa" },
357 { MU, A, 0, 0, ".[ab]?.", "xx" },
358 { MU, A, 0, 0, "_[ab]+_*a", "_aa" },
359 { MU, A, 0, 0, "#(A+)#\\d+", "#A#A#0" },
360 { MU, A, 0, 0, "(?P<size>\\d+)m|M", "4M" },
361 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\n?.+#", "\n,\n,#" },
362 { 0, A, 0, 0, "<(\\w+)[\\s\\w]+id>", "<br><div id>" },
363
364 /* Bracket repeats with limit. */
365 { MU, A, 0, 0, "(?:(ab){2}){5}M", "abababababababababababM" },
366 { MU, A, 0, 0, "(?:ab|abab){1,5}M", "abababababababababababM" },
367 { MU, A, 0, 0, "(?>ab|abab){1,5}M", "abababababababababababM" },
368 { MU, A, 0, 0, "(?:ab|abab){1,5}?M", "abababababababababababM" },
369 { MU, A, 0, 0, "(?>ab|abab){1,5}?M", "abababababababababababM" },
370 { MU, A, 0, 0, "(?:(ab){1,4}?){1,3}?M", "abababababababababababababM" },
371 { MU, A, 0, 0, "(?:(ab){1,4}){1,3}abababababababababababM", "ababababababababababababM" },
372 { MU, A, 0, 0 | F_NOMATCH, "(?:(ab){1,4}){1,3}abababababababababababM", "abababababababababababM" },
373 { MU, A, 0, 0, "(ab){4,6}?M", "abababababababM" },
374
375 /* Basic character sets. */
376 { MU, A, 0, 0, "(?:\\s)+(?:\\S)+", "ab \t\xc3\xa9\xe6\x92\xad " },
377 { MU, A, 0, 0, "(\\w)*(k)(\\W)?\?", "abcdef abck11" },
378 { MU, A, 0, 0, "\\((\\d)+\\)\\D", "a() (83 (8)2 (9)ab" },
379 { MU, A, 0, 0, "\\w(\\s|(?:\\d)*,)+\\w\\wb", "a 5, 4,, bb 5, 4,, aab" },
380 { MU, A, 0, 0, "(\\v+)(\\V+)", "\x0e\xc2\x85\xe2\x80\xa8\x0b\x09\xe2\x80\xa9" },
381 { MU, A, 0, 0, "(\\h+)(\\H+)", "\xe2\x80\xa8\xe2\x80\x80\x20\xe2\x80\x8a\xe2\x81\x9f\xe3\x80\x80\x09\x20\xc2\xa0\x0a" },
382 { MU, A, 0, 0, "x[bcef]+", "xaxdxecbfg" },
383 { MU, A, 0, 0, "x[bcdghij]+", "xaxexfxdgbjk" },
384 { MU, A, 0, 0, "x[^befg]+", "xbxexacdhg" },
385 { MU, A, 0, 0, "x[^bcdl]+", "xlxbxaekmd" },
386 { MU, A, 0, 0, "x[^bcdghi]+", "xbxdxgxaefji" },
387 { MU, A, 0, 0, "x[B-Fb-f]+", "xaxAxgxbfBFG" },
388 { CMU, A, 0, 0, "\\x{e9}+", "#\xf0\x90\x90\xa8\xc3\xa8\xc3\xa9\xc3\x89\xc3\x88" },
389 { CMU, A, 0, 0, "[^\\x{e9}]+", "\xc3\xa9#\xf0\x90\x90\xa8\xc3\xa8\xc3\x88\xc3\x89" },
390 { MU, A, 0, 0, "[\\x02\\x7e]+", "\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x02\x7e\x7f" },
391 { MU, A, 0, 0, "[^\\x02\\x7e]+", "\x02\xc3\x81\xe1\xbf\xb8\xf0\x90\x90\xa8\x01\x7f\x7e" },
392 { MU, A, 0, 0, "[\\x{81}-\\x{7fe}]+", "#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xc2\x81\xdf\xbe\xdf\xbf" },
393 { MU, A, 0, 0, "[^\\x{81}-\\x{7fe}]+", "\xc2\x81#\xe1\xbf\xb8\xf0\x90\x90\xa8\xc2\x80\xdf\xbf\xdf\xbe" },
394 { MU, A, 0, 0, "[\\x{801}-\\x{fffe}]+", "#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xe0\xa0\x81\xef\xbf\xbe\xef\xbf\xbf" },
395 { MU, A, 0, 0, "[^\\x{801}-\\x{fffe}]+", "\xe0\xa0\x81#\xc3\xa9\xf0\x90\x90\x80\xe0\xa0\x80\xef\xbf\xbf\xef\xbf\xbe" },
396 { MU, A, 0, 0, "[\\x{10001}-\\x{10fffe}]+", "#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf0\x90\x80\x81\xf4\x8f\xbf\xbe\xf4\x8f\xbf\xbf" },
397 { MU, A, 0, 0, "[^\\x{10001}-\\x{10fffe}]+", "\xf0\x90\x80\x81#\xc3\xa9\xe2\xb1\xa5\xf0\x90\x80\x80\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbe" },
398 { CMU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "^[\\x{100}-\\x{17f}]", " " },
399 { M, A, 0, 0 | F_NOMATCH, "[^\\S\\W]{6}", "abcdefghijk" },
400
401 /* Unicode properties. */
402 { MUP, A, 0, 0, "[1-5\xc3\xa9\\w]", "\xc3\xa1_" },
403 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\x81\\p{Ll}]", "A_\xc3\x89\xc3\xa1" },
404 { MUP, A, 0, 0, "[\\Wd-h_x-z]+", "a\xc2\xa1#_yhzdxi" },
405 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}]", "abc" },
406 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}]", "abc" },
407 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\\P{Any}\xc3\xa1-\xc3\xa8]", "abc" },
408 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Any}\xc3\xa1-\xc3\xa8]", "abc" },
409 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
410 { MUP, A, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
411 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa1-\xc3\xa8\\p{Any}]", "abc" },
412 { MUP, A, 0, 0 | F_PROPERTY, "[^\xc3\xa1-\xc3\xa8\\P{Any}]", "abc" },
413 { MUP, A, 0, 0, "[b-\xc3\xa9\\s]", "a\xc\xe6\x92\xad" },
414 { CMUP, A, 0, 0, "[\xc2\x85-\xc2\x89\xc3\x89]", "\xc2\x84\xc3\xa9" },
415 { MUP, A, 0, 0, "[^b-d^&\\s]{3,}", "db^ !a\xe2\x80\xa8_ae" },
416 { MUP, A, 0, 0 | F_PROPERTY, "[^\\S\\P{Any}][\\sN]{1,3}[\\P{N}]{4}", "\xe2\x80\xaa\xa N\x9\xc3\xa9_0" },
417 { MU, A, 0, 0 | F_PROPERTY, "[^\\P{L}\x9!D-F\xa]{2,3}", "\x9,.DF\xa.CG\xc3\x81" },
418 { CMUP, A, 0, 0, "[\xc3\xa1-\xc3\xa9_\xe2\x80\xa0-\xe2\x80\xaf]{1,5}[^\xe2\x80\xa0-\xe2\x80\xaf]", "\xc2\xa1\xc3\x89\xc3\x89\xe2\x80\xaf_\xe2\x80\xa0" },
419 { MUP, A, 0, 0 | F_PROPERTY, "[\xc3\xa2-\xc3\xa6\xc3\x81-\xc3\x84\xe2\x80\xa8-\xe2\x80\xa9\xe6\x92\xad\\p{Zs}]{2,}", "\xe2\x80\xa7\xe2\x80\xa9\xe6\x92\xad \xe6\x92\xae" },
420 { MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
421 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
422 { MUP, 0, 0, 0 | F_NOMATCH | F_PROPERTY, "[^\\p{Hangul}\\p{Z}]", " " },
423 { MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
424 { MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
425 { CMUP, 0, 0, 0, "[^S]\\B", "\xe2\x80\x8a" },
426 { MUP, 0, 0, 0 | F_NOMATCH, "[^[:print:]\\x{f6f6}]", "\xef\x9b\xb6" },
427 { MUP, 0, 0, 0, "[[:xdigit:]\\x{6500}]#", "\xe6\x94\x80#" },
428 { MUP, 0, 0, 0 | F_PROPERTY, "[\\pC\\PC]#", "A#" },
429
430 /* Possible empty brackets. */
431 { MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
432 { MU, A, 0, 0, "(|ab||bc|a)+d", "abcxabcabd" },
433 { MU, A, 0, 0, "(?:|ab||bc|a)*d", "abcxabcabd" },
434 { MU, A, 0, 0, "(|ab||bc|a)*d", "abcxabcabd" },
435 { MU, A, 0, 0, "(?:|ab||bc|a)+?d", "abcxabcabd" },
436 { MU, A, 0, 0, "(|ab||bc|a)+?d", "abcxabcabd" },
437 { MU, A, 0, 0, "(?:|ab||bc|a)*?d", "abcxabcabd" },
438 { MU, A, 0, 0, "(|ab||bc|a)*?d", "abcxabcabd" },
439 { MU, A, 0, 0, "(((a)*?|(?:ba)+)+?|(?:|c|ca)*)*m", "abaacaccabacabalabaacaccabacabamm" },
440 { MU, A, 0, 0, "(?:((?:a)*|(ba)+?)+|(|c|ca)*?)*?m", "abaacaccabacabalabaacaccabacabamm" },
441
442 /* Start offset. */
443 { MU, A, 0, 3, "(\\d|(?:\\w)*\\w)+", "0ac01Hb" },
444 { MU, A, 0, 4 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
445 { MU, A, 0, 2 | F_NOMATCH, "(\\w\\W\\w)+", "ab#d" },
446 { MU, A, 0, 1, "(\\w\\W\\w)+", "ab#d" },
447
448 /* Newline. */
449 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
450 { M, PCRE2_NEWLINE_CR, 0, 0, "\\W{0,2}[^#]{3}", "\r\n#....." },
451 { M, PCRE2_NEWLINE_CRLF, 0, 0, "\\W{1,3}[^#]", "\r\n##...." },
452 { MU, A, PCRE2_NO_UTF_CHECK, 1, "^.a", "\n\x80\nxa" },
453 { MU, A, 0, 1, "^", "\r\n" },
454 { M, PCRE2_NEWLINE_CRLF, 0, 1 | F_NOMATCH, "^", "\r\n" },
455 { M, PCRE2_NEWLINE_CRLF, 0, 1, "^", "\r\na" },
456
457 /* Any character except newline or any newline. */
458 { 0, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
459 { U, PCRE2_NEWLINE_CRLF, 0, 0, ".(.).", "a\xc3\xa1\r\n\n\r\r" },
460 { 0, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
461 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0, ".(.)", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa8" },
462 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.).", "a\rb\nc\r\n\xc2\x85\xe2\x80\xa9$de" },
463 { U, PCRE2_NEWLINE_ANYCRLF, 0, 0 | F_NOMATCH, ".(.).", "\xe2\x80\xa8\nb\r" },
464 { 0, PCRE2_NEWLINE_ANY, 0, 0, "(.)(.)", "#\x85#\r#\n#\r\n#\x84" },
465 { U, PCRE2_NEWLINE_ANY, 0, 0, "(.+)#", "#\rMn\xc2\x85#\n###" },
466 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\r" },
467 { 0, BSR(PCRE2_BSR_ANYCRLF), 0, 0, "\\R", "\x85#\r\n#" },
468 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\xe2\x80\xa8#c" },
469 { U, BSR(PCRE2_BSR_UNICODE), 0, 0, "\\R", "ab\r\nc" },
470 { U, PCRE2_NEWLINE_CRLF | BSR(PCRE2_BSR_UNICODE), 0, 0, "(\\R.)+", "\xc2\x85\r\n#\xe2\x80\xa8\n\r\n\r" },
471 { MU, A, 0, 0 | F_NOMATCH, "\\R+", "ab" },
472 { MU, A, 0, 0, "\\R+", "ab\r\n\r" },
473 { MU, A, 0, 0, "\\R*", "ab\r\n\r" },
474 { MU, A, 0, 0, "\\R*", "\r\n\r" },
475 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\r\r" },
476 { MU, A, 0, 0, "\\R{2,4}", "\r\nab\n\n\n\r\r\r" },
477 { MU, A, 0, 0, "\\R{2,}", "\r\nab\n\n\n\r\r\r" },
478 { MU, A, 0, 0, "\\R{0,3}", "\r\n\r\n\r\n\r\n\r\n" },
479 { MU, A, 0, 0 | F_NOMATCH, "\\R+\\R\\R", "\r\n\r\n" },
480 { MU, A, 0, 0, "\\R+\\R\\R", "\r\r\r" },
481 { MU, A, 0, 0, "\\R*\\R\\R", "\n\r" },
482 { MU, A, 0, 0 | F_NOMATCH, "\\R{2,4}\\R\\R", "\r\r\r" },
483 { MU, A, 0, 0, "\\R{2,4}\\R\\R", "\r\r\r\r" },
484
485 /* Atomic groups (no fallback from "next" direction). */
486 { MU, A, 0, 0 | F_NOMATCH, "(?>ab)ab", "bab" },
487 { MU, A, 0, 0 | F_NOMATCH, "(?>(ab))ab", "bab" },
488 { MU, A, 0, 0, "(?>ab)+abc(?>de)*def(?>gh)?ghe(?>ij)+?k(?>lm)*?n(?>op)?\?op",
489 "bababcdedefgheijijklmlmnop" },
490 { MU, A, 0, 0, "(?>a(b)+a|(ab)?\?(b))an", "abban" },
491 { MU, A, 0, 0, "(?>ab+a|(?:ab)?\?b)an", "abban" },
492 { MU, A, 0, 0, "((?>ab|ad|)*?)(?>|c)*abad", "abababcababad" },
493 { MU, A, 0, 0, "(?>(aa|b|)*+(?>(##)|###)*d|(aa)(?>(baa)?)m)", "aabaa#####da" },
494 { MU, A, 0, 0, "((?>a|)+?)b", "aaacaaab" },
495 { MU, A, 0, 0, "(?>x|)*$", "aaa" },
496 { MU, A, 0, 0, "(?>(x)|)*$", "aaa" },
497 { MU, A, 0, 0, "(?>x|())*$", "aaa" },
498 { MU, A, 0, 0, "((?>[cxy]a|[a-d])*?)b", "aaa+ aaab" },
499 { MU, A, 0, 0, "((?>[cxy](a)|[a-d])*?)b", "aaa+ aaab" },
500 { MU, A, 0, 0, "(?>((?>(a+))))bab|(?>((?>(a+))))bb", "aaaabaaabaabab" },
501 { MU, A, 0, 0, "(?>(?>a+))bab|(?>(?>a+))bb", "aaaabaaabaabab" },
502 { MU, A, 0, 0, "(?>(a)c|(?>(c)|(a))a)b*?bab", "aaaabaaabaabab" },
503 { MU, A, 0, 0, "(?>ac|(?>c|a)a)b*?bab", "aaaabaaabaabab" },
504 { MU, A, 0, 0, "(?>(b)b|(a))*b(?>(c)|d)?x", "ababcaaabdbx" },
505 { MU, A, 0, 0, "(?>bb|a)*b(?>c|d)?x", "ababcaaabdbx" },
506 { MU, A, 0, 0, "(?>(bb)|a)*b(?>c|(d))?x", "ababcaaabdbx" },
507 { MU, A, 0, 0, "(?>(a))*?(?>(a))+?(?>(a))??x", "aaaaaacccaaaaabax" },
508 { MU, A, 0, 0, "(?>a)*?(?>a)+?(?>a)??x", "aaaaaacccaaaaabax" },
509 { MU, A, 0, 0, "(?>(a)|)*?(?>(a)|)+?(?>(a)|)??x", "aaaaaacccaaaaabax" },
510 { MU, A, 0, 0, "(?>a|)*?(?>a|)+?(?>a|)??x", "aaaaaacccaaaaabax" },
511 { MU, A, 0, 0, "(?>a(?>(a{0,2}))*?b|aac)+b", "aaaaaaacaaaabaaaaacaaaabaacaaabb" },
512 { CM, A, 0, 0, "(?>((?>a{32}|b+|(a*))?(?>c+|d*)?\?)+e)+?f", "aaccebbdde bbdaaaccebbdee bbdaaaccebbdeef" },
513 { MU, A, 0, 0, "(?>(?:(?>aa|a||x)+?b|(?>aa|a||(x))+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
514 { MU, A, 0, 0, "(?>(?:(?>aa|a||(x))+?b|(?>aa|a||x)+?c)?(?>[ad]{0,2})*?d)+d", "aaacdbaabdcabdbaaacd aacaabdbdcdcaaaadaabcbaadd" },
515 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d" },
516 { MU, A, 0, 0 | F_PROPERTY, "\\X", "\xcc\x8d\xcc\x8d#\xcc\x8d\xcc\x8d" },
517 { MU, A, 0, 0 | F_PROPERTY, "\\X+..", "\xcc\x8d#\xcc\x8d#\xcc\x8d\xcc\x8d" },
518 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}", "abcdef" },
519 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}?", "abcdef" },
520 { MU, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d##" },
521 { MU, A, 0, 0 | F_PROPERTY, "\\X{2,4}..", "#\xcc\x8d#\xcc\x8d##" },
522 { MU, A, 0, 0, "(c(ab)?+ab)+", "cabcababcab" },
523 { MU, A, 0, 0, "(?>(a+)b)+aabab", "aaaabaaabaabab" },
524 { MU, A, 0, 0 | F_NOMATCH, "(?>a*|)a", "aaa" },
525
526 /* Possessive quantifiers. */
527 { MU, A, 0, 0, "(?:a|b)++m", "mababbaaxababbaam" },
528 { MU, A, 0, 0, "(?:a|b)*+m", "mababbaaxababbaam" },
529 { MU, A, 0, 0, "(?:a|b)*+m", "ababbaaxababbaam" },
530 { MU, A, 0, 0, "(a|b)++m", "mababbaaxababbaam" },
531 { MU, A, 0, 0, "(a|b)*+m", "mababbaaxababbaam" },
532 { MU, A, 0, 0, "(a|b)*+m", "ababbaaxababbaam" },
533 { MU, A, 0, 0, "(a|b(*ACCEPT))++m", "maaxab" },
534 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxm" },
535 { MU, A, 0, 0, "(?:b*)++m", "bxbbxbbbxbbm" },
536 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxm" },
537 { MU, A, 0, 0, "(?:b*)*+m", "bxbbxbbbxbbm" },
538 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxm" },
539 { MU, A, 0, 0, "(b*)++m", "bxbbxbbbxbbm" },
540 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxm" },
541 { MU, A, 0, 0, "(b*)*+m", "bxbbxbbbxbbm" },
542 { MU, A, 0, 0, "(?:a|(b))++m", "mababbaaxababbaam" },
543 { MU, A, 0, 0, "(?:(a)|b)*+m", "mababbaaxababbaam" },
544 { MU, A, 0, 0, "(?:(a)|(b))*+m", "ababbaaxababbaam" },
545 { MU, A, 0, 0, "(a|(b))++m", "mababbaaxababbaam" },
546 { MU, A, 0, 0, "((a)|b)*+m", "mababbaaxababbaam" },
547 { MU, A, 0, 0, "((a)|(b))*+m", "ababbaaxababbaam" },
548 { MU, A, 0, 0, "(a|(b)(*ACCEPT))++m", "maaxab" },
549 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxm" },
550 { MU, A, 0, 0, "(?:(b*))++m", "bxbbxbbbxbbm" },
551 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxm" },
552 { MU, A, 0, 0, "(?:(b*))*+m", "bxbbxbbbxbbm" },
553 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxm" },
554 { MU, A, 0, 0, "((b*))++m", "bxbbxbbbxbbm" },
555 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxm" },
556 { MU, A, 0, 0, "((b*))*+m", "bxbbxbbbxbbm" },
557 { MU, A, 0, 0, "(A)*+$", "ABC" },
558 { MU, A, 0, 0 | F_NOMATCH, "(?>(b{2,4}))(?:(?:(aa|c))++m|(?:(aa|c))+n)", "bbaacaaccaaaacxbbbmbn" },
559 { MU, A, 0, 0, "((?:b)++a)+(cd)*+m", "bbababbacdcdnbbababbacdcdm" },
560 { MU, A, 0, 0, "((?:(b))++a)+((c)d)*+m", "bbababbacdcdnbbababbacdcdm" },
561 { MU, A, 0, 0, "(?:(?:(?:ab)*+k)++(?:n(?:cd)++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
562 { MU, A, 0, 0, "(?:((ab)*+(k))++(n(?:c(d))++)*+)*+m", "ababkkXababkkabkncXababkkabkncdcdncdXababkkabkncdcdncdkkabkncdXababkkabkncdcdncdkkabkncdm" },
563
564 /* Back references. */
565 { MU, A, 0, 0, "(aa|bb)(\\1*)(ll|)(\\3*)bbbbbbc", "aaaaaabbbbbbbbc" },
566 { CMU, A, 0, 0, "(aa|bb)(\\1+)(ll|)(\\3+)bbbbbbc", "bBbbBbCbBbbbBbbcbbBbbbBBbbC" },
567 { CM, A, 0, 0, "(a{2,4})\\1", "AaAaaAaA" },
568 { MU, A, 0, 0, "(aa|bb)(\\1?)aa(\\1?)(ll|)(\\4+)bbc", "aaaaaaaabbaabbbbaabbbbc" },
569 { MU, A, 0, 0, "(aa|bb)(\\1{0,5})(ll|)(\\3{0,5})cc", "bbxxbbbbxxaaaaaaaaaaaaaaaacc" },
570 { MU, A, 0, 0, "(aa|bb)(\\1{3,5})(ll|)(\\3{3,5})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
571 { MU, A, 0, 0, "(aa|bb)(\\1{3,})(ll|)(\\3{3,})cc", "bbbbbbbbbbbbaaaaaaccbbbbbbbbbbbbbbcc" },
572 { MU, A, 0, 0, "(\\w+)b(\\1+)c", "GabGaGaDbGaDGaDc" },
573 { MU, A, 0, 0, "(?:(aa)|b)\\1?b", "bb" },
574 { CMU, A, 0, 0, "(aa|bb)(\\1*?)aa(\\1+?)", "bBBbaaAAaaAAaa" },
575 { MU, A, 0, 0, "(aa|bb)(\\1*?)(dd|)cc(\\3+?)", "aaaaaccdd" },
576 { CMU, A, 0, 0, "(?:(aa|bb)(\\1?\?)cc){2}(\\1?\?)", "aAaABBbbAAaAcCaAcCaA" },
577 { MU, A, 0, 0, "(?:(aa|bb)(\\1{3,5}?)){2}(dd|)(\\3{3,5}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
578 { CM, A, 0, 0, "(?:(aa|bb)(\\1{3,}?)){2}(dd|)(\\3{3,}?)", "aaaaaabbbbbbbbbbaaaaaaaaaaaaaa" },
579 { MU, A, 0, 0, "(?:(aa|bb)(\\1{0,3}?)){2}(dd|)(\\3{0,3}?)b(\\1{0,3}?)(\\1{0,3})", "aaaaaaaaaaaaaaabaaaaa" },
580 { MU, A, 0, 0, "(a(?:\\1|)a){3}b", "aaaaaaaaaaab" },
581 { M, A, 0, 0, "(a?)b(\\1\\1*\\1+\\1?\\1*?\\1+?\\1??\\1*+\\1++\\1?+\\1{4}\\1{3,5}\\1{4,}\\1{0,5}\\1{3,5}?\\1{4,}?\\1{0,5}?\\1{3,5}+\\1{4,}+\\1{0,5}+#){2}d", "bb#b##d" },
582 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
583 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{0,2}", "wwwww." },
584 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwww" },
585 { MUP, A, 0, 0 | F_PROPERTY, "(\\P{N})\\1{1,2}ww", "wwwww" },
586 { PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "(\\P{N})\\1{2,}", ".www." },
587 { CMUP, A, 0, 0, "(\xf0\x90\x90\x80)\\1", "\xf0\x90\x90\xa8\xf0\x90\x90\xa8" },
588 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
589 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}(?<A>aa)(?<A>bb)", "aabb" },
590 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>*(?<A>aa)(?<A>bb)", "aabb" },
591 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{0,3}aaaaaa", "aabbaaaaaa" },
592 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?<A>aa)(?<A>bb)\\k<A>{2,5}bb", "aabbaaaabb" },
593 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}m", "aaaaaaaabbbbaabbbbm" },
594 { MU | PCRE2_DUPNAMES, A, 0, 0 | F_NOMATCH, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
595 { MU | PCRE2_DUPNAMES | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\k<A>{1,3}?(?<A>aa)(?<A>bb)", "aabb" },
596 { MU | PCRE2_DUPNAMES, A, 0, 0, "\\k<A>*?(?<A>aa)(?<A>bb)", "aabb" },
597 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{0,3}?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
598 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>*?m", "aaaaaabbbbbbaabbbbbbbbbbm" },
599 { MU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>aa)|(?<A>bb))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
600 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}M", "aaaaaaaabbbbaabbbbm" },
601 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{1,3}M", "aaaaaaaabbbbaabbbbm" },
602 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{0,3}?M", "aaaaaabbbbbbaabbbbbbbbbbm" },
603 { CMU | PCRE2_DUPNAMES, A, 0, 0, "(?:(?<A>AA)|(?<A>BB))\\k<A>{2,3}?", "aaaabbbbaaaabbbbbbbbbb" },
604 { MU | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "(a)|\\1+c", "xxc" },
605 { MU | PCRE2_MATCH_UNSET_BACKREF, A, 0, 0, "\\1+?()", "" },
606
607 /* Assertions. */
608 { MU, A, 0, 0, "(?=xx|yy|zz)\\w{4}", "abczzdefg" },
609 { MU, A, 0, 0, "(?=((\\w+)b){3}|ab)", "dbbbb ab" },
610 { MU, A, 0, 0, "(?!ab|bc|cd)[a-z]{2}", "Xabcdef" },
611 { MU, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
612 { MU, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
613 { M, A, 0, 0, "(?<=aaa|aa|a)a", "aaa" },
614 { M, A, 0, 2, "(?<=aaa|aa|a)a", "aaa" },
615 { MU, A, 0, 0, "(\\d{2})(?!\\w+c|(((\\w?)m){2}n)+|\\1)", "x5656" },
616 { MU, A, 0, 0, "((?=((\\d{2,6}\\w){2,}))\\w{5,20}K){2,}", "567v09708K12l00M00 567v09708K12l00M00K45K" },
617 { MU, A, 0, 0, "(?=(?:(?=\\S+a)\\w*(b)){3})\\w+\\d", "bba bbab nbbkba nbbkba0kl" },
618 { MU, A, 0, 0, "(?>a(?>(b+))a(?=(..)))*?k", "acabbcabbaabacabaabbakk" },
619 { MU, A, 0, 0, "((?(?=(a))a)+k)", "bbak" },
620 { MU, A, 0, 0, "((?(?=a)a)+k)", "bbak" },
621 { MU, A, 0, 0 | F_NOMATCH, "(?=(?>(a))m)amk", "a k" },
622 { MU, A, 0, 0 | F_NOMATCH, "(?!(?>(a))m)amk", "a k" },
623 { MU, A, 0, 0 | F_NOMATCH, "(?>(?=(a))am)amk", "a k" },
624 { MU, A, 0, 0, "(?=(?>a|(?=(?>(b+))a|c)[a-c]+)*?m)[a-cm]+k", "aaam bbam baaambaam abbabba baaambaamk" },
625 { MU, A, 0, 0, "(?> ?\?\\b(?(?=\\w{1,4}(a))m)\\w{0,8}bc){2,}?", "bca ssbc mabd ssbc mabc" },
626 { MU, A, 0, 0, "(?:(?=ab)?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
627 { MU, A, 0, 0, "(?:(?=a(b))?[^n][^n])+m", "ababcdabcdcdabnababcdabcdcdabm" },
628 { MU, A, 0, 0, "(?:(?=.(.))??\\1.)+m", "aabbbcbacccanaabbbcbacccam" },
629 { MU, A, 0, 0, "(?:(?=.)??[a-c])+m", "abacdcbacacdcaccam" },
630 { MU, A, 0, 0, "((?!a)?(?!([^a]))?)+$", "acbab" },
631 { MU, A, 0, 0, "((?!a)?\?(?!([^a]))?\?)+$", "acbab" },
632 { MU, A, 0, 0, "a(?=(?C)\\B(?C`x`))b", "ab" },
633 { MU, A, 0, 0, "a(?!(?C)\\B(?C`x`))bb|ab", "abb" },
634 { MU, A, 0, 0, "a(?=\\b|(?C)\\B(?C`x`))b", "ab" },
635 { MU, A, 0, 0, "a(?!\\b|(?C)\\B(?C`x`))bb|ab", "abb" },
636 { MU, A, 0, 0, "c(?(?=(?C)\\B(?C`x`))ab|a)", "cab" },
637 { MU, A, 0, 0, "c(?(?!(?C)\\B(?C`x`))ab|a)", "cab" },
638 { MU, A, 0, 0, "c(?(?=\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
639 { MU, A, 0, 0, "c(?(?!\\b|(?C)\\B(?C`x`))ab|a)", "cab" },
640 { MU, A, 0, 0, "a(?=)b", "ab" },
641 { MU, A, 0, 0 | F_NOMATCH, "a(?!)b", "ab" },
642 { MU, A, 0, 0, "(?(?<!|(|a)))", "a" },
643
644 /* Not empty, ACCEPT, FAIL */
645 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*", "bcx" },
646 { MU, A, PCRE2_NOTEMPTY, 0, "a*", "bcaad" },
647 { MU, A, PCRE2_NOTEMPTY, 0, "a*?", "bcaad" },
648 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*", "bcaad" },
649 { MU, A, 0, 0, "a(*ACCEPT)b", "ab" },
650 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a*(*ACCEPT)b", "bcx" },
651 { MU, A, PCRE2_NOTEMPTY, 0, "a*(*ACCEPT)b", "bcaad" },
652 { MU, A, PCRE2_NOTEMPTY, 0, "a*?(*ACCEPT)b", "bcaad" },
653 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?:z|a*(*ACCEPT)b)", "bcx" },
654 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*(*ACCEPT)b)", "bcaad" },
655 { MU, A, PCRE2_NOTEMPTY, 0, "(?:z|a*?(*ACCEPT)b)", "bcaad" },
656 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a*(*ACCEPT)b", "bcx" },
657 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0 | F_NOMATCH, "a*(*ACCEPT)b", "" },
658 { MU, A, 0, 0, "((a(*ACCEPT)b))", "ab" },
659 { MU, A, 0, 0, "(a(*FAIL)a|a)", "aaa" },
660 { MU, A, 0, 0, "(?=ab(*ACCEPT)b)a", "ab" },
661 { MU, A, 0, 0, "(?=(?:x|ab(*ACCEPT)b))", "ab" },
662 { MU, A, 0, 0, "(?=(a(b(*ACCEPT)b)))a", "ab" },
663 { MU, A, PCRE2_NOTEMPTY, 0, "(?=a*(*ACCEPT))c", "c" },
664 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "(?=A)", "AB" },
665 { MU | PCRE2_ENDANCHORED, A, 0, 0, "aa(*ACCEPT)aa", "aaa" },
666
667 /* Conditional blocks. */
668 { MU, A, 0, 0, "(?(?=(a))a|b)+k", "ababbalbbadabak" },
669 { MU, A, 0, 0, "(?(?!(b))a|b)+k", "ababbalbbadabak" },
670 { MU, A, 0, 0, "(?(?=a)a|b)+k", "ababbalbbadabak" },
671 { MU, A, 0, 0, "(?(?!b)a|b)+k", "ababbalbbadabak" },
672 { MU, A, 0, 0, "(?(?=(a))a*|b*)+k", "ababbalbbadabak" },
673 { MU, A, 0, 0, "(?(?!(b))a*|b*)+k", "ababbalbbadabak" },
674 { MU, A, 0, 0, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
675 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+aaaak", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb aaaaaaak" },
676 { MU, A, 0, 0 | F_DIFF, "(?(?!(b))(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
677 { MU, A, 0, 0, "(?(?!b)(?:aaaaaa|a)|(?:bbbbbb|b))+bbbbk", "aaaaaaaaaaaaaa bbbbbbbbbbbbbbb bbbbbbbk" },
678 { MU, A, 0, 0, "(?(?=a)a*|b*)+k", "ababbalbbadabak" },
679 { MU, A, 0, 0, "(?(?!b)a*|b*)+k", "ababbalbbadabak" },
680 { MU, A, 0, 0, "(?(?=a)ab)", "a" },
681 { MU, A, 0, 0, "(?(?<!b)c)", "b" },
682 { MU, A, 0, 0, "(?(DEFINE)a(b))", "a" },
683 { MU, A, 0, 0, "a(?(DEFINE)(?:b|(?:c?)+)*)", "a" },
684 { MU, A, 0, 0, "(?(?=.[a-c])[k-l]|[A-D])", "kdB" },
685 { MU, A, 0, 0, "(?(?!.{0,4}[cd])(aa|bb)|(cc|dd))+", "aabbccddaa" },
686 { MU, A, 0, 0, "(?(?=[^#@]*@)(aaab|aa|aba)|(aba|aab)){3,}", "aaabaaaba#aaabaaaba#aaabaaaba@" },
687 { MU, A, 0, 0, "((?=\\w{5})\\w(?(?=\\w*k)\\d|[a-f_])*\\w\\s)+", "mol m10kk m088k _f_a_ mbkkl" },
688 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cdcaa" },
689 { MU, A, 0, 0, "(c)?\?(?(1)a|b)", "cbb" },
690 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))(aaaa|a?))+aak", "aaaaab aaaaak" },
691 { MU, A, 0, 0, "(?(?=a)(aaaa|a?))+aak", "aaaaab aaaaak" },
692 { MU, A, 0, 0, "(?(?!(b))(aaaa|a?))+aak", "aaaaab aaaaak" },
693 { MU, A, 0, 0, "(?(?!b)(aaaa|a?))+aak", "aaaaab aaaaak" },
694 { MU, A, 0, 0 | F_DIFF, "(?(?=(a))a*)+aak", "aaaaab aaaaak" },
695 { MU, A, 0, 0, "(?(?=a)a*)+aak", "aaaaab aaaaak" },
696 { MU, A, 0, 0, "(?(?!(b))a*)+aak", "aaaaab aaaaak" },
697 { MU, A, 0, 0, "(?(?!b)a*)+aak", "aaaaab aaaaak" },
698 { MU, A, 0, 0, "(?(?=(?=(?!(x))a)aa)aaa|(?(?=(?!y)bb)bbb))*k", "abaabbaaabbbaaabbb abaabbaaabbbaaabbbk" },
699 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)*l", "bc ddd abccabccl" },
700 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+?dd", "bcabcacdb bdddd" },
701 { MU, A, 0, 0, "(?P<Name>a)?(?P<Name2>b)?(?(Name)c|d)+l", "ababccddabdbccd abcccl" },
702 { MU, A, 0, 0, "((?:a|aa)(?(1)aaa))x", "aax" },
703 { MU, A, 0, 0, "(?(?!)a|b)", "ab" },
704 { MU, A, 0, 0, "(?(?!)a)", "ab" },
705 { MU, A, 0, 0 | F_NOMATCH, "(?(?!)a|b)", "ac" },
706
707 /* Set start of match. */
708 { MU, A, 0, 0, "(?:\\Ka)*aaaab", "aaaaaaaa aaaaaaabb" },
709 { MU, A, 0, 0, "(?>\\Ka\\Ka)*aaaab", "aaaaaaaa aaaaaaaaaabb" },
710 { MU, A, 0, 0, "a+\\K(?<=\\Gaa)a", "aaaaaa" },
711 { MU, A, PCRE2_NOTEMPTY, 0 | F_NOMATCH, "a\\K(*ACCEPT)b", "aa" },
712 { MU, A, PCRE2_NOTEMPTY_ATSTART, 0, "a\\K(*ACCEPT)b", "aa" },
713
714 /* First line. */
715 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_PROPERTY, "\\p{Any}a", "bb\naaa" },
716 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}a", "bb\r\naaa" },
717 { MU | PCRE2_FIRSTLINE, A, 0, 0, "(?<=a)", "a" },
718 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[^a][^b]", "ab" },
719 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "a", "\na" },
720 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "[abc]", "\na" },
721 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^a", "\na" },
722 { MU | PCRE2_FIRSTLINE, A, 0, 0 | F_NOMATCH, "^(?<=\n)", "\na" },
723 { MU | PCRE2_FIRSTLINE, A, 0, 0, "\xf0\x90\x90\x80", "\xf0\x90\x90\x80" },
724 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\xc2\x85#" },
725 { M | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "#", "\x85#" },
726 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_ANY, 0, 0 | F_NOMATCH, "^#", "\xe2\x80\xa8#" },
727 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_PROPERTY, "\\p{Any}", "\r\na" },
728 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, ".", "\r" },
729 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0, "a", "\ra" },
730 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH, "ba", "bbb\r\nba" },
731 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 0 | F_NOMATCH | F_PROPERTY, "\\p{Any}{4}|a", "\r\na" },
732 { MU | PCRE2_FIRSTLINE, PCRE2_NEWLINE_CRLF, 0, 1, ".", "\r\n" },
733 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_LF, 0, 0 | F_NOMATCH, "ab.", "ab" },
734 { MU | PCRE2_FIRSTLINE, A, 0, 1 | F_NOMATCH, "^[a-d0-9]", "\nxx\nd" },
735 { PCRE2_FIRSTLINE | PCRE2_DOTALL, PCRE2_NEWLINE_ANY, 0, 0, "....a", "012\n0a" },
736 { MU | PCRE2_FIRSTLINE, A, 0, 0, "[aC]", "a" },
737
738 /* Recurse. */
739 { MU, A, 0, 0, "(a)(?1)", "aa" },
740 { MU, A, 0, 0, "((a))(?1)", "aa" },
741 { MU, A, 0, 0, "(b|a)(?1)", "aa" },
742 { MU, A, 0, 0, "(b|(a))(?1)", "aa" },
743 { MU, A, 0, 0 | F_NOMATCH, "((a)(b)(?:a*))(?1)", "aba" },
744 { MU, A, 0, 0, "((a)(b)(?:a*))(?1)", "abab" },
745 { MU, A, 0, 0, "((a+)c(?2))b(?1)", "aacaabaca" },
746 { MU, A, 0, 0, "((?2)b|(a)){2}(?1)", "aabab" },
747 { MU, A, 0, 0, "(?1)(a)*+(?2)(b(?1))", "aababa" },
748 { MU, A, 0, 0, "(?1)(((a(*ACCEPT)))b)", "axaa" },
749 { MU, A, 0, 0, "(?1)(?(DEFINE) (((ac(*ACCEPT)))b) )", "akaac" },
750 { MU, A, 0, 0, "(a+)b(?1)b\\1", "abaaabaaaaa" },
751 { MU, A, 0, 0, "(?(DEFINE)(aa|a))(?1)ab", "aab" },
752 { MU, A, 0, 0, "(?(DEFINE)(a\\Kb))(?1)+ababc", "abababxabababc" },
753 { MU, A, 0, 0, "(a\\Kb)(?1)+ababc", "abababxababababc" },
754 { MU, A, 0, 0 | F_NOMATCH, "(a\\Kb)(?1)+ababc", "abababxababababxc" },
755 { MU, A, 0, 0, "b|<(?R)*>", "<<b>" },
756 { MU, A, 0, 0, "(a\\K){0}(?:(?1)b|ac)", "ac" },
757 { MU, A, 0, 0, "(?(DEFINE)(a(?2)|b)(b(?1)|(a)))(?:(?1)|(?2))m", "ababababnababababaam" },
758 { MU, A, 0, 0, "(a)((?(R)a|b))(?2)", "aabbabaa" },
759 { MU, A, 0, 0, "(a)((?(R2)a|b))(?2)", "aabbabaa" },
760 { MU, A, 0, 0, "(a)((?(R1)a|b))(?2)", "ababba" },
761 { MU, A, 0, 0, "(?(R0)aa|bb(?R))", "abba aabb bbaa" },
762 { MU, A, 0, 0, "((?(R)(?:aaaa|a)|(?:(aaaa)|(a)))+)(?1)$", "aaaaaaaaaa aaaa" },
763 { MU, A, 0, 0, "(?P<Name>a(?(R&Name)a|b))(?1)", "aab abb abaa" },
764 { MU, A, 0, 0, "((?(R)a|(?1)){3})", "XaaaaaaaaaX" },
765 { MU, A, 0, 0, "((?:(?(R)a|(?1))){3})", "XaaaaaaaaaX" },
766 { MU, A, 0, 0, "((?(R)a|(?1)){1,3})aaaaaa", "aaaaaaaaXaaaaaaaaa" },
767 { MU, A, 0, 0, "((?(R)a|(?1)){1,3}?)M", "aaaM" },
768 { MU, A, 0, 0, "((.)(?:.|\\2(?1))){0}#(?1)#", "#aabbccdde# #aabbccddee#" },
769 { MU, A, 0, 0, "((.)(?:\\2|\\2{4}b)){0}#(?:(?1))+#", "#aaaab# #aaaaab#" },
770 { MU, A, 0, 0 | F_NOMATCH, "(?1)$((.|\\2xx){1,2})", "abc" },
771
772 /* 16 bit specific tests. */
773 { CM, A, 0, 0 | F_FORCECONV, "\xc3\xa1", "\xc3\x81\xc3\xa1" },
774 { CM, A, 0, 0 | F_FORCECONV, "\xe1\xbd\xb8", "\xe1\xbf\xb8\xe1\xbd\xb8" },
775 { CM, A, 0, 0 | F_FORCECONV, "[\xc3\xa1]", "\xc3\x81\xc3\xa1" },
776 { CM, A, 0, 0 | F_FORCECONV, "[\xe1\xbd\xb8]", "\xe1\xbf\xb8\xe1\xbd\xb8" },
777 { CM, A, 0, 0 | F_FORCECONV, "[a-\xed\xb0\x80]", "A" },
778 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[a-\\x{dc00}]", "B" },
779 { CM, A, 0, 0 | F_NO8 | F_NOMATCH | F_FORCECONV, "[b-\\x{dc00}]", "a" },
780 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "\xed\xa0\x80\\x{d800}\xed\xb0\x80\\x{dc00}", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80" },
781 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\xed\xa0\x80\\x{d800}]{1,2}?[\xed\xb0\x80\\x{dc00}]{1,2}?#", "\xed\xa0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80#" },
782 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80\xed\xb0\x80#]{0,3}(?<=\xed\xb0\x80.)", "\xed\xa0\x80#\xed\xa0\x80##\xed\xb0\x80\xed\xa0\x80" },
783 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\x9f\xbf\xed\xa0\x83" },
784 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xed\xb3\xbf]", "\xed\xb4\x80\xed\xb3\xb0" },
785 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\x9f\xbf\xed\xa0\x83" },
786 { CM, A, 0, 0 | F_NO8 | F_FORCECONV, "[\\x{d800}-\\x{dcff}]", "\xed\xb4\x80\xed\xb3\xb0" },
787 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80-\xef\xbf\xbf]+[\x1-\xed\xb0\x80]+#", "\xed\xa0\x85\xc3\x81\xed\xa0\x85\xef\xbf\xb0\xc2\x85\xed\xa9\x89#" },
788 { CM, A, 0, 0 | F_FORCECONV, "[\xed\xa0\x80][\xed\xb0\x80]{2,}", "\xed\xa0\x80\xed\xb0\x80\xed\xa0\x80\xed\xb0\x80\xed\xb0\x80\xed\xb0\x80" },
789 { M, A, 0, 0 | F_FORCECONV, "[^\xed\xb0\x80]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
790 { M, A, 0, 0 | F_NO8 | F_FORCECONV, "[^\\x{dc00}]{3,}?", "##\xed\xb0\x80#\xed\xb0\x80#\xc3\x89#\xed\xb0\x80" },
791 { CM, A, 0, 0 | F_FORCECONV, ".\\B.", "\xed\xa0\x80\xed\xb0\x80" },
792 { CM, A, 0, 0 | F_FORCECONV, "\\D+(?:\\d+|.)\\S+(?:\\s+|.)\\W+(?:\\w+|.)\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80" },
793 { CM, A, 0, 0 | F_FORCECONV, "\\d*\\s*\\w*\xed\xa0\x80\xed\xa0\x80", "\xed\xa0\x80\xed\xa0\x80" },
794 { CM, A, 0, 0 | F_FORCECONV | F_NOMATCH, "\\d*?\\D*?\\s*?\\S*?\\w*?\\W*?##", "\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80\xed\xa0\x80#" },
795 { CM | PCRE2_EXTENDED, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80 \xed\xb0\x80 !", "\xed\xa0\x80\xed\xb0\x80!" },
796 { CM, A, 0, 0 | F_FORCECONV, "\xed\xa0\x80+#[^#]+\xed\xa0\x80", "\xed\xa0\x80#a\xed\xa0\x80" },
797 { CM, A, 0, 0 | F_FORCECONV, "(\xed\xa0\x80+)#\\1", "\xed\xa0\x80\xed\xa0\x80#\xed\xa0\x80\xed\xa0\x80" },
798 { M, PCRE2_NEWLINE_ANY, 0, 0 | F_NO8 | F_FORCECONV, "^-", "a--\xe2\x80\xa8--" },
799 { 0, BSR(PCRE2_BSR_UNICODE), 0, 0 | F_NO8 | F_FORCECONV, "\\R", "ab\xe2\x80\xa8" },
800 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v", "ab\xe2\x80\xa9" },
801 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h", "ab\xe1\xa0\x8e" },
802 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\v+?\\V+?#", "\xe2\x80\xa9\xe2\x80\xa9\xef\xbf\xbf\xef\xbf\xbf#" },
803 { 0, 0, 0, 0 | F_NO8 | F_FORCECONV, "\\h+?\\H+?#", "\xe1\xa0\x8e\xe1\xa0\x8e\xef\xbf\xbf\xef\xbf\xbf#" },
804
805 /* Partial matching. */
806 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab", "a" },
807 { MU, A, PCRE2_PARTIAL_SOFT, 0, "ab|a", "a" },
808 { MU, A, PCRE2_PARTIAL_HARD, 0, "ab|a", "a" },
809 { MU, A, PCRE2_PARTIAL_SOFT, 0, "\\b#", "a" },
810 { MU, A, PCRE2_PARTIAL_SOFT, 0, "(?<=a)b", "a" },
811 { MU, A, PCRE2_PARTIAL_SOFT, 0, "abc|(?<=xxa)bc", "xxab" },
812 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a\\B", "a" },
813 { MU, A, PCRE2_PARTIAL_HARD, 0, "a\\b", "a" },
814
815 /* (*MARK) verb. */
816 { MU, A, 0, 0, "a(*MARK:aa)a", "ababaa" },
817 { MU, A, 0, 0 | F_NOMATCH, "a(*:aa)a", "abab" },
818 { MU, A, 0, 0, "a(*:aa)(b(*:bb)b|bc)", "abc" },
819 { MU, A, 0, 0 | F_NOMATCH, "a(*:1)x|b(*:2)y", "abc" },
820 { MU, A, 0, 0, "(?>a(*:aa))b|ac", "ac" },
821 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))(?1)", "a" },
822 { MU, A, 0, 0 | F_NOMATCH, "(?(DEFINE)((a)(*:aa)))(?1)b", "aa" },
823 { MU, A, 0, 0, "(?(DEFINE)(a(*:aa)))a(?1)b|aac", "aac" },
824 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
825 { MU, A, 0, 0, "(a(*:aa)){0}(?:b(?1)b)+", "babba" },
826 { MU, A, 0, 0 | F_NOMATCH, "(a(*:aa)){0}(?:b(?1)b)+", "ba" },
827 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b|c)+c", "babbab cc" },
828 { MU, A, 0, 0, "(a\\K(*:aa)){0}(?:b(?1)b)+", "babba" },
829 { MU, A, 0, 0 | F_NOMATCH, "(a\\K(*:aa)){0}(?:b(?1)b)+", "ba" },
830 { MU, A, 0, 0 | F_NOMATCH, "(*:mark)m", "a" },
831
832 /* (*COMMIT) verb. */
833 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)b", "ac" },
834 { MU, A, 0, 0, "aa(*COMMIT)b", "xaxaab" },
835 { MU, A, 0, 0 | F_NOMATCH, "a(*COMMIT)(*:msg)b|ac", "ac" },
836 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b)++", "abac" },
837 { MU, A, 0, 0 | F_NOMATCH, "((a)(*COMMIT)b)++", "abac" },
838 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*COMMIT)b)ab|ad", "ad" },
839
840 /* (*PRUNE) verb. */
841 { MU, A, 0, 0, "aa\\K(*PRUNE)b", "aaab" },
842 { MU, A, 0, 0, "aa(*PRUNE:bb)b|a", "aa" },
843 { MU, A, 0, 0, "(a)(a)(*PRUNE)b|(a)", "aa" },
844 { MU, A, 0, 0, "(a)(a)(a)(a)(a)(a)(a)(a)(*PRUNE)b|(a)", "aaaaaaaa" },
845 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|", "a" },
846 { MU, A, PCRE2_PARTIAL_SOFT, 0, "a(*PRUNE)a|m", "a" },
847 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*PRUNE)b)ab|ad", "ad" },
848 { MU, A, 0, 0, "a(*COMMIT)(*PRUNE)d|bc", "abc" },
849 { MU, A, 0, 0, "(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
850 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=a(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
851 { MU, A, 0, 0, "(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
852 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?=(a)(*COMMIT)b)a(*PRUNE)c|bc", "abc" },
853 { MU, A, 0, 0, "(a(*COMMIT)b){0}a(?1)(*PRUNE)c|bc", "abc" },
854 { MU, A, 0, 0 | F_NOMATCH, "(a(*COMMIT)b){0}a(*COMMIT)(?1)(*PRUNE)c|bc", "abc" },
855 { MU, A, 0, 0, "(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
856 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(a(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
857 { MU, A, 0, 0, "((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
858 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)((a)(*COMMIT)b)++(*PRUNE)d|c", "ababc" },
859 { MU, A, 0, 0, "(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
860 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*abab(*PRUNE)d|ba", "ababab" },
861 { MU, A, 0, 0, "(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
862 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+abab(*PRUNE)d|ba", "ababab" },
863 { MU, A, 0, 0, "(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
864 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)?ab(*PRUNE)d|ba", "aba" },
865 { MU, A, 0, 0, "(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
866 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)*?n(*PRUNE)d|ba", "abababn" },
867 { MU, A, 0, 0, "(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
868 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)+?n(*PRUNE)d|ba", "abababn" },
869 { MU, A, 0, 0, "(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
870 { MU, A, 0, 0 | F_NOMATCH, "(*COMMIT)(?>a(*COMMIT)b)??n(*PRUNE)d|bn", "abn" },
871
872 /* (*SKIP) verb. */
873 { MU, A, 0, 0 | F_NOMATCH, "(?=a(*SKIP)b)ab|ad", "ad" },
874 { MU, A, 0, 0, "(\\w+(*SKIP)#)", "abcd,xyz#," },
875 { MU, A, 0, 0, "\\w+(*SKIP)#|mm", "abcd,xyz#," },
876 { MU, A, 0, 0 | F_NOMATCH, "b+(?<=(*SKIP)#c)|b+", "#bbb" },
877
878 /* (*THEN) verb. */
879 { MU, A, 0, 0, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcaabcaabcaabcnacm" },
880 { MU, A, 0, 0 | F_NOMATCH, "((?:a(*THEN)|aab)(*THEN)c|a+)+m", "aabcm" },
881 { MU, A, 0, 0, "((?:a(*THEN)|aab)c|a+)+m", "aabcaabcnmaabcaabcm" },
882 { MU, A, 0, 0, "((?:a|aab)(*THEN)c|a+)+m", "aam" },
883 { MU, A, 0, 0, "((?:a(*COMMIT)|aab)(*THEN)c|a+)+m", "aam" },
884 { MU, A, 0, 0, "(?(?=a(*THEN)b)ab|ad)", "ad" },
885 { MU, A, 0, 0, "(?(?!a(*THEN)b)ad|add)", "add" },
886 { MU, A, 0, 0 | F_NOMATCH, "(?(?=a)a(*THEN)b|ad)", "ad" },
887 { MU, A, 0, 0, "(?!(?(?=a)ab|b(*THEN)d))bn|bnn", "bnn" },
888 { MU, A, 0, 0, "(?=(*THEN: ))* ", " " },
889 { MU, A, 0, 0, "a(*THEN)(?R) |", "a" },
890 { MU, A, 0, 0 | F_NOMATCH, "(?<!(*THEN)a|(*THEN)b|(*THEN)ab?|(*THEN)ba?|)", "c" },
891
892 /* Recurse and control verbs. */
893 { MU, A, 0, 0, "(a(*ACCEPT)b){0}a(?1)b", "aacaabb" },
894 { MU, A, 0, 0, "((a)\\2(*ACCEPT)b){0}a(?1)b", "aaacaaabb" },
895 { MU, A, 0, 0, "((ab|a(*ACCEPT)x)+|ababababax){0}_(?1)_", "_ababababax_ _ababababa_" },
896 { MU, A, 0, 0, "((.)(?:A(*ACCEPT)|(?1)\\2)){0}_(?1)_", "_bcdaAdcb_bcdaAdcb_" },
897 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_", "_ab_" },
898 { MU, A, 0, 0, "((*MARK:m)(?:a|a(*COMMIT)b|aa)){0}_(?1)_|(_aa_)", "_aa_" },
899 { MU, A, 0, 0, "(a(*COMMIT)(?:b|bb)|c(*ACCEPT)d|dd){0}_(?1)+_", "_ax_ _cd_ _abbb_ _abcd_ _abbcdd_" },
900 { MU, A, 0, 0, "((.)(?:.|(*COMMIT)\\2{3}(*ACCEPT).*|.*)){0}_(?1){0,4}_", "_aaaabbbbccccddd_ _aaaabbbbccccdddd_" },
901
902 #ifdef SUPPORT_UNICODE
903 /* Script runs and iterations. */
904 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
905 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
906 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
907 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)+?#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
908 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)*+#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
909 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)++#", "!abcdefghijklmno!abcdefghijklmno!abcdef#" },
910 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)?#", "!ab!abc!ab!ab#" },
911 { MU, A, 0, 0, "!(*sr:\\w\\w|\\w\\w\\w)??#", "!ab!abc!ab!ab#" },
912 #endif /* SUPPORT_UNICODE */
913
914 /* Deep recursion. */
915 { MU, A, 0, 0, "((((?:(?:(?:\\w)+)?)*|(?>\\w)+?)+|(?>\\w)?\?)*)?\\s", "aaaaa+ " },
916 { MU, A, 0, 0, "(?:((?:(?:(?:\\w*?)+)??|(?>\\w)?|\\w*+)*)+)+?\\s", "aa+ " },
917 { MU, A, 0, 0, "((a?)+)+b", "aaaaaaaaaaaa b" },
918
919 /* Deep recursion: Stack limit reached. */
920 { M, A, 0, 0 | F_NOMATCH, "a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?a?aaaaaaaaaaaaaaaaaaaaaaa", "aaaaaaaaaaaaaaaaaaaaaaa" },
921 { M, A, 0, 0 | F_NOMATCH, "(?:a+)+b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
922 { M, A, 0, 0 | F_NOMATCH, "(?:a+?)+?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
923 { M, A, 0, 0 | F_NOMATCH, "(?:a*)*b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
924 { M, A, 0, 0 | F_NOMATCH, "(?:a*?)*?b", "aaaaaaaaaaaaaaaaaaaaaaaa b" },
925
926 { 0, 0, 0, 0, NULL, NULL }
927 };
928
929 #ifdef SUPPORT_PCRE2_8
callback8(void * arg)930 static pcre2_jit_stack_8* callback8(void *arg)
931 {
932 return (pcre2_jit_stack_8 *)arg;
933 }
934 #endif
935
936 #ifdef SUPPORT_PCRE2_16
callback16(void * arg)937 static pcre2_jit_stack_16* callback16(void *arg)
938 {
939 return (pcre2_jit_stack_16 *)arg;
940 }
941 #endif
942
943 #ifdef SUPPORT_PCRE2_32
callback32(void * arg)944 static pcre2_jit_stack_32* callback32(void *arg)
945 {
946 return (pcre2_jit_stack_32 *)arg;
947 }
948 #endif
949
950 #ifdef SUPPORT_PCRE2_8
951 static pcre2_jit_stack_8 *stack8;
952
getstack8(void)953 static pcre2_jit_stack_8 *getstack8(void)
954 {
955 if (!stack8)
956 stack8 = pcre2_jit_stack_create_8(1, 1024 * 1024, NULL);
957 return stack8;
958 }
959
setstack8(pcre2_match_context_8 * mcontext)960 static void setstack8(pcre2_match_context_8 *mcontext)
961 {
962 if (!mcontext) {
963 if (stack8)
964 pcre2_jit_stack_free_8(stack8);
965 stack8 = NULL;
966 return;
967 }
968
969 pcre2_jit_stack_assign_8(mcontext, callback8, getstack8());
970 }
971 #endif /* SUPPORT_PCRE2_8 */
972
973 #ifdef SUPPORT_PCRE2_16
974 static pcre2_jit_stack_16 *stack16;
975
getstack16(void)976 static pcre2_jit_stack_16 *getstack16(void)
977 {
978 if (!stack16)
979 stack16 = pcre2_jit_stack_create_16(1, 1024 * 1024, NULL);
980 return stack16;
981 }
982
setstack16(pcre2_match_context_16 * mcontext)983 static void setstack16(pcre2_match_context_16 *mcontext)
984 {
985 if (!mcontext) {
986 if (stack16)
987 pcre2_jit_stack_free_16(stack16);
988 stack16 = NULL;
989 return;
990 }
991
992 pcre2_jit_stack_assign_16(mcontext, callback16, getstack16());
993 }
994 #endif /* SUPPORT_PCRE2_16 */
995
996 #ifdef SUPPORT_PCRE2_32
997 static pcre2_jit_stack_32 *stack32;
998
getstack32(void)999 static pcre2_jit_stack_32 *getstack32(void)
1000 {
1001 if (!stack32)
1002 stack32 = pcre2_jit_stack_create_32(1, 1024 * 1024, NULL);
1003 return stack32;
1004 }
1005
setstack32(pcre2_match_context_32 * mcontext)1006 static void setstack32(pcre2_match_context_32 *mcontext)
1007 {
1008 if (!mcontext) {
1009 if (stack32)
1010 pcre2_jit_stack_free_32(stack32);
1011 stack32 = NULL;
1012 return;
1013 }
1014
1015 pcre2_jit_stack_assign_32(mcontext, callback32, getstack32());
1016 }
1017 #endif /* SUPPORT_PCRE2_32 */
1018
1019 #ifdef SUPPORT_PCRE2_16
1020
convert_utf8_to_utf16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int * offsetmap,int max_length)1021 static int convert_utf8_to_utf16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int *offsetmap, int max_length)
1022 {
1023 PCRE2_SPTR8 iptr = input;
1024 PCRE2_UCHAR16 *optr = output;
1025 unsigned int c;
1026
1027 if (max_length == 0)
1028 return 0;
1029
1030 while (*iptr && max_length > 1) {
1031 c = 0;
1032 if (offsetmap)
1033 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1034
1035 if (*iptr < 0xc0)
1036 c = *iptr++;
1037 else if (!(*iptr & 0x20)) {
1038 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1039 iptr += 2;
1040 } else if (!(*iptr & 0x10)) {
1041 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1042 iptr += 3;
1043 } else if (!(*iptr & 0x08)) {
1044 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1045 iptr += 4;
1046 }
1047
1048 if (c < 65536) {
1049 *optr++ = c;
1050 max_length--;
1051 } else if (max_length <= 2) {
1052 *optr = '\0';
1053 return (int)(optr - output);
1054 } else {
1055 c -= 0x10000;
1056 *optr++ = 0xd800 | ((c >> 10) & 0x3ff);
1057 *optr++ = 0xdc00 | (c & 0x3ff);
1058 max_length -= 2;
1059 if (offsetmap)
1060 offsetmap++;
1061 }
1062 }
1063 if (offsetmap)
1064 *offsetmap = (int)(iptr - (unsigned char*)input);
1065 *optr = '\0';
1066 return (int)(optr - output);
1067 }
1068
copy_char8_to_char16(PCRE2_SPTR8 input,PCRE2_UCHAR16 * output,int max_length)1069 static int copy_char8_to_char16(PCRE2_SPTR8 input, PCRE2_UCHAR16 *output, int max_length)
1070 {
1071 PCRE2_SPTR8 iptr = input;
1072 PCRE2_UCHAR16 *optr = output;
1073
1074 if (max_length == 0)
1075 return 0;
1076
1077 while (*iptr && max_length > 1) {
1078 *optr++ = *iptr++;
1079 max_length--;
1080 }
1081 *optr = '\0';
1082 return (int)(optr - output);
1083 }
1084
1085 #define REGTEST_MAX_LENGTH16 4096
1086 static PCRE2_UCHAR16 regtest_buf16[REGTEST_MAX_LENGTH16];
1087 static int regtest_offsetmap16[REGTEST_MAX_LENGTH16];
1088
1089 #endif /* SUPPORT_PCRE2_16 */
1090
1091 #ifdef SUPPORT_PCRE2_32
1092
convert_utf8_to_utf32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int * offsetmap,int max_length)1093 static int convert_utf8_to_utf32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int *offsetmap, int max_length)
1094 {
1095 PCRE2_SPTR8 iptr = input;
1096 PCRE2_UCHAR32 *optr = output;
1097 unsigned int c;
1098
1099 if (max_length == 0)
1100 return 0;
1101
1102 while (*iptr && max_length > 1) {
1103 c = 0;
1104 if (offsetmap)
1105 *offsetmap++ = (int)(iptr - (unsigned char*)input);
1106
1107 if (*iptr < 0xc0)
1108 c = *iptr++;
1109 else if (!(*iptr & 0x20)) {
1110 c = ((iptr[0] & 0x1f) << 6) | (iptr[1] & 0x3f);
1111 iptr += 2;
1112 } else if (!(*iptr & 0x10)) {
1113 c = ((iptr[0] & 0x0f) << 12) | ((iptr[1] & 0x3f) << 6) | (iptr[2] & 0x3f);
1114 iptr += 3;
1115 } else if (!(*iptr & 0x08)) {
1116 c = ((iptr[0] & 0x07) << 18) | ((iptr[1] & 0x3f) << 12) | ((iptr[2] & 0x3f) << 6) | (iptr[3] & 0x3f);
1117 iptr += 4;
1118 }
1119
1120 *optr++ = c;
1121 max_length--;
1122 }
1123 if (offsetmap)
1124 *offsetmap = (int)(iptr - (unsigned char*)input);
1125 *optr = 0;
1126 return (int)(optr - output);
1127 }
1128
copy_char8_to_char32(PCRE2_SPTR8 input,PCRE2_UCHAR32 * output,int max_length)1129 static int copy_char8_to_char32(PCRE2_SPTR8 input, PCRE2_UCHAR32 *output, int max_length)
1130 {
1131 PCRE2_SPTR8 iptr = input;
1132 PCRE2_UCHAR32 *optr = output;
1133
1134 if (max_length == 0)
1135 return 0;
1136
1137 while (*iptr && max_length > 1) {
1138 *optr++ = *iptr++;
1139 max_length--;
1140 }
1141 *optr = '\0';
1142 return (int)(optr - output);
1143 }
1144
1145 #define REGTEST_MAX_LENGTH32 4096
1146 static PCRE2_UCHAR32 regtest_buf32[REGTEST_MAX_LENGTH32];
1147 static int regtest_offsetmap32[REGTEST_MAX_LENGTH32];
1148
1149 #endif /* SUPPORT_PCRE2_32 */
1150
check_ascii(const char * input)1151 static int check_ascii(const char *input)
1152 {
1153 const unsigned char *ptr = (unsigned char *)input;
1154 while (*ptr) {
1155 if (*ptr > 127)
1156 return 0;
1157 ptr++;
1158 }
1159 return 1;
1160 }
1161
1162 #define OVECTOR_SIZE 15
1163
regression_tests(void)1164 static int regression_tests(void)
1165 {
1166 struct regression_test_case *current = regression_test_cases;
1167 int error;
1168 PCRE2_SIZE err_offs;
1169 int is_successful;
1170 int is_ascii;
1171 int total = 0;
1172 int successful = 0;
1173 int successful_row = 0;
1174 int counter = 0;
1175 int jit_compile_mode;
1176 int utf = 0;
1177 uint32_t disabled_options = 0;
1178 int i;
1179 #ifdef SUPPORT_PCRE2_8
1180 pcre2_code_8 *re8;
1181 pcre2_compile_context_8 *ccontext8;
1182 pcre2_match_data_8 *mdata8_1;
1183 pcre2_match_data_8 *mdata8_2;
1184 pcre2_match_context_8 *mcontext8;
1185 PCRE2_SIZE *ovector8_1 = NULL;
1186 PCRE2_SIZE *ovector8_2 = NULL;
1187 int return_value8[2];
1188 #endif
1189 #ifdef SUPPORT_PCRE2_16
1190 pcre2_code_16 *re16;
1191 pcre2_compile_context_16 *ccontext16;
1192 pcre2_match_data_16 *mdata16_1;
1193 pcre2_match_data_16 *mdata16_2;
1194 pcre2_match_context_16 *mcontext16;
1195 PCRE2_SIZE *ovector16_1 = NULL;
1196 PCRE2_SIZE *ovector16_2 = NULL;
1197 int return_value16[2];
1198 int length16;
1199 #endif
1200 #ifdef SUPPORT_PCRE2_32
1201 pcre2_code_32 *re32;
1202 pcre2_compile_context_32 *ccontext32;
1203 pcre2_match_data_32 *mdata32_1;
1204 pcre2_match_data_32 *mdata32_2;
1205 pcre2_match_context_32 *mcontext32;
1206 PCRE2_SIZE *ovector32_1 = NULL;
1207 PCRE2_SIZE *ovector32_2 = NULL;
1208 int return_value32[2];
1209 int length32;
1210 #endif
1211
1212 #if defined SUPPORT_PCRE2_8
1213 PCRE2_UCHAR8 cpu_info[128];
1214 #elif defined SUPPORT_PCRE2_16
1215 PCRE2_UCHAR16 cpu_info[128];
1216 #elif defined SUPPORT_PCRE2_32
1217 PCRE2_UCHAR32 cpu_info[128];
1218 #endif
1219 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1220 int return_value;
1221 #endif
1222
1223 /* This test compares the behaviour of interpreter and JIT. Although disabling
1224 utf or ucp may make tests fail, if the pcre2_match result is the SAME, it is
1225 still considered successful from pcre2_jit_test point of view. */
1226
1227 #if defined SUPPORT_PCRE2_8
1228 pcre2_config_8(PCRE2_CONFIG_JITTARGET, &cpu_info);
1229 #elif defined SUPPORT_PCRE2_16
1230 pcre2_config_16(PCRE2_CONFIG_JITTARGET, &cpu_info);
1231 #elif defined SUPPORT_PCRE2_32
1232 pcre2_config_32(PCRE2_CONFIG_JITTARGET, &cpu_info);
1233 #endif
1234
1235 printf("Running JIT regression tests\n");
1236 printf(" target CPU of SLJIT compiler: ");
1237 for (i = 0; cpu_info[i]; i++)
1238 printf("%c", (char)(cpu_info[i]));
1239 printf("\n");
1240
1241 #if defined SUPPORT_PCRE2_8
1242 pcre2_config_8(PCRE2_CONFIG_UNICODE, &utf);
1243 #elif defined SUPPORT_PCRE2_16
1244 pcre2_config_16(PCRE2_CONFIG_UNICODE, &utf);
1245 #elif defined SUPPORT_PCRE2_32
1246 pcre2_config_32(PCRE2_CONFIG_UNICODE, &utf);
1247 #endif
1248
1249 if (!utf)
1250 disabled_options |= PCRE2_UTF;
1251 #ifdef SUPPORT_PCRE2_8
1252 printf(" in 8 bit mode with UTF-8 %s:\n", utf ? "enabled" : "disabled");
1253 #endif
1254 #ifdef SUPPORT_PCRE2_16
1255 printf(" in 16 bit mode with UTF-16 %s:\n", utf ? "enabled" : "disabled");
1256 #endif
1257 #ifdef SUPPORT_PCRE2_32
1258 printf(" in 32 bit mode with UTF-32 %s:\n", utf ? "enabled" : "disabled");
1259 #endif
1260
1261 while (current->pattern) {
1262 /* printf("\nPattern: %s :\n", current->pattern); */
1263 total++;
1264 is_ascii = 0;
1265 if (!(current->start_offset & F_PROPERTY))
1266 is_ascii = check_ascii(current->pattern) && check_ascii(current->input);
1267
1268 if (current->match_options & PCRE2_PARTIAL_SOFT)
1269 jit_compile_mode = PCRE2_JIT_PARTIAL_SOFT;
1270 else if (current->match_options & PCRE2_PARTIAL_HARD)
1271 jit_compile_mode = PCRE2_JIT_PARTIAL_HARD;
1272 else
1273 jit_compile_mode = PCRE2_JIT_COMPLETE;
1274 error = 0;
1275 #ifdef SUPPORT_PCRE2_8
1276 re8 = NULL;
1277 ccontext8 = pcre2_compile_context_create_8(NULL);
1278 if (ccontext8) {
1279 if (GET_NEWLINE(current->newline))
1280 pcre2_set_newline_8(ccontext8, GET_NEWLINE(current->newline));
1281 if (GET_BSR(current->newline))
1282 pcre2_set_bsr_8(ccontext8, GET_BSR(current->newline));
1283
1284 if (!(current->start_offset & F_NO8)) {
1285 re8 = pcre2_compile_8((PCRE2_SPTR8)current->pattern, PCRE2_ZERO_TERMINATED,
1286 current->compile_options & ~disabled_options,
1287 &error, &err_offs, ccontext8);
1288
1289 if (!re8 && (utf || is_ascii))
1290 printf("\n8 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1291 }
1292 pcre2_compile_context_free_8(ccontext8);
1293 }
1294 else
1295 printf("\n8 bit: Cannot allocate compile context\n");
1296 #endif
1297 #ifdef SUPPORT_PCRE2_16
1298 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1299 convert_utf8_to_utf16((PCRE2_SPTR8)current->pattern, regtest_buf16, NULL, REGTEST_MAX_LENGTH16);
1300 else
1301 copy_char8_to_char16((PCRE2_SPTR8)current->pattern, regtest_buf16, REGTEST_MAX_LENGTH16);
1302
1303 re16 = NULL;
1304 ccontext16 = pcre2_compile_context_create_16(NULL);
1305 if (ccontext16) {
1306 if (GET_NEWLINE(current->newline))
1307 pcre2_set_newline_16(ccontext16, GET_NEWLINE(current->newline));
1308 if (GET_BSR(current->newline))
1309 pcre2_set_bsr_16(ccontext16, GET_BSR(current->newline));
1310
1311 if (!(current->start_offset & F_NO16)) {
1312 re16 = pcre2_compile_16(regtest_buf16, PCRE2_ZERO_TERMINATED,
1313 current->compile_options & ~disabled_options,
1314 &error, &err_offs, ccontext16);
1315
1316 if (!re16 && (utf || is_ascii))
1317 printf("\n16 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1318 }
1319 pcre2_compile_context_free_16(ccontext16);
1320 }
1321 else
1322 printf("\n16 bit: Cannot allocate compile context\n");
1323 #endif
1324 #ifdef SUPPORT_PCRE2_32
1325 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1326 convert_utf8_to_utf32((PCRE2_SPTR8)current->pattern, regtest_buf32, NULL, REGTEST_MAX_LENGTH32);
1327 else
1328 copy_char8_to_char32((PCRE2_SPTR8)current->pattern, regtest_buf32, REGTEST_MAX_LENGTH32);
1329
1330 re32 = NULL;
1331 ccontext32 = pcre2_compile_context_create_32(NULL);
1332 if (ccontext32) {
1333 if (GET_NEWLINE(current->newline))
1334 pcre2_set_newline_32(ccontext32, GET_NEWLINE(current->newline));
1335 if (GET_BSR(current->newline))
1336 pcre2_set_bsr_32(ccontext32, GET_BSR(current->newline));
1337
1338 if (!(current->start_offset & F_NO32)) {
1339 re32 = pcre2_compile_32(regtest_buf32, PCRE2_ZERO_TERMINATED,
1340 current->compile_options & ~disabled_options,
1341 &error, &err_offs, ccontext32);
1342
1343 if (!re32 && (utf || is_ascii))
1344 printf("\n32 bit: Cannot compile pattern \"%s\": %d\n", current->pattern, error);
1345 }
1346 pcre2_compile_context_free_32(ccontext32);
1347 }
1348 else
1349 printf("\n32 bit: Cannot allocate compile context\n");
1350 #endif
1351
1352 counter++;
1353 if ((counter & 0x3) != 0) {
1354 #ifdef SUPPORT_PCRE2_8
1355 setstack8(NULL);
1356 #endif
1357 #ifdef SUPPORT_PCRE2_16
1358 setstack16(NULL);
1359 #endif
1360 #ifdef SUPPORT_PCRE2_32
1361 setstack32(NULL);
1362 #endif
1363 }
1364
1365 #ifdef SUPPORT_PCRE2_8
1366 return_value8[0] = -1000;
1367 return_value8[1] = -1000;
1368 mdata8_1 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1369 mdata8_2 = pcre2_match_data_create_8(OVECTOR_SIZE, NULL);
1370 mcontext8 = pcre2_match_context_create_8(NULL);
1371 if (!mdata8_1 || !mdata8_2 || !mcontext8) {
1372 printf("\n8 bit: Cannot allocate match data\n");
1373 pcre2_match_data_free_8(mdata8_1);
1374 pcre2_match_data_free_8(mdata8_2);
1375 pcre2_match_context_free_8(mcontext8);
1376 pcre2_code_free_8(re8);
1377 re8 = NULL;
1378 } else {
1379 ovector8_1 = pcre2_get_ovector_pointer_8(mdata8_1);
1380 ovector8_2 = pcre2_get_ovector_pointer_8(mdata8_2);
1381 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1382 ovector8_1[i] = (PCRE2_SIZE)(-2);
1383 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1384 ovector8_2[i] = (PCRE2_SIZE)(-2);
1385 pcre2_set_match_limit_8(mcontext8, 10000000);
1386 }
1387 if (re8) {
1388 return_value8[1] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1389 current->start_offset & OFFSET_MASK, current->match_options, mdata8_2, mcontext8);
1390
1391 if (pcre2_jit_compile_8(re8, jit_compile_mode)) {
1392 printf("\n8 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1393 } else if ((counter & 0x1) != 0) {
1394 setstack8(mcontext8);
1395 return_value8[0] = pcre2_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1396 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1397 } else {
1398 pcre2_jit_stack_assign_8(mcontext8, NULL, getstack8());
1399 return_value8[0] = pcre2_jit_match_8(re8, (PCRE2_SPTR8)current->input, strlen(current->input),
1400 current->start_offset & OFFSET_MASK, current->match_options, mdata8_1, mcontext8);
1401 }
1402 }
1403 #endif
1404
1405 #ifdef SUPPORT_PCRE2_16
1406 return_value16[0] = -1000;
1407 return_value16[1] = -1000;
1408 mdata16_1 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1409 mdata16_2 = pcre2_match_data_create_16(OVECTOR_SIZE, NULL);
1410 mcontext16 = pcre2_match_context_create_16(NULL);
1411 if (!mdata16_1 || !mdata16_2 || !mcontext16) {
1412 printf("\n16 bit: Cannot allocate match data\n");
1413 pcre2_match_data_free_16(mdata16_1);
1414 pcre2_match_data_free_16(mdata16_2);
1415 pcre2_match_context_free_16(mcontext16);
1416 pcre2_code_free_16(re16);
1417 re16 = NULL;
1418 } else {
1419 ovector16_1 = pcre2_get_ovector_pointer_16(mdata16_1);
1420 ovector16_2 = pcre2_get_ovector_pointer_16(mdata16_2);
1421 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1422 ovector16_1[i] = (PCRE2_SIZE)(-2);
1423 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1424 ovector16_2[i] = (PCRE2_SIZE)(-2);
1425 pcre2_set_match_limit_16(mcontext16, 10000000);
1426 }
1427 if (re16) {
1428 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1429 length16 = convert_utf8_to_utf16((PCRE2_SPTR8)current->input, regtest_buf16, regtest_offsetmap16, REGTEST_MAX_LENGTH16);
1430 else
1431 length16 = copy_char8_to_char16((PCRE2_SPTR8)current->input, regtest_buf16, REGTEST_MAX_LENGTH16);
1432
1433 return_value16[1] = pcre2_match_16(re16, regtest_buf16, length16,
1434 current->start_offset & OFFSET_MASK, current->match_options, mdata16_2, mcontext16);
1435
1436 if (pcre2_jit_compile_16(re16, jit_compile_mode)) {
1437 printf("\n16 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1438 } else if ((counter & 0x1) != 0) {
1439 setstack16(mcontext16);
1440 return_value16[0] = pcre2_match_16(re16, regtest_buf16, length16,
1441 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1442 } else {
1443 pcre2_jit_stack_assign_16(mcontext16, NULL, getstack16());
1444 return_value16[0] = pcre2_jit_match_16(re16, regtest_buf16, length16,
1445 current->start_offset & OFFSET_MASK, current->match_options, mdata16_1, mcontext16);
1446 }
1447 }
1448 #endif
1449
1450 #ifdef SUPPORT_PCRE2_32
1451 return_value32[0] = -1000;
1452 return_value32[1] = -1000;
1453 mdata32_1 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1454 mdata32_2 = pcre2_match_data_create_32(OVECTOR_SIZE, NULL);
1455 mcontext32 = pcre2_match_context_create_32(NULL);
1456 if (!mdata32_1 || !mdata32_2 || !mcontext32) {
1457 printf("\n32 bit: Cannot allocate match data\n");
1458 pcre2_match_data_free_32(mdata32_1);
1459 pcre2_match_data_free_32(mdata32_2);
1460 pcre2_match_context_free_32(mcontext32);
1461 pcre2_code_free_32(re32);
1462 re32 = NULL;
1463 } else {
1464 ovector32_1 = pcre2_get_ovector_pointer_32(mdata32_1);
1465 ovector32_2 = pcre2_get_ovector_pointer_32(mdata32_2);
1466 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1467 ovector32_1[i] = (PCRE2_SIZE)(-2);
1468 for (i = 0; i < OVECTOR_SIZE * 2; ++i)
1469 ovector32_2[i] = (PCRE2_SIZE)(-2);
1470 pcre2_set_match_limit_32(mcontext32, 10000000);
1471 }
1472 if (re32) {
1473 if ((current->compile_options & PCRE2_UTF) || (current->start_offset & F_FORCECONV))
1474 length32 = convert_utf8_to_utf32((PCRE2_SPTR8)current->input, regtest_buf32, regtest_offsetmap32, REGTEST_MAX_LENGTH32);
1475 else
1476 length32 = copy_char8_to_char32((PCRE2_SPTR8)current->input, regtest_buf32, REGTEST_MAX_LENGTH32);
1477
1478 return_value32[1] = pcre2_match_32(re32, regtest_buf32, length32,
1479 current->start_offset & OFFSET_MASK, current->match_options, mdata32_2, mcontext32);
1480
1481 if (pcre2_jit_compile_32(re32, jit_compile_mode)) {
1482 printf("\n32 bit: JIT compiler does not support \"%s\"\n", current->pattern);
1483 } else if ((counter & 0x1) != 0) {
1484 setstack32(mcontext32);
1485 return_value32[0] = pcre2_match_32(re32, regtest_buf32, length32,
1486 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1487 } else {
1488 pcre2_jit_stack_assign_32(mcontext32, NULL, getstack32());
1489 return_value32[0] = pcre2_jit_match_32(re32, regtest_buf32, length32,
1490 current->start_offset & OFFSET_MASK, current->match_options, mdata32_1, mcontext32);
1491 }
1492 }
1493 #endif
1494
1495 /* printf("[%d-%d-%d|%d-%d|%d-%d|%d-%d]%s",
1496 return_value8[0], return_value16[0], return_value32[0],
1497 (int)ovector8_1[0], (int)ovector8_1[1],
1498 (int)ovector16_1[0], (int)ovector16_1[1],
1499 (int)ovector32_1[0], (int)ovector32_1[1],
1500 (current->compile_options & PCRE2_CASELESS) ? "C" : ""); */
1501
1502 /* If F_DIFF is set, just run the test, but do not compare the results.
1503 Segfaults can still be captured. */
1504
1505 is_successful = 1;
1506 if (!(current->start_offset & F_DIFF)) {
1507 #if defined SUPPORT_UNICODE && ((defined(SUPPORT_PCRE2_8) + defined(SUPPORT_PCRE2_16) + defined(SUPPORT_PCRE2_32)) >= 2)
1508 if (!(current->start_offset & F_FORCECONV)) {
1509
1510 /* All results must be the same. */
1511 #ifdef SUPPORT_PCRE2_8
1512 if ((return_value = return_value8[0]) != return_value8[1]) {
1513 printf("\n8 bit: Return value differs(J8:%d,I8:%d): [%d] '%s' @ '%s'\n",
1514 return_value8[0], return_value8[1], total, current->pattern, current->input);
1515 is_successful = 0;
1516 } else
1517 #endif
1518 #ifdef SUPPORT_PCRE2_16
1519 if ((return_value = return_value16[0]) != return_value16[1]) {
1520 printf("\n16 bit: Return value differs(J16:%d,I16:%d): [%d] '%s' @ '%s'\n",
1521 return_value16[0], return_value16[1], total, current->pattern, current->input);
1522 is_successful = 0;
1523 } else
1524 #endif
1525 #ifdef SUPPORT_PCRE2_32
1526 if ((return_value = return_value32[0]) != return_value32[1]) {
1527 printf("\n32 bit: Return value differs(J32:%d,I32:%d): [%d] '%s' @ '%s'\n",
1528 return_value32[0], return_value32[1], total, current->pattern, current->input);
1529 is_successful = 0;
1530 } else
1531 #endif
1532 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1533 if (return_value8[0] != return_value16[0]) {
1534 printf("\n8 and 16 bit: Return value differs(J8:%d,J16:%d): [%d] '%s' @ '%s'\n",
1535 return_value8[0], return_value16[0],
1536 total, current->pattern, current->input);
1537 is_successful = 0;
1538 } else
1539 #endif
1540 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1541 if (return_value8[0] != return_value32[0]) {
1542 printf("\n8 and 32 bit: Return value differs(J8:%d,J32:%d): [%d] '%s' @ '%s'\n",
1543 return_value8[0], return_value32[0],
1544 total, current->pattern, current->input);
1545 is_successful = 0;
1546 } else
1547 #endif
1548 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1549 if (return_value16[0] != return_value32[0]) {
1550 printf("\n16 and 32 bit: Return value differs(J16:%d,J32:%d): [%d] '%s' @ '%s'\n",
1551 return_value16[0], return_value32[0],
1552 total, current->pattern, current->input);
1553 is_successful = 0;
1554 } else
1555 #endif
1556 if (return_value >= 0 || return_value == PCRE2_ERROR_PARTIAL) {
1557 if (return_value == PCRE2_ERROR_PARTIAL) {
1558 return_value = 2;
1559 } else {
1560 return_value *= 2;
1561 }
1562 #ifdef SUPPORT_PCRE2_8
1563 return_value8[0] = return_value;
1564 #endif
1565 #ifdef SUPPORT_PCRE2_16
1566 return_value16[0] = return_value;
1567 #endif
1568 #ifdef SUPPORT_PCRE2_32
1569 return_value32[0] = return_value;
1570 #endif
1571 /* Transform back the results. */
1572 if (current->compile_options & PCRE2_UTF) {
1573 #ifdef SUPPORT_PCRE2_16
1574 for (i = 0; i < return_value; ++i) {
1575 if (ovector16_1[i] != PCRE2_UNSET)
1576 ovector16_1[i] = regtest_offsetmap16[ovector16_1[i]];
1577 if (ovector16_2[i] != PCRE2_UNSET)
1578 ovector16_2[i] = regtest_offsetmap16[ovector16_2[i]];
1579 }
1580 #endif
1581 #ifdef SUPPORT_PCRE2_32
1582 for (i = 0; i < return_value; ++i) {
1583 if (ovector32_1[i] != PCRE2_UNSET)
1584 ovector32_1[i] = regtest_offsetmap32[ovector32_1[i]];
1585 if (ovector32_2[i] != PCRE2_UNSET)
1586 ovector32_2[i] = regtest_offsetmap32[ovector32_2[i]];
1587 }
1588 #endif
1589 }
1590
1591 for (i = 0; i < return_value; ++i) {
1592 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_16
1593 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector16_1[i] || ovector8_1[i] != ovector16_2[i]) {
1594 printf("\n8 and 16 bit: Ovector[%d] value differs(J8:%d,I8:%d,J16:%d,I16:%d): [%d] '%s' @ '%s' \n",
1595 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector16_1[i], (int)ovector16_2[i],
1596 total, current->pattern, current->input);
1597 is_successful = 0;
1598 }
1599 #endif
1600 #if defined SUPPORT_PCRE2_8 && defined SUPPORT_PCRE2_32
1601 if (ovector8_1[i] != ovector8_2[i] || ovector8_1[i] != ovector32_1[i] || ovector8_1[i] != ovector32_2[i]) {
1602 printf("\n8 and 32 bit: Ovector[%d] value differs(J8:%d,I8:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1603 i, (int)ovector8_1[i], (int)ovector8_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1604 total, current->pattern, current->input);
1605 is_successful = 0;
1606 }
1607 #endif
1608 #if defined SUPPORT_PCRE2_16 && defined SUPPORT_PCRE2_32
1609 if (ovector16_1[i] != ovector16_2[i] || ovector16_1[i] != ovector32_1[i] || ovector16_1[i] != ovector32_2[i]) {
1610 printf("\n16 and 32 bit: Ovector[%d] value differs(J16:%d,I16:%d,J32:%d,I32:%d): [%d] '%s' @ '%s' \n",
1611 i, (int)ovector16_1[i], (int)ovector16_2[i], (int)ovector32_1[i], (int)ovector32_2[i],
1612 total, current->pattern, current->input);
1613 is_successful = 0;
1614 }
1615 #endif
1616 }
1617 }
1618 } else
1619 #endif /* more than one of SUPPORT_PCRE2_8, SUPPORT_PCRE2_16 and SUPPORT_PCRE2_32 */
1620 {
1621 #ifdef SUPPORT_PCRE2_8
1622 if (return_value8[0] != return_value8[1]) {
1623 printf("\n8 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1624 return_value8[0], return_value8[1], total, current->pattern, current->input);
1625 is_successful = 0;
1626 } else if (return_value8[0] >= 0 || return_value8[0] == PCRE2_ERROR_PARTIAL) {
1627 if (return_value8[0] == PCRE2_ERROR_PARTIAL)
1628 return_value8[0] = 2;
1629 else
1630 return_value8[0] *= 2;
1631
1632 for (i = 0; i < return_value8[0]; ++i)
1633 if (ovector8_1[i] != ovector8_2[i]) {
1634 printf("\n8 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1635 i, (int)ovector8_1[i], (int)ovector8_2[i], total, current->pattern, current->input);
1636 is_successful = 0;
1637 }
1638 }
1639 #endif
1640
1641 #ifdef SUPPORT_PCRE2_16
1642 if (return_value16[0] != return_value16[1]) {
1643 printf("\n16 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1644 return_value16[0], return_value16[1], total, current->pattern, current->input);
1645 is_successful = 0;
1646 } else if (return_value16[0] >= 0 || return_value16[0] == PCRE2_ERROR_PARTIAL) {
1647 if (return_value16[0] == PCRE2_ERROR_PARTIAL)
1648 return_value16[0] = 2;
1649 else
1650 return_value16[0] *= 2;
1651
1652 for (i = 0; i < return_value16[0]; ++i)
1653 if (ovector16_1[i] != ovector16_2[i]) {
1654 printf("\n16 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1655 i, (int)ovector16_1[i], (int)ovector16_2[i], total, current->pattern, current->input);
1656 is_successful = 0;
1657 }
1658 }
1659 #endif
1660
1661 #ifdef SUPPORT_PCRE2_32
1662 if (return_value32[0] != return_value32[1]) {
1663 printf("\n32 bit: Return value differs(%d:%d): [%d] '%s' @ '%s'\n",
1664 return_value32[0], return_value32[1], total, current->pattern, current->input);
1665 is_successful = 0;
1666 } else if (return_value32[0] >= 0 || return_value32[0] == PCRE2_ERROR_PARTIAL) {
1667 if (return_value32[0] == PCRE2_ERROR_PARTIAL)
1668 return_value32[0] = 2;
1669 else
1670 return_value32[0] *= 2;
1671
1672 for (i = 0; i < return_value32[0]; ++i)
1673 if (ovector32_1[i] != ovector32_2[i]) {
1674 printf("\n32 bit: Ovector[%d] value differs(%d:%d): [%d] '%s' @ '%s'\n",
1675 i, (int)ovector32_1[i], (int)ovector32_2[i], total, current->pattern, current->input);
1676 is_successful = 0;
1677 }
1678 }
1679 #endif
1680 }
1681 }
1682
1683 if (is_successful) {
1684 #ifdef SUPPORT_PCRE2_8
1685 if (!(current->start_offset & F_NO8) && (utf || is_ascii)) {
1686 if (return_value8[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1687 printf("8 bit: Test should match: [%d] '%s' @ '%s'\n",
1688 total, current->pattern, current->input);
1689 is_successful = 0;
1690 }
1691
1692 if (return_value8[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1693 printf("8 bit: Test should not match: [%d] '%s' @ '%s'\n",
1694 total, current->pattern, current->input);
1695 is_successful = 0;
1696 }
1697 }
1698 #endif
1699 #ifdef SUPPORT_PCRE2_16
1700 if (!(current->start_offset & F_NO16) && (utf || is_ascii)) {
1701 if (return_value16[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1702 printf("16 bit: Test should match: [%d] '%s' @ '%s'\n",
1703 total, current->pattern, current->input);
1704 is_successful = 0;
1705 }
1706
1707 if (return_value16[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1708 printf("16 bit: Test should not match: [%d] '%s' @ '%s'\n",
1709 total, current->pattern, current->input);
1710 is_successful = 0;
1711 }
1712 }
1713 #endif
1714 #ifdef SUPPORT_PCRE2_32
1715 if (!(current->start_offset & F_NO32) && (utf || is_ascii)) {
1716 if (return_value32[0] < 0 && !(current->start_offset & F_NOMATCH)) {
1717 printf("32 bit: Test should match: [%d] '%s' @ '%s'\n",
1718 total, current->pattern, current->input);
1719 is_successful = 0;
1720 }
1721
1722 if (return_value32[0] >= 0 && (current->start_offset & F_NOMATCH)) {
1723 printf("32 bit: Test should not match: [%d] '%s' @ '%s'\n",
1724 total, current->pattern, current->input);
1725 is_successful = 0;
1726 }
1727 }
1728 #endif
1729 }
1730
1731 if (is_successful) {
1732 #ifdef SUPPORT_PCRE2_8
1733 if (re8 && !(current->start_offset & F_NO8) && pcre2_get_mark_8(mdata8_1) != pcre2_get_mark_8(mdata8_2)) {
1734 printf("8 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1735 total, current->pattern, current->input);
1736 is_successful = 0;
1737 }
1738 #endif
1739 #ifdef SUPPORT_PCRE2_16
1740 if (re16 && !(current->start_offset & F_NO16) && pcre2_get_mark_16(mdata16_1) != pcre2_get_mark_16(mdata16_2)) {
1741 printf("16 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1742 total, current->pattern, current->input);
1743 is_successful = 0;
1744 }
1745 #endif
1746 #ifdef SUPPORT_PCRE2_32
1747 if (re32 && !(current->start_offset & F_NO32) && pcre2_get_mark_32(mdata32_1) != pcre2_get_mark_32(mdata32_2)) {
1748 printf("32 bit: Mark value mismatch: [%d] '%s' @ '%s'\n",
1749 total, current->pattern, current->input);
1750 is_successful = 0;
1751 }
1752 #endif
1753 }
1754
1755 #ifdef SUPPORT_PCRE2_8
1756 pcre2_code_free_8(re8);
1757 pcre2_match_data_free_8(mdata8_1);
1758 pcre2_match_data_free_8(mdata8_2);
1759 pcre2_match_context_free_8(mcontext8);
1760 #endif
1761 #ifdef SUPPORT_PCRE2_16
1762 pcre2_code_free_16(re16);
1763 pcre2_match_data_free_16(mdata16_1);
1764 pcre2_match_data_free_16(mdata16_2);
1765 pcre2_match_context_free_16(mcontext16);
1766 #endif
1767 #ifdef SUPPORT_PCRE2_32
1768 pcre2_code_free_32(re32);
1769 pcre2_match_data_free_32(mdata32_1);
1770 pcre2_match_data_free_32(mdata32_2);
1771 pcre2_match_context_free_32(mcontext32);
1772 #endif
1773
1774 if (is_successful) {
1775 successful++;
1776 successful_row++;
1777 printf(".");
1778 if (successful_row >= 60) {
1779 successful_row = 0;
1780 printf("\n");
1781 }
1782 } else
1783 successful_row = 0;
1784
1785 fflush(stdout);
1786 current++;
1787 }
1788 #ifdef SUPPORT_PCRE2_8
1789 setstack8(NULL);
1790 #endif
1791 #ifdef SUPPORT_PCRE2_16
1792 setstack16(NULL);
1793 #endif
1794 #ifdef SUPPORT_PCRE2_32
1795 setstack32(NULL);
1796 #endif
1797
1798 if (total == successful) {
1799 printf("\nAll JIT regression tests are successfully passed.\n");
1800 return 0;
1801 } else {
1802 printf("\nSuccessful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
1803 return 1;
1804 }
1805 }
1806
1807 #if defined SUPPORT_UNICODE
1808
check_invalid_utf_result(int pattern_index,const char * type,int result,int match_start,int match_end,PCRE2_SIZE * ovector)1809 static int check_invalid_utf_result(int pattern_index, const char *type, int result,
1810 int match_start, int match_end, PCRE2_SIZE *ovector)
1811 {
1812 if (match_start < 0) {
1813 if (result != -1) {
1814 printf("Pattern[%d] %s result is not -1.\n", pattern_index, type);
1815 return 1;
1816 }
1817 return 0;
1818 }
1819
1820 if (result <= 0) {
1821 printf("Pattern[%d] %s result (%d) is not greater than 0.\n", pattern_index, type, result);
1822 return 1;
1823 }
1824
1825 if (ovector[0] != (PCRE2_SIZE)match_start) {
1826 printf("Pattern[%d] %s ovector[0] is unexpected (%d instead of %d)\n",
1827 pattern_index, type, (int)ovector[0], match_start);
1828 return 1;
1829 }
1830
1831 if (ovector[1] != (PCRE2_SIZE)match_end) {
1832 printf("Pattern[%d] %s ovector[1] is unexpected (%d instead of %d)\n",
1833 pattern_index, type, (int)ovector[1], match_end);
1834 return 1;
1835 }
1836
1837 return 0;
1838 }
1839
1840 #endif /* SUPPORT_UNICODE */
1841
1842 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_8
1843
1844 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
1845 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
1846 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
1847
1848 struct invalid_utf8_regression_test_case {
1849 uint32_t compile_options;
1850 int jit_compile_options;
1851 int start_offset;
1852 int skip_left;
1853 int skip_right;
1854 int match_start;
1855 int match_end;
1856 const char *pattern[2];
1857 const char *input;
1858 };
1859
1860 static const char invalid_utf8_newline_cr;
1861
1862 static const struct invalid_utf8_regression_test_case invalid_utf8_regression_test_cases[] = {
1863 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1864 { UDA, CI, 0, 0, 0, 0, 4, { ".", NULL }, "\xf0\x90\x80\x80" },
1865 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf4\x90\x80\x80" },
1866 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xf4\x8f\xbf\xbf" },
1867 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\x7f" },
1868 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x90\x80\xc0" },
1869 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf0\x8f\xbf\xbf" },
1870 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf#" },
1871 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xef\xbf\xbf" },
1872 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80#" },
1873 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xe0\xa0\x80" },
1874 { UDA, CI, 0, 0, 2, -1, -1, { ".", NULL }, "\xef\xbf\xbf#" },
1875 { UDA, CI, 0, 0, 1, -1, -1, { ".", NULL }, "\xef\xbf\xbf" },
1876 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\x7f#" },
1877 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xef\xbf\xc0" },
1878 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf#" },
1879 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x9f\xbf" },
1880 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xed\x9f\xbf#" },
1881 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xa0\x80#" },
1882 { UDA, CI, 0, 0, 0, 0, 3, { ".", NULL }, "\xee\x80\x80#" },
1883 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xed\xbf\xbf#" },
1884 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf##" },
1885 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf#" },
1886 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xdf\xbf" },
1887 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80##" },
1888 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80#" },
1889 { UDA, CI, 0, 0, 0, 0, 2, { ".", NULL }, "\xc2\x80" },
1890 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80##" },
1891 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0##" },
1892 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xe0\x80" },
1893 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xdf\xc0" },
1894 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf##" },
1895 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xc1\xbf" },
1896 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80###" },
1897 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\x80" },
1898 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8###" },
1899 { UDA, CI, 0, 0, 0, -1, -1, { ".", NULL }, "\xf8" },
1900 { UDA, CI, 0, 0, 0, 0, 1, { ".", NULL }, "\x7f" },
1901
1902 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf4\x8f\xbf\xbf#" },
1903 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\xa0\x80\x80\xf4\xa0\x80\x80" },
1904 { UDA, CPI, 4, 1, 1, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xbf\xf4\x8f\xbf\xbf" },
1905 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xef\xbf\xbf#" },
1906 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "#\xe0\xa0\x80#" },
1907 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf0\x90\x80\x80#" },
1908 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "\xf3\xbf\xbf\xbf#" },
1909 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf0\x8f\xbf\xbf\xf0\x8f\xbf\xbf" },
1910 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf5\x80\x80\x80\xf5\x80\x80\x80" },
1911 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x90\x80\x80\xf4\x90\x80\x80" },
1912 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xbf\xff\xf4\x8f\xbf\xff" },
1913 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\xf4\x8f\xff\xbf\xf4\x8f\xff\xbf" },
1914 { UDA, CPI, 4, 0, 1, -1, -1, { "\\B", "\\b" }, "\xef\x80\x80\x80\xef\x80\x80" },
1915 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80\x80\x80\x80\x80\x80\x80" },
1916 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xe0\x9f\xbf\xe0\x9f\xbf#" },
1917 { UDA, CPI, 4, 2, 2, -1, -1, { "\\B", "\\b" }, "#\xe0\xa0\x80\xe0\xa0\x80#" },
1918 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xf0\x80\x80\xf0\x80\x80#" },
1919 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "#\xed\xa0\x80\xed\xa0\x80#" },
1920 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xdf\xbf#" },
1921 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xdf\xbf#" },
1922 { UDA, CPI, 4, 0, 0, 4, 4, { "\\B", NULL }, "##\xc2\x80#" },
1923 { UDA, CPI, 4, 2, 0, 2, 2, { "\\B", NULL }, "##\xc2\x80#" },
1924 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xc1\xbf\xc1\xbf##" },
1925 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xdf\xc0\xdf\xc0##" },
1926 { UDA, CPI, 4, 0, 0, -1, -1, { "\\B", "\\b" }, "##\xe0\x80\xe0\x80##" },
1927
1928 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xef\xbf\xbf#" },
1929 { UDA, CPI, 3, 0, 0, 3, 3, { "\\B", NULL }, "\xe0\xa0\x80#" },
1930 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x9f\xbf\xe0\x9f\xbf" },
1931 { UDA, CPI, 3, 1, 1, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xbf\xef\xbf\xbf" },
1932 { UDA, CPI, 3, 0, 1, -1, -1, { "\\B", "\\b" }, "\xdf\x80\x80\xdf\x80" },
1933 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xbf\xff\xef\xbf\xff" },
1934 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xef\xff\xbf\xef\xff\xbf" },
1935 { UDA, CPI, 3, 0, 0, -1, -1, { "\\B", "\\b" }, "\xed\xbf\xbf\xed\xbf\xbf" },
1936
1937 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xdf\xbf#" },
1938 { UDA, CPI, 2, 0, 0, 2, 2, { "\\B", NULL }, "\xc2\x80#" },
1939 { UDA, CPI, 2, 1, 1, -1, -1, { "\\B", "\\b" }, "\xdf\xbf\xdf\xbf" },
1940 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xc1\xbf\xc1\xbf" },
1941 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xe0\x80\xe0\x80" },
1942 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xdf\xff\xdf\xff" },
1943 { UDA, CPI, 2, 0, 0, -1, -1, { "\\B", "\\b" }, "\xff\xbf\xff\xbf" },
1944
1945 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x7f#" },
1946 { UDA, CPI, 1, 0, 0, 1, 1, { "\\B", NULL }, "\x01#" },
1947 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\x80\x80" },
1948 { UDA, CPI, 1, 0, 0, -1, -1, { "\\B", "\\b" }, "\xb0\xb0" },
1949
1950 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { "(.)\\1", NULL }, "aA" },
1951 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "a\xff" },
1952 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1953 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xc3\xa1\xc3\x81" },
1954 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, -1, -1, { "(.)\\1", NULL }, "\xc2\x80\x80" },
1955 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 6, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1956 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xe1\xbd\xb8\xe1\xbf\xb8" },
1957 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 8, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1958 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { "(.)\\1", NULL }, "\xf0\x90\x90\x80\xf0\x90\x90\xa8" },
1959
1960 { UDA, CPI, 0, 0, 0, 0, 1, { "\\X", NULL }, "A" },
1961 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xff" },
1962 { UDA, CPI, 0, 0, 0, 0, 2, { "\\X", NULL }, "\xc3\xa1" },
1963 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xc3\xa1" },
1964 { UDA, CPI, 0, 0, 0, -1, -1, { "\\X", NULL }, "\xc3\x7f" },
1965 { UDA, CPI, 0, 0, 0, 0, 3, { "\\X", NULL }, "\xe1\xbd\xb8" },
1966 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xe1\xbd\xb8" },
1967 { UDA, CPI, 0, 0, 0, 0, 4, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1968 { UDA, CPI, 0, 0, 1, -1, -1, { "\\X", NULL }, "\xf0\x90\x90\x80" },
1969
1970 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "#" },
1971 { UDA, CPI, 0, 0, 0, 0, 4, { "[^#]", NULL }, "\xf4\x8f\xbf\xbf" },
1972 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xf4\x90\x80\x80" },
1973 { UDA, CPI, 0, 0, 0, -1, -1, { "[^#]", NULL }, "\xc1\x80" },
1974
1975 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { "^\\W", NULL }, " \x0a#"},
1976 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 14, 15, { "^\\W", NULL }, " \xc0\x8a#\xe0\x80\x8a#\xf0\x80\x80\x8a#\x0a#"},
1977 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf8\x0a#"},
1978 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xc3\x0a#"},
1979 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xf1\x0a#"},
1980 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xf2\xbf\x0a#"},
1981 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \xf2\xbf\xbf\x0a#"},
1982 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { "^\\W", NULL }, " \xef\x0a#"},
1983 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { "^\\W", NULL }, " \xef\xbf\x0a#"},
1984 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 5, 6, { "^\\W", NULL }, " \x85#\xc2\x85#"},
1985 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 7, 8, { "^\\W", NULL }, " \xe2\x80\xf8\xe2\x80\xa8#"},
1986
1987 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xe2\x80\xf8\xe2\x80\xa8#"},
1988 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 3, 4, { "#", NULL }, "\xe2\x80\xf8#\xe2\x80\xa8#"},
1989 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "abcd\xc2\x85#"},
1990 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 1, 2, { "#", NULL }, "\x85#\xc2\x85#"},
1991 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, 5, 6, { "#", NULL }, "\xef,\x80,\xf8#\x0a"},
1992 { PCRE2_UTF | PCRE2_FIRSTLINE, CI, 0, 0, 0, -1, -1, { "#", NULL }, "\xef,\x80,\xf8\x0a#"},
1993
1994 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1995 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1996 { PCRE2_UTF, CI, 0, 0, 0, 4, 8, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7#\xc7\x85#" },
1997 { PCRE2_UTF, CI, 0, 0, 0, 7, 11, { "#\xc7\x85#", NULL }, "\x80\x80#\xc7\x80\x80\x80#\xc7\x85#" },
1998
1999 { PCRE2_UTF | PCRE2_UCP, CI, 0, 0, 0, -1, -1, { "[\\s]", NULL }, "\xed\xa0\x80" },
2000 { PCRE2_UTF, CI, 0, 0, 0, 0, 3, { "[\\D]", NULL }, "\xe0\xab\xaa@" },
2001 { PCRE2_UTF, CI, 0, 0, 0, 0, 3, { "\\D+", NULL }, "n\xc3\xb1" },
2002 { PCRE2_UTF, CI, 0, 0, 0, 0, 5, { "\\W+", NULL }, "@\xf0\x9d\x84\x9e" },
2003
2004 /* These two are not invalid UTF tests, but this infrastructure fits better for them. */
2005 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\X{2}", NULL }, "\r\n\n" },
2006 { 0, PCRE2_JIT_COMPLETE, 0, 0, 1, -1, -1, { "\\R{2}", NULL }, "\r\n\n" },
2007
2008 { PCRE2_UTF | PCRE2_MULTILINE, CI, 0, 0, 0, -1, -1, { "^.a", &invalid_utf8_newline_cr }, "\xc3\xa7#a" },
2009
2010 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2011 };
2012
2013 #undef UDA
2014 #undef CI
2015 #undef CPI
2016
run_invalid_utf8_test(const struct invalid_utf8_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_8 * ccontext,pcre2_match_data_8 * mdata)2017 static int run_invalid_utf8_test(const struct invalid_utf8_regression_test_case *current,
2018 int pattern_index, int i, pcre2_compile_context_8 *ccontext, pcre2_match_data_8 *mdata)
2019 {
2020 pcre2_code_8 *code;
2021 int result, errorcode;
2022 PCRE2_SIZE length, erroroffset;
2023 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_8(mdata);
2024
2025 if (current->pattern[i] == NULL)
2026 return 1;
2027
2028 code = pcre2_compile_8((PCRE2_UCHAR8*)current->pattern[i], PCRE2_ZERO_TERMINATED,
2029 current->compile_options, &errorcode, &erroroffset, ccontext);
2030
2031 if (!code) {
2032 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2033 return 0;
2034 }
2035
2036 if (pcre2_jit_compile_8(code, current->jit_compile_options) != 0) {
2037 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2038 pcre2_code_free_8(code);
2039 return 0;
2040 }
2041
2042 length = (PCRE2_SIZE)(strlen(current->input) - current->skip_left - current->skip_right);
2043
2044 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2045 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2046 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2047
2048 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2049 pcre2_code_free_8(code);
2050 return 0;
2051 }
2052 }
2053
2054 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2055 result = pcre2_jit_match_8(code, (PCRE2_UCHAR8*)(current->input + current->skip_left),
2056 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2057
2058 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2059 pcre2_code_free_8(code);
2060 return 0;
2061 }
2062 }
2063
2064 pcre2_code_free_8(code);
2065 return 1;
2066 }
2067
invalid_utf8_regression_tests(void)2068 static int invalid_utf8_regression_tests(void)
2069 {
2070 const struct invalid_utf8_regression_test_case *current;
2071 pcre2_compile_context_8 *ccontext;
2072 pcre2_match_data_8 *mdata;
2073 int total = 0, successful = 0;
2074 int result;
2075
2076 printf("\nRunning invalid-utf8 JIT regression tests\n");
2077
2078 ccontext = pcre2_compile_context_create_8(NULL);
2079 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2080 mdata = pcre2_match_data_create_8(4, NULL);
2081
2082 for (current = invalid_utf8_regression_test_cases; current->pattern[0]; current++) {
2083 /* printf("\nPattern: %s :\n", current->pattern); */
2084 total++;
2085
2086 result = 1;
2087 if (current->pattern[1] != &invalid_utf8_newline_cr)
2088 {
2089 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2090 result = 0;
2091 if (!run_invalid_utf8_test(current, total - 1, 1, ccontext, mdata))
2092 result = 0;
2093 } else {
2094 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_CR);
2095 if (!run_invalid_utf8_test(current, total - 1, 0, ccontext, mdata))
2096 result = 0;
2097 pcre2_set_newline_8(ccontext, PCRE2_NEWLINE_ANY);
2098 }
2099
2100 if (result) {
2101 successful++;
2102 }
2103
2104 printf(".");
2105 if ((total % 60) == 0)
2106 printf("\n");
2107 }
2108
2109 if ((total % 60) != 0)
2110 printf("\n");
2111
2112 pcre2_match_data_free_8(mdata);
2113 pcre2_compile_context_free_8(ccontext);
2114
2115 if (total == successful) {
2116 printf("\nAll invalid UTF8 JIT regression tests are successfully passed.\n");
2117 return 0;
2118 } else {
2119 printf("\nInvalid UTF8 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2120 return 1;
2121 }
2122 }
2123
2124 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_8 */
2125
invalid_utf8_regression_tests(void)2126 static int invalid_utf8_regression_tests(void)
2127 {
2128 return 0;
2129 }
2130
2131 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_8 */
2132
2133 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_16
2134
2135 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2136 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2137 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2138
2139 struct invalid_utf16_regression_test_case {
2140 uint32_t compile_options;
2141 int jit_compile_options;
2142 int start_offset;
2143 int skip_left;
2144 int skip_right;
2145 int match_start;
2146 int match_end;
2147 const PCRE2_UCHAR16 *pattern[2];
2148 const PCRE2_UCHAR16 *input;
2149 };
2150
2151 static PCRE2_UCHAR16 allany16[] = { '.', 0 };
2152 static PCRE2_UCHAR16 non_word_boundary16[] = { '\\', 'B', 0 };
2153 static PCRE2_UCHAR16 word_boundary16[] = { '\\', 'b', 0 };
2154 static PCRE2_UCHAR16 backreference16[] = { '(', '.', ')', '\\', '1', 0 };
2155 static PCRE2_UCHAR16 grapheme16[] = { '\\', 'X', 0 };
2156 static PCRE2_UCHAR16 nothashmark16[] = { '[', '^', '#', ']', 0 };
2157 static PCRE2_UCHAR16 afternl16[] = { '^', '\\', 'W', 0 };
2158 static PCRE2_UCHAR16 generic16[] = { '#', 0xd800, 0xdc00, '#', 0 };
2159 static PCRE2_UCHAR16 test16_1[] = { 0xd7ff, 0xe000, 0xffff, 0x01, '#', 0 };
2160 static PCRE2_UCHAR16 test16_2[] = { 0xd800, 0xdc00, 0xd800, 0xdc00, 0 };
2161 static PCRE2_UCHAR16 test16_3[] = { 0xdbff, 0xdfff, 0xdbff, 0xdfff, 0 };
2162 static PCRE2_UCHAR16 test16_4[] = { 0xd800, 0xdbff, 0xd800, 0xdbff, 0 };
2163 static PCRE2_UCHAR16 test16_5[] = { '#', 0xd800, 0xdc00, '#', 0 };
2164 static PCRE2_UCHAR16 test16_6[] = { 'a', 'A', 0xdc28, 0 };
2165 static PCRE2_UCHAR16 test16_7[] = { 0xd801, 0xdc00, 0xd801, 0xdc28, 0 };
2166 static PCRE2_UCHAR16 test16_8[] = { '#', 0xd800, 0xdc00, 0 };
2167 static PCRE2_UCHAR16 test16_9[] = { ' ', 0x2028, '#', 0 };
2168 static PCRE2_UCHAR16 test16_10[] = { ' ', 0xdc00, 0xd800, 0x2028, '#', 0 };
2169 static PCRE2_UCHAR16 test16_11[] = { 0xdc00, 0xdc00, 0xd800, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2170 static PCRE2_UCHAR16 test16_12[] = { '#', 0xd800, 0xdc00, 0xd800, '#', 0xd800, 0xdc00, 0xdc00, 0xdc00, '#', 0xd800, 0xdc00, '#', 0 };
2171
2172 static const struct invalid_utf16_regression_test_case invalid_utf16_regression_test_cases[] = {
2173 { UDA, CI, 0, 0, 0, 0, 1, { allany16, NULL }, test16_1 },
2174 { UDA, CI, 1, 0, 0, 1, 2, { allany16, NULL }, test16_1 },
2175 { UDA, CI, 2, 0, 0, 2, 3, { allany16, NULL }, test16_1 },
2176 { UDA, CI, 3, 0, 0, 3, 4, { allany16, NULL }, test16_1 },
2177 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_2 },
2178 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_2 },
2179 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_2 },
2180 { UDA, CI, 0, 0, 0, 0, 2, { allany16, NULL }, test16_3 },
2181 { UDA, CI, 0, 0, 3, -1, -1, { allany16, NULL }, test16_3 },
2182 { UDA, CI, 1, 0, 0, -1, -1, { allany16, NULL }, test16_3 },
2183
2184 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary16, NULL }, test16_1 },
2185 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_1 },
2186 { UDA, CPI, 3, 0, 0, 3, 3, { non_word_boundary16, NULL }, test16_1 },
2187 { UDA, CPI, 4, 0, 0, 4, 4, { non_word_boundary16, NULL }, test16_1 },
2188 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_2 },
2189 { UDA, CPI, 2, 0, 0, 2, 2, { non_word_boundary16, NULL }, test16_3 },
2190 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_2 },
2191 { UDA, CPI, 2, 1, 1, -1, -1, { non_word_boundary16, word_boundary16 }, test16_3 },
2192 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_4 },
2193 { UDA, CPI, 2, 0, 0, -1, -1, { non_word_boundary16, word_boundary16 }, test16_5 },
2194
2195 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference16, NULL }, test16_6 },
2196 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference16, NULL }, test16_6 },
2197 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 4, { backreference16, NULL }, test16_7 },
2198 { UDA | PCRE2_CASELESS, CPI, 0, 0, 1, -1, -1, { backreference16, NULL }, test16_7 },
2199
2200 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme16, NULL }, test16_6 },
2201 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme16, NULL }, test16_6 },
2202 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme16, NULL }, test16_6 },
2203 { UDA, CPI, 0, 0, 0, 0, 2, { grapheme16, NULL }, test16_7 },
2204 { UDA, CPI, 2, 0, 0, 2, 4, { grapheme16, NULL }, test16_7 },
2205 { UDA, CPI, 1, 0, 0, -1, -1, { grapheme16, NULL }, test16_7 },
2206
2207 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2208 { UDA, CPI, 1, 0, 0, 1, 3, { nothashmark16, NULL }, test16_8 },
2209 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark16, NULL }, test16_8 },
2210
2211 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl16, NULL }, test16_9 },
2212 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 4, 5, { afternl16, NULL }, test16_10 },
2213
2214 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2215 { PCRE2_UTF | PCRE2_NO_START_OPTIMIZE, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2216 { PCRE2_UTF, CI, 0, 0, 0, 5, 9, { generic16, NULL }, test16_11 },
2217 { PCRE2_UTF, CI, 0, 0, 0, 9, 13, { generic16, NULL }, test16_12 },
2218
2219 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2220 };
2221
2222 #undef UDA
2223 #undef CI
2224 #undef CPI
2225
run_invalid_utf16_test(const struct invalid_utf16_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_16 * ccontext,pcre2_match_data_16 * mdata)2226 static int run_invalid_utf16_test(const struct invalid_utf16_regression_test_case *current,
2227 int pattern_index, int i, pcre2_compile_context_16 *ccontext, pcre2_match_data_16 *mdata)
2228 {
2229 pcre2_code_16 *code;
2230 int result, errorcode;
2231 PCRE2_SIZE length, erroroffset;
2232 const PCRE2_UCHAR16 *input;
2233 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_16(mdata);
2234
2235 if (current->pattern[i] == NULL)
2236 return 1;
2237
2238 code = pcre2_compile_16(current->pattern[i], PCRE2_ZERO_TERMINATED,
2239 current->compile_options, &errorcode, &erroroffset, ccontext);
2240
2241 if (!code) {
2242 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2243 return 0;
2244 }
2245
2246 if (pcre2_jit_compile_16(code, current->jit_compile_options) != 0) {
2247 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2248 pcre2_code_free_16(code);
2249 return 0;
2250 }
2251
2252 input = current->input;
2253 length = 0;
2254
2255 while (*input++ != 0)
2256 length++;
2257
2258 length -= current->skip_left + current->skip_right;
2259
2260 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2261 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2262 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2263
2264 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2265 pcre2_code_free_16(code);
2266 return 0;
2267 }
2268 }
2269
2270 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2271 result = pcre2_jit_match_16(code, (current->input + current->skip_left),
2272 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2273
2274 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2275 pcre2_code_free_16(code);
2276 return 0;
2277 }
2278 }
2279
2280 pcre2_code_free_16(code);
2281 return 1;
2282 }
2283
invalid_utf16_regression_tests(void)2284 static int invalid_utf16_regression_tests(void)
2285 {
2286 const struct invalid_utf16_regression_test_case *current;
2287 pcre2_compile_context_16 *ccontext;
2288 pcre2_match_data_16 *mdata;
2289 int total = 0, successful = 0;
2290 int result;
2291
2292 printf("\nRunning invalid-utf16 JIT regression tests\n");
2293
2294 ccontext = pcre2_compile_context_create_16(NULL);
2295 pcre2_set_newline_16(ccontext, PCRE2_NEWLINE_ANY);
2296 mdata = pcre2_match_data_create_16(4, NULL);
2297
2298 for (current = invalid_utf16_regression_test_cases; current->pattern[0]; current++) {
2299 /* printf("\nPattern: %s :\n", current->pattern); */
2300 total++;
2301
2302 result = 1;
2303 if (!run_invalid_utf16_test(current, total - 1, 0, ccontext, mdata))
2304 result = 0;
2305 if (!run_invalid_utf16_test(current, total - 1, 1, ccontext, mdata))
2306 result = 0;
2307
2308 if (result) {
2309 successful++;
2310 }
2311
2312 printf(".");
2313 if ((total % 60) == 0)
2314 printf("\n");
2315 }
2316
2317 if ((total % 60) != 0)
2318 printf("\n");
2319
2320 pcre2_match_data_free_16(mdata);
2321 pcre2_compile_context_free_16(ccontext);
2322
2323 if (total == successful) {
2324 printf("\nAll invalid UTF16 JIT regression tests are successfully passed.\n");
2325 return 0;
2326 } else {
2327 printf("\nInvalid UTF16 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2328 return 1;
2329 }
2330 }
2331
2332 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_16 */
2333
invalid_utf16_regression_tests(void)2334 static int invalid_utf16_regression_tests(void)
2335 {
2336 return 0;
2337 }
2338
2339 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_16 */
2340
2341 #if defined SUPPORT_UNICODE && defined SUPPORT_PCRE2_32
2342
2343 #define UDA (PCRE2_UTF | PCRE2_DOTALL | PCRE2_ANCHORED)
2344 #define CI (PCRE2_JIT_COMPLETE | PCRE2_JIT_INVALID_UTF)
2345 #define CPI (PCRE2_JIT_COMPLETE | PCRE2_JIT_PARTIAL_SOFT | PCRE2_JIT_INVALID_UTF)
2346
2347 struct invalid_utf32_regression_test_case {
2348 uint32_t compile_options;
2349 int jit_compile_options;
2350 int start_offset;
2351 int skip_left;
2352 int skip_right;
2353 int match_start;
2354 int match_end;
2355 const PCRE2_UCHAR32 *pattern[2];
2356 const PCRE2_UCHAR32 *input;
2357 };
2358
2359 static PCRE2_UCHAR32 allany32[] = { '.', 0 };
2360 static PCRE2_UCHAR32 non_word_boundary32[] = { '\\', 'B', 0 };
2361 static PCRE2_UCHAR32 word_boundary32[] = { '\\', 'b', 0 };
2362 static PCRE2_UCHAR32 backreference32[] = { '(', '.', ')', '\\', '1', 0 };
2363 static PCRE2_UCHAR32 grapheme32[] = { '\\', 'X', 0 };
2364 static PCRE2_UCHAR32 nothashmark32[] = { '[', '^', '#', ']', 0 };
2365 static PCRE2_UCHAR32 afternl32[] = { '^', '\\', 'W', 0 };
2366 static PCRE2_UCHAR32 test32_1[] = { 0x10ffff, 0x10ffff, 0x110000, 0x110000, 0x10ffff, 0 };
2367 static PCRE2_UCHAR32 test32_2[] = { 0xd7ff, 0xe000, 0xd800, 0xdfff, 0xe000, 0xdfff, 0xd800, 0 };
2368 static PCRE2_UCHAR32 test32_3[] = { 'a', 'A', 0x110000, 0 };
2369 static PCRE2_UCHAR32 test32_4[] = { '#', 0x10ffff, 0x110000, 0 };
2370 static PCRE2_UCHAR32 test32_5[] = { ' ', 0x2028, '#', 0 };
2371 static PCRE2_UCHAR32 test32_6[] = { ' ', 0x110000, 0x2028, '#', 0 };
2372
2373 static const struct invalid_utf32_regression_test_case invalid_utf32_regression_test_cases[] = {
2374 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_1 },
2375 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_1 },
2376 { UDA, CI, 0, 0, 0, 0, 1, { allany32, NULL }, test32_2 },
2377 { UDA, CI, 1, 0, 0, 1, 2, { allany32, NULL }, test32_2 },
2378 { UDA, CI, 2, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2379 { UDA, CI, 3, 0, 0, -1, -1, { allany32, NULL }, test32_2 },
2380
2381 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_1 },
2382 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_1 },
2383 { UDA, CPI, 1, 0, 0, 1, 1, { non_word_boundary32, NULL }, test32_2 },
2384 { UDA, CPI, 3, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2385 { UDA, CPI, 6, 0, 0, -1, -1, { non_word_boundary32, word_boundary32 }, test32_2 },
2386
2387 { UDA | PCRE2_CASELESS, CPI, 0, 0, 0, 0, 2, { backreference32, NULL }, test32_3 },
2388 { UDA | PCRE2_CASELESS, CPI, 1, 0, 0, -1, -1, { backreference32, NULL }, test32_3 },
2389
2390 { UDA, CPI, 0, 0, 0, 0, 1, { grapheme32, NULL }, test32_1 },
2391 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_1 },
2392 { UDA, CPI, 1, 0, 0, 1, 2, { grapheme32, NULL }, test32_2 },
2393 { UDA, CPI, 2, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2394 { UDA, CPI, 3, 0, 0, -1, -1, { grapheme32, NULL }, test32_2 },
2395 { UDA, CPI, 4, 0, 0, 4, 5, { grapheme32, NULL }, test32_2 },
2396
2397 { UDA, CPI, 0, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2398 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_4 },
2399 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_4 },
2400 { UDA, CPI, 1, 0, 0, 1, 2, { nothashmark32, NULL }, test32_2 },
2401 { UDA, CPI, 2, 0, 0, -1, -1, { nothashmark32, NULL }, test32_2 },
2402
2403 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 2, 3, { afternl32, NULL }, test32_5 },
2404 { PCRE2_UTF | PCRE2_MULTILINE, CI, 1, 0, 0, 3, 4, { afternl32, NULL }, test32_6 },
2405
2406 { 0, 0, 0, 0, 0, 0, 0, { NULL, NULL }, NULL }
2407 };
2408
2409 #undef UDA
2410 #undef CI
2411 #undef CPI
2412
run_invalid_utf32_test(const struct invalid_utf32_regression_test_case * current,int pattern_index,int i,pcre2_compile_context_32 * ccontext,pcre2_match_data_32 * mdata)2413 static int run_invalid_utf32_test(const struct invalid_utf32_regression_test_case *current,
2414 int pattern_index, int i, pcre2_compile_context_32 *ccontext, pcre2_match_data_32 *mdata)
2415 {
2416 pcre2_code_32 *code;
2417 int result, errorcode;
2418 PCRE2_SIZE length, erroroffset;
2419 const PCRE2_UCHAR32 *input;
2420 PCRE2_SIZE *ovector = pcre2_get_ovector_pointer_32(mdata);
2421
2422 if (current->pattern[i] == NULL)
2423 return 1;
2424
2425 code = pcre2_compile_32(current->pattern[i], PCRE2_ZERO_TERMINATED,
2426 current->compile_options, &errorcode, &erroroffset, ccontext);
2427
2428 if (!code) {
2429 printf("Pattern[%d:0] cannot be compiled. Error offset: %d\n", pattern_index, (int)erroroffset);
2430 return 0;
2431 }
2432
2433 if (pcre2_jit_compile_32(code, current->jit_compile_options) != 0) {
2434 printf("Pattern[%d:0] cannot be compiled by the JIT compiler.\n", pattern_index);
2435 pcre2_code_free_32(code);
2436 return 0;
2437 }
2438
2439 input = current->input;
2440 length = 0;
2441
2442 while (*input++ != 0)
2443 length++;
2444
2445 length -= current->skip_left + current->skip_right;
2446
2447 if (current->jit_compile_options & PCRE2_JIT_COMPLETE) {
2448 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2449 length, current->start_offset - current->skip_left, 0, mdata, NULL);
2450
2451 if (check_invalid_utf_result(pattern_index, "match", result, current->match_start, current->match_end, ovector)) {
2452 pcre2_code_free_32(code);
2453 return 0;
2454 }
2455 }
2456
2457 if (current->jit_compile_options & PCRE2_JIT_PARTIAL_SOFT) {
2458 result = pcre2_jit_match_32(code, (current->input + current->skip_left),
2459 length, current->start_offset - current->skip_left, PCRE2_PARTIAL_SOFT, mdata, NULL);
2460
2461 if (check_invalid_utf_result(pattern_index, "partial match", result, current->match_start, current->match_end, ovector)) {
2462 pcre2_code_free_32(code);
2463 return 0;
2464 }
2465 }
2466
2467 pcre2_code_free_32(code);
2468 return 1;
2469 }
2470
invalid_utf32_regression_tests(void)2471 static int invalid_utf32_regression_tests(void)
2472 {
2473 const struct invalid_utf32_regression_test_case *current;
2474 pcre2_compile_context_32 *ccontext;
2475 pcre2_match_data_32 *mdata;
2476 int total = 0, successful = 0;
2477 int result;
2478
2479 printf("\nRunning invalid-utf32 JIT regression tests\n");
2480
2481 ccontext = pcre2_compile_context_create_32(NULL);
2482 pcre2_set_newline_32(ccontext, PCRE2_NEWLINE_ANY);
2483 mdata = pcre2_match_data_create_32(4, NULL);
2484
2485 for (current = invalid_utf32_regression_test_cases; current->pattern[0]; current++) {
2486 /* printf("\nPattern: %s :\n", current->pattern); */
2487 total++;
2488
2489 result = 1;
2490 if (!run_invalid_utf32_test(current, total - 1, 0, ccontext, mdata))
2491 result = 0;
2492 if (!run_invalid_utf32_test(current, total - 1, 1, ccontext, mdata))
2493 result = 0;
2494
2495 if (result) {
2496 successful++;
2497 }
2498
2499 printf(".");
2500 if ((total % 60) == 0)
2501 printf("\n");
2502 }
2503
2504 if ((total % 60) != 0)
2505 printf("\n");
2506
2507 pcre2_match_data_free_32(mdata);
2508 pcre2_compile_context_free_32(ccontext);
2509
2510 if (total == successful) {
2511 printf("\nAll invalid UTF32 JIT regression tests are successfully passed.\n");
2512 return 0;
2513 } else {
2514 printf("\nInvalid UTF32 successful test ratio: %d%% (%d failed)\n", successful * 100 / total, total - successful);
2515 return 1;
2516 }
2517 }
2518
2519 #else /* !SUPPORT_UNICODE || !SUPPORT_PCRE2_32 */
2520
invalid_utf32_regression_tests(void)2521 static int invalid_utf32_regression_tests(void)
2522 {
2523 return 0;
2524 }
2525
2526 #endif /* SUPPORT_UNICODE && SUPPORT_PCRE2_32 */
2527
2528 /* End of pcre2_jit_test.c */
2529