1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi * PCRE2 DEMONSTRATION PROGRAM *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi
5*22dc650dSSadaf Ebrahimi /* This is a demonstration program to illustrate a straightforward way of
6*22dc650dSSadaf Ebrahimi using the PCRE2 regular expression library from a C program. See the
7*22dc650dSSadaf Ebrahimi pcre2sample documentation for a short discussion ("man pcre2sample" if you have
8*22dc650dSSadaf Ebrahimi the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
9*22dc650dSSadaf Ebrahimi incompatible with the original PCRE API.
10*22dc650dSSadaf Ebrahimi
11*22dc650dSSadaf Ebrahimi There are actually three libraries, each supporting a different code unit
12*22dc650dSSadaf Ebrahimi width. This demonstration program uses the 8-bit library. The default is to
13*22dc650dSSadaf Ebrahimi process each code unit as a separate character, but if the pattern begins with
14*22dc650dSSadaf Ebrahimi "(*UTF)", both it and the subject are treated as UTF-8 strings, where
15*22dc650dSSadaf Ebrahimi characters may occupy multiple code units.
16*22dc650dSSadaf Ebrahimi
17*22dc650dSSadaf Ebrahimi In Unix-like environments, if PCRE2 is installed in your standard system
18*22dc650dSSadaf Ebrahimi libraries, you should be able to compile this program using this command:
19*22dc650dSSadaf Ebrahimi
20*22dc650dSSadaf Ebrahimi cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
21*22dc650dSSadaf Ebrahimi
22*22dc650dSSadaf Ebrahimi If PCRE2 is not installed in a standard place, it is likely to be installed
23*22dc650dSSadaf Ebrahimi with support for the pkg-config mechanism. If you have pkg-config, you can
24*22dc650dSSadaf Ebrahimi compile this program using this command:
25*22dc650dSSadaf Ebrahimi
26*22dc650dSSadaf Ebrahimi cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
27*22dc650dSSadaf Ebrahimi
28*22dc650dSSadaf Ebrahimi If you do not have pkg-config, you may have to use something like this:
29*22dc650dSSadaf Ebrahimi
30*22dc650dSSadaf Ebrahimi cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
31*22dc650dSSadaf Ebrahimi -R/usr/local/lib -lpcre2-8 -o pcre2demo
32*22dc650dSSadaf Ebrahimi
33*22dc650dSSadaf Ebrahimi Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
34*22dc650dSSadaf Ebrahimi library files for PCRE2 are installed on your system. Only some operating
35*22dc650dSSadaf Ebrahimi systems (Solaris is one) use the -R option.
36*22dc650dSSadaf Ebrahimi
37*22dc650dSSadaf Ebrahimi Building under Windows:
38*22dc650dSSadaf Ebrahimi
39*22dc650dSSadaf Ebrahimi If you want to statically link this program against a non-dll .a file, you must
40*22dc650dSSadaf Ebrahimi define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
41*22dc650dSSadaf Ebrahimi the following line. */
42*22dc650dSSadaf Ebrahimi
43*22dc650dSSadaf Ebrahimi /* #define PCRE2_STATIC */
44*22dc650dSSadaf Ebrahimi
45*22dc650dSSadaf Ebrahimi /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
46*22dc650dSSadaf Ebrahimi For a program that uses only one code unit width, setting it to 8, 16, or 32
47*22dc650dSSadaf Ebrahimi makes it possible to use generic function names such as pcre2_compile(). Note
48*22dc650dSSadaf Ebrahimi that just changing 8 to 16 (for example) is not sufficient to convert this
49*22dc650dSSadaf Ebrahimi program to process 16-bit characters. Even in a fully 16-bit environment, where
50*22dc650dSSadaf Ebrahimi string-handling functions such as strcmp() and printf() work with 16-bit
51*22dc650dSSadaf Ebrahimi characters, the code for handling the table of named substrings will still need
52*22dc650dSSadaf Ebrahimi to be modified. */
53*22dc650dSSadaf Ebrahimi
54*22dc650dSSadaf Ebrahimi #define PCRE2_CODE_UNIT_WIDTH 8
55*22dc650dSSadaf Ebrahimi
56*22dc650dSSadaf Ebrahimi #include <stdio.h>
57*22dc650dSSadaf Ebrahimi #include <string.h>
58*22dc650dSSadaf Ebrahimi #include <pcre2.h>
59*22dc650dSSadaf Ebrahimi
60*22dc650dSSadaf Ebrahimi
61*22dc650dSSadaf Ebrahimi /**************************************************************************
62*22dc650dSSadaf Ebrahimi * Here is the program. The API includes the concept of "contexts" for *
63*22dc650dSSadaf Ebrahimi * setting up unusual interface requirements for compiling and matching, *
64*22dc650dSSadaf Ebrahimi * such as custom memory managers and non-standard newline definitions. *
65*22dc650dSSadaf Ebrahimi * This program does not do any of this, so it makes no use of contexts, *
66*22dc650dSSadaf Ebrahimi * always passing NULL where a context could be given. *
67*22dc650dSSadaf Ebrahimi **************************************************************************/
68*22dc650dSSadaf Ebrahimi
main(int argc,char ** argv)69*22dc650dSSadaf Ebrahimi int main(int argc, char **argv)
70*22dc650dSSadaf Ebrahimi {
71*22dc650dSSadaf Ebrahimi pcre2_code *re;
72*22dc650dSSadaf Ebrahimi PCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */
73*22dc650dSSadaf Ebrahimi PCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */
74*22dc650dSSadaf Ebrahimi PCRE2_SPTR name_table;
75*22dc650dSSadaf Ebrahimi
76*22dc650dSSadaf Ebrahimi int crlf_is_newline;
77*22dc650dSSadaf Ebrahimi int errornumber;
78*22dc650dSSadaf Ebrahimi int find_all;
79*22dc650dSSadaf Ebrahimi int i;
80*22dc650dSSadaf Ebrahimi int rc;
81*22dc650dSSadaf Ebrahimi int utf8;
82*22dc650dSSadaf Ebrahimi
83*22dc650dSSadaf Ebrahimi uint32_t option_bits;
84*22dc650dSSadaf Ebrahimi uint32_t namecount;
85*22dc650dSSadaf Ebrahimi uint32_t name_entry_size;
86*22dc650dSSadaf Ebrahimi uint32_t newline;
87*22dc650dSSadaf Ebrahimi
88*22dc650dSSadaf Ebrahimi PCRE2_SIZE erroroffset;
89*22dc650dSSadaf Ebrahimi PCRE2_SIZE *ovector;
90*22dc650dSSadaf Ebrahimi PCRE2_SIZE subject_length;
91*22dc650dSSadaf Ebrahimi
92*22dc650dSSadaf Ebrahimi pcre2_match_data *match_data;
93*22dc650dSSadaf Ebrahimi
94*22dc650dSSadaf Ebrahimi
95*22dc650dSSadaf Ebrahimi /**************************************************************************
96*22dc650dSSadaf Ebrahimi * First, sort out the command line. There is only one possible option at *
97*22dc650dSSadaf Ebrahimi * the moment, "-g" to request repeated matching to find all occurrences, *
98*22dc650dSSadaf Ebrahimi * like Perl's /g option. We set the variable find_all to a non-zero value *
99*22dc650dSSadaf Ebrahimi * if the -g option is present. *
100*22dc650dSSadaf Ebrahimi **************************************************************************/
101*22dc650dSSadaf Ebrahimi
102*22dc650dSSadaf Ebrahimi find_all = 0;
103*22dc650dSSadaf Ebrahimi for (i = 1; i < argc; i++)
104*22dc650dSSadaf Ebrahimi {
105*22dc650dSSadaf Ebrahimi if (strcmp(argv[i], "-g") == 0) find_all = 1;
106*22dc650dSSadaf Ebrahimi else if (argv[i][0] == '-')
107*22dc650dSSadaf Ebrahimi {
108*22dc650dSSadaf Ebrahimi printf("Unrecognised option %s\n", argv[i]);
109*22dc650dSSadaf Ebrahimi return 1;
110*22dc650dSSadaf Ebrahimi }
111*22dc650dSSadaf Ebrahimi else break;
112*22dc650dSSadaf Ebrahimi }
113*22dc650dSSadaf Ebrahimi
114*22dc650dSSadaf Ebrahimi /* After the options, we require exactly two arguments, which are the pattern,
115*22dc650dSSadaf Ebrahimi and the subject string. */
116*22dc650dSSadaf Ebrahimi
117*22dc650dSSadaf Ebrahimi if (argc - i != 2)
118*22dc650dSSadaf Ebrahimi {
119*22dc650dSSadaf Ebrahimi printf("Exactly two arguments required: a regex and a subject string\n");
120*22dc650dSSadaf Ebrahimi return 1;
121*22dc650dSSadaf Ebrahimi }
122*22dc650dSSadaf Ebrahimi
123*22dc650dSSadaf Ebrahimi /* Pattern and subject are char arguments, so they can be straightforwardly
124*22dc650dSSadaf Ebrahimi cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
125*22dc650dSSadaf Ebrahimi length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
126*22dc650dSSadaf Ebrahimi defined to be size_t. */
127*22dc650dSSadaf Ebrahimi
128*22dc650dSSadaf Ebrahimi pattern = (PCRE2_SPTR)argv[i];
129*22dc650dSSadaf Ebrahimi subject = (PCRE2_SPTR)argv[i+1];
130*22dc650dSSadaf Ebrahimi subject_length = (PCRE2_SIZE)strlen((char *)subject);
131*22dc650dSSadaf Ebrahimi
132*22dc650dSSadaf Ebrahimi
133*22dc650dSSadaf Ebrahimi /*************************************************************************
134*22dc650dSSadaf Ebrahimi * Now we are going to compile the regular expression pattern, and handle *
135*22dc650dSSadaf Ebrahimi * any errors that are detected. *
136*22dc650dSSadaf Ebrahimi *************************************************************************/
137*22dc650dSSadaf Ebrahimi
138*22dc650dSSadaf Ebrahimi re = pcre2_compile(
139*22dc650dSSadaf Ebrahimi pattern, /* the pattern */
140*22dc650dSSadaf Ebrahimi PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
141*22dc650dSSadaf Ebrahimi 0, /* default options */
142*22dc650dSSadaf Ebrahimi &errornumber, /* for error number */
143*22dc650dSSadaf Ebrahimi &erroroffset, /* for error offset */
144*22dc650dSSadaf Ebrahimi NULL); /* use default compile context */
145*22dc650dSSadaf Ebrahimi
146*22dc650dSSadaf Ebrahimi /* Compilation failed: print the error message and exit. */
147*22dc650dSSadaf Ebrahimi
148*22dc650dSSadaf Ebrahimi if (re == NULL)
149*22dc650dSSadaf Ebrahimi {
150*22dc650dSSadaf Ebrahimi PCRE2_UCHAR buffer[256];
151*22dc650dSSadaf Ebrahimi pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
152*22dc650dSSadaf Ebrahimi printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
153*22dc650dSSadaf Ebrahimi buffer);
154*22dc650dSSadaf Ebrahimi return 1;
155*22dc650dSSadaf Ebrahimi }
156*22dc650dSSadaf Ebrahimi
157*22dc650dSSadaf Ebrahimi
158*22dc650dSSadaf Ebrahimi /*************************************************************************
159*22dc650dSSadaf Ebrahimi * If the compilation succeeded, we call PCRE2 again, in order to do a *
160*22dc650dSSadaf Ebrahimi * pattern match against the subject string. This does just ONE match. If *
161*22dc650dSSadaf Ebrahimi * further matching is needed, it will be done below. Before running the *
162*22dc650dSSadaf Ebrahimi * match we must set up a match_data block for holding the result. Using *
163*22dc650dSSadaf Ebrahimi * pcre2_match_data_create_from_pattern() ensures that the block is *
164*22dc650dSSadaf Ebrahimi * exactly the right size for the number of capturing parentheses in the *
165*22dc650dSSadaf Ebrahimi * pattern. If you need to know the actual size of a match_data block as *
166*22dc650dSSadaf Ebrahimi * a number of bytes, you can find it like this: *
167*22dc650dSSadaf Ebrahimi * *
168*22dc650dSSadaf Ebrahimi * PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); *
169*22dc650dSSadaf Ebrahimi *************************************************************************/
170*22dc650dSSadaf Ebrahimi
171*22dc650dSSadaf Ebrahimi match_data = pcre2_match_data_create_from_pattern(re, NULL);
172*22dc650dSSadaf Ebrahimi
173*22dc650dSSadaf Ebrahimi /* Now run the match. */
174*22dc650dSSadaf Ebrahimi
175*22dc650dSSadaf Ebrahimi rc = pcre2_match(
176*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */
177*22dc650dSSadaf Ebrahimi subject, /* the subject string */
178*22dc650dSSadaf Ebrahimi subject_length, /* the length of the subject */
179*22dc650dSSadaf Ebrahimi 0, /* start at offset 0 in the subject */
180*22dc650dSSadaf Ebrahimi 0, /* default options */
181*22dc650dSSadaf Ebrahimi match_data, /* block for storing the result */
182*22dc650dSSadaf Ebrahimi NULL); /* use default match context */
183*22dc650dSSadaf Ebrahimi
184*22dc650dSSadaf Ebrahimi /* Matching failed: handle error cases */
185*22dc650dSSadaf Ebrahimi
186*22dc650dSSadaf Ebrahimi if (rc < 0)
187*22dc650dSSadaf Ebrahimi {
188*22dc650dSSadaf Ebrahimi switch(rc)
189*22dc650dSSadaf Ebrahimi {
190*22dc650dSSadaf Ebrahimi case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
191*22dc650dSSadaf Ebrahimi /*
192*22dc650dSSadaf Ebrahimi Handle other special cases if you like
193*22dc650dSSadaf Ebrahimi */
194*22dc650dSSadaf Ebrahimi default: printf("Matching error %d\n", rc); break;
195*22dc650dSSadaf Ebrahimi }
196*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); /* Release memory used for the match */
197*22dc650dSSadaf Ebrahimi pcre2_code_free(re); /* data and the compiled pattern. */
198*22dc650dSSadaf Ebrahimi return 1;
199*22dc650dSSadaf Ebrahimi }
200*22dc650dSSadaf Ebrahimi
201*22dc650dSSadaf Ebrahimi /* Match succeeded. Get a pointer to the output vector, where string offsets
202*22dc650dSSadaf Ebrahimi are stored. */
203*22dc650dSSadaf Ebrahimi
204*22dc650dSSadaf Ebrahimi ovector = pcre2_get_ovector_pointer(match_data);
205*22dc650dSSadaf Ebrahimi printf("Match succeeded at offset %d\n", (int)ovector[0]);
206*22dc650dSSadaf Ebrahimi
207*22dc650dSSadaf Ebrahimi
208*22dc650dSSadaf Ebrahimi /*************************************************************************
209*22dc650dSSadaf Ebrahimi * We have found the first match within the subject string. If the output *
210*22dc650dSSadaf Ebrahimi * vector wasn't big enough, say so. Then output any substrings that were *
211*22dc650dSSadaf Ebrahimi * captured. *
212*22dc650dSSadaf Ebrahimi *************************************************************************/
213*22dc650dSSadaf Ebrahimi
214*22dc650dSSadaf Ebrahimi /* The output vector wasn't big enough. This should not happen, because we used
215*22dc650dSSadaf Ebrahimi pcre2_match_data_create_from_pattern() above. */
216*22dc650dSSadaf Ebrahimi
217*22dc650dSSadaf Ebrahimi if (rc == 0)
218*22dc650dSSadaf Ebrahimi printf("ovector was not big enough for all the captured substrings\n");
219*22dc650dSSadaf Ebrahimi
220*22dc650dSSadaf Ebrahimi /* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
221*22dc650dSSadaf Ebrahimi assertions. However, there is an option to re-enable the old behaviour. If that
222*22dc650dSSadaf Ebrahimi is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
223*22dc650dSSadaf Ebrahimi assertion to set the start of a match later than its end. In this demonstration
224*22dc650dSSadaf Ebrahimi program, we show how to detect this case, but it shouldn't arise because the
225*22dc650dSSadaf Ebrahimi option is never set. */
226*22dc650dSSadaf Ebrahimi
227*22dc650dSSadaf Ebrahimi if (ovector[0] > ovector[1])
228*22dc650dSSadaf Ebrahimi {
229*22dc650dSSadaf Ebrahimi printf("\\K was used in an assertion to set the match start after its end.\n"
230*22dc650dSSadaf Ebrahimi "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
231*22dc650dSSadaf Ebrahimi (char *)(subject + ovector[1]));
232*22dc650dSSadaf Ebrahimi printf("Run abandoned\n");
233*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data);
234*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
235*22dc650dSSadaf Ebrahimi return 1;
236*22dc650dSSadaf Ebrahimi }
237*22dc650dSSadaf Ebrahimi
238*22dc650dSSadaf Ebrahimi /* Show substrings stored in the output vector by number. Obviously, in a real
239*22dc650dSSadaf Ebrahimi application you might want to do things other than print them. */
240*22dc650dSSadaf Ebrahimi
241*22dc650dSSadaf Ebrahimi for (i = 0; i < rc; i++)
242*22dc650dSSadaf Ebrahimi {
243*22dc650dSSadaf Ebrahimi PCRE2_SPTR substring_start = subject + ovector[2*i];
244*22dc650dSSadaf Ebrahimi PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
245*22dc650dSSadaf Ebrahimi printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
246*22dc650dSSadaf Ebrahimi }
247*22dc650dSSadaf Ebrahimi
248*22dc650dSSadaf Ebrahimi
249*22dc650dSSadaf Ebrahimi /**************************************************************************
250*22dc650dSSadaf Ebrahimi * That concludes the basic part of this demonstration program. We have *
251*22dc650dSSadaf Ebrahimi * compiled a pattern, and performed a single match. The code that follows *
252*22dc650dSSadaf Ebrahimi * shows first how to access named substrings, and then how to code for *
253*22dc650dSSadaf Ebrahimi * repeated matches on the same subject. *
254*22dc650dSSadaf Ebrahimi **************************************************************************/
255*22dc650dSSadaf Ebrahimi
256*22dc650dSSadaf Ebrahimi /* See if there are any named substrings, and if so, show them by name. First
257*22dc650dSSadaf Ebrahimi we have to extract the count of named parentheses from the pattern. */
258*22dc650dSSadaf Ebrahimi
259*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(
260*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */
261*22dc650dSSadaf Ebrahimi PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
262*22dc650dSSadaf Ebrahimi &namecount); /* where to put the answer */
263*22dc650dSSadaf Ebrahimi
264*22dc650dSSadaf Ebrahimi if (namecount == 0) printf("No named substrings\n"); else
265*22dc650dSSadaf Ebrahimi {
266*22dc650dSSadaf Ebrahimi PCRE2_SPTR tabptr;
267*22dc650dSSadaf Ebrahimi printf("Named substrings\n");
268*22dc650dSSadaf Ebrahimi
269*22dc650dSSadaf Ebrahimi /* Before we can access the substrings, we must extract the table for
270*22dc650dSSadaf Ebrahimi translating names to numbers, and the size of each entry in the table. */
271*22dc650dSSadaf Ebrahimi
272*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(
273*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */
274*22dc650dSSadaf Ebrahimi PCRE2_INFO_NAMETABLE, /* address of the table */
275*22dc650dSSadaf Ebrahimi &name_table); /* where to put the answer */
276*22dc650dSSadaf Ebrahimi
277*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(
278*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */
279*22dc650dSSadaf Ebrahimi PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
280*22dc650dSSadaf Ebrahimi &name_entry_size); /* where to put the answer */
281*22dc650dSSadaf Ebrahimi
282*22dc650dSSadaf Ebrahimi /* Now we can scan the table and, for each entry, print the number, the name,
283*22dc650dSSadaf Ebrahimi and the substring itself. In the 8-bit library the number is held in two
284*22dc650dSSadaf Ebrahimi bytes, most significant first. */
285*22dc650dSSadaf Ebrahimi
286*22dc650dSSadaf Ebrahimi tabptr = name_table;
287*22dc650dSSadaf Ebrahimi for (i = 0; i < namecount; i++)
288*22dc650dSSadaf Ebrahimi {
289*22dc650dSSadaf Ebrahimi int n = (tabptr[0] << 8) | tabptr[1];
290*22dc650dSSadaf Ebrahimi printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
291*22dc650dSSadaf Ebrahimi (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
292*22dc650dSSadaf Ebrahimi tabptr += name_entry_size;
293*22dc650dSSadaf Ebrahimi }
294*22dc650dSSadaf Ebrahimi }
295*22dc650dSSadaf Ebrahimi
296*22dc650dSSadaf Ebrahimi
297*22dc650dSSadaf Ebrahimi /*************************************************************************
298*22dc650dSSadaf Ebrahimi * If the "-g" option was given on the command line, we want to continue *
299*22dc650dSSadaf Ebrahimi * to search for additional matches in the subject string, in a similar *
300*22dc650dSSadaf Ebrahimi * way to the /g option in Perl. This turns out to be trickier than you *
301*22dc650dSSadaf Ebrahimi * might think because of the possibility of matching an empty string. *
302*22dc650dSSadaf Ebrahimi * What happens is as follows: *
303*22dc650dSSadaf Ebrahimi * *
304*22dc650dSSadaf Ebrahimi * If the previous match was NOT for an empty string, we can just start *
305*22dc650dSSadaf Ebrahimi * the next match at the end of the previous one. *
306*22dc650dSSadaf Ebrahimi * *
307*22dc650dSSadaf Ebrahimi * If the previous match WAS for an empty string, we can't do that, as it *
308*22dc650dSSadaf Ebrahimi * would lead to an infinite loop. Instead, a call of pcre2_match() is *
309*22dc650dSSadaf Ebrahimi * made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
310*22dc650dSSadaf Ebrahimi * first of these tells PCRE2 that an empty string at the start of the *
311*22dc650dSSadaf Ebrahimi * subject is not a valid match; other possibilities must be tried. The *
312*22dc650dSSadaf Ebrahimi * second flag restricts PCRE2 to one match attempt at the initial string *
313*22dc650dSSadaf Ebrahimi * position. If this match succeeds, an alternative to the empty string *
314*22dc650dSSadaf Ebrahimi * match has been found, and we can print it and proceed round the loop, *
315*22dc650dSSadaf Ebrahimi * advancing by the length of whatever was found. If this match does not *
316*22dc650dSSadaf Ebrahimi * succeed, we still stay in the loop, advancing by just one character. *
317*22dc650dSSadaf Ebrahimi * In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be *
318*22dc650dSSadaf Ebrahimi * more than one byte. *
319*22dc650dSSadaf Ebrahimi * *
320*22dc650dSSadaf Ebrahimi * However, there is a complication concerned with newlines. When the *
321*22dc650dSSadaf Ebrahimi * newline convention is such that CRLF is a valid newline, we must *
322*22dc650dSSadaf Ebrahimi * advance by two characters rather than one. The newline convention can *
323*22dc650dSSadaf Ebrahimi * be set in the regex by (*CR), etc.; if not, we must find the default. *
324*22dc650dSSadaf Ebrahimi *************************************************************************/
325*22dc650dSSadaf Ebrahimi
326*22dc650dSSadaf Ebrahimi if (!find_all) /* Check for -g */
327*22dc650dSSadaf Ebrahimi {
328*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); /* Release the memory that was used */
329*22dc650dSSadaf Ebrahimi pcre2_code_free(re); /* for the match data and the pattern. */
330*22dc650dSSadaf Ebrahimi return 0; /* Exit the program. */
331*22dc650dSSadaf Ebrahimi }
332*22dc650dSSadaf Ebrahimi
333*22dc650dSSadaf Ebrahimi /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
334*22dc650dSSadaf Ebrahimi sequence. First, find the options with which the regex was compiled and extract
335*22dc650dSSadaf Ebrahimi the UTF state. */
336*22dc650dSSadaf Ebrahimi
337*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits);
338*22dc650dSSadaf Ebrahimi utf8 = (option_bits & PCRE2_UTF) != 0;
339*22dc650dSSadaf Ebrahimi
340*22dc650dSSadaf Ebrahimi /* Now find the newline convention and see whether CRLF is a valid newline
341*22dc650dSSadaf Ebrahimi sequence. */
342*22dc650dSSadaf Ebrahimi
343*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline);
344*22dc650dSSadaf Ebrahimi crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
345*22dc650dSSadaf Ebrahimi newline == PCRE2_NEWLINE_CRLF ||
346*22dc650dSSadaf Ebrahimi newline == PCRE2_NEWLINE_ANYCRLF;
347*22dc650dSSadaf Ebrahimi
348*22dc650dSSadaf Ebrahimi /* Loop for second and subsequent matches */
349*22dc650dSSadaf Ebrahimi
350*22dc650dSSadaf Ebrahimi for (;;)
351*22dc650dSSadaf Ebrahimi {
352*22dc650dSSadaf Ebrahimi uint32_t options = 0; /* Normally no options */
353*22dc650dSSadaf Ebrahimi PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */
354*22dc650dSSadaf Ebrahimi
355*22dc650dSSadaf Ebrahimi /* If the previous match was for an empty string, we are finished if we are
356*22dc650dSSadaf Ebrahimi at the end of the subject. Otherwise, arrange to run another match at the
357*22dc650dSSadaf Ebrahimi same point to see if a non-empty match can be found. */
358*22dc650dSSadaf Ebrahimi
359*22dc650dSSadaf Ebrahimi if (ovector[0] == ovector[1])
360*22dc650dSSadaf Ebrahimi {
361*22dc650dSSadaf Ebrahimi if (ovector[0] == subject_length) break;
362*22dc650dSSadaf Ebrahimi options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
363*22dc650dSSadaf Ebrahimi }
364*22dc650dSSadaf Ebrahimi
365*22dc650dSSadaf Ebrahimi /* If the previous match was not an empty string, there is one tricky case to
366*22dc650dSSadaf Ebrahimi consider. If a pattern contains \K within a lookbehind assertion at the
367*22dc650dSSadaf Ebrahimi start, the end of the matched string can be at the offset where the match
368*22dc650dSSadaf Ebrahimi started. Without special action, this leads to a loop that keeps on matching
369*22dc650dSSadaf Ebrahimi the same substring. We must detect this case and arrange to move the start on
370*22dc650dSSadaf Ebrahimi by one character. The pcre2_get_startchar() function returns the starting
371*22dc650dSSadaf Ebrahimi offset that was passed to pcre2_match(). */
372*22dc650dSSadaf Ebrahimi
373*22dc650dSSadaf Ebrahimi else
374*22dc650dSSadaf Ebrahimi {
375*22dc650dSSadaf Ebrahimi PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
376*22dc650dSSadaf Ebrahimi if (start_offset <= startchar)
377*22dc650dSSadaf Ebrahimi {
378*22dc650dSSadaf Ebrahimi if (startchar >= subject_length) break; /* Reached end of subject. */
379*22dc650dSSadaf Ebrahimi start_offset = startchar + 1; /* Advance by one character. */
380*22dc650dSSadaf Ebrahimi if (utf8) /* If UTF-8, it may be more */
381*22dc650dSSadaf Ebrahimi { /* than one code unit. */
382*22dc650dSSadaf Ebrahimi for (; start_offset < subject_length; start_offset++)
383*22dc650dSSadaf Ebrahimi if ((subject[start_offset] & 0xc0) != 0x80) break;
384*22dc650dSSadaf Ebrahimi }
385*22dc650dSSadaf Ebrahimi }
386*22dc650dSSadaf Ebrahimi }
387*22dc650dSSadaf Ebrahimi
388*22dc650dSSadaf Ebrahimi /* Run the next matching operation */
389*22dc650dSSadaf Ebrahimi
390*22dc650dSSadaf Ebrahimi rc = pcre2_match(
391*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */
392*22dc650dSSadaf Ebrahimi subject, /* the subject string */
393*22dc650dSSadaf Ebrahimi subject_length, /* the length of the subject */
394*22dc650dSSadaf Ebrahimi start_offset, /* starting offset in the subject */
395*22dc650dSSadaf Ebrahimi options, /* options */
396*22dc650dSSadaf Ebrahimi match_data, /* block for storing the result */
397*22dc650dSSadaf Ebrahimi NULL); /* use default match context */
398*22dc650dSSadaf Ebrahimi
399*22dc650dSSadaf Ebrahimi /* This time, a result of NOMATCH isn't an error. If the value in "options"
400*22dc650dSSadaf Ebrahimi is zero, it just means we have found all possible matches, so the loop ends.
401*22dc650dSSadaf Ebrahimi Otherwise, it means we have failed to find a non-empty-string match at a
402*22dc650dSSadaf Ebrahimi point where there was a previous empty-string match. In this case, we do what
403*22dc650dSSadaf Ebrahimi Perl does: advance the matching position by one character, and continue. We
404*22dc650dSSadaf Ebrahimi do this by setting the "end of previous match" offset, because that is picked
405*22dc650dSSadaf Ebrahimi up at the top of the loop as the point at which to start again.
406*22dc650dSSadaf Ebrahimi
407*22dc650dSSadaf Ebrahimi There are two complications: (a) When CRLF is a valid newline sequence, and
408*22dc650dSSadaf Ebrahimi the current position is just before it, advance by an extra byte. (b)
409*22dc650dSSadaf Ebrahimi Otherwise we must ensure that we skip an entire UTF character if we are in
410*22dc650dSSadaf Ebrahimi UTF mode. */
411*22dc650dSSadaf Ebrahimi
412*22dc650dSSadaf Ebrahimi if (rc == PCRE2_ERROR_NOMATCH)
413*22dc650dSSadaf Ebrahimi {
414*22dc650dSSadaf Ebrahimi if (options == 0) break; /* All matches found */
415*22dc650dSSadaf Ebrahimi ovector[1] = start_offset + 1; /* Advance one code unit */
416*22dc650dSSadaf Ebrahimi if (crlf_is_newline && /* If CRLF is a newline & */
417*22dc650dSSadaf Ebrahimi start_offset < subject_length - 1 && /* we are at CRLF, */
418*22dc650dSSadaf Ebrahimi subject[start_offset] == '\r' &&
419*22dc650dSSadaf Ebrahimi subject[start_offset + 1] == '\n')
420*22dc650dSSadaf Ebrahimi ovector[1] += 1; /* Advance by one more. */
421*22dc650dSSadaf Ebrahimi else if (utf8) /* Otherwise, ensure we */
422*22dc650dSSadaf Ebrahimi { /* advance a whole UTF-8 */
423*22dc650dSSadaf Ebrahimi while (ovector[1] < subject_length) /* character. */
424*22dc650dSSadaf Ebrahimi {
425*22dc650dSSadaf Ebrahimi if ((subject[ovector[1]] & 0xc0) != 0x80) break;
426*22dc650dSSadaf Ebrahimi ovector[1] += 1;
427*22dc650dSSadaf Ebrahimi }
428*22dc650dSSadaf Ebrahimi }
429*22dc650dSSadaf Ebrahimi continue; /* Go round the loop again */
430*22dc650dSSadaf Ebrahimi }
431*22dc650dSSadaf Ebrahimi
432*22dc650dSSadaf Ebrahimi /* Other matching errors are not recoverable. */
433*22dc650dSSadaf Ebrahimi
434*22dc650dSSadaf Ebrahimi if (rc < 0)
435*22dc650dSSadaf Ebrahimi {
436*22dc650dSSadaf Ebrahimi printf("Matching error %d\n", rc);
437*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data);
438*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
439*22dc650dSSadaf Ebrahimi return 1;
440*22dc650dSSadaf Ebrahimi }
441*22dc650dSSadaf Ebrahimi
442*22dc650dSSadaf Ebrahimi /* Match succeeded */
443*22dc650dSSadaf Ebrahimi
444*22dc650dSSadaf Ebrahimi printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
445*22dc650dSSadaf Ebrahimi
446*22dc650dSSadaf Ebrahimi /* The match succeeded, but the output vector wasn't big enough. This
447*22dc650dSSadaf Ebrahimi should not happen. */
448*22dc650dSSadaf Ebrahimi
449*22dc650dSSadaf Ebrahimi if (rc == 0)
450*22dc650dSSadaf Ebrahimi printf("ovector was not big enough for all the captured substrings\n");
451*22dc650dSSadaf Ebrahimi
452*22dc650dSSadaf Ebrahimi /* We must guard against patterns such as /(?=.\K)/ that use \K in an
453*22dc650dSSadaf Ebrahimi assertion to set the start of a match later than its end. In this
454*22dc650dSSadaf Ebrahimi demonstration program, we just detect this case and give up. */
455*22dc650dSSadaf Ebrahimi
456*22dc650dSSadaf Ebrahimi if (ovector[0] > ovector[1])
457*22dc650dSSadaf Ebrahimi {
458*22dc650dSSadaf Ebrahimi printf("\\K was used in an assertion to set the match start after its end.\n"
459*22dc650dSSadaf Ebrahimi "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
460*22dc650dSSadaf Ebrahimi (char *)(subject + ovector[1]));
461*22dc650dSSadaf Ebrahimi printf("Run abandoned\n");
462*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data);
463*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
464*22dc650dSSadaf Ebrahimi return 1;
465*22dc650dSSadaf Ebrahimi }
466*22dc650dSSadaf Ebrahimi
467*22dc650dSSadaf Ebrahimi /* As before, show substrings stored in the output vector by number, and then
468*22dc650dSSadaf Ebrahimi also any named substrings. */
469*22dc650dSSadaf Ebrahimi
470*22dc650dSSadaf Ebrahimi for (i = 0; i < rc; i++)
471*22dc650dSSadaf Ebrahimi {
472*22dc650dSSadaf Ebrahimi PCRE2_SPTR substring_start = subject + ovector[2*i];
473*22dc650dSSadaf Ebrahimi size_t substring_length = ovector[2*i+1] - ovector[2*i];
474*22dc650dSSadaf Ebrahimi printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
475*22dc650dSSadaf Ebrahimi }
476*22dc650dSSadaf Ebrahimi
477*22dc650dSSadaf Ebrahimi if (namecount == 0) printf("No named substrings\n"); else
478*22dc650dSSadaf Ebrahimi {
479*22dc650dSSadaf Ebrahimi PCRE2_SPTR tabptr = name_table;
480*22dc650dSSadaf Ebrahimi printf("Named substrings\n");
481*22dc650dSSadaf Ebrahimi for (i = 0; i < namecount; i++)
482*22dc650dSSadaf Ebrahimi {
483*22dc650dSSadaf Ebrahimi int n = (tabptr[0] << 8) | tabptr[1];
484*22dc650dSSadaf Ebrahimi printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
485*22dc650dSSadaf Ebrahimi (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
486*22dc650dSSadaf Ebrahimi tabptr += name_entry_size;
487*22dc650dSSadaf Ebrahimi }
488*22dc650dSSadaf Ebrahimi }
489*22dc650dSSadaf Ebrahimi } /* End of loop to find second and subsequent matches */
490*22dc650dSSadaf Ebrahimi
491*22dc650dSSadaf Ebrahimi printf("\n");
492*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data);
493*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
494*22dc650dSSadaf Ebrahimi return 0;
495*22dc650dSSadaf Ebrahimi }
496*22dc650dSSadaf Ebrahimi
497*22dc650dSSadaf Ebrahimi /* End of pcre2demo.c */
498