xref: /aosp_15_r20/external/pcre/src/pcre2demo.c (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi /*************************************************
2*22dc650dSSadaf Ebrahimi *           PCRE2 DEMONSTRATION PROGRAM          *
3*22dc650dSSadaf Ebrahimi *************************************************/
4*22dc650dSSadaf Ebrahimi 
5*22dc650dSSadaf Ebrahimi /* This is a demonstration program to illustrate a straightforward way of
6*22dc650dSSadaf Ebrahimi using the PCRE2 regular expression library from a C program. See the
7*22dc650dSSadaf Ebrahimi pcre2sample documentation for a short discussion ("man pcre2sample" if you have
8*22dc650dSSadaf Ebrahimi the PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
9*22dc650dSSadaf Ebrahimi incompatible with the original PCRE API.
10*22dc650dSSadaf Ebrahimi 
11*22dc650dSSadaf Ebrahimi There are actually three libraries, each supporting a different code unit
12*22dc650dSSadaf Ebrahimi width. This demonstration program uses the 8-bit library. The default is to
13*22dc650dSSadaf Ebrahimi process each code unit as a separate character, but if the pattern begins with
14*22dc650dSSadaf Ebrahimi "(*UTF)", both it and the subject are treated as UTF-8 strings, where
15*22dc650dSSadaf Ebrahimi characters may occupy multiple code units.
16*22dc650dSSadaf Ebrahimi 
17*22dc650dSSadaf Ebrahimi In Unix-like environments, if PCRE2 is installed in your standard system
18*22dc650dSSadaf Ebrahimi libraries, you should be able to compile this program using this command:
19*22dc650dSSadaf Ebrahimi 
20*22dc650dSSadaf Ebrahimi cc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
21*22dc650dSSadaf Ebrahimi 
22*22dc650dSSadaf Ebrahimi If PCRE2 is not installed in a standard place, it is likely to be installed
23*22dc650dSSadaf Ebrahimi with support for the pkg-config mechanism. If you have pkg-config, you can
24*22dc650dSSadaf Ebrahimi compile this program using this command:
25*22dc650dSSadaf Ebrahimi 
26*22dc650dSSadaf Ebrahimi cc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
27*22dc650dSSadaf Ebrahimi 
28*22dc650dSSadaf Ebrahimi If you do not have pkg-config, you may have to use something like this:
29*22dc650dSSadaf Ebrahimi 
30*22dc650dSSadaf Ebrahimi cc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
31*22dc650dSSadaf Ebrahimi   -R/usr/local/lib -lpcre2-8 -o pcre2demo
32*22dc650dSSadaf Ebrahimi 
33*22dc650dSSadaf Ebrahimi Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and
34*22dc650dSSadaf Ebrahimi library files for PCRE2 are installed on your system. Only some operating
35*22dc650dSSadaf Ebrahimi systems (Solaris is one) use the -R option.
36*22dc650dSSadaf Ebrahimi 
37*22dc650dSSadaf Ebrahimi Building under Windows:
38*22dc650dSSadaf Ebrahimi 
39*22dc650dSSadaf Ebrahimi If you want to statically link this program against a non-dll .a file, you must
40*22dc650dSSadaf Ebrahimi define PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
41*22dc650dSSadaf Ebrahimi the following line. */
42*22dc650dSSadaf Ebrahimi 
43*22dc650dSSadaf Ebrahimi /* #define PCRE2_STATIC */
44*22dc650dSSadaf Ebrahimi 
45*22dc650dSSadaf Ebrahimi /* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
46*22dc650dSSadaf Ebrahimi For a program that uses only one code unit width, setting it to 8, 16, or 32
47*22dc650dSSadaf Ebrahimi makes it possible to use generic function names such as pcre2_compile(). Note
48*22dc650dSSadaf Ebrahimi that just changing 8 to 16 (for example) is not sufficient to convert this
49*22dc650dSSadaf Ebrahimi program to process 16-bit characters. Even in a fully 16-bit environment, where
50*22dc650dSSadaf Ebrahimi string-handling functions such as strcmp() and printf() work with 16-bit
51*22dc650dSSadaf Ebrahimi characters, the code for handling the table of named substrings will still need
52*22dc650dSSadaf Ebrahimi to be modified. */
53*22dc650dSSadaf Ebrahimi 
54*22dc650dSSadaf Ebrahimi #define PCRE2_CODE_UNIT_WIDTH 8
55*22dc650dSSadaf Ebrahimi 
56*22dc650dSSadaf Ebrahimi #include <stdio.h>
57*22dc650dSSadaf Ebrahimi #include <string.h>
58*22dc650dSSadaf Ebrahimi #include <pcre2.h>
59*22dc650dSSadaf Ebrahimi 
60*22dc650dSSadaf Ebrahimi 
61*22dc650dSSadaf Ebrahimi /**************************************************************************
62*22dc650dSSadaf Ebrahimi * Here is the program. The API includes the concept of "contexts" for     *
63*22dc650dSSadaf Ebrahimi * setting up unusual interface requirements for compiling and matching,   *
64*22dc650dSSadaf Ebrahimi * such as custom memory managers and non-standard newline definitions.    *
65*22dc650dSSadaf Ebrahimi * This program does not do any of this, so it makes no use of contexts,   *
66*22dc650dSSadaf Ebrahimi * always passing NULL where a context could be given.                     *
67*22dc650dSSadaf Ebrahimi **************************************************************************/
68*22dc650dSSadaf Ebrahimi 
main(int argc,char ** argv)69*22dc650dSSadaf Ebrahimi int main(int argc, char **argv)
70*22dc650dSSadaf Ebrahimi {
71*22dc650dSSadaf Ebrahimi pcre2_code *re;
72*22dc650dSSadaf Ebrahimi PCRE2_SPTR pattern;     /* PCRE2_SPTR is a pointer to unsigned code units of */
73*22dc650dSSadaf Ebrahimi PCRE2_SPTR subject;     /* the appropriate width (in this case, 8 bits). */
74*22dc650dSSadaf Ebrahimi PCRE2_SPTR name_table;
75*22dc650dSSadaf Ebrahimi 
76*22dc650dSSadaf Ebrahimi int crlf_is_newline;
77*22dc650dSSadaf Ebrahimi int errornumber;
78*22dc650dSSadaf Ebrahimi int find_all;
79*22dc650dSSadaf Ebrahimi int i;
80*22dc650dSSadaf Ebrahimi int rc;
81*22dc650dSSadaf Ebrahimi int utf8;
82*22dc650dSSadaf Ebrahimi 
83*22dc650dSSadaf Ebrahimi uint32_t option_bits;
84*22dc650dSSadaf Ebrahimi uint32_t namecount;
85*22dc650dSSadaf Ebrahimi uint32_t name_entry_size;
86*22dc650dSSadaf Ebrahimi uint32_t newline;
87*22dc650dSSadaf Ebrahimi 
88*22dc650dSSadaf Ebrahimi PCRE2_SIZE erroroffset;
89*22dc650dSSadaf Ebrahimi PCRE2_SIZE *ovector;
90*22dc650dSSadaf Ebrahimi PCRE2_SIZE subject_length;
91*22dc650dSSadaf Ebrahimi 
92*22dc650dSSadaf Ebrahimi pcre2_match_data *match_data;
93*22dc650dSSadaf Ebrahimi 
94*22dc650dSSadaf Ebrahimi 
95*22dc650dSSadaf Ebrahimi /**************************************************************************
96*22dc650dSSadaf Ebrahimi * First, sort out the command line. There is only one possible option at  *
97*22dc650dSSadaf Ebrahimi * the moment, "-g" to request repeated matching to find all occurrences,  *
98*22dc650dSSadaf Ebrahimi * like Perl's /g option. We set the variable find_all to a non-zero value *
99*22dc650dSSadaf Ebrahimi * if the -g option is present.                                            *
100*22dc650dSSadaf Ebrahimi **************************************************************************/
101*22dc650dSSadaf Ebrahimi 
102*22dc650dSSadaf Ebrahimi find_all = 0;
103*22dc650dSSadaf Ebrahimi for (i = 1; i < argc; i++)
104*22dc650dSSadaf Ebrahimi   {
105*22dc650dSSadaf Ebrahimi   if (strcmp(argv[i], "-g") == 0) find_all = 1;
106*22dc650dSSadaf Ebrahimi   else if (argv[i][0] == '-')
107*22dc650dSSadaf Ebrahimi     {
108*22dc650dSSadaf Ebrahimi     printf("Unrecognised option %s\n", argv[i]);
109*22dc650dSSadaf Ebrahimi     return 1;
110*22dc650dSSadaf Ebrahimi     }
111*22dc650dSSadaf Ebrahimi   else break;
112*22dc650dSSadaf Ebrahimi   }
113*22dc650dSSadaf Ebrahimi 
114*22dc650dSSadaf Ebrahimi /* After the options, we require exactly two arguments, which are the pattern,
115*22dc650dSSadaf Ebrahimi and the subject string. */
116*22dc650dSSadaf Ebrahimi 
117*22dc650dSSadaf Ebrahimi if (argc - i != 2)
118*22dc650dSSadaf Ebrahimi   {
119*22dc650dSSadaf Ebrahimi   printf("Exactly two arguments required: a regex and a subject string\n");
120*22dc650dSSadaf Ebrahimi   return 1;
121*22dc650dSSadaf Ebrahimi   }
122*22dc650dSSadaf Ebrahimi 
123*22dc650dSSadaf Ebrahimi /* Pattern and subject are char arguments, so they can be straightforwardly
124*22dc650dSSadaf Ebrahimi cast to PCRE2_SPTR because we are working in 8-bit code units. The subject
125*22dc650dSSadaf Ebrahimi length is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
126*22dc650dSSadaf Ebrahimi defined to be size_t. */
127*22dc650dSSadaf Ebrahimi 
128*22dc650dSSadaf Ebrahimi pattern = (PCRE2_SPTR)argv[i];
129*22dc650dSSadaf Ebrahimi subject = (PCRE2_SPTR)argv[i+1];
130*22dc650dSSadaf Ebrahimi subject_length = (PCRE2_SIZE)strlen((char *)subject);
131*22dc650dSSadaf Ebrahimi 
132*22dc650dSSadaf Ebrahimi 
133*22dc650dSSadaf Ebrahimi /*************************************************************************
134*22dc650dSSadaf Ebrahimi * Now we are going to compile the regular expression pattern, and handle *
135*22dc650dSSadaf Ebrahimi * any errors that are detected.                                          *
136*22dc650dSSadaf Ebrahimi *************************************************************************/
137*22dc650dSSadaf Ebrahimi 
138*22dc650dSSadaf Ebrahimi re = pcre2_compile(
139*22dc650dSSadaf Ebrahimi   pattern,               /* the pattern */
140*22dc650dSSadaf Ebrahimi   PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
141*22dc650dSSadaf Ebrahimi   0,                     /* default options */
142*22dc650dSSadaf Ebrahimi   &errornumber,          /* for error number */
143*22dc650dSSadaf Ebrahimi   &erroroffset,          /* for error offset */
144*22dc650dSSadaf Ebrahimi   NULL);                 /* use default compile context */
145*22dc650dSSadaf Ebrahimi 
146*22dc650dSSadaf Ebrahimi /* Compilation failed: print the error message and exit. */
147*22dc650dSSadaf Ebrahimi 
148*22dc650dSSadaf Ebrahimi if (re == NULL)
149*22dc650dSSadaf Ebrahimi   {
150*22dc650dSSadaf Ebrahimi   PCRE2_UCHAR buffer[256];
151*22dc650dSSadaf Ebrahimi   pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
152*22dc650dSSadaf Ebrahimi   printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
153*22dc650dSSadaf Ebrahimi     buffer);
154*22dc650dSSadaf Ebrahimi   return 1;
155*22dc650dSSadaf Ebrahimi   }
156*22dc650dSSadaf Ebrahimi 
157*22dc650dSSadaf Ebrahimi 
158*22dc650dSSadaf Ebrahimi /*************************************************************************
159*22dc650dSSadaf Ebrahimi * If the compilation succeeded, we call PCRE2 again, in order to do a    *
160*22dc650dSSadaf Ebrahimi * pattern match against the subject string. This does just ONE match. If *
161*22dc650dSSadaf Ebrahimi * further matching is needed, it will be done below. Before running the  *
162*22dc650dSSadaf Ebrahimi * match we must set up a match_data block for holding the result. Using  *
163*22dc650dSSadaf Ebrahimi * pcre2_match_data_create_from_pattern() ensures that the block is       *
164*22dc650dSSadaf Ebrahimi * exactly the right size for the number of capturing parentheses in the  *
165*22dc650dSSadaf Ebrahimi * pattern. If you need to know the actual size of a match_data block as  *
166*22dc650dSSadaf Ebrahimi * a number of bytes, you can find it like this:                          *
167*22dc650dSSadaf Ebrahimi *                                                                        *
168*22dc650dSSadaf Ebrahimi * PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data);    *
169*22dc650dSSadaf Ebrahimi *************************************************************************/
170*22dc650dSSadaf Ebrahimi 
171*22dc650dSSadaf Ebrahimi match_data = pcre2_match_data_create_from_pattern(re, NULL);
172*22dc650dSSadaf Ebrahimi 
173*22dc650dSSadaf Ebrahimi /* Now run the match. */
174*22dc650dSSadaf Ebrahimi 
175*22dc650dSSadaf Ebrahimi rc = pcre2_match(
176*22dc650dSSadaf Ebrahimi   re,                   /* the compiled pattern */
177*22dc650dSSadaf Ebrahimi   subject,              /* the subject string */
178*22dc650dSSadaf Ebrahimi   subject_length,       /* the length of the subject */
179*22dc650dSSadaf Ebrahimi   0,                    /* start at offset 0 in the subject */
180*22dc650dSSadaf Ebrahimi   0,                    /* default options */
181*22dc650dSSadaf Ebrahimi   match_data,           /* block for storing the result */
182*22dc650dSSadaf Ebrahimi   NULL);                /* use default match context */
183*22dc650dSSadaf Ebrahimi 
184*22dc650dSSadaf Ebrahimi /* Matching failed: handle error cases */
185*22dc650dSSadaf Ebrahimi 
186*22dc650dSSadaf Ebrahimi if (rc < 0)
187*22dc650dSSadaf Ebrahimi   {
188*22dc650dSSadaf Ebrahimi   switch(rc)
189*22dc650dSSadaf Ebrahimi     {
190*22dc650dSSadaf Ebrahimi     case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
191*22dc650dSSadaf Ebrahimi     /*
192*22dc650dSSadaf Ebrahimi     Handle other special cases if you like
193*22dc650dSSadaf Ebrahimi     */
194*22dc650dSSadaf Ebrahimi     default: printf("Matching error %d\n", rc); break;
195*22dc650dSSadaf Ebrahimi     }
196*22dc650dSSadaf Ebrahimi   pcre2_match_data_free(match_data);   /* Release memory used for the match */
197*22dc650dSSadaf Ebrahimi   pcre2_code_free(re);                 /*   data and the compiled pattern. */
198*22dc650dSSadaf Ebrahimi   return 1;
199*22dc650dSSadaf Ebrahimi   }
200*22dc650dSSadaf Ebrahimi 
201*22dc650dSSadaf Ebrahimi /* Match succeeded. Get a pointer to the output vector, where string offsets
202*22dc650dSSadaf Ebrahimi are stored. */
203*22dc650dSSadaf Ebrahimi 
204*22dc650dSSadaf Ebrahimi ovector = pcre2_get_ovector_pointer(match_data);
205*22dc650dSSadaf Ebrahimi printf("Match succeeded at offset %d\n", (int)ovector[0]);
206*22dc650dSSadaf Ebrahimi 
207*22dc650dSSadaf Ebrahimi 
208*22dc650dSSadaf Ebrahimi /*************************************************************************
209*22dc650dSSadaf Ebrahimi * We have found the first match within the subject string. If the output *
210*22dc650dSSadaf Ebrahimi * vector wasn't big enough, say so. Then output any substrings that were *
211*22dc650dSSadaf Ebrahimi * captured.                                                              *
212*22dc650dSSadaf Ebrahimi *************************************************************************/
213*22dc650dSSadaf Ebrahimi 
214*22dc650dSSadaf Ebrahimi /* The output vector wasn't big enough. This should not happen, because we used
215*22dc650dSSadaf Ebrahimi pcre2_match_data_create_from_pattern() above. */
216*22dc650dSSadaf Ebrahimi 
217*22dc650dSSadaf Ebrahimi if (rc == 0)
218*22dc650dSSadaf Ebrahimi   printf("ovector was not big enough for all the captured substrings\n");
219*22dc650dSSadaf Ebrahimi 
220*22dc650dSSadaf Ebrahimi /* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
221*22dc650dSSadaf Ebrahimi assertions. However, there is an option to re-enable the old behaviour. If that
222*22dc650dSSadaf Ebrahimi is set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
223*22dc650dSSadaf Ebrahimi assertion to set the start of a match later than its end. In this demonstration
224*22dc650dSSadaf Ebrahimi program, we show how to detect this case, but it shouldn't arise because the
225*22dc650dSSadaf Ebrahimi option is never set. */
226*22dc650dSSadaf Ebrahimi 
227*22dc650dSSadaf Ebrahimi if (ovector[0] > ovector[1])
228*22dc650dSSadaf Ebrahimi   {
229*22dc650dSSadaf Ebrahimi   printf("\\K was used in an assertion to set the match start after its end.\n"
230*22dc650dSSadaf Ebrahimi     "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
231*22dc650dSSadaf Ebrahimi       (char *)(subject + ovector[1]));
232*22dc650dSSadaf Ebrahimi   printf("Run abandoned\n");
233*22dc650dSSadaf Ebrahimi   pcre2_match_data_free(match_data);
234*22dc650dSSadaf Ebrahimi   pcre2_code_free(re);
235*22dc650dSSadaf Ebrahimi   return 1;
236*22dc650dSSadaf Ebrahimi   }
237*22dc650dSSadaf Ebrahimi 
238*22dc650dSSadaf Ebrahimi /* Show substrings stored in the output vector by number. Obviously, in a real
239*22dc650dSSadaf Ebrahimi application you might want to do things other than print them. */
240*22dc650dSSadaf Ebrahimi 
241*22dc650dSSadaf Ebrahimi for (i = 0; i < rc; i++)
242*22dc650dSSadaf Ebrahimi   {
243*22dc650dSSadaf Ebrahimi   PCRE2_SPTR substring_start = subject + ovector[2*i];
244*22dc650dSSadaf Ebrahimi   PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
245*22dc650dSSadaf Ebrahimi   printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
246*22dc650dSSadaf Ebrahimi   }
247*22dc650dSSadaf Ebrahimi 
248*22dc650dSSadaf Ebrahimi 
249*22dc650dSSadaf Ebrahimi /**************************************************************************
250*22dc650dSSadaf Ebrahimi * That concludes the basic part of this demonstration program. We have    *
251*22dc650dSSadaf Ebrahimi * compiled a pattern, and performed a single match. The code that follows *
252*22dc650dSSadaf Ebrahimi * shows first how to access named substrings, and then how to code for    *
253*22dc650dSSadaf Ebrahimi * repeated matches on the same subject.                                   *
254*22dc650dSSadaf Ebrahimi **************************************************************************/
255*22dc650dSSadaf Ebrahimi 
256*22dc650dSSadaf Ebrahimi /* See if there are any named substrings, and if so, show them by name. First
257*22dc650dSSadaf Ebrahimi we have to extract the count of named parentheses from the pattern. */
258*22dc650dSSadaf Ebrahimi 
259*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(
260*22dc650dSSadaf Ebrahimi   re,                   /* the compiled pattern */
261*22dc650dSSadaf Ebrahimi   PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
262*22dc650dSSadaf Ebrahimi   &namecount);          /* where to put the answer */
263*22dc650dSSadaf Ebrahimi 
264*22dc650dSSadaf Ebrahimi if (namecount == 0) printf("No named substrings\n"); else
265*22dc650dSSadaf Ebrahimi   {
266*22dc650dSSadaf Ebrahimi   PCRE2_SPTR tabptr;
267*22dc650dSSadaf Ebrahimi   printf("Named substrings\n");
268*22dc650dSSadaf Ebrahimi 
269*22dc650dSSadaf Ebrahimi   /* Before we can access the substrings, we must extract the table for
270*22dc650dSSadaf Ebrahimi   translating names to numbers, and the size of each entry in the table. */
271*22dc650dSSadaf Ebrahimi 
272*22dc650dSSadaf Ebrahimi   (void)pcre2_pattern_info(
273*22dc650dSSadaf Ebrahimi     re,                       /* the compiled pattern */
274*22dc650dSSadaf Ebrahimi     PCRE2_INFO_NAMETABLE,     /* address of the table */
275*22dc650dSSadaf Ebrahimi     &name_table);             /* where to put the answer */
276*22dc650dSSadaf Ebrahimi 
277*22dc650dSSadaf Ebrahimi   (void)pcre2_pattern_info(
278*22dc650dSSadaf Ebrahimi     re,                       /* the compiled pattern */
279*22dc650dSSadaf Ebrahimi     PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
280*22dc650dSSadaf Ebrahimi     &name_entry_size);        /* where to put the answer */
281*22dc650dSSadaf Ebrahimi 
282*22dc650dSSadaf Ebrahimi   /* Now we can scan the table and, for each entry, print the number, the name,
283*22dc650dSSadaf Ebrahimi   and the substring itself. In the 8-bit library the number is held in two
284*22dc650dSSadaf Ebrahimi   bytes, most significant first. */
285*22dc650dSSadaf Ebrahimi 
286*22dc650dSSadaf Ebrahimi   tabptr = name_table;
287*22dc650dSSadaf Ebrahimi   for (i = 0; i < namecount; i++)
288*22dc650dSSadaf Ebrahimi     {
289*22dc650dSSadaf Ebrahimi     int n = (tabptr[0] << 8) | tabptr[1];
290*22dc650dSSadaf Ebrahimi     printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
291*22dc650dSSadaf Ebrahimi       (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
292*22dc650dSSadaf Ebrahimi     tabptr += name_entry_size;
293*22dc650dSSadaf Ebrahimi     }
294*22dc650dSSadaf Ebrahimi   }
295*22dc650dSSadaf Ebrahimi 
296*22dc650dSSadaf Ebrahimi 
297*22dc650dSSadaf Ebrahimi /*************************************************************************
298*22dc650dSSadaf Ebrahimi * If the "-g" option was given on the command line, we want to continue  *
299*22dc650dSSadaf Ebrahimi * to search for additional matches in the subject string, in a similar   *
300*22dc650dSSadaf Ebrahimi * way to the /g option in Perl. This turns out to be trickier than you   *
301*22dc650dSSadaf Ebrahimi * might think because of the possibility of matching an empty string.    *
302*22dc650dSSadaf Ebrahimi * What happens is as follows:                                            *
303*22dc650dSSadaf Ebrahimi *                                                                        *
304*22dc650dSSadaf Ebrahimi * If the previous match was NOT for an empty string, we can just start   *
305*22dc650dSSadaf Ebrahimi * the next match at the end of the previous one.                         *
306*22dc650dSSadaf Ebrahimi *                                                                        *
307*22dc650dSSadaf Ebrahimi * If the previous match WAS for an empty string, we can't do that, as it *
308*22dc650dSSadaf Ebrahimi * would lead to an infinite loop. Instead, a call of pcre2_match() is    *
309*22dc650dSSadaf Ebrahimi * made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
310*22dc650dSSadaf Ebrahimi * first of these tells PCRE2 that an empty string at the start of the    *
311*22dc650dSSadaf Ebrahimi * subject is not a valid match; other possibilities must be tried. The   *
312*22dc650dSSadaf Ebrahimi * second flag restricts PCRE2 to one match attempt at the initial string *
313*22dc650dSSadaf Ebrahimi * position. If this match succeeds, an alternative to the empty string   *
314*22dc650dSSadaf Ebrahimi * match has been found, and we can print it and proceed round the loop,  *
315*22dc650dSSadaf Ebrahimi * advancing by the length of whatever was found. If this match does not  *
316*22dc650dSSadaf Ebrahimi * succeed, we still stay in the loop, advancing by just one character.   *
317*22dc650dSSadaf Ebrahimi * In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be  *
318*22dc650dSSadaf Ebrahimi * more than one byte.                                                    *
319*22dc650dSSadaf Ebrahimi *                                                                        *
320*22dc650dSSadaf Ebrahimi * However, there is a complication concerned with newlines. When the     *
321*22dc650dSSadaf Ebrahimi * newline convention is such that CRLF is a valid newline, we must       *
322*22dc650dSSadaf Ebrahimi * advance by two characters rather than one. The newline convention can  *
323*22dc650dSSadaf Ebrahimi * be set in the regex by (*CR), etc.; if not, we must find the default.  *
324*22dc650dSSadaf Ebrahimi *************************************************************************/
325*22dc650dSSadaf Ebrahimi 
326*22dc650dSSadaf Ebrahimi if (!find_all)     /* Check for -g */
327*22dc650dSSadaf Ebrahimi   {
328*22dc650dSSadaf Ebrahimi   pcre2_match_data_free(match_data);  /* Release the memory that was used */
329*22dc650dSSadaf Ebrahimi   pcre2_code_free(re);                /* for the match data and the pattern. */
330*22dc650dSSadaf Ebrahimi   return 0;                           /* Exit the program. */
331*22dc650dSSadaf Ebrahimi   }
332*22dc650dSSadaf Ebrahimi 
333*22dc650dSSadaf Ebrahimi /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
334*22dc650dSSadaf Ebrahimi sequence. First, find the options with which the regex was compiled and extract
335*22dc650dSSadaf Ebrahimi the UTF state. */
336*22dc650dSSadaf Ebrahimi 
337*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits);
338*22dc650dSSadaf Ebrahimi utf8 = (option_bits & PCRE2_UTF) != 0;
339*22dc650dSSadaf Ebrahimi 
340*22dc650dSSadaf Ebrahimi /* Now find the newline convention and see whether CRLF is a valid newline
341*22dc650dSSadaf Ebrahimi sequence. */
342*22dc650dSSadaf Ebrahimi 
343*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline);
344*22dc650dSSadaf Ebrahimi crlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
345*22dc650dSSadaf Ebrahimi                   newline == PCRE2_NEWLINE_CRLF ||
346*22dc650dSSadaf Ebrahimi                   newline == PCRE2_NEWLINE_ANYCRLF;
347*22dc650dSSadaf Ebrahimi 
348*22dc650dSSadaf Ebrahimi /* Loop for second and subsequent matches */
349*22dc650dSSadaf Ebrahimi 
350*22dc650dSSadaf Ebrahimi for (;;)
351*22dc650dSSadaf Ebrahimi   {
352*22dc650dSSadaf Ebrahimi   uint32_t options = 0;                   /* Normally no options */
353*22dc650dSSadaf Ebrahimi   PCRE2_SIZE start_offset = ovector[1];   /* Start at end of previous match */
354*22dc650dSSadaf Ebrahimi 
355*22dc650dSSadaf Ebrahimi   /* If the previous match was for an empty string, we are finished if we are
356*22dc650dSSadaf Ebrahimi   at the end of the subject. Otherwise, arrange to run another match at the
357*22dc650dSSadaf Ebrahimi   same point to see if a non-empty match can be found. */
358*22dc650dSSadaf Ebrahimi 
359*22dc650dSSadaf Ebrahimi   if (ovector[0] == ovector[1])
360*22dc650dSSadaf Ebrahimi     {
361*22dc650dSSadaf Ebrahimi     if (ovector[0] == subject_length) break;
362*22dc650dSSadaf Ebrahimi     options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
363*22dc650dSSadaf Ebrahimi     }
364*22dc650dSSadaf Ebrahimi 
365*22dc650dSSadaf Ebrahimi   /* If the previous match was not an empty string, there is one tricky case to
366*22dc650dSSadaf Ebrahimi   consider. If a pattern contains \K within a lookbehind assertion at the
367*22dc650dSSadaf Ebrahimi   start, the end of the matched string can be at the offset where the match
368*22dc650dSSadaf Ebrahimi   started. Without special action, this leads to a loop that keeps on matching
369*22dc650dSSadaf Ebrahimi   the same substring. We must detect this case and arrange to move the start on
370*22dc650dSSadaf Ebrahimi   by one character. The pcre2_get_startchar() function returns the starting
371*22dc650dSSadaf Ebrahimi   offset that was passed to pcre2_match(). */
372*22dc650dSSadaf Ebrahimi 
373*22dc650dSSadaf Ebrahimi   else
374*22dc650dSSadaf Ebrahimi     {
375*22dc650dSSadaf Ebrahimi     PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
376*22dc650dSSadaf Ebrahimi     if (start_offset <= startchar)
377*22dc650dSSadaf Ebrahimi       {
378*22dc650dSSadaf Ebrahimi       if (startchar >= subject_length) break;   /* Reached end of subject.   */
379*22dc650dSSadaf Ebrahimi       start_offset = startchar + 1;             /* Advance by one character. */
380*22dc650dSSadaf Ebrahimi       if (utf8)                                 /* If UTF-8, it may be more  */
381*22dc650dSSadaf Ebrahimi         {                                       /*   than one code unit.     */
382*22dc650dSSadaf Ebrahimi         for (; start_offset < subject_length; start_offset++)
383*22dc650dSSadaf Ebrahimi           if ((subject[start_offset] & 0xc0) != 0x80) break;
384*22dc650dSSadaf Ebrahimi         }
385*22dc650dSSadaf Ebrahimi       }
386*22dc650dSSadaf Ebrahimi     }
387*22dc650dSSadaf Ebrahimi 
388*22dc650dSSadaf Ebrahimi   /* Run the next matching operation */
389*22dc650dSSadaf Ebrahimi 
390*22dc650dSSadaf Ebrahimi   rc = pcre2_match(
391*22dc650dSSadaf Ebrahimi     re,                   /* the compiled pattern */
392*22dc650dSSadaf Ebrahimi     subject,              /* the subject string */
393*22dc650dSSadaf Ebrahimi     subject_length,       /* the length of the subject */
394*22dc650dSSadaf Ebrahimi     start_offset,         /* starting offset in the subject */
395*22dc650dSSadaf Ebrahimi     options,              /* options */
396*22dc650dSSadaf Ebrahimi     match_data,           /* block for storing the result */
397*22dc650dSSadaf Ebrahimi     NULL);                /* use default match context */
398*22dc650dSSadaf Ebrahimi 
399*22dc650dSSadaf Ebrahimi   /* This time, a result of NOMATCH isn't an error. If the value in "options"
400*22dc650dSSadaf Ebrahimi   is zero, it just means we have found all possible matches, so the loop ends.
401*22dc650dSSadaf Ebrahimi   Otherwise, it means we have failed to find a non-empty-string match at a
402*22dc650dSSadaf Ebrahimi   point where there was a previous empty-string match. In this case, we do what
403*22dc650dSSadaf Ebrahimi   Perl does: advance the matching position by one character, and continue. We
404*22dc650dSSadaf Ebrahimi   do this by setting the "end of previous match" offset, because that is picked
405*22dc650dSSadaf Ebrahimi   up at the top of the loop as the point at which to start again.
406*22dc650dSSadaf Ebrahimi 
407*22dc650dSSadaf Ebrahimi   There are two complications: (a) When CRLF is a valid newline sequence, and
408*22dc650dSSadaf Ebrahimi   the current position is just before it, advance by an extra byte. (b)
409*22dc650dSSadaf Ebrahimi   Otherwise we must ensure that we skip an entire UTF character if we are in
410*22dc650dSSadaf Ebrahimi   UTF mode. */
411*22dc650dSSadaf Ebrahimi 
412*22dc650dSSadaf Ebrahimi   if (rc == PCRE2_ERROR_NOMATCH)
413*22dc650dSSadaf Ebrahimi     {
414*22dc650dSSadaf Ebrahimi     if (options == 0) break;                    /* All matches found */
415*22dc650dSSadaf Ebrahimi     ovector[1] = start_offset + 1;              /* Advance one code unit */
416*22dc650dSSadaf Ebrahimi     if (crlf_is_newline &&                      /* If CRLF is a newline & */
417*22dc650dSSadaf Ebrahimi         start_offset < subject_length - 1 &&    /* we are at CRLF, */
418*22dc650dSSadaf Ebrahimi         subject[start_offset] == '\r' &&
419*22dc650dSSadaf Ebrahimi         subject[start_offset + 1] == '\n')
420*22dc650dSSadaf Ebrahimi       ovector[1] += 1;                          /* Advance by one more. */
421*22dc650dSSadaf Ebrahimi     else if (utf8)                              /* Otherwise, ensure we */
422*22dc650dSSadaf Ebrahimi       {                                         /* advance a whole UTF-8 */
423*22dc650dSSadaf Ebrahimi       while (ovector[1] < subject_length)       /* character. */
424*22dc650dSSadaf Ebrahimi         {
425*22dc650dSSadaf Ebrahimi         if ((subject[ovector[1]] & 0xc0) != 0x80) break;
426*22dc650dSSadaf Ebrahimi         ovector[1] += 1;
427*22dc650dSSadaf Ebrahimi         }
428*22dc650dSSadaf Ebrahimi       }
429*22dc650dSSadaf Ebrahimi     continue;    /* Go round the loop again */
430*22dc650dSSadaf Ebrahimi     }
431*22dc650dSSadaf Ebrahimi 
432*22dc650dSSadaf Ebrahimi   /* Other matching errors are not recoverable. */
433*22dc650dSSadaf Ebrahimi 
434*22dc650dSSadaf Ebrahimi   if (rc < 0)
435*22dc650dSSadaf Ebrahimi     {
436*22dc650dSSadaf Ebrahimi     printf("Matching error %d\n", rc);
437*22dc650dSSadaf Ebrahimi     pcre2_match_data_free(match_data);
438*22dc650dSSadaf Ebrahimi     pcre2_code_free(re);
439*22dc650dSSadaf Ebrahimi     return 1;
440*22dc650dSSadaf Ebrahimi     }
441*22dc650dSSadaf Ebrahimi 
442*22dc650dSSadaf Ebrahimi   /* Match succeeded */
443*22dc650dSSadaf Ebrahimi 
444*22dc650dSSadaf Ebrahimi   printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
445*22dc650dSSadaf Ebrahimi 
446*22dc650dSSadaf Ebrahimi   /* The match succeeded, but the output vector wasn't big enough. This
447*22dc650dSSadaf Ebrahimi   should not happen. */
448*22dc650dSSadaf Ebrahimi 
449*22dc650dSSadaf Ebrahimi   if (rc == 0)
450*22dc650dSSadaf Ebrahimi     printf("ovector was not big enough for all the captured substrings\n");
451*22dc650dSSadaf Ebrahimi 
452*22dc650dSSadaf Ebrahimi   /* We must guard against patterns such as /(?=.\K)/ that use \K in an
453*22dc650dSSadaf Ebrahimi   assertion to set the start of a match later than its end. In this
454*22dc650dSSadaf Ebrahimi   demonstration program, we just detect this case and give up. */
455*22dc650dSSadaf Ebrahimi 
456*22dc650dSSadaf Ebrahimi   if (ovector[0] > ovector[1])
457*22dc650dSSadaf Ebrahimi     {
458*22dc650dSSadaf Ebrahimi     printf("\\K was used in an assertion to set the match start after its end.\n"
459*22dc650dSSadaf Ebrahimi       "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
460*22dc650dSSadaf Ebrahimi         (char *)(subject + ovector[1]));
461*22dc650dSSadaf Ebrahimi     printf("Run abandoned\n");
462*22dc650dSSadaf Ebrahimi     pcre2_match_data_free(match_data);
463*22dc650dSSadaf Ebrahimi     pcre2_code_free(re);
464*22dc650dSSadaf Ebrahimi     return 1;
465*22dc650dSSadaf Ebrahimi     }
466*22dc650dSSadaf Ebrahimi 
467*22dc650dSSadaf Ebrahimi   /* As before, show substrings stored in the output vector by number, and then
468*22dc650dSSadaf Ebrahimi   also any named substrings. */
469*22dc650dSSadaf Ebrahimi 
470*22dc650dSSadaf Ebrahimi   for (i = 0; i < rc; i++)
471*22dc650dSSadaf Ebrahimi     {
472*22dc650dSSadaf Ebrahimi     PCRE2_SPTR substring_start = subject + ovector[2*i];
473*22dc650dSSadaf Ebrahimi     size_t substring_length = ovector[2*i+1] - ovector[2*i];
474*22dc650dSSadaf Ebrahimi     printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
475*22dc650dSSadaf Ebrahimi     }
476*22dc650dSSadaf Ebrahimi 
477*22dc650dSSadaf Ebrahimi   if (namecount == 0) printf("No named substrings\n"); else
478*22dc650dSSadaf Ebrahimi     {
479*22dc650dSSadaf Ebrahimi     PCRE2_SPTR tabptr = name_table;
480*22dc650dSSadaf Ebrahimi     printf("Named substrings\n");
481*22dc650dSSadaf Ebrahimi     for (i = 0; i < namecount; i++)
482*22dc650dSSadaf Ebrahimi       {
483*22dc650dSSadaf Ebrahimi       int n = (tabptr[0] << 8) | tabptr[1];
484*22dc650dSSadaf Ebrahimi       printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
485*22dc650dSSadaf Ebrahimi         (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
486*22dc650dSSadaf Ebrahimi       tabptr += name_entry_size;
487*22dc650dSSadaf Ebrahimi       }
488*22dc650dSSadaf Ebrahimi     }
489*22dc650dSSadaf Ebrahimi   }      /* End of loop to find second and subsequent matches */
490*22dc650dSSadaf Ebrahimi 
491*22dc650dSSadaf Ebrahimi printf("\n");
492*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data);
493*22dc650dSSadaf Ebrahimi pcre2_code_free(re);
494*22dc650dSSadaf Ebrahimi return 0;
495*22dc650dSSadaf Ebrahimi }
496*22dc650dSSadaf Ebrahimi 
497*22dc650dSSadaf Ebrahimi /* End of pcre2demo.c */
498