xref: /aosp_15_r20/external/pcre/doc/html/pcre2demo.html (revision 22dc650d8ae982c6770746019a6f94af92b0f024)
1*22dc650dSSadaf Ebrahimi<html>
2*22dc650dSSadaf Ebrahimi<head>
3*22dc650dSSadaf Ebrahimi<title>pcre2demo specification</title>
4*22dc650dSSadaf Ebrahimi</head>
5*22dc650dSSadaf Ebrahimi<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB">
6*22dc650dSSadaf Ebrahimi<h1>pcre2demo man page</h1>
7*22dc650dSSadaf Ebrahimi<p>
8*22dc650dSSadaf EbrahimiReturn to the <a href="index.html">PCRE2 index page</a>.
9*22dc650dSSadaf Ebrahimi</p>
10*22dc650dSSadaf Ebrahimi<p>
11*22dc650dSSadaf EbrahimiThis page is part of the PCRE2 HTML documentation. It was generated
12*22dc650dSSadaf Ebrahimiautomatically from the original man page. If there is any nonsense in it,
13*22dc650dSSadaf Ebrahimiplease consult the man page, in case the conversion went wrong.
14*22dc650dSSadaf Ebrahimi<br>
15*22dc650dSSadaf Ebrahimi<br><b>
16*22dc650dSSadaf EbrahimiSOURCE CODE
17*22dc650dSSadaf Ebrahimi</b><br>
18*22dc650dSSadaf Ebrahimi<PRE>
19*22dc650dSSadaf Ebrahimi/*************************************************
20*22dc650dSSadaf Ebrahimi*           PCRE2 DEMONSTRATION PROGRAM          *
21*22dc650dSSadaf Ebrahimi*************************************************/
22*22dc650dSSadaf Ebrahimi
23*22dc650dSSadaf Ebrahimi/* This is a demonstration program to illustrate a straightforward way of
24*22dc650dSSadaf Ebrahimiusing the PCRE2 regular expression library from a C program. See the
25*22dc650dSSadaf Ebrahimipcre2sample documentation for a short discussion ("man pcre2sample" if you have
26*22dc650dSSadaf Ebrahimithe PCRE2 man pages installed). PCRE2 is a revised API for the library, and is
27*22dc650dSSadaf Ebrahimiincompatible with the original PCRE API.
28*22dc650dSSadaf Ebrahimi
29*22dc650dSSadaf EbrahimiThere are actually three libraries, each supporting a different code unit
30*22dc650dSSadaf Ebrahimiwidth. This demonstration program uses the 8-bit library. The default is to
31*22dc650dSSadaf Ebrahimiprocess each code unit as a separate character, but if the pattern begins with
32*22dc650dSSadaf Ebrahimi"(*UTF)", both it and the subject are treated as UTF-8 strings, where
33*22dc650dSSadaf Ebrahimicharacters may occupy multiple code units.
34*22dc650dSSadaf Ebrahimi
35*22dc650dSSadaf EbrahimiIn Unix-like environments, if PCRE2 is installed in your standard system
36*22dc650dSSadaf Ebrahimilibraries, you should be able to compile this program using this command:
37*22dc650dSSadaf Ebrahimi
38*22dc650dSSadaf Ebrahimicc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo
39*22dc650dSSadaf Ebrahimi
40*22dc650dSSadaf EbrahimiIf PCRE2 is not installed in a standard place, it is likely to be installed
41*22dc650dSSadaf Ebrahimiwith support for the pkg-config mechanism. If you have pkg-config, you can
42*22dc650dSSadaf Ebrahimicompile this program using this command:
43*22dc650dSSadaf Ebrahimi
44*22dc650dSSadaf Ebrahimicc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo
45*22dc650dSSadaf Ebrahimi
46*22dc650dSSadaf EbrahimiIf you do not have pkg-config, you may have to use something like this:
47*22dc650dSSadaf Ebrahimi
48*22dc650dSSadaf Ebrahimicc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \
49*22dc650dSSadaf Ebrahimi  -R/usr/local/lib -lpcre2-8 -o pcre2demo
50*22dc650dSSadaf Ebrahimi
51*22dc650dSSadaf EbrahimiReplace "/usr/local/include" and "/usr/local/lib" with wherever the include and
52*22dc650dSSadaf Ebrahimilibrary files for PCRE2 are installed on your system. Only some operating
53*22dc650dSSadaf Ebrahimisystems (Solaris is one) use the -R option.
54*22dc650dSSadaf Ebrahimi
55*22dc650dSSadaf EbrahimiBuilding under Windows:
56*22dc650dSSadaf Ebrahimi
57*22dc650dSSadaf EbrahimiIf you want to statically link this program against a non-dll .a file, you must
58*22dc650dSSadaf Ebrahimidefine PCRE2_STATIC before including pcre2.h, so in this environment, uncomment
59*22dc650dSSadaf Ebrahimithe following line. */
60*22dc650dSSadaf Ebrahimi
61*22dc650dSSadaf Ebrahimi/* #define PCRE2_STATIC */
62*22dc650dSSadaf Ebrahimi
63*22dc650dSSadaf Ebrahimi/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h.
64*22dc650dSSadaf EbrahimiFor a program that uses only one code unit width, setting it to 8, 16, or 32
65*22dc650dSSadaf Ebrahimimakes it possible to use generic function names such as pcre2_compile(). Note
66*22dc650dSSadaf Ebrahimithat just changing 8 to 16 (for example) is not sufficient to convert this
67*22dc650dSSadaf Ebrahimiprogram to process 16-bit characters. Even in a fully 16-bit environment, where
68*22dc650dSSadaf Ebrahimistring-handling functions such as strcmp() and printf() work with 16-bit
69*22dc650dSSadaf Ebrahimicharacters, the code for handling the table of named substrings will still need
70*22dc650dSSadaf Ebrahimito be modified. */
71*22dc650dSSadaf Ebrahimi
72*22dc650dSSadaf Ebrahimi#define PCRE2_CODE_UNIT_WIDTH 8
73*22dc650dSSadaf Ebrahimi
74*22dc650dSSadaf Ebrahimi#include &lt;stdio.h&gt;
75*22dc650dSSadaf Ebrahimi#include &lt;string.h&gt;
76*22dc650dSSadaf Ebrahimi#include &lt;pcre2.h&gt;
77*22dc650dSSadaf Ebrahimi
78*22dc650dSSadaf Ebrahimi
79*22dc650dSSadaf Ebrahimi/**************************************************************************
80*22dc650dSSadaf Ebrahimi* Here is the program. The API includes the concept of "contexts" for     *
81*22dc650dSSadaf Ebrahimi* setting up unusual interface requirements for compiling and matching,   *
82*22dc650dSSadaf Ebrahimi* such as custom memory managers and non-standard newline definitions.    *
83*22dc650dSSadaf Ebrahimi* This program does not do any of this, so it makes no use of contexts,   *
84*22dc650dSSadaf Ebrahimi* always passing NULL where a context could be given.                     *
85*22dc650dSSadaf Ebrahimi**************************************************************************/
86*22dc650dSSadaf Ebrahimi
87*22dc650dSSadaf Ebrahimiint main(int argc, char **argv)
88*22dc650dSSadaf Ebrahimi{
89*22dc650dSSadaf Ebrahimipcre2_code *re;
90*22dc650dSSadaf EbrahimiPCRE2_SPTR pattern;     /* PCRE2_SPTR is a pointer to unsigned code units of */
91*22dc650dSSadaf EbrahimiPCRE2_SPTR subject;     /* the appropriate width (in this case, 8 bits). */
92*22dc650dSSadaf EbrahimiPCRE2_SPTR name_table;
93*22dc650dSSadaf Ebrahimi
94*22dc650dSSadaf Ebrahimiint crlf_is_newline;
95*22dc650dSSadaf Ebrahimiint errornumber;
96*22dc650dSSadaf Ebrahimiint find_all;
97*22dc650dSSadaf Ebrahimiint i;
98*22dc650dSSadaf Ebrahimiint rc;
99*22dc650dSSadaf Ebrahimiint utf8;
100*22dc650dSSadaf Ebrahimi
101*22dc650dSSadaf Ebrahimiuint32_t option_bits;
102*22dc650dSSadaf Ebrahimiuint32_t namecount;
103*22dc650dSSadaf Ebrahimiuint32_t name_entry_size;
104*22dc650dSSadaf Ebrahimiuint32_t newline;
105*22dc650dSSadaf Ebrahimi
106*22dc650dSSadaf EbrahimiPCRE2_SIZE erroroffset;
107*22dc650dSSadaf EbrahimiPCRE2_SIZE *ovector;
108*22dc650dSSadaf EbrahimiPCRE2_SIZE subject_length;
109*22dc650dSSadaf Ebrahimi
110*22dc650dSSadaf Ebrahimipcre2_match_data *match_data;
111*22dc650dSSadaf Ebrahimi
112*22dc650dSSadaf Ebrahimi
113*22dc650dSSadaf Ebrahimi/**************************************************************************
114*22dc650dSSadaf Ebrahimi* First, sort out the command line. There is only one possible option at  *
115*22dc650dSSadaf Ebrahimi* the moment, "-g" to request repeated matching to find all occurrences,  *
116*22dc650dSSadaf Ebrahimi* like Perl's /g option. We set the variable find_all to a non-zero value *
117*22dc650dSSadaf Ebrahimi* if the -g option is present.                                            *
118*22dc650dSSadaf Ebrahimi**************************************************************************/
119*22dc650dSSadaf Ebrahimi
120*22dc650dSSadaf Ebrahimifind_all = 0;
121*22dc650dSSadaf Ebrahimifor (i = 1; i &lt; argc; i++)
122*22dc650dSSadaf Ebrahimi  {
123*22dc650dSSadaf Ebrahimi  if (strcmp(argv[i], "-g") == 0) find_all = 1;
124*22dc650dSSadaf Ebrahimi  else if (argv[i][0] == '-')
125*22dc650dSSadaf Ebrahimi    {
126*22dc650dSSadaf Ebrahimi    printf("Unrecognised option %s\n", argv[i]);
127*22dc650dSSadaf Ebrahimi    return 1;
128*22dc650dSSadaf Ebrahimi    }
129*22dc650dSSadaf Ebrahimi  else break;
130*22dc650dSSadaf Ebrahimi  }
131*22dc650dSSadaf Ebrahimi
132*22dc650dSSadaf Ebrahimi/* After the options, we require exactly two arguments, which are the pattern,
133*22dc650dSSadaf Ebrahimiand the subject string. */
134*22dc650dSSadaf Ebrahimi
135*22dc650dSSadaf Ebrahimiif (argc - i != 2)
136*22dc650dSSadaf Ebrahimi  {
137*22dc650dSSadaf Ebrahimi  printf("Exactly two arguments required: a regex and a subject string\n");
138*22dc650dSSadaf Ebrahimi  return 1;
139*22dc650dSSadaf Ebrahimi  }
140*22dc650dSSadaf Ebrahimi
141*22dc650dSSadaf Ebrahimi/* Pattern and subject are char arguments, so they can be straightforwardly
142*22dc650dSSadaf Ebrahimicast to PCRE2_SPTR because we are working in 8-bit code units. The subject
143*22dc650dSSadaf Ebrahimilength is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact
144*22dc650dSSadaf Ebrahimidefined to be size_t. */
145*22dc650dSSadaf Ebrahimi
146*22dc650dSSadaf Ebrahimipattern = (PCRE2_SPTR)argv[i];
147*22dc650dSSadaf Ebrahimisubject = (PCRE2_SPTR)argv[i+1];
148*22dc650dSSadaf Ebrahimisubject_length = (PCRE2_SIZE)strlen((char *)subject);
149*22dc650dSSadaf Ebrahimi
150*22dc650dSSadaf Ebrahimi
151*22dc650dSSadaf Ebrahimi/*************************************************************************
152*22dc650dSSadaf Ebrahimi* Now we are going to compile the regular expression pattern, and handle *
153*22dc650dSSadaf Ebrahimi* any errors that are detected.                                          *
154*22dc650dSSadaf Ebrahimi*************************************************************************/
155*22dc650dSSadaf Ebrahimi
156*22dc650dSSadaf Ebrahimire = pcre2_compile(
157*22dc650dSSadaf Ebrahimi  pattern,               /* the pattern */
158*22dc650dSSadaf Ebrahimi  PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */
159*22dc650dSSadaf Ebrahimi  0,                     /* default options */
160*22dc650dSSadaf Ebrahimi  &amp;errornumber,          /* for error number */
161*22dc650dSSadaf Ebrahimi  &amp;erroroffset,          /* for error offset */
162*22dc650dSSadaf Ebrahimi  NULL);                 /* use default compile context */
163*22dc650dSSadaf Ebrahimi
164*22dc650dSSadaf Ebrahimi/* Compilation failed: print the error message and exit. */
165*22dc650dSSadaf Ebrahimi
166*22dc650dSSadaf Ebrahimiif (re == NULL)
167*22dc650dSSadaf Ebrahimi  {
168*22dc650dSSadaf Ebrahimi  PCRE2_UCHAR buffer[256];
169*22dc650dSSadaf Ebrahimi  pcre2_get_error_message(errornumber, buffer, sizeof(buffer));
170*22dc650dSSadaf Ebrahimi  printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset,
171*22dc650dSSadaf Ebrahimi    buffer);
172*22dc650dSSadaf Ebrahimi  return 1;
173*22dc650dSSadaf Ebrahimi  }
174*22dc650dSSadaf Ebrahimi
175*22dc650dSSadaf Ebrahimi
176*22dc650dSSadaf Ebrahimi/*************************************************************************
177*22dc650dSSadaf Ebrahimi* If the compilation succeeded, we call PCRE2 again, in order to do a    *
178*22dc650dSSadaf Ebrahimi* pattern match against the subject string. This does just ONE match. If *
179*22dc650dSSadaf Ebrahimi* further matching is needed, it will be done below. Before running the  *
180*22dc650dSSadaf Ebrahimi* match we must set up a match_data block for holding the result. Using  *
181*22dc650dSSadaf Ebrahimi* pcre2_match_data_create_from_pattern() ensures that the block is       *
182*22dc650dSSadaf Ebrahimi* exactly the right size for the number of capturing parentheses in the  *
183*22dc650dSSadaf Ebrahimi* pattern. If you need to know the actual size of a match_data block as  *
184*22dc650dSSadaf Ebrahimi* a number of bytes, you can find it like this:                          *
185*22dc650dSSadaf Ebrahimi*                                                                        *
186*22dc650dSSadaf Ebrahimi* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data);    *
187*22dc650dSSadaf Ebrahimi*************************************************************************/
188*22dc650dSSadaf Ebrahimi
189*22dc650dSSadaf Ebrahimimatch_data = pcre2_match_data_create_from_pattern(re, NULL);
190*22dc650dSSadaf Ebrahimi
191*22dc650dSSadaf Ebrahimi/* Now run the match. */
192*22dc650dSSadaf Ebrahimi
193*22dc650dSSadaf Ebrahimirc = pcre2_match(
194*22dc650dSSadaf Ebrahimi  re,                   /* the compiled pattern */
195*22dc650dSSadaf Ebrahimi  subject,              /* the subject string */
196*22dc650dSSadaf Ebrahimi  subject_length,       /* the length of the subject */
197*22dc650dSSadaf Ebrahimi  0,                    /* start at offset 0 in the subject */
198*22dc650dSSadaf Ebrahimi  0,                    /* default options */
199*22dc650dSSadaf Ebrahimi  match_data,           /* block for storing the result */
200*22dc650dSSadaf Ebrahimi  NULL);                /* use default match context */
201*22dc650dSSadaf Ebrahimi
202*22dc650dSSadaf Ebrahimi/* Matching failed: handle error cases */
203*22dc650dSSadaf Ebrahimi
204*22dc650dSSadaf Ebrahimiif (rc &lt; 0)
205*22dc650dSSadaf Ebrahimi  {
206*22dc650dSSadaf Ebrahimi  switch(rc)
207*22dc650dSSadaf Ebrahimi    {
208*22dc650dSSadaf Ebrahimi    case PCRE2_ERROR_NOMATCH: printf("No match\n"); break;
209*22dc650dSSadaf Ebrahimi    /*
210*22dc650dSSadaf Ebrahimi    Handle other special cases if you like
211*22dc650dSSadaf Ebrahimi    */
212*22dc650dSSadaf Ebrahimi    default: printf("Matching error %d\n", rc); break;
213*22dc650dSSadaf Ebrahimi    }
214*22dc650dSSadaf Ebrahimi  pcre2_match_data_free(match_data);   /* Release memory used for the match */
215*22dc650dSSadaf Ebrahimi  pcre2_code_free(re);                 /*   data and the compiled pattern. */
216*22dc650dSSadaf Ebrahimi  return 1;
217*22dc650dSSadaf Ebrahimi  }
218*22dc650dSSadaf Ebrahimi
219*22dc650dSSadaf Ebrahimi/* Match succeeded. Get a pointer to the output vector, where string offsets
220*22dc650dSSadaf Ebrahimiare stored. */
221*22dc650dSSadaf Ebrahimi
222*22dc650dSSadaf Ebrahimiovector = pcre2_get_ovector_pointer(match_data);
223*22dc650dSSadaf Ebrahimiprintf("Match succeeded at offset %d\n", (int)ovector[0]);
224*22dc650dSSadaf Ebrahimi
225*22dc650dSSadaf Ebrahimi
226*22dc650dSSadaf Ebrahimi/*************************************************************************
227*22dc650dSSadaf Ebrahimi* We have found the first match within the subject string. If the output *
228*22dc650dSSadaf Ebrahimi* vector wasn't big enough, say so. Then output any substrings that were *
229*22dc650dSSadaf Ebrahimi* captured.                                                              *
230*22dc650dSSadaf Ebrahimi*************************************************************************/
231*22dc650dSSadaf Ebrahimi
232*22dc650dSSadaf Ebrahimi/* The output vector wasn't big enough. This should not happen, because we used
233*22dc650dSSadaf Ebrahimipcre2_match_data_create_from_pattern() above. */
234*22dc650dSSadaf Ebrahimi
235*22dc650dSSadaf Ebrahimiif (rc == 0)
236*22dc650dSSadaf Ebrahimi  printf("ovector was not big enough for all the captured substrings\n");
237*22dc650dSSadaf Ebrahimi
238*22dc650dSSadaf Ebrahimi/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround
239*22dc650dSSadaf Ebrahimiassertions. However, there is an option to re-enable the old behaviour. If that
240*22dc650dSSadaf Ebrahimiis set, it is possible to run patterns such as /(?=.\K)/ that use \K in an
241*22dc650dSSadaf Ebrahimiassertion to set the start of a match later than its end. In this demonstration
242*22dc650dSSadaf Ebrahimiprogram, we show how to detect this case, but it shouldn't arise because the
243*22dc650dSSadaf Ebrahimioption is never set. */
244*22dc650dSSadaf Ebrahimi
245*22dc650dSSadaf Ebrahimiif (ovector[0] &gt; ovector[1])
246*22dc650dSSadaf Ebrahimi  {
247*22dc650dSSadaf Ebrahimi  printf("\\K was used in an assertion to set the match start after its end.\n"
248*22dc650dSSadaf Ebrahimi    "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
249*22dc650dSSadaf Ebrahimi      (char *)(subject + ovector[1]));
250*22dc650dSSadaf Ebrahimi  printf("Run abandoned\n");
251*22dc650dSSadaf Ebrahimi  pcre2_match_data_free(match_data);
252*22dc650dSSadaf Ebrahimi  pcre2_code_free(re);
253*22dc650dSSadaf Ebrahimi  return 1;
254*22dc650dSSadaf Ebrahimi  }
255*22dc650dSSadaf Ebrahimi
256*22dc650dSSadaf Ebrahimi/* Show substrings stored in the output vector by number. Obviously, in a real
257*22dc650dSSadaf Ebrahimiapplication you might want to do things other than print them. */
258*22dc650dSSadaf Ebrahimi
259*22dc650dSSadaf Ebrahimifor (i = 0; i &lt; rc; i++)
260*22dc650dSSadaf Ebrahimi  {
261*22dc650dSSadaf Ebrahimi  PCRE2_SPTR substring_start = subject + ovector[2*i];
262*22dc650dSSadaf Ebrahimi  PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i];
263*22dc650dSSadaf Ebrahimi  printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
264*22dc650dSSadaf Ebrahimi  }
265*22dc650dSSadaf Ebrahimi
266*22dc650dSSadaf Ebrahimi
267*22dc650dSSadaf Ebrahimi/**************************************************************************
268*22dc650dSSadaf Ebrahimi* That concludes the basic part of this demonstration program. We have    *
269*22dc650dSSadaf Ebrahimi* compiled a pattern, and performed a single match. The code that follows *
270*22dc650dSSadaf Ebrahimi* shows first how to access named substrings, and then how to code for    *
271*22dc650dSSadaf Ebrahimi* repeated matches on the same subject.                                   *
272*22dc650dSSadaf Ebrahimi**************************************************************************/
273*22dc650dSSadaf Ebrahimi
274*22dc650dSSadaf Ebrahimi/* See if there are any named substrings, and if so, show them by name. First
275*22dc650dSSadaf Ebrahimiwe have to extract the count of named parentheses from the pattern. */
276*22dc650dSSadaf Ebrahimi
277*22dc650dSSadaf Ebrahimi(void)pcre2_pattern_info(
278*22dc650dSSadaf Ebrahimi  re,                   /* the compiled pattern */
279*22dc650dSSadaf Ebrahimi  PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */
280*22dc650dSSadaf Ebrahimi  &amp;namecount);          /* where to put the answer */
281*22dc650dSSadaf Ebrahimi
282*22dc650dSSadaf Ebrahimiif (namecount == 0) printf("No named substrings\n"); else
283*22dc650dSSadaf Ebrahimi  {
284*22dc650dSSadaf Ebrahimi  PCRE2_SPTR tabptr;
285*22dc650dSSadaf Ebrahimi  printf("Named substrings\n");
286*22dc650dSSadaf Ebrahimi
287*22dc650dSSadaf Ebrahimi  /* Before we can access the substrings, we must extract the table for
288*22dc650dSSadaf Ebrahimi  translating names to numbers, and the size of each entry in the table. */
289*22dc650dSSadaf Ebrahimi
290*22dc650dSSadaf Ebrahimi  (void)pcre2_pattern_info(
291*22dc650dSSadaf Ebrahimi    re,                       /* the compiled pattern */
292*22dc650dSSadaf Ebrahimi    PCRE2_INFO_NAMETABLE,     /* address of the table */
293*22dc650dSSadaf Ebrahimi    &amp;name_table);             /* where to put the answer */
294*22dc650dSSadaf Ebrahimi
295*22dc650dSSadaf Ebrahimi  (void)pcre2_pattern_info(
296*22dc650dSSadaf Ebrahimi    re,                       /* the compiled pattern */
297*22dc650dSSadaf Ebrahimi    PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */
298*22dc650dSSadaf Ebrahimi    &amp;name_entry_size);        /* where to put the answer */
299*22dc650dSSadaf Ebrahimi
300*22dc650dSSadaf Ebrahimi  /* Now we can scan the table and, for each entry, print the number, the name,
301*22dc650dSSadaf Ebrahimi  and the substring itself. In the 8-bit library the number is held in two
302*22dc650dSSadaf Ebrahimi  bytes, most significant first. */
303*22dc650dSSadaf Ebrahimi
304*22dc650dSSadaf Ebrahimi  tabptr = name_table;
305*22dc650dSSadaf Ebrahimi  for (i = 0; i &lt; namecount; i++)
306*22dc650dSSadaf Ebrahimi    {
307*22dc650dSSadaf Ebrahimi    int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
308*22dc650dSSadaf Ebrahimi    printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
309*22dc650dSSadaf Ebrahimi      (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
310*22dc650dSSadaf Ebrahimi    tabptr += name_entry_size;
311*22dc650dSSadaf Ebrahimi    }
312*22dc650dSSadaf Ebrahimi  }
313*22dc650dSSadaf Ebrahimi
314*22dc650dSSadaf Ebrahimi
315*22dc650dSSadaf Ebrahimi/*************************************************************************
316*22dc650dSSadaf Ebrahimi* If the "-g" option was given on the command line, we want to continue  *
317*22dc650dSSadaf Ebrahimi* to search for additional matches in the subject string, in a similar   *
318*22dc650dSSadaf Ebrahimi* way to the /g option in Perl. This turns out to be trickier than you   *
319*22dc650dSSadaf Ebrahimi* might think because of the possibility of matching an empty string.    *
320*22dc650dSSadaf Ebrahimi* What happens is as follows:                                            *
321*22dc650dSSadaf Ebrahimi*                                                                        *
322*22dc650dSSadaf Ebrahimi* If the previous match was NOT for an empty string, we can just start   *
323*22dc650dSSadaf Ebrahimi* the next match at the end of the previous one.                         *
324*22dc650dSSadaf Ebrahimi*                                                                        *
325*22dc650dSSadaf Ebrahimi* If the previous match WAS for an empty string, we can't do that, as it *
326*22dc650dSSadaf Ebrahimi* would lead to an infinite loop. Instead, a call of pcre2_match() is    *
327*22dc650dSSadaf Ebrahimi* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The *
328*22dc650dSSadaf Ebrahimi* first of these tells PCRE2 that an empty string at the start of the    *
329*22dc650dSSadaf Ebrahimi* subject is not a valid match; other possibilities must be tried. The   *
330*22dc650dSSadaf Ebrahimi* second flag restricts PCRE2 to one match attempt at the initial string *
331*22dc650dSSadaf Ebrahimi* position. If this match succeeds, an alternative to the empty string   *
332*22dc650dSSadaf Ebrahimi* match has been found, and we can print it and proceed round the loop,  *
333*22dc650dSSadaf Ebrahimi* advancing by the length of whatever was found. If this match does not  *
334*22dc650dSSadaf Ebrahimi* succeed, we still stay in the loop, advancing by just one character.   *
335*22dc650dSSadaf Ebrahimi* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be  *
336*22dc650dSSadaf Ebrahimi* more than one byte.                                                    *
337*22dc650dSSadaf Ebrahimi*                                                                        *
338*22dc650dSSadaf Ebrahimi* However, there is a complication concerned with newlines. When the     *
339*22dc650dSSadaf Ebrahimi* newline convention is such that CRLF is a valid newline, we must       *
340*22dc650dSSadaf Ebrahimi* advance by two characters rather than one. The newline convention can  *
341*22dc650dSSadaf Ebrahimi* be set in the regex by (*CR), etc.; if not, we must find the default.  *
342*22dc650dSSadaf Ebrahimi*************************************************************************/
343*22dc650dSSadaf Ebrahimi
344*22dc650dSSadaf Ebrahimiif (!find_all)     /* Check for -g */
345*22dc650dSSadaf Ebrahimi  {
346*22dc650dSSadaf Ebrahimi  pcre2_match_data_free(match_data);  /* Release the memory that was used */
347*22dc650dSSadaf Ebrahimi  pcre2_code_free(re);                /* for the match data and the pattern. */
348*22dc650dSSadaf Ebrahimi  return 0;                           /* Exit the program. */
349*22dc650dSSadaf Ebrahimi  }
350*22dc650dSSadaf Ebrahimi
351*22dc650dSSadaf Ebrahimi/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline
352*22dc650dSSadaf Ebrahimisequence. First, find the options with which the regex was compiled and extract
353*22dc650dSSadaf Ebrahimithe UTF state. */
354*22dc650dSSadaf Ebrahimi
355*22dc650dSSadaf Ebrahimi(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &amp;option_bits);
356*22dc650dSSadaf Ebrahimiutf8 = (option_bits &amp; PCRE2_UTF) != 0;
357*22dc650dSSadaf Ebrahimi
358*22dc650dSSadaf Ebrahimi/* Now find the newline convention and see whether CRLF is a valid newline
359*22dc650dSSadaf Ebrahimisequence. */
360*22dc650dSSadaf Ebrahimi
361*22dc650dSSadaf Ebrahimi(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &amp;newline);
362*22dc650dSSadaf Ebrahimicrlf_is_newline = newline == PCRE2_NEWLINE_ANY ||
363*22dc650dSSadaf Ebrahimi                  newline == PCRE2_NEWLINE_CRLF ||
364*22dc650dSSadaf Ebrahimi                  newline == PCRE2_NEWLINE_ANYCRLF;
365*22dc650dSSadaf Ebrahimi
366*22dc650dSSadaf Ebrahimi/* Loop for second and subsequent matches */
367*22dc650dSSadaf Ebrahimi
368*22dc650dSSadaf Ebrahimifor (;;)
369*22dc650dSSadaf Ebrahimi  {
370*22dc650dSSadaf Ebrahimi  uint32_t options = 0;                   /* Normally no options */
371*22dc650dSSadaf Ebrahimi  PCRE2_SIZE start_offset = ovector[1];   /* Start at end of previous match */
372*22dc650dSSadaf Ebrahimi
373*22dc650dSSadaf Ebrahimi  /* If the previous match was for an empty string, we are finished if we are
374*22dc650dSSadaf Ebrahimi  at the end of the subject. Otherwise, arrange to run another match at the
375*22dc650dSSadaf Ebrahimi  same point to see if a non-empty match can be found. */
376*22dc650dSSadaf Ebrahimi
377*22dc650dSSadaf Ebrahimi  if (ovector[0] == ovector[1])
378*22dc650dSSadaf Ebrahimi    {
379*22dc650dSSadaf Ebrahimi    if (ovector[0] == subject_length) break;
380*22dc650dSSadaf Ebrahimi    options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
381*22dc650dSSadaf Ebrahimi    }
382*22dc650dSSadaf Ebrahimi
383*22dc650dSSadaf Ebrahimi  /* If the previous match was not an empty string, there is one tricky case to
384*22dc650dSSadaf Ebrahimi  consider. If a pattern contains \K within a lookbehind assertion at the
385*22dc650dSSadaf Ebrahimi  start, the end of the matched string can be at the offset where the match
386*22dc650dSSadaf Ebrahimi  started. Without special action, this leads to a loop that keeps on matching
387*22dc650dSSadaf Ebrahimi  the same substring. We must detect this case and arrange to move the start on
388*22dc650dSSadaf Ebrahimi  by one character. The pcre2_get_startchar() function returns the starting
389*22dc650dSSadaf Ebrahimi  offset that was passed to pcre2_match(). */
390*22dc650dSSadaf Ebrahimi
391*22dc650dSSadaf Ebrahimi  else
392*22dc650dSSadaf Ebrahimi    {
393*22dc650dSSadaf Ebrahimi    PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
394*22dc650dSSadaf Ebrahimi    if (start_offset &lt;= startchar)
395*22dc650dSSadaf Ebrahimi      {
396*22dc650dSSadaf Ebrahimi      if (startchar &gt;= subject_length) break;   /* Reached end of subject.   */
397*22dc650dSSadaf Ebrahimi      start_offset = startchar + 1;             /* Advance by one character. */
398*22dc650dSSadaf Ebrahimi      if (utf8)                                 /* If UTF-8, it may be more  */
399*22dc650dSSadaf Ebrahimi        {                                       /*   than one code unit.     */
400*22dc650dSSadaf Ebrahimi        for (; start_offset &lt; subject_length; start_offset++)
401*22dc650dSSadaf Ebrahimi          if ((subject[start_offset] &amp; 0xc0) != 0x80) break;
402*22dc650dSSadaf Ebrahimi        }
403*22dc650dSSadaf Ebrahimi      }
404*22dc650dSSadaf Ebrahimi    }
405*22dc650dSSadaf Ebrahimi
406*22dc650dSSadaf Ebrahimi  /* Run the next matching operation */
407*22dc650dSSadaf Ebrahimi
408*22dc650dSSadaf Ebrahimi  rc = pcre2_match(
409*22dc650dSSadaf Ebrahimi    re,                   /* the compiled pattern */
410*22dc650dSSadaf Ebrahimi    subject,              /* the subject string */
411*22dc650dSSadaf Ebrahimi    subject_length,       /* the length of the subject */
412*22dc650dSSadaf Ebrahimi    start_offset,         /* starting offset in the subject */
413*22dc650dSSadaf Ebrahimi    options,              /* options */
414*22dc650dSSadaf Ebrahimi    match_data,           /* block for storing the result */
415*22dc650dSSadaf Ebrahimi    NULL);                /* use default match context */
416*22dc650dSSadaf Ebrahimi
417*22dc650dSSadaf Ebrahimi  /* This time, a result of NOMATCH isn't an error. If the value in "options"
418*22dc650dSSadaf Ebrahimi  is zero, it just means we have found all possible matches, so the loop ends.
419*22dc650dSSadaf Ebrahimi  Otherwise, it means we have failed to find a non-empty-string match at a
420*22dc650dSSadaf Ebrahimi  point where there was a previous empty-string match. In this case, we do what
421*22dc650dSSadaf Ebrahimi  Perl does: advance the matching position by one character, and continue. We
422*22dc650dSSadaf Ebrahimi  do this by setting the "end of previous match" offset, because that is picked
423*22dc650dSSadaf Ebrahimi  up at the top of the loop as the point at which to start again.
424*22dc650dSSadaf Ebrahimi
425*22dc650dSSadaf Ebrahimi  There are two complications: (a) When CRLF is a valid newline sequence, and
426*22dc650dSSadaf Ebrahimi  the current position is just before it, advance by an extra byte. (b)
427*22dc650dSSadaf Ebrahimi  Otherwise we must ensure that we skip an entire UTF character if we are in
428*22dc650dSSadaf Ebrahimi  UTF mode. */
429*22dc650dSSadaf Ebrahimi
430*22dc650dSSadaf Ebrahimi  if (rc == PCRE2_ERROR_NOMATCH)
431*22dc650dSSadaf Ebrahimi    {
432*22dc650dSSadaf Ebrahimi    if (options == 0) break;                    /* All matches found */
433*22dc650dSSadaf Ebrahimi    ovector[1] = start_offset + 1;              /* Advance one code unit */
434*22dc650dSSadaf Ebrahimi    if (crlf_is_newline &amp;&amp;                      /* If CRLF is a newline &amp; */
435*22dc650dSSadaf Ebrahimi        start_offset &lt; subject_length - 1 &amp;&amp;    /* we are at CRLF, */
436*22dc650dSSadaf Ebrahimi        subject[start_offset] == '\r' &amp;&amp;
437*22dc650dSSadaf Ebrahimi        subject[start_offset + 1] == '\n')
438*22dc650dSSadaf Ebrahimi      ovector[1] += 1;                          /* Advance by one more. */
439*22dc650dSSadaf Ebrahimi    else if (utf8)                              /* Otherwise, ensure we */
440*22dc650dSSadaf Ebrahimi      {                                         /* advance a whole UTF-8 */
441*22dc650dSSadaf Ebrahimi      while (ovector[1] &lt; subject_length)       /* character. */
442*22dc650dSSadaf Ebrahimi        {
443*22dc650dSSadaf Ebrahimi        if ((subject[ovector[1]] &amp; 0xc0) != 0x80) break;
444*22dc650dSSadaf Ebrahimi        ovector[1] += 1;
445*22dc650dSSadaf Ebrahimi        }
446*22dc650dSSadaf Ebrahimi      }
447*22dc650dSSadaf Ebrahimi    continue;    /* Go round the loop again */
448*22dc650dSSadaf Ebrahimi    }
449*22dc650dSSadaf Ebrahimi
450*22dc650dSSadaf Ebrahimi  /* Other matching errors are not recoverable. */
451*22dc650dSSadaf Ebrahimi
452*22dc650dSSadaf Ebrahimi  if (rc &lt; 0)
453*22dc650dSSadaf Ebrahimi    {
454*22dc650dSSadaf Ebrahimi    printf("Matching error %d\n", rc);
455*22dc650dSSadaf Ebrahimi    pcre2_match_data_free(match_data);
456*22dc650dSSadaf Ebrahimi    pcre2_code_free(re);
457*22dc650dSSadaf Ebrahimi    return 1;
458*22dc650dSSadaf Ebrahimi    }
459*22dc650dSSadaf Ebrahimi
460*22dc650dSSadaf Ebrahimi  /* Match succeeded */
461*22dc650dSSadaf Ebrahimi
462*22dc650dSSadaf Ebrahimi  printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]);
463*22dc650dSSadaf Ebrahimi
464*22dc650dSSadaf Ebrahimi  /* The match succeeded, but the output vector wasn't big enough. This
465*22dc650dSSadaf Ebrahimi  should not happen. */
466*22dc650dSSadaf Ebrahimi
467*22dc650dSSadaf Ebrahimi  if (rc == 0)
468*22dc650dSSadaf Ebrahimi    printf("ovector was not big enough for all the captured substrings\n");
469*22dc650dSSadaf Ebrahimi
470*22dc650dSSadaf Ebrahimi  /* We must guard against patterns such as /(?=.\K)/ that use \K in an
471*22dc650dSSadaf Ebrahimi  assertion to set the start of a match later than its end. In this
472*22dc650dSSadaf Ebrahimi  demonstration program, we just detect this case and give up. */
473*22dc650dSSadaf Ebrahimi
474*22dc650dSSadaf Ebrahimi  if (ovector[0] &gt; ovector[1])
475*22dc650dSSadaf Ebrahimi    {
476*22dc650dSSadaf Ebrahimi    printf("\\K was used in an assertion to set the match start after its end.\n"
477*22dc650dSSadaf Ebrahimi      "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]),
478*22dc650dSSadaf Ebrahimi        (char *)(subject + ovector[1]));
479*22dc650dSSadaf Ebrahimi    printf("Run abandoned\n");
480*22dc650dSSadaf Ebrahimi    pcre2_match_data_free(match_data);
481*22dc650dSSadaf Ebrahimi    pcre2_code_free(re);
482*22dc650dSSadaf Ebrahimi    return 1;
483*22dc650dSSadaf Ebrahimi    }
484*22dc650dSSadaf Ebrahimi
485*22dc650dSSadaf Ebrahimi  /* As before, show substrings stored in the output vector by number, and then
486*22dc650dSSadaf Ebrahimi  also any named substrings. */
487*22dc650dSSadaf Ebrahimi
488*22dc650dSSadaf Ebrahimi  for (i = 0; i &lt; rc; i++)
489*22dc650dSSadaf Ebrahimi    {
490*22dc650dSSadaf Ebrahimi    PCRE2_SPTR substring_start = subject + ovector[2*i];
491*22dc650dSSadaf Ebrahimi    size_t substring_length = ovector[2*i+1] - ovector[2*i];
492*22dc650dSSadaf Ebrahimi    printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start);
493*22dc650dSSadaf Ebrahimi    }
494*22dc650dSSadaf Ebrahimi
495*22dc650dSSadaf Ebrahimi  if (namecount == 0) printf("No named substrings\n"); else
496*22dc650dSSadaf Ebrahimi    {
497*22dc650dSSadaf Ebrahimi    PCRE2_SPTR tabptr = name_table;
498*22dc650dSSadaf Ebrahimi    printf("Named substrings\n");
499*22dc650dSSadaf Ebrahimi    for (i = 0; i &lt; namecount; i++)
500*22dc650dSSadaf Ebrahimi      {
501*22dc650dSSadaf Ebrahimi      int n = (tabptr[0] &lt;&lt; 8) | tabptr[1];
502*22dc650dSSadaf Ebrahimi      printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2,
503*22dc650dSSadaf Ebrahimi        (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]);
504*22dc650dSSadaf Ebrahimi      tabptr += name_entry_size;
505*22dc650dSSadaf Ebrahimi      }
506*22dc650dSSadaf Ebrahimi    }
507*22dc650dSSadaf Ebrahimi  }      /* End of loop to find second and subsequent matches */
508*22dc650dSSadaf Ebrahimi
509*22dc650dSSadaf Ebrahimiprintf("\n");
510*22dc650dSSadaf Ebrahimipcre2_match_data_free(match_data);
511*22dc650dSSadaf Ebrahimipcre2_code_free(re);
512*22dc650dSSadaf Ebrahimireturn 0;
513*22dc650dSSadaf Ebrahimi}
514*22dc650dSSadaf Ebrahimi
515*22dc650dSSadaf Ebrahimi/* End of pcre2demo.c */
516*22dc650dSSadaf Ebrahimi<p>
517*22dc650dSSadaf EbrahimiReturn to the <a href="index.html">PCRE2 index page</a>.
518*22dc650dSSadaf Ebrahimi</p>
519