1*22dc650dSSadaf Ebrahimi<html> 2*22dc650dSSadaf Ebrahimi<head> 3*22dc650dSSadaf Ebrahimi<title>pcre2demo specification</title> 4*22dc650dSSadaf Ebrahimi</head> 5*22dc650dSSadaf Ebrahimi<body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> 6*22dc650dSSadaf Ebrahimi<h1>pcre2demo man page</h1> 7*22dc650dSSadaf Ebrahimi<p> 8*22dc650dSSadaf EbrahimiReturn to the <a href="index.html">PCRE2 index page</a>. 9*22dc650dSSadaf Ebrahimi</p> 10*22dc650dSSadaf Ebrahimi<p> 11*22dc650dSSadaf EbrahimiThis page is part of the PCRE2 HTML documentation. It was generated 12*22dc650dSSadaf Ebrahimiautomatically from the original man page. If there is any nonsense in it, 13*22dc650dSSadaf Ebrahimiplease consult the man page, in case the conversion went wrong. 14*22dc650dSSadaf Ebrahimi<br> 15*22dc650dSSadaf Ebrahimi<br><b> 16*22dc650dSSadaf EbrahimiSOURCE CODE 17*22dc650dSSadaf Ebrahimi</b><br> 18*22dc650dSSadaf Ebrahimi<PRE> 19*22dc650dSSadaf Ebrahimi/************************************************* 20*22dc650dSSadaf Ebrahimi* PCRE2 DEMONSTRATION PROGRAM * 21*22dc650dSSadaf Ebrahimi*************************************************/ 22*22dc650dSSadaf Ebrahimi 23*22dc650dSSadaf Ebrahimi/* This is a demonstration program to illustrate a straightforward way of 24*22dc650dSSadaf Ebrahimiusing the PCRE2 regular expression library from a C program. See the 25*22dc650dSSadaf Ebrahimipcre2sample documentation for a short discussion ("man pcre2sample" if you have 26*22dc650dSSadaf Ebrahimithe PCRE2 man pages installed). PCRE2 is a revised API for the library, and is 27*22dc650dSSadaf Ebrahimiincompatible with the original PCRE API. 28*22dc650dSSadaf Ebrahimi 29*22dc650dSSadaf EbrahimiThere are actually three libraries, each supporting a different code unit 30*22dc650dSSadaf Ebrahimiwidth. This demonstration program uses the 8-bit library. The default is to 31*22dc650dSSadaf Ebrahimiprocess each code unit as a separate character, but if the pattern begins with 32*22dc650dSSadaf Ebrahimi"(*UTF)", both it and the subject are treated as UTF-8 strings, where 33*22dc650dSSadaf Ebrahimicharacters may occupy multiple code units. 34*22dc650dSSadaf Ebrahimi 35*22dc650dSSadaf EbrahimiIn Unix-like environments, if PCRE2 is installed in your standard system 36*22dc650dSSadaf Ebrahimilibraries, you should be able to compile this program using this command: 37*22dc650dSSadaf Ebrahimi 38*22dc650dSSadaf Ebrahimicc -Wall pcre2demo.c -lpcre2-8 -o pcre2demo 39*22dc650dSSadaf Ebrahimi 40*22dc650dSSadaf EbrahimiIf PCRE2 is not installed in a standard place, it is likely to be installed 41*22dc650dSSadaf Ebrahimiwith support for the pkg-config mechanism. If you have pkg-config, you can 42*22dc650dSSadaf Ebrahimicompile this program using this command: 43*22dc650dSSadaf Ebrahimi 44*22dc650dSSadaf Ebrahimicc -Wall pcre2demo.c `pkg-config --cflags --libs libpcre2-8` -o pcre2demo 45*22dc650dSSadaf Ebrahimi 46*22dc650dSSadaf EbrahimiIf you do not have pkg-config, you may have to use something like this: 47*22dc650dSSadaf Ebrahimi 48*22dc650dSSadaf Ebrahimicc -Wall pcre2demo.c -I/usr/local/include -L/usr/local/lib \ 49*22dc650dSSadaf Ebrahimi -R/usr/local/lib -lpcre2-8 -o pcre2demo 50*22dc650dSSadaf Ebrahimi 51*22dc650dSSadaf EbrahimiReplace "/usr/local/include" and "/usr/local/lib" with wherever the include and 52*22dc650dSSadaf Ebrahimilibrary files for PCRE2 are installed on your system. Only some operating 53*22dc650dSSadaf Ebrahimisystems (Solaris is one) use the -R option. 54*22dc650dSSadaf Ebrahimi 55*22dc650dSSadaf EbrahimiBuilding under Windows: 56*22dc650dSSadaf Ebrahimi 57*22dc650dSSadaf EbrahimiIf you want to statically link this program against a non-dll .a file, you must 58*22dc650dSSadaf Ebrahimidefine PCRE2_STATIC before including pcre2.h, so in this environment, uncomment 59*22dc650dSSadaf Ebrahimithe following line. */ 60*22dc650dSSadaf Ebrahimi 61*22dc650dSSadaf Ebrahimi/* #define PCRE2_STATIC */ 62*22dc650dSSadaf Ebrahimi 63*22dc650dSSadaf Ebrahimi/* The PCRE2_CODE_UNIT_WIDTH macro must be defined before including pcre2.h. 64*22dc650dSSadaf EbrahimiFor a program that uses only one code unit width, setting it to 8, 16, or 32 65*22dc650dSSadaf Ebrahimimakes it possible to use generic function names such as pcre2_compile(). Note 66*22dc650dSSadaf Ebrahimithat just changing 8 to 16 (for example) is not sufficient to convert this 67*22dc650dSSadaf Ebrahimiprogram to process 16-bit characters. Even in a fully 16-bit environment, where 68*22dc650dSSadaf Ebrahimistring-handling functions such as strcmp() and printf() work with 16-bit 69*22dc650dSSadaf Ebrahimicharacters, the code for handling the table of named substrings will still need 70*22dc650dSSadaf Ebrahimito be modified. */ 71*22dc650dSSadaf Ebrahimi 72*22dc650dSSadaf Ebrahimi#define PCRE2_CODE_UNIT_WIDTH 8 73*22dc650dSSadaf Ebrahimi 74*22dc650dSSadaf Ebrahimi#include <stdio.h> 75*22dc650dSSadaf Ebrahimi#include <string.h> 76*22dc650dSSadaf Ebrahimi#include <pcre2.h> 77*22dc650dSSadaf Ebrahimi 78*22dc650dSSadaf Ebrahimi 79*22dc650dSSadaf Ebrahimi/************************************************************************** 80*22dc650dSSadaf Ebrahimi* Here is the program. The API includes the concept of "contexts" for * 81*22dc650dSSadaf Ebrahimi* setting up unusual interface requirements for compiling and matching, * 82*22dc650dSSadaf Ebrahimi* such as custom memory managers and non-standard newline definitions. * 83*22dc650dSSadaf Ebrahimi* This program does not do any of this, so it makes no use of contexts, * 84*22dc650dSSadaf Ebrahimi* always passing NULL where a context could be given. * 85*22dc650dSSadaf Ebrahimi**************************************************************************/ 86*22dc650dSSadaf Ebrahimi 87*22dc650dSSadaf Ebrahimiint main(int argc, char **argv) 88*22dc650dSSadaf Ebrahimi{ 89*22dc650dSSadaf Ebrahimipcre2_code *re; 90*22dc650dSSadaf EbrahimiPCRE2_SPTR pattern; /* PCRE2_SPTR is a pointer to unsigned code units of */ 91*22dc650dSSadaf EbrahimiPCRE2_SPTR subject; /* the appropriate width (in this case, 8 bits). */ 92*22dc650dSSadaf EbrahimiPCRE2_SPTR name_table; 93*22dc650dSSadaf Ebrahimi 94*22dc650dSSadaf Ebrahimiint crlf_is_newline; 95*22dc650dSSadaf Ebrahimiint errornumber; 96*22dc650dSSadaf Ebrahimiint find_all; 97*22dc650dSSadaf Ebrahimiint i; 98*22dc650dSSadaf Ebrahimiint rc; 99*22dc650dSSadaf Ebrahimiint utf8; 100*22dc650dSSadaf Ebrahimi 101*22dc650dSSadaf Ebrahimiuint32_t option_bits; 102*22dc650dSSadaf Ebrahimiuint32_t namecount; 103*22dc650dSSadaf Ebrahimiuint32_t name_entry_size; 104*22dc650dSSadaf Ebrahimiuint32_t newline; 105*22dc650dSSadaf Ebrahimi 106*22dc650dSSadaf EbrahimiPCRE2_SIZE erroroffset; 107*22dc650dSSadaf EbrahimiPCRE2_SIZE *ovector; 108*22dc650dSSadaf EbrahimiPCRE2_SIZE subject_length; 109*22dc650dSSadaf Ebrahimi 110*22dc650dSSadaf Ebrahimipcre2_match_data *match_data; 111*22dc650dSSadaf Ebrahimi 112*22dc650dSSadaf Ebrahimi 113*22dc650dSSadaf Ebrahimi/************************************************************************** 114*22dc650dSSadaf Ebrahimi* First, sort out the command line. There is only one possible option at * 115*22dc650dSSadaf Ebrahimi* the moment, "-g" to request repeated matching to find all occurrences, * 116*22dc650dSSadaf Ebrahimi* like Perl's /g option. We set the variable find_all to a non-zero value * 117*22dc650dSSadaf Ebrahimi* if the -g option is present. * 118*22dc650dSSadaf Ebrahimi**************************************************************************/ 119*22dc650dSSadaf Ebrahimi 120*22dc650dSSadaf Ebrahimifind_all = 0; 121*22dc650dSSadaf Ebrahimifor (i = 1; i < argc; i++) 122*22dc650dSSadaf Ebrahimi { 123*22dc650dSSadaf Ebrahimi if (strcmp(argv[i], "-g") == 0) find_all = 1; 124*22dc650dSSadaf Ebrahimi else if (argv[i][0] == '-') 125*22dc650dSSadaf Ebrahimi { 126*22dc650dSSadaf Ebrahimi printf("Unrecognised option %s\n", argv[i]); 127*22dc650dSSadaf Ebrahimi return 1; 128*22dc650dSSadaf Ebrahimi } 129*22dc650dSSadaf Ebrahimi else break; 130*22dc650dSSadaf Ebrahimi } 131*22dc650dSSadaf Ebrahimi 132*22dc650dSSadaf Ebrahimi/* After the options, we require exactly two arguments, which are the pattern, 133*22dc650dSSadaf Ebrahimiand the subject string. */ 134*22dc650dSSadaf Ebrahimi 135*22dc650dSSadaf Ebrahimiif (argc - i != 2) 136*22dc650dSSadaf Ebrahimi { 137*22dc650dSSadaf Ebrahimi printf("Exactly two arguments required: a regex and a subject string\n"); 138*22dc650dSSadaf Ebrahimi return 1; 139*22dc650dSSadaf Ebrahimi } 140*22dc650dSSadaf Ebrahimi 141*22dc650dSSadaf Ebrahimi/* Pattern and subject are char arguments, so they can be straightforwardly 142*22dc650dSSadaf Ebrahimicast to PCRE2_SPTR because we are working in 8-bit code units. The subject 143*22dc650dSSadaf Ebrahimilength is cast to PCRE2_SIZE for completeness, though PCRE2_SIZE is in fact 144*22dc650dSSadaf Ebrahimidefined to be size_t. */ 145*22dc650dSSadaf Ebrahimi 146*22dc650dSSadaf Ebrahimipattern = (PCRE2_SPTR)argv[i]; 147*22dc650dSSadaf Ebrahimisubject = (PCRE2_SPTR)argv[i+1]; 148*22dc650dSSadaf Ebrahimisubject_length = (PCRE2_SIZE)strlen((char *)subject); 149*22dc650dSSadaf Ebrahimi 150*22dc650dSSadaf Ebrahimi 151*22dc650dSSadaf Ebrahimi/************************************************************************* 152*22dc650dSSadaf Ebrahimi* Now we are going to compile the regular expression pattern, and handle * 153*22dc650dSSadaf Ebrahimi* any errors that are detected. * 154*22dc650dSSadaf Ebrahimi*************************************************************************/ 155*22dc650dSSadaf Ebrahimi 156*22dc650dSSadaf Ebrahimire = pcre2_compile( 157*22dc650dSSadaf Ebrahimi pattern, /* the pattern */ 158*22dc650dSSadaf Ebrahimi PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminated */ 159*22dc650dSSadaf Ebrahimi 0, /* default options */ 160*22dc650dSSadaf Ebrahimi &errornumber, /* for error number */ 161*22dc650dSSadaf Ebrahimi &erroroffset, /* for error offset */ 162*22dc650dSSadaf Ebrahimi NULL); /* use default compile context */ 163*22dc650dSSadaf Ebrahimi 164*22dc650dSSadaf Ebrahimi/* Compilation failed: print the error message and exit. */ 165*22dc650dSSadaf Ebrahimi 166*22dc650dSSadaf Ebrahimiif (re == NULL) 167*22dc650dSSadaf Ebrahimi { 168*22dc650dSSadaf Ebrahimi PCRE2_UCHAR buffer[256]; 169*22dc650dSSadaf Ebrahimi pcre2_get_error_message(errornumber, buffer, sizeof(buffer)); 170*22dc650dSSadaf Ebrahimi printf("PCRE2 compilation failed at offset %d: %s\n", (int)erroroffset, 171*22dc650dSSadaf Ebrahimi buffer); 172*22dc650dSSadaf Ebrahimi return 1; 173*22dc650dSSadaf Ebrahimi } 174*22dc650dSSadaf Ebrahimi 175*22dc650dSSadaf Ebrahimi 176*22dc650dSSadaf Ebrahimi/************************************************************************* 177*22dc650dSSadaf Ebrahimi* If the compilation succeeded, we call PCRE2 again, in order to do a * 178*22dc650dSSadaf Ebrahimi* pattern match against the subject string. This does just ONE match. If * 179*22dc650dSSadaf Ebrahimi* further matching is needed, it will be done below. Before running the * 180*22dc650dSSadaf Ebrahimi* match we must set up a match_data block for holding the result. Using * 181*22dc650dSSadaf Ebrahimi* pcre2_match_data_create_from_pattern() ensures that the block is * 182*22dc650dSSadaf Ebrahimi* exactly the right size for the number of capturing parentheses in the * 183*22dc650dSSadaf Ebrahimi* pattern. If you need to know the actual size of a match_data block as * 184*22dc650dSSadaf Ebrahimi* a number of bytes, you can find it like this: * 185*22dc650dSSadaf Ebrahimi* * 186*22dc650dSSadaf Ebrahimi* PCRE2_SIZE match_data_size = pcre2_get_match_data_size(match_data); * 187*22dc650dSSadaf Ebrahimi*************************************************************************/ 188*22dc650dSSadaf Ebrahimi 189*22dc650dSSadaf Ebrahimimatch_data = pcre2_match_data_create_from_pattern(re, NULL); 190*22dc650dSSadaf Ebrahimi 191*22dc650dSSadaf Ebrahimi/* Now run the match. */ 192*22dc650dSSadaf Ebrahimi 193*22dc650dSSadaf Ebrahimirc = pcre2_match( 194*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */ 195*22dc650dSSadaf Ebrahimi subject, /* the subject string */ 196*22dc650dSSadaf Ebrahimi subject_length, /* the length of the subject */ 197*22dc650dSSadaf Ebrahimi 0, /* start at offset 0 in the subject */ 198*22dc650dSSadaf Ebrahimi 0, /* default options */ 199*22dc650dSSadaf Ebrahimi match_data, /* block for storing the result */ 200*22dc650dSSadaf Ebrahimi NULL); /* use default match context */ 201*22dc650dSSadaf Ebrahimi 202*22dc650dSSadaf Ebrahimi/* Matching failed: handle error cases */ 203*22dc650dSSadaf Ebrahimi 204*22dc650dSSadaf Ebrahimiif (rc < 0) 205*22dc650dSSadaf Ebrahimi { 206*22dc650dSSadaf Ebrahimi switch(rc) 207*22dc650dSSadaf Ebrahimi { 208*22dc650dSSadaf Ebrahimi case PCRE2_ERROR_NOMATCH: printf("No match\n"); break; 209*22dc650dSSadaf Ebrahimi /* 210*22dc650dSSadaf Ebrahimi Handle other special cases if you like 211*22dc650dSSadaf Ebrahimi */ 212*22dc650dSSadaf Ebrahimi default: printf("Matching error %d\n", rc); break; 213*22dc650dSSadaf Ebrahimi } 214*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); /* Release memory used for the match */ 215*22dc650dSSadaf Ebrahimi pcre2_code_free(re); /* data and the compiled pattern. */ 216*22dc650dSSadaf Ebrahimi return 1; 217*22dc650dSSadaf Ebrahimi } 218*22dc650dSSadaf Ebrahimi 219*22dc650dSSadaf Ebrahimi/* Match succeeded. Get a pointer to the output vector, where string offsets 220*22dc650dSSadaf Ebrahimiare stored. */ 221*22dc650dSSadaf Ebrahimi 222*22dc650dSSadaf Ebrahimiovector = pcre2_get_ovector_pointer(match_data); 223*22dc650dSSadaf Ebrahimiprintf("Match succeeded at offset %d\n", (int)ovector[0]); 224*22dc650dSSadaf Ebrahimi 225*22dc650dSSadaf Ebrahimi 226*22dc650dSSadaf Ebrahimi/************************************************************************* 227*22dc650dSSadaf Ebrahimi* We have found the first match within the subject string. If the output * 228*22dc650dSSadaf Ebrahimi* vector wasn't big enough, say so. Then output any substrings that were * 229*22dc650dSSadaf Ebrahimi* captured. * 230*22dc650dSSadaf Ebrahimi*************************************************************************/ 231*22dc650dSSadaf Ebrahimi 232*22dc650dSSadaf Ebrahimi/* The output vector wasn't big enough. This should not happen, because we used 233*22dc650dSSadaf Ebrahimipcre2_match_data_create_from_pattern() above. */ 234*22dc650dSSadaf Ebrahimi 235*22dc650dSSadaf Ebrahimiif (rc == 0) 236*22dc650dSSadaf Ebrahimi printf("ovector was not big enough for all the captured substrings\n"); 237*22dc650dSSadaf Ebrahimi 238*22dc650dSSadaf Ebrahimi/* Since release 10.38 PCRE2 has locked out the use of \K in lookaround 239*22dc650dSSadaf Ebrahimiassertions. However, there is an option to re-enable the old behaviour. If that 240*22dc650dSSadaf Ebrahimiis set, it is possible to run patterns such as /(?=.\K)/ that use \K in an 241*22dc650dSSadaf Ebrahimiassertion to set the start of a match later than its end. In this demonstration 242*22dc650dSSadaf Ebrahimiprogram, we show how to detect this case, but it shouldn't arise because the 243*22dc650dSSadaf Ebrahimioption is never set. */ 244*22dc650dSSadaf Ebrahimi 245*22dc650dSSadaf Ebrahimiif (ovector[0] > ovector[1]) 246*22dc650dSSadaf Ebrahimi { 247*22dc650dSSadaf Ebrahimi printf("\\K was used in an assertion to set the match start after its end.\n" 248*22dc650dSSadaf Ebrahimi "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), 249*22dc650dSSadaf Ebrahimi (char *)(subject + ovector[1])); 250*22dc650dSSadaf Ebrahimi printf("Run abandoned\n"); 251*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); 252*22dc650dSSadaf Ebrahimi pcre2_code_free(re); 253*22dc650dSSadaf Ebrahimi return 1; 254*22dc650dSSadaf Ebrahimi } 255*22dc650dSSadaf Ebrahimi 256*22dc650dSSadaf Ebrahimi/* Show substrings stored in the output vector by number. Obviously, in a real 257*22dc650dSSadaf Ebrahimiapplication you might want to do things other than print them. */ 258*22dc650dSSadaf Ebrahimi 259*22dc650dSSadaf Ebrahimifor (i = 0; i < rc; i++) 260*22dc650dSSadaf Ebrahimi { 261*22dc650dSSadaf Ebrahimi PCRE2_SPTR substring_start = subject + ovector[2*i]; 262*22dc650dSSadaf Ebrahimi PCRE2_SIZE substring_length = ovector[2*i+1] - ovector[2*i]; 263*22dc650dSSadaf Ebrahimi printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); 264*22dc650dSSadaf Ebrahimi } 265*22dc650dSSadaf Ebrahimi 266*22dc650dSSadaf Ebrahimi 267*22dc650dSSadaf Ebrahimi/************************************************************************** 268*22dc650dSSadaf Ebrahimi* That concludes the basic part of this demonstration program. We have * 269*22dc650dSSadaf Ebrahimi* compiled a pattern, and performed a single match. The code that follows * 270*22dc650dSSadaf Ebrahimi* shows first how to access named substrings, and then how to code for * 271*22dc650dSSadaf Ebrahimi* repeated matches on the same subject. * 272*22dc650dSSadaf Ebrahimi**************************************************************************/ 273*22dc650dSSadaf Ebrahimi 274*22dc650dSSadaf Ebrahimi/* See if there are any named substrings, and if so, show them by name. First 275*22dc650dSSadaf Ebrahimiwe have to extract the count of named parentheses from the pattern. */ 276*22dc650dSSadaf Ebrahimi 277*22dc650dSSadaf Ebrahimi(void)pcre2_pattern_info( 278*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */ 279*22dc650dSSadaf Ebrahimi PCRE2_INFO_NAMECOUNT, /* get the number of named substrings */ 280*22dc650dSSadaf Ebrahimi &namecount); /* where to put the answer */ 281*22dc650dSSadaf Ebrahimi 282*22dc650dSSadaf Ebrahimiif (namecount == 0) printf("No named substrings\n"); else 283*22dc650dSSadaf Ebrahimi { 284*22dc650dSSadaf Ebrahimi PCRE2_SPTR tabptr; 285*22dc650dSSadaf Ebrahimi printf("Named substrings\n"); 286*22dc650dSSadaf Ebrahimi 287*22dc650dSSadaf Ebrahimi /* Before we can access the substrings, we must extract the table for 288*22dc650dSSadaf Ebrahimi translating names to numbers, and the size of each entry in the table. */ 289*22dc650dSSadaf Ebrahimi 290*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info( 291*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */ 292*22dc650dSSadaf Ebrahimi PCRE2_INFO_NAMETABLE, /* address of the table */ 293*22dc650dSSadaf Ebrahimi &name_table); /* where to put the answer */ 294*22dc650dSSadaf Ebrahimi 295*22dc650dSSadaf Ebrahimi (void)pcre2_pattern_info( 296*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */ 297*22dc650dSSadaf Ebrahimi PCRE2_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ 298*22dc650dSSadaf Ebrahimi &name_entry_size); /* where to put the answer */ 299*22dc650dSSadaf Ebrahimi 300*22dc650dSSadaf Ebrahimi /* Now we can scan the table and, for each entry, print the number, the name, 301*22dc650dSSadaf Ebrahimi and the substring itself. In the 8-bit library the number is held in two 302*22dc650dSSadaf Ebrahimi bytes, most significant first. */ 303*22dc650dSSadaf Ebrahimi 304*22dc650dSSadaf Ebrahimi tabptr = name_table; 305*22dc650dSSadaf Ebrahimi for (i = 0; i < namecount; i++) 306*22dc650dSSadaf Ebrahimi { 307*22dc650dSSadaf Ebrahimi int n = (tabptr[0] << 8) | tabptr[1]; 308*22dc650dSSadaf Ebrahimi printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 309*22dc650dSSadaf Ebrahimi (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); 310*22dc650dSSadaf Ebrahimi tabptr += name_entry_size; 311*22dc650dSSadaf Ebrahimi } 312*22dc650dSSadaf Ebrahimi } 313*22dc650dSSadaf Ebrahimi 314*22dc650dSSadaf Ebrahimi 315*22dc650dSSadaf Ebrahimi/************************************************************************* 316*22dc650dSSadaf Ebrahimi* If the "-g" option was given on the command line, we want to continue * 317*22dc650dSSadaf Ebrahimi* to search for additional matches in the subject string, in a similar * 318*22dc650dSSadaf Ebrahimi* way to the /g option in Perl. This turns out to be trickier than you * 319*22dc650dSSadaf Ebrahimi* might think because of the possibility of matching an empty string. * 320*22dc650dSSadaf Ebrahimi* What happens is as follows: * 321*22dc650dSSadaf Ebrahimi* * 322*22dc650dSSadaf Ebrahimi* If the previous match was NOT for an empty string, we can just start * 323*22dc650dSSadaf Ebrahimi* the next match at the end of the previous one. * 324*22dc650dSSadaf Ebrahimi* * 325*22dc650dSSadaf Ebrahimi* If the previous match WAS for an empty string, we can't do that, as it * 326*22dc650dSSadaf Ebrahimi* would lead to an infinite loop. Instead, a call of pcre2_match() is * 327*22dc650dSSadaf Ebrahimi* made with the PCRE2_NOTEMPTY_ATSTART and PCRE2_ANCHORED flags set. The * 328*22dc650dSSadaf Ebrahimi* first of these tells PCRE2 that an empty string at the start of the * 329*22dc650dSSadaf Ebrahimi* subject is not a valid match; other possibilities must be tried. The * 330*22dc650dSSadaf Ebrahimi* second flag restricts PCRE2 to one match attempt at the initial string * 331*22dc650dSSadaf Ebrahimi* position. If this match succeeds, an alternative to the empty string * 332*22dc650dSSadaf Ebrahimi* match has been found, and we can print it and proceed round the loop, * 333*22dc650dSSadaf Ebrahimi* advancing by the length of whatever was found. If this match does not * 334*22dc650dSSadaf Ebrahimi* succeed, we still stay in the loop, advancing by just one character. * 335*22dc650dSSadaf Ebrahimi* In UTF-8 mode, which can be set by (*UTF) in the pattern, this may be * 336*22dc650dSSadaf Ebrahimi* more than one byte. * 337*22dc650dSSadaf Ebrahimi* * 338*22dc650dSSadaf Ebrahimi* However, there is a complication concerned with newlines. When the * 339*22dc650dSSadaf Ebrahimi* newline convention is such that CRLF is a valid newline, we must * 340*22dc650dSSadaf Ebrahimi* advance by two characters rather than one. The newline convention can * 341*22dc650dSSadaf Ebrahimi* be set in the regex by (*CR), etc.; if not, we must find the default. * 342*22dc650dSSadaf Ebrahimi*************************************************************************/ 343*22dc650dSSadaf Ebrahimi 344*22dc650dSSadaf Ebrahimiif (!find_all) /* Check for -g */ 345*22dc650dSSadaf Ebrahimi { 346*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); /* Release the memory that was used */ 347*22dc650dSSadaf Ebrahimi pcre2_code_free(re); /* for the match data and the pattern. */ 348*22dc650dSSadaf Ebrahimi return 0; /* Exit the program. */ 349*22dc650dSSadaf Ebrahimi } 350*22dc650dSSadaf Ebrahimi 351*22dc650dSSadaf Ebrahimi/* Before running the loop, check for UTF-8 and whether CRLF is a valid newline 352*22dc650dSSadaf Ebrahimisequence. First, find the options with which the regex was compiled and extract 353*22dc650dSSadaf Ebrahimithe UTF state. */ 354*22dc650dSSadaf Ebrahimi 355*22dc650dSSadaf Ebrahimi(void)pcre2_pattern_info(re, PCRE2_INFO_ALLOPTIONS, &option_bits); 356*22dc650dSSadaf Ebrahimiutf8 = (option_bits & PCRE2_UTF) != 0; 357*22dc650dSSadaf Ebrahimi 358*22dc650dSSadaf Ebrahimi/* Now find the newline convention and see whether CRLF is a valid newline 359*22dc650dSSadaf Ebrahimisequence. */ 360*22dc650dSSadaf Ebrahimi 361*22dc650dSSadaf Ebrahimi(void)pcre2_pattern_info(re, PCRE2_INFO_NEWLINE, &newline); 362*22dc650dSSadaf Ebrahimicrlf_is_newline = newline == PCRE2_NEWLINE_ANY || 363*22dc650dSSadaf Ebrahimi newline == PCRE2_NEWLINE_CRLF || 364*22dc650dSSadaf Ebrahimi newline == PCRE2_NEWLINE_ANYCRLF; 365*22dc650dSSadaf Ebrahimi 366*22dc650dSSadaf Ebrahimi/* Loop for second and subsequent matches */ 367*22dc650dSSadaf Ebrahimi 368*22dc650dSSadaf Ebrahimifor (;;) 369*22dc650dSSadaf Ebrahimi { 370*22dc650dSSadaf Ebrahimi uint32_t options = 0; /* Normally no options */ 371*22dc650dSSadaf Ebrahimi PCRE2_SIZE start_offset = ovector[1]; /* Start at end of previous match */ 372*22dc650dSSadaf Ebrahimi 373*22dc650dSSadaf Ebrahimi /* If the previous match was for an empty string, we are finished if we are 374*22dc650dSSadaf Ebrahimi at the end of the subject. Otherwise, arrange to run another match at the 375*22dc650dSSadaf Ebrahimi same point to see if a non-empty match can be found. */ 376*22dc650dSSadaf Ebrahimi 377*22dc650dSSadaf Ebrahimi if (ovector[0] == ovector[1]) 378*22dc650dSSadaf Ebrahimi { 379*22dc650dSSadaf Ebrahimi if (ovector[0] == subject_length) break; 380*22dc650dSSadaf Ebrahimi options = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED; 381*22dc650dSSadaf Ebrahimi } 382*22dc650dSSadaf Ebrahimi 383*22dc650dSSadaf Ebrahimi /* If the previous match was not an empty string, there is one tricky case to 384*22dc650dSSadaf Ebrahimi consider. If a pattern contains \K within a lookbehind assertion at the 385*22dc650dSSadaf Ebrahimi start, the end of the matched string can be at the offset where the match 386*22dc650dSSadaf Ebrahimi started. Without special action, this leads to a loop that keeps on matching 387*22dc650dSSadaf Ebrahimi the same substring. We must detect this case and arrange to move the start on 388*22dc650dSSadaf Ebrahimi by one character. The pcre2_get_startchar() function returns the starting 389*22dc650dSSadaf Ebrahimi offset that was passed to pcre2_match(). */ 390*22dc650dSSadaf Ebrahimi 391*22dc650dSSadaf Ebrahimi else 392*22dc650dSSadaf Ebrahimi { 393*22dc650dSSadaf Ebrahimi PCRE2_SIZE startchar = pcre2_get_startchar(match_data); 394*22dc650dSSadaf Ebrahimi if (start_offset <= startchar) 395*22dc650dSSadaf Ebrahimi { 396*22dc650dSSadaf Ebrahimi if (startchar >= subject_length) break; /* Reached end of subject. */ 397*22dc650dSSadaf Ebrahimi start_offset = startchar + 1; /* Advance by one character. */ 398*22dc650dSSadaf Ebrahimi if (utf8) /* If UTF-8, it may be more */ 399*22dc650dSSadaf Ebrahimi { /* than one code unit. */ 400*22dc650dSSadaf Ebrahimi for (; start_offset < subject_length; start_offset++) 401*22dc650dSSadaf Ebrahimi if ((subject[start_offset] & 0xc0) != 0x80) break; 402*22dc650dSSadaf Ebrahimi } 403*22dc650dSSadaf Ebrahimi } 404*22dc650dSSadaf Ebrahimi } 405*22dc650dSSadaf Ebrahimi 406*22dc650dSSadaf Ebrahimi /* Run the next matching operation */ 407*22dc650dSSadaf Ebrahimi 408*22dc650dSSadaf Ebrahimi rc = pcre2_match( 409*22dc650dSSadaf Ebrahimi re, /* the compiled pattern */ 410*22dc650dSSadaf Ebrahimi subject, /* the subject string */ 411*22dc650dSSadaf Ebrahimi subject_length, /* the length of the subject */ 412*22dc650dSSadaf Ebrahimi start_offset, /* starting offset in the subject */ 413*22dc650dSSadaf Ebrahimi options, /* options */ 414*22dc650dSSadaf Ebrahimi match_data, /* block for storing the result */ 415*22dc650dSSadaf Ebrahimi NULL); /* use default match context */ 416*22dc650dSSadaf Ebrahimi 417*22dc650dSSadaf Ebrahimi /* This time, a result of NOMATCH isn't an error. If the value in "options" 418*22dc650dSSadaf Ebrahimi is zero, it just means we have found all possible matches, so the loop ends. 419*22dc650dSSadaf Ebrahimi Otherwise, it means we have failed to find a non-empty-string match at a 420*22dc650dSSadaf Ebrahimi point where there was a previous empty-string match. In this case, we do what 421*22dc650dSSadaf Ebrahimi Perl does: advance the matching position by one character, and continue. We 422*22dc650dSSadaf Ebrahimi do this by setting the "end of previous match" offset, because that is picked 423*22dc650dSSadaf Ebrahimi up at the top of the loop as the point at which to start again. 424*22dc650dSSadaf Ebrahimi 425*22dc650dSSadaf Ebrahimi There are two complications: (a) When CRLF is a valid newline sequence, and 426*22dc650dSSadaf Ebrahimi the current position is just before it, advance by an extra byte. (b) 427*22dc650dSSadaf Ebrahimi Otherwise we must ensure that we skip an entire UTF character if we are in 428*22dc650dSSadaf Ebrahimi UTF mode. */ 429*22dc650dSSadaf Ebrahimi 430*22dc650dSSadaf Ebrahimi if (rc == PCRE2_ERROR_NOMATCH) 431*22dc650dSSadaf Ebrahimi { 432*22dc650dSSadaf Ebrahimi if (options == 0) break; /* All matches found */ 433*22dc650dSSadaf Ebrahimi ovector[1] = start_offset + 1; /* Advance one code unit */ 434*22dc650dSSadaf Ebrahimi if (crlf_is_newline && /* If CRLF is a newline & */ 435*22dc650dSSadaf Ebrahimi start_offset < subject_length - 1 && /* we are at CRLF, */ 436*22dc650dSSadaf Ebrahimi subject[start_offset] == '\r' && 437*22dc650dSSadaf Ebrahimi subject[start_offset + 1] == '\n') 438*22dc650dSSadaf Ebrahimi ovector[1] += 1; /* Advance by one more. */ 439*22dc650dSSadaf Ebrahimi else if (utf8) /* Otherwise, ensure we */ 440*22dc650dSSadaf Ebrahimi { /* advance a whole UTF-8 */ 441*22dc650dSSadaf Ebrahimi while (ovector[1] < subject_length) /* character. */ 442*22dc650dSSadaf Ebrahimi { 443*22dc650dSSadaf Ebrahimi if ((subject[ovector[1]] & 0xc0) != 0x80) break; 444*22dc650dSSadaf Ebrahimi ovector[1] += 1; 445*22dc650dSSadaf Ebrahimi } 446*22dc650dSSadaf Ebrahimi } 447*22dc650dSSadaf Ebrahimi continue; /* Go round the loop again */ 448*22dc650dSSadaf Ebrahimi } 449*22dc650dSSadaf Ebrahimi 450*22dc650dSSadaf Ebrahimi /* Other matching errors are not recoverable. */ 451*22dc650dSSadaf Ebrahimi 452*22dc650dSSadaf Ebrahimi if (rc < 0) 453*22dc650dSSadaf Ebrahimi { 454*22dc650dSSadaf Ebrahimi printf("Matching error %d\n", rc); 455*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); 456*22dc650dSSadaf Ebrahimi pcre2_code_free(re); 457*22dc650dSSadaf Ebrahimi return 1; 458*22dc650dSSadaf Ebrahimi } 459*22dc650dSSadaf Ebrahimi 460*22dc650dSSadaf Ebrahimi /* Match succeeded */ 461*22dc650dSSadaf Ebrahimi 462*22dc650dSSadaf Ebrahimi printf("\nMatch succeeded again at offset %d\n", (int)ovector[0]); 463*22dc650dSSadaf Ebrahimi 464*22dc650dSSadaf Ebrahimi /* The match succeeded, but the output vector wasn't big enough. This 465*22dc650dSSadaf Ebrahimi should not happen. */ 466*22dc650dSSadaf Ebrahimi 467*22dc650dSSadaf Ebrahimi if (rc == 0) 468*22dc650dSSadaf Ebrahimi printf("ovector was not big enough for all the captured substrings\n"); 469*22dc650dSSadaf Ebrahimi 470*22dc650dSSadaf Ebrahimi /* We must guard against patterns such as /(?=.\K)/ that use \K in an 471*22dc650dSSadaf Ebrahimi assertion to set the start of a match later than its end. In this 472*22dc650dSSadaf Ebrahimi demonstration program, we just detect this case and give up. */ 473*22dc650dSSadaf Ebrahimi 474*22dc650dSSadaf Ebrahimi if (ovector[0] > ovector[1]) 475*22dc650dSSadaf Ebrahimi { 476*22dc650dSSadaf Ebrahimi printf("\\K was used in an assertion to set the match start after its end.\n" 477*22dc650dSSadaf Ebrahimi "From end to start the match was: %.*s\n", (int)(ovector[0] - ovector[1]), 478*22dc650dSSadaf Ebrahimi (char *)(subject + ovector[1])); 479*22dc650dSSadaf Ebrahimi printf("Run abandoned\n"); 480*22dc650dSSadaf Ebrahimi pcre2_match_data_free(match_data); 481*22dc650dSSadaf Ebrahimi pcre2_code_free(re); 482*22dc650dSSadaf Ebrahimi return 1; 483*22dc650dSSadaf Ebrahimi } 484*22dc650dSSadaf Ebrahimi 485*22dc650dSSadaf Ebrahimi /* As before, show substrings stored in the output vector by number, and then 486*22dc650dSSadaf Ebrahimi also any named substrings. */ 487*22dc650dSSadaf Ebrahimi 488*22dc650dSSadaf Ebrahimi for (i = 0; i < rc; i++) 489*22dc650dSSadaf Ebrahimi { 490*22dc650dSSadaf Ebrahimi PCRE2_SPTR substring_start = subject + ovector[2*i]; 491*22dc650dSSadaf Ebrahimi size_t substring_length = ovector[2*i+1] - ovector[2*i]; 492*22dc650dSSadaf Ebrahimi printf("%2d: %.*s\n", i, (int)substring_length, (char *)substring_start); 493*22dc650dSSadaf Ebrahimi } 494*22dc650dSSadaf Ebrahimi 495*22dc650dSSadaf Ebrahimi if (namecount == 0) printf("No named substrings\n"); else 496*22dc650dSSadaf Ebrahimi { 497*22dc650dSSadaf Ebrahimi PCRE2_SPTR tabptr = name_table; 498*22dc650dSSadaf Ebrahimi printf("Named substrings\n"); 499*22dc650dSSadaf Ebrahimi for (i = 0; i < namecount; i++) 500*22dc650dSSadaf Ebrahimi { 501*22dc650dSSadaf Ebrahimi int n = (tabptr[0] << 8) | tabptr[1]; 502*22dc650dSSadaf Ebrahimi printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 503*22dc650dSSadaf Ebrahimi (int)(ovector[2*n+1] - ovector[2*n]), subject + ovector[2*n]); 504*22dc650dSSadaf Ebrahimi tabptr += name_entry_size; 505*22dc650dSSadaf Ebrahimi } 506*22dc650dSSadaf Ebrahimi } 507*22dc650dSSadaf Ebrahimi } /* End of loop to find second and subsequent matches */ 508*22dc650dSSadaf Ebrahimi 509*22dc650dSSadaf Ebrahimiprintf("\n"); 510*22dc650dSSadaf Ebrahimipcre2_match_data_free(match_data); 511*22dc650dSSadaf Ebrahimipcre2_code_free(re); 512*22dc650dSSadaf Ebrahimireturn 0; 513*22dc650dSSadaf Ebrahimi} 514*22dc650dSSadaf Ebrahimi 515*22dc650dSSadaf Ebrahimi/* End of pcre2demo.c */ 516*22dc650dSSadaf Ebrahimi<p> 517*22dc650dSSadaf EbrahimiReturn to the <a href="index.html">PCRE2 index page</a>. 518*22dc650dSSadaf Ebrahimi</p> 519