1 /*************************************************
2 * pcre2grep program *
3 *************************************************/
4
5 /* This is a grep program that uses the 8-bit PCRE regular expression library
6 via the PCRE2 updated API to do its pattern matching. On Unix-like, Windows,
7 and native z/OS systems it can recurse into directories, and in z/OS it can
8 handle PDS files.
9
10 Note that for native z/OS, in addition to defining the NATIVE_ZOS macro, an
11 additional header is required. That header is not included in the main PCRE2
12 distribution because other apparatus is needed to compile pcre2grep for z/OS.
13 The header can be found in the special z/OS distribution, which is available
14 from www.zaconsultants.net or from www.cbttape.org.
15
16 Copyright (c) 1997-2023 University of Cambridge
17
18 -----------------------------------------------------------------------------
19 Redistribution and use in source and binary forms, with or without
20 modification, are permitted provided that the following conditions are met:
21
22 * Redistributions of source code must retain the above copyright notice,
23 this list of conditions and the following disclaimer.
24
25 * Redistributions in binary form must reproduce the above copyright
26 notice, this list of conditions and the following disclaimer in the
27 documentation and/or other materials provided with the distribution.
28
29 * Neither the name of the University of Cambridge nor the names of its
30 contributors may be used to endorse or promote products derived from
31 this software without specific prior written permission.
32
33 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
34 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
35 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
36 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
37 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
38 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
39 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
40 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
41 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
42 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
43 POSSIBILITY OF SUCH DAMAGE.
44 -----------------------------------------------------------------------------
45 */
46
47 #ifdef HAVE_CONFIG_H
48 #include "config.h"
49 #endif
50
51 #include <ctype.h>
52 #include <locale.h>
53 #include <stdio.h>
54 #include <string.h>
55 #include <stdlib.h>
56 #include <errno.h>
57
58 #include <sys/types.h>
59 #include <sys/stat.h>
60
61 #if (defined _WIN32 || (defined HAVE_WINDOWS_H && HAVE_WINDOWS_H)) \
62 && !defined WIN32 && !defined(__CYGWIN__)
63 #define WIN32
64 #endif
65
66 /* Some CMake's define it still */
67 #if defined(__CYGWIN__) && defined(WIN32)
68 #undef WIN32
69 #endif
70
71 #ifdef __VMS
72 #include clidef
73 #include descrip
74 #include lib$routines
75 #endif
76
77 #ifdef WIN32
78 #include <io.h> /* For _setmode() */
79 #include <fcntl.h> /* For _O_BINARY */
80 #endif
81
82 #if defined(SUPPORT_PCRE2GREP_CALLOUT) && defined(SUPPORT_PCRE2GREP_CALLOUT_FORK)
83 #ifdef WIN32
84 #include <process.h>
85 #else
86 #include <sys/wait.h>
87 #endif
88 #endif
89
90 #ifdef HAVE_UNISTD_H
91 #include <unistd.h>
92 #endif
93
94 #ifdef SUPPORT_LIBZ
95 #include <zlib.h>
96 #endif
97
98 #ifdef SUPPORT_LIBBZ2
99 #include <bzlib.h>
100 #endif
101
102 #define PCRE2_CODE_UNIT_WIDTH 8
103 #include "pcre2.h"
104
105 /* Older versions of MSVC lack snprintf(). This define allows for
106 warning/error-free compilation and testing with MSVC compilers back to at least
107 MSVC 10/2010. Except for VC6 (which is missing some fundamentals and fails). */
108
109 #if defined(_MSC_VER) && (_MSC_VER < 1900)
110 #define snprintf _snprintf
111 #endif
112
113 /* old VC and older compilers don't support %td or %zu, and even some that claim to
114 be C99 don't support it (hence DISABLE_PERCENT_ZT). */
115
116 #if defined(DISABLE_PERCENT_ZT) || (defined(_MSC_VER) && (_MSC_VER < 1800)) || \
117 (!defined(_MSC_VER) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L))
118 #ifdef _WIN64
119 #define SIZ_FORM "llu"
120 #else
121 #define SIZ_FORM "lu"
122 #endif
123 #else
124 #define SIZ_FORM "zu"
125 #endif
126
127 #define FALSE 0
128 #define TRUE 1
129
130 typedef int BOOL;
131
132 #define DEFAULT_CAPTURE_MAX 50
133
134 #if BUFSIZ > 8192
135 #define MAXPATLEN BUFSIZ
136 #else
137 #define MAXPATLEN 8192
138 #endif
139
140 #define FNBUFSIZ 2048
141 #define ERRBUFSIZ 256
142
143 /* Values for the "filenames" variable, which specifies options for file name
144 output. The order is important; it is assumed that a file name is wanted for
145 all values greater than FN_DEFAULT. */
146
147 enum { FN_NONE, FN_DEFAULT, FN_MATCH_ONLY, FN_NOMATCH_ONLY, FN_FORCE };
148
149 /* File reading styles */
150
151 enum { FR_PLAIN, FR_LIBZ, FR_LIBBZ2 };
152
153 /* Actions for the -d and -D options */
154
155 enum { dee_READ, dee_SKIP, dee_RECURSE };
156 enum { DEE_READ, DEE_SKIP };
157
158 /* Actions for special processing options (flag bits) */
159
160 #define PO_WORD_MATCH 0x0001
161 #define PO_LINE_MATCH 0x0002
162 #define PO_FIXED_STRINGS 0x0004
163
164 /* Binary file options */
165
166 enum { BIN_BINARY, BIN_NOMATCH, BIN_TEXT };
167
168 /* Return values from decode_dollar_escape() */
169
170 enum { DDE_ERROR, DDE_CAPTURE, DDE_CHAR };
171
172 /* In newer versions of gcc, with FORTIFY_SOURCE set (the default in some
173 environments), a warning is issued if the value of fwrite() is ignored.
174 Unfortunately, casting to (void) does not suppress the warning. To get round
175 this, we use a macro that compiles a fudge. Oddly, this does not also seem to
176 apply to fprintf(). */
177
178 #define FWRITE_IGNORE(a,b,c,d) if (fwrite(a,b,c,d)) {}
179
180 /* Under Windows, we have to set stdout to be binary, so that it does not
181 convert \r\n at the ends of output lines to \r\r\n. However, that means that
182 any messages written to stdout must have \r\n as their line terminator. This is
183 handled by using STDOUT_NL as the newline string. We also use a normal double
184 quote for the example, as single quotes aren't usually available. */
185
186 #ifdef WIN32
187 #define STDOUT_NL "\r\n"
188 #define STDOUT_NL_LEN 2
189 #define QUOT "\""
190 #else
191 #define STDOUT_NL "\n"
192 #define STDOUT_NL_LEN 1
193 #define QUOT "'"
194 #endif
195
196 /* This code is returned from decode_dollar_escape() when $n is encountered,
197 and used to mean "output STDOUT_NL". It is, of course, not a valid Unicode code
198 point. */
199
200 #define STDOUT_NL_CODE 0x7fffffffu
201
202
203
204 /*************************************************
205 * Global variables *
206 *************************************************/
207
208 static const char *colour_string = "1;31";
209 static const char *colour_option = NULL;
210 static const char *dee_option = NULL;
211 static const char *DEE_option = NULL;
212 static const char *locale = NULL;
213 static const char *newline_arg = NULL;
214 static const char *group_separator = "--";
215 static const char *om_separator = NULL;
216 static const char *stdin_name = "(standard input)";
217 static const char *output_text = NULL;
218
219 static char *main_buffer = NULL;
220
221 static const char *printname_nl = STDOUT_NL; /* Changed to NULL for -Z */
222 static int printname_colon = ':'; /* Changed to 0 for -Z */
223 static int printname_hyphen = '-'; /* Changed to 0 for -Z */
224
225 static int after_context = 0;
226 static int before_context = 0;
227 static int binary_files = BIN_BINARY;
228 static int both_context = 0;
229 static int endlinetype;
230
231 static int count_limit = -1; /* Not long, so that it works with OP_NUMBER */
232 static unsigned long int counts_printed = 0;
233 static unsigned long int total_count = 0;
234
235 static PCRE2_SIZE bufthird = PCRE2GREP_BUFSIZE;
236 static PCRE2_SIZE max_bufthird = PCRE2GREP_MAX_BUFSIZE;
237 static PCRE2_SIZE bufsize = 3*PCRE2GREP_BUFSIZE;
238
239 #ifdef WIN32
240 static int dee_action = dee_SKIP;
241 #else
242 static int dee_action = dee_READ;
243 #endif
244
245 static int DEE_action = DEE_READ;
246 static int error_count = 0;
247 static int filenames = FN_DEFAULT;
248
249 #ifdef SUPPORT_PCRE2GREP_JIT
250 static BOOL use_jit = TRUE;
251 #else
252 static BOOL use_jit = FALSE;
253 #endif
254
255 static const uint8_t *character_tables = NULL;
256
257 static uint32_t pcre2_options = 0;
258 static uint32_t extra_options = 0;
259 static PCRE2_SIZE heap_limit = PCRE2_UNSET;
260 static uint32_t match_limit = 0;
261 static uint32_t depth_limit = 0;
262
263 static pcre2_compile_context *compile_context;
264 static pcre2_match_context *match_context;
265 static pcre2_match_data *match_data, *match_data_pair[2];
266 static PCRE2_SIZE *offsets, *offsets_pair[2];
267 static int match_data_toggle;
268 static uint32_t offset_size;
269 static uint32_t capture_max = DEFAULT_CAPTURE_MAX;
270
271 static BOOL all_matches = FALSE;
272 static BOOL case_restrict = FALSE;
273 static BOOL count_only = FALSE;
274 static BOOL do_colour = FALSE;
275 #ifdef WIN32
276 static BOOL do_ansi = FALSE;
277 #endif
278 static BOOL file_offsets = FALSE;
279 static BOOL hyphenpending = FALSE;
280 static BOOL invert = FALSE;
281 static BOOL line_buffered = FALSE;
282 static BOOL line_offsets = FALSE;
283 static BOOL multiline = FALSE;
284 static BOOL no_ucp = FALSE;
285 static BOOL number = FALSE;
286 static BOOL omit_zero_count = FALSE;
287 static BOOL resource_error = FALSE;
288 static BOOL quiet = FALSE;
289 static BOOL show_total_count = FALSE;
290 static BOOL silent = FALSE;
291 static BOOL utf = FALSE;
292 static BOOL posix_digit = FALSE;
293
294 static uint8_t utf8_buffer[8];
295
296
297 /* Structure for list of --only-matching capturing numbers. */
298
299 typedef struct omstr {
300 struct omstr *next;
301 int groupnum;
302 } omstr;
303
304 static omstr *only_matching = NULL;
305 static omstr *only_matching_last = NULL;
306 static int only_matching_count;
307
308 /* Structure for holding the two variables that describe a number chain. */
309
310 typedef struct omdatastr {
311 omstr **anchor;
312 omstr **lastptr;
313 } omdatastr;
314
315 static omdatastr only_matching_data = { &only_matching, &only_matching_last };
316
317 /* Structure for list of file names (for -f and --{in,ex}clude-from) */
318
319 typedef struct fnstr {
320 struct fnstr *next;
321 char *name;
322 } fnstr;
323
324 static fnstr *exclude_from = NULL;
325 static fnstr *exclude_from_last = NULL;
326 static fnstr *include_from = NULL;
327 static fnstr *include_from_last = NULL;
328
329 static fnstr *file_lists = NULL;
330 static fnstr *file_lists_last = NULL;
331 static fnstr *pattern_files = NULL;
332 static fnstr *pattern_files_last = NULL;
333
334 /* Structure for holding the two variables that describe a file name chain. */
335
336 typedef struct fndatastr {
337 fnstr **anchor;
338 fnstr **lastptr;
339 } fndatastr;
340
341 static fndatastr exclude_from_data = { &exclude_from, &exclude_from_last };
342 static fndatastr include_from_data = { &include_from, &include_from_last };
343 static fndatastr file_lists_data = { &file_lists, &file_lists_last };
344 static fndatastr pattern_files_data = { &pattern_files, &pattern_files_last };
345
346 /* Structure for pattern and its compiled form; used for matching patterns and
347 also for include/exclude patterns. */
348
349 typedef struct patstr {
350 struct patstr *next;
351 char *string;
352 PCRE2_SIZE length;
353 pcre2_code *compiled;
354 } patstr;
355
356 static patstr *patterns = NULL;
357 static patstr *patterns_last = NULL;
358 static patstr *include_patterns = NULL;
359 static patstr *include_patterns_last = NULL;
360 static patstr *exclude_patterns = NULL;
361 static patstr *exclude_patterns_last = NULL;
362 static patstr *include_dir_patterns = NULL;
363 static patstr *include_dir_patterns_last = NULL;
364 static patstr *exclude_dir_patterns = NULL;
365 static patstr *exclude_dir_patterns_last = NULL;
366
367 /* Structure holding the two variables that describe a pattern chain. A pointer
368 to such structures is used for each appropriate option. */
369
370 typedef struct patdatastr {
371 patstr **anchor;
372 patstr **lastptr;
373 } patdatastr;
374
375 static patdatastr match_patdata = { &patterns, &patterns_last };
376 static patdatastr include_patdata = { &include_patterns, &include_patterns_last };
377 static patdatastr exclude_patdata = { &exclude_patterns, &exclude_patterns_last };
378 static patdatastr include_dir_patdata = { &include_dir_patterns, &include_dir_patterns_last };
379 static patdatastr exclude_dir_patdata = { &exclude_dir_patterns, &exclude_dir_patterns_last };
380
381 static patstr **incexlist[4] = { &include_patterns, &exclude_patterns,
382 &include_dir_patterns, &exclude_dir_patterns };
383
384 static const char *incexname[4] = { "--include", "--exclude",
385 "--include-dir", "--exclude-dir" };
386
387 /* Structure for options and list of them */
388
389 enum { OP_NODATA, OP_STRING, OP_OP_STRING, OP_NUMBER, OP_U32NUMBER, OP_SIZE,
390 OP_OP_NUMBER, OP_OP_NUMBERS, OP_PATLIST, OP_FILELIST, OP_BINFILES };
391
392 typedef struct option_item {
393 int type;
394 int one_char;
395 void *dataptr;
396 const char *long_name;
397 const char *help_text;
398 } option_item;
399
400 /* Options without a single-letter equivalent get a negative value. This can be
401 used to identify them. */
402
403 #define N_COLOUR (-1)
404 #define N_EXCLUDE (-2)
405 #define N_EXCLUDE_DIR (-3)
406 #define N_HELP (-4)
407 #define N_INCLUDE (-5)
408 #define N_INCLUDE_DIR (-6)
409 #define N_LABEL (-7)
410 #define N_LOCALE (-8)
411 #define N_NULL (-9)
412 #define N_LOFFSETS (-10)
413 #define N_FOFFSETS (-11)
414 #define N_LBUFFER (-12)
415 #define N_H_LIMIT (-13)
416 #define N_M_LIMIT (-14)
417 #define N_M_LIMIT_DEP (-15)
418 #define N_BUFSIZE (-16)
419 #define N_NOJIT (-17)
420 #define N_FILE_LIST (-18)
421 #define N_BINARY_FILES (-19)
422 #define N_EXCLUDE_FROM (-20)
423 #define N_INCLUDE_FROM (-21)
424 #define N_OM_SEPARATOR (-22)
425 #define N_MAX_BUFSIZE (-23)
426 #define N_OM_CAPTURE (-24)
427 #define N_ALLABSK (-25)
428 #define N_POSIX_DIGIT (-26)
429 #define N_GROUP_SEPARATOR (-27)
430 #define N_NO_GROUP_SEPARATOR (-28)
431
432 static option_item optionlist[] = {
433 { OP_NODATA, N_NULL, NULL, "", "terminate options" },
434 { OP_NODATA, N_HELP, NULL, "help", "display this help and exit" },
435 { OP_NUMBER, 'A', &after_context, "after-context=number", "set number of following context lines" },
436 { OP_NODATA, 'a', NULL, "text", "treat binary files as text" },
437 { OP_NUMBER, 'B', &before_context, "before-context=number", "set number of prior context lines" },
438 { OP_BINFILES, N_BINARY_FILES, NULL, "binary-files=word", "set treatment of binary files" },
439 { OP_SIZE, N_BUFSIZE,&bufthird, "buffer-size=number", "set processing buffer starting size" },
440 { OP_SIZE, N_MAX_BUFSIZE,&max_bufthird, "max-buffer-size=number", "set processing buffer maximum size" },
441 { OP_OP_STRING, N_COLOUR, &colour_option, "color=option", "matched text color option" },
442 { OP_OP_STRING, N_COLOUR, &colour_option, "colour=option", "matched text colour option" },
443 { OP_NUMBER, 'C', &both_context, "context=number", "set number of context lines, before & after" },
444 { OP_NODATA, 'c', NULL, "count", "print only a count of matching lines per FILE" },
445 { OP_STRING, 'D', &DEE_option, "devices=action","how to handle devices, FIFOs, and sockets" },
446 { OP_STRING, 'd', &dee_option, "directories=action", "how to handle directories" },
447 { OP_NODATA, N_POSIX_DIGIT, NULL, "posix-digit", "\\d always matches [0-9], even in UTF/UCP mode" },
448 { OP_NODATA, 'E', NULL, "case-restrict", "restrict case matching (no mix ASCII/non-ASCII)" },
449 { OP_PATLIST, 'e', &match_patdata, "regex(p)=pattern", "specify pattern (may be used more than once)" },
450 { OP_NODATA, 'F', NULL, "fixed-strings", "patterns are sets of newline-separated strings" },
451 { OP_FILELIST, 'f', &pattern_files_data, "file=path", "read patterns from file" },
452 { OP_FILELIST, N_FILE_LIST, &file_lists_data, "file-list=path","read files to search from file" },
453 { OP_NODATA, N_FOFFSETS, NULL, "file-offsets", "output file offsets, not text" },
454 { OP_STRING, N_GROUP_SEPARATOR, &group_separator, "group-separator=text", "set separator between groups of lines" },
455 { OP_NODATA, 'H', NULL, "with-filename", "force the prefixing filename on output" },
456 { OP_NODATA, 'h', NULL, "no-filename", "suppress the prefixing filename on output" },
457 { OP_NODATA, 'I', NULL, "", "treat binary files as not matching (ignore)" },
458 { OP_NODATA, 'i', NULL, "ignore-case", "ignore case distinctions" },
459 { OP_NODATA, 'l', NULL, "files-with-matches", "print only FILE names containing matches" },
460 { OP_NODATA, 'L', NULL, "files-without-match","print only FILE names not containing matches" },
461 { OP_STRING, N_LABEL, &stdin_name, "label=name", "set name for standard input" },
462 { OP_NODATA, N_LBUFFER, NULL, "line-buffered", "use line buffering" },
463 { OP_NODATA, N_LOFFSETS, NULL, "line-offsets", "output line numbers and offsets, not text" },
464 { OP_STRING, N_LOCALE, &locale, "locale=locale", "use the named locale" },
465 { OP_SIZE, N_H_LIMIT, &heap_limit, "heap-limit=number", "set PCRE2 heap limit option (kibibytes)" },
466 { OP_U32NUMBER, N_M_LIMIT, &match_limit, "match-limit=number", "set PCRE2 match limit option" },
467 { OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "depth-limit=number", "set PCRE2 depth limit option" },
468 { OP_U32NUMBER, N_M_LIMIT_DEP, &depth_limit, "recursion-limit=number", "obsolete synonym for depth-limit" },
469 { OP_NODATA, 'M', NULL, "multiline", "run in multiline mode" },
470 { OP_NUMBER, 'm', &count_limit, "max-count=number", "stop after <number> matched lines" },
471 { OP_STRING, 'N', &newline_arg, "newline=type", "set newline type (CR, LF, CRLF, ANYCRLF, ANY, or NUL)" },
472 { OP_NODATA, 'n', NULL, "line-number", "print line number with output lines" },
473 #ifdef SUPPORT_PCRE2GREP_JIT
474 { OP_NODATA, N_NOJIT, NULL, "no-jit", "do not use just-in-time compiler optimization" },
475 #else
476 { OP_NODATA, N_NOJIT, NULL, "no-jit", "ignored: this pcre2grep does not support JIT" },
477 #endif
478 { OP_NODATA, N_NO_GROUP_SEPARATOR, NULL, "no-group-separator", "suppress separators between groups of lines" },
479 { OP_STRING, 'O', &output_text, "output=text", "show only this text (possibly expanded)" },
480 { OP_OP_NUMBERS, 'o', &only_matching_data, "only-matching=n", "show only the part of the line that matched" },
481 { OP_STRING, N_OM_SEPARATOR, &om_separator, "om-separator=text", "set separator for multiple -o output" },
482 { OP_U32NUMBER, N_OM_CAPTURE, &capture_max, "om-capture=n", "set capture count for --only-matching" },
483 { OP_NODATA, 'P', NULL, "no-ucp", "do not enable UCP mode with Unicode" },
484 { OP_NODATA, 'q', NULL, "quiet", "suppress output, just set return code" },
485 { OP_NODATA, 'r', NULL, "recursive", "recursively scan sub-directories" },
486 { OP_PATLIST, N_EXCLUDE,&exclude_patdata, "exclude=pattern","exclude matching files when recursing" },
487 { OP_PATLIST, N_INCLUDE,&include_patdata, "include=pattern","include matching files when recursing" },
488 { OP_PATLIST, N_EXCLUDE_DIR,&exclude_dir_patdata, "exclude-dir=pattern","exclude matching directories when recursing" },
489 { OP_PATLIST, N_INCLUDE_DIR,&include_dir_patdata, "include-dir=pattern","include matching directories when recursing" },
490 { OP_FILELIST, N_EXCLUDE_FROM,&exclude_from_data, "exclude-from=path", "read exclude list from file" },
491 { OP_FILELIST, N_INCLUDE_FROM,&include_from_data, "include-from=path", "read include list from file" },
492 { OP_NODATA, 's', NULL, "no-messages", "suppress error messages" },
493 { OP_NODATA, 't', NULL, "total-count", "print total count of matching lines" },
494 { OP_NODATA, 'u', NULL, "utf", "use UTF/Unicode" },
495 { OP_NODATA, 'U', NULL, "utf-allow-invalid", "use UTF/Unicode, allow for invalid code units" },
496 { OP_NODATA, 'V', NULL, "version", "print version information and exit" },
497 { OP_NODATA, 'v', NULL, "invert-match", "select non-matching lines" },
498 { OP_NODATA, 'w', NULL, "word-regex(p)", "force patterns to match only as words" },
499 { OP_NODATA, 'x', NULL, "line-regex(p)", "force patterns to match only whole lines" },
500 { OP_NODATA, N_ALLABSK, NULL, "allow-lookaround-bsk", "allow \\K in lookarounds" },
501 { OP_NODATA, 'Z', NULL, "null", "output 0 byte after file names" },
502 { OP_NODATA, 0, NULL, NULL, NULL }
503 };
504
505 /* Table of names for newline types. Must be kept in step with the definitions
506 of PCRE2_NEWLINE_xx in pcre2.h. */
507
508 static const char *newlines[] = {
509 "DEFAULT", "CR", "LF", "CRLF", "ANY", "ANYCRLF", "NUL" };
510
511 /* UTF-8 tables */
512
513 const int utf8_table1[] =
514 { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
515 const int utf8_table1_size = sizeof(utf8_table1) / sizeof(int);
516
517 const int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
518 const int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
519
520 const char utf8_table4[] = {
521 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
522 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
523 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
524 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
525
526
527 #if !defined(VPCOMPAT) && !defined(HAVE_MEMMOVE)
528 /*************************************************
529 * Emulated memmove() for systems without it *
530 *************************************************/
531
532 /* This function can make use of bcopy() if it is available. Otherwise do it by
533 steam, as there are some non-Unix environments that lack both memmove() and
534 bcopy(). */
535
536 static void *
emulated_memmove(void * d,const void * s,size_t n)537 emulated_memmove(void *d, const void *s, size_t n)
538 {
539 #ifdef HAVE_BCOPY
540 bcopy(s, d, n);
541 return d;
542 #else
543 size_t i;
544 unsigned char *dest = (unsigned char *)d;
545 const unsigned char *src = (const unsigned char *)s;
546 if (dest > src)
547 {
548 dest += n;
549 src += n;
550 for (i = 0; i < n; ++i) *(--dest) = *(--src);
551 return (void *)dest;
552 }
553 else
554 {
555 for (i = 0; i < n; ++i) *dest++ = *src++;
556 return (void *)(dest - n);
557 }
558 #endif /* not HAVE_BCOPY */
559 }
560 #undef memmove
561 #define memmove(d,s,n) emulated_memmove(d,s,n)
562 #endif /* not VPCOMPAT && not HAVE_MEMMOVE */
563
564
565
566 /*************************************************
567 * Convert code point to UTF-8 *
568 *************************************************/
569
570 /* A static buffer is used. Returns the number of bytes. */
571
572 static int
ord2utf8(uint32_t value)573 ord2utf8(uint32_t value)
574 {
575 int i, j;
576 uint8_t *utf8bytes = utf8_buffer;
577 for (i = 0; i < utf8_table1_size; i++)
578 if (value <= (uint32_t)utf8_table1[i]) break;
579 utf8bytes += i;
580 for (j = i; j > 0; j--)
581 {
582 *utf8bytes-- = 0x80 | (value & 0x3f);
583 value >>= 6;
584 }
585 *utf8bytes = utf8_table2[i] | value;
586 return i + 1;
587 }
588
589
590
591 /*************************************************
592 * Case-independent string compare *
593 *************************************************/
594
595 static int
strcmpic(const char * str1,const char * str2)596 strcmpic(const char *str1, const char *str2)
597 {
598 unsigned int c1, c2;
599 while (*str1 != '\0' || *str2 != '\0')
600 {
601 c1 = tolower(*str1++);
602 c2 = tolower(*str2++);
603 if (c1 != c2) return ((c1 > c2) << 1) - 1;
604 }
605 return 0;
606 }
607
608
609 /*************************************************
610 * Parse GREP_COLORS *
611 *************************************************/
612
613 /* Extract ms or mt from GREP_COLORS.
614
615 Argument: the string, possibly NULL
616 Returns: the value of ms or mt, or NULL if neither present
617 */
618
619 static char *
parse_grep_colors(const char * gc)620 parse_grep_colors(const char *gc)
621 {
622 static char seq[16];
623 char *col;
624 uint32_t len;
625 if (gc == NULL) return NULL;
626 col = strstr(gc, "ms=");
627 if (col == NULL) col = strstr(gc, "mt=");
628 if (col == NULL) return NULL;
629 len = 0;
630 col += 3;
631 while (*col != ':' && *col != 0 && len < sizeof(seq)-1)
632 seq[len++] = *col++;
633 seq[len] = 0;
634 return seq;
635 }
636
637
638 /*************************************************
639 * Exit from the program *
640 *************************************************/
641
642 /* If there has been a resource error, give a suitable message.
643
644 Argument: the return code
645 Returns: does not return
646 */
647
648 static void
pcre2grep_exit(int rc)649 pcre2grep_exit(int rc)
650 {
651 /* VMS does exit codes differently: both exit(1) and exit(0) return with a
652 status of 1, which is not helpful. To help with this problem, define a symbol
653 (akin to an environment variable) called "PCRE2GREP_RC" and put the exit code
654 therein. */
655
656 #ifdef __VMS
657 char val_buf[4];
658 $DESCRIPTOR(sym_nam, "PCRE2GREP_RC");
659 $DESCRIPTOR(sym_val, val_buf);
660 sprintf(val_buf, "%d", rc);
661 sym_val.dsc$w_length = strlen(val_buf);
662 lib$set_symbol(&sym_nam, &sym_val);
663 #endif
664
665 if (resource_error)
666 {
667 fprintf(stderr, "pcre2grep: Error %d, %d, %d or %d means that a resource "
668 "limit was exceeded.\n", PCRE2_ERROR_JIT_STACKLIMIT, PCRE2_ERROR_MATCHLIMIT,
669 PCRE2_ERROR_DEPTHLIMIT, PCRE2_ERROR_HEAPLIMIT);
670 fprintf(stderr, "pcre2grep: Check your regex for nested unlimited loops.\n");
671 }
672 exit(rc);
673 }
674
675
676 /*************************************************
677 * Add item to chain of patterns *
678 *************************************************/
679
680 /* Used to add an item onto a chain, or just return an unconnected item if the
681 "after" argument is NULL.
682
683 Arguments:
684 s pattern string to add
685 patlen length of pattern
686 after if not NULL points to item to insert after
687
688 Returns: new pattern block or NULL on error
689 */
690
691 static patstr *
add_pattern(char * s,PCRE2_SIZE patlen,patstr * after)692 add_pattern(char *s, PCRE2_SIZE patlen, patstr *after)
693 {
694 patstr *p = (patstr *)malloc(sizeof(patstr));
695
696 /* LCOV_EXCL_START - These won't be hit in normal testing. */
697
698 if (p == NULL)
699 {
700 fprintf(stderr, "pcre2grep: malloc failed\n");
701 pcre2grep_exit(2);
702 }
703 if (patlen > MAXPATLEN)
704 {
705 fprintf(stderr, "pcre2grep: pattern is too long (limit is %d bytes)\n",
706 MAXPATLEN);
707 free(p);
708 return NULL;
709 }
710
711 /* LCOV_EXCL_STOP */
712
713 p->next = NULL;
714 p->string = s;
715 p->length = patlen;
716 p->compiled = NULL;
717
718 if (after != NULL)
719 {
720 p->next = after->next;
721 after->next = p;
722 }
723 return p;
724 }
725
726
727 /*************************************************
728 * Free chain of patterns *
729 *************************************************/
730
731 /* Used for several chains of patterns.
732
733 Argument: pointer to start of chain
734 Returns: nothing
735 */
736
737 static void
free_pattern_chain(patstr * pc)738 free_pattern_chain(patstr *pc)
739 {
740 while (pc != NULL)
741 {
742 patstr *p = pc;
743 pc = p->next;
744 if (p->compiled != NULL) pcre2_code_free(p->compiled);
745 free(p);
746 }
747 }
748
749
750 /*************************************************
751 * Free chain of file names *
752 *************************************************/
753
754 /*
755 Argument: pointer to start of chain
756 Returns: nothing
757 */
758
759 static void
free_file_chain(fnstr * fn)760 free_file_chain(fnstr *fn)
761 {
762 while (fn != NULL)
763 {
764 fnstr *f = fn;
765 fn = f->next;
766 free(f);
767 }
768 }
769
770
771 /*************************************************
772 * OS-specific functions *
773 *************************************************/
774
775 /* These definitions are needed in all Windows environments, even those where
776 Unix-style directory scanning can be used (see below). */
777
778 #ifdef WIN32
779
780 #ifndef STRICT
781 # define STRICT
782 #endif
783 #ifndef WIN32_LEAN_AND_MEAN
784 # define WIN32_LEAN_AND_MEAN
785 #endif
786
787 #include <windows.h>
788
789 #define iswild(name) (strpbrk(name, "*?") != NULL)
790
791 /* Convert ANSI BGR format to RGB used by Windows */
792 #define BGR_RGB(x) (((x) & 1 ? 4 : 0) | ((x) & 2) | ((x) & 4 ? 1 : 0))
793
794 static HANDLE hstdout;
795 static CONSOLE_SCREEN_BUFFER_INFO csbi;
796 static WORD match_colour;
797
798 static WORD
decode_ANSI_colour(const char * cs)799 decode_ANSI_colour(const char *cs)
800 {
801 WORD result = csbi.wAttributes;
802 while (*cs)
803 {
804 if (isdigit((unsigned char)(*cs)))
805 {
806 int code = atoi(cs);
807 if (code == 1) result |= 0x08;
808 else if (code == 4) result |= 0x8000;
809 else if (code == 5) result |= 0x80;
810 else if (code >= 30 && code <= 37) result = (result & 0xF8) | BGR_RGB(code - 30);
811 else if (code == 39) result = (result & 0xF0) | (csbi.wAttributes & 0x0F);
812 else if (code >= 40 && code <= 47) result = (result & 0x8F) | (BGR_RGB(code - 40) << 4);
813 else if (code == 49) result = (result & 0x0F) | (csbi.wAttributes & 0xF0);
814 /* aixterm high intensity colour codes */
815 else if (code >= 90 && code <= 97) result = (result & 0xF0) | BGR_RGB(code - 90) | 0x08;
816 else if (code >= 100 && code <= 107) result = (result & 0x0F) | (BGR_RGB(code - 100) << 4) | 0x80;
817
818 while (isdigit((unsigned char)(*cs))) cs++;
819 }
820 if (*cs) cs++;
821 }
822 return result;
823 }
824
825
826 static void
init_colour_output()827 init_colour_output()
828 {
829 if (do_colour)
830 {
831 hstdout = GetStdHandle(STD_OUTPUT_HANDLE);
832 /* This fails when redirected to con; try again if so. */
833 if (!GetConsoleScreenBufferInfo(hstdout, &csbi) && !do_ansi)
834 {
835 HANDLE hcon = CreateFile("CONOUT$", GENERIC_READ | GENERIC_WRITE,
836 FILE_SHARE_WRITE, NULL, OPEN_EXISTING, 0, NULL);
837 GetConsoleScreenBufferInfo(hcon, &csbi);
838 CloseHandle(hcon);
839 }
840 match_colour = decode_ANSI_colour(colour_string);
841 /* No valid colour found - turn off colouring */
842 if (!match_colour) do_colour = FALSE;
843 }
844 }
845
846 #endif /* WIN32 */
847
848
849 /* The following sets of functions are defined so that they can be made system
850 specific. At present there are versions for Unix-style environments, Windows,
851 native z/OS, and "no support". */
852
853
854 /************* Directory scanning Unix-style and z/OS ***********/
855
856 #if (defined HAVE_SYS_STAT_H && defined HAVE_DIRENT_H && defined HAVE_SYS_TYPES_H) || defined NATIVE_ZOS
857 #include <sys/types.h>
858 #include <sys/stat.h>
859 #include <dirent.h>
860
861 #if defined NATIVE_ZOS
862 /************* Directory and PDS/E scanning for z/OS ***********/
863 /************* z/OS looks mostly like Unix with USS ************/
864 /* However, z/OS needs the #include statements in this header */
865 #include "pcrzosfs.h"
866 /* That header is not included in the main PCRE distribution because
867 other apparatus is needed to compile pcre2grep for z/OS. The header
868 can be found in the special z/OS distribution, which is available
869 from www.zaconsultants.net or from www.cbttape.org. */
870 #endif
871
872 typedef DIR directory_type;
873 #define FILESEP '/'
874
875 static int
isdirectory(char * filename)876 isdirectory(char *filename)
877 {
878 struct stat statbuf;
879 if (stat(filename, &statbuf) < 0)
880 return 0; /* In the expectation that opening as a file will fail */
881 return S_ISDIR(statbuf.st_mode);
882 }
883
884 static directory_type *
opendirectory(char * filename)885 opendirectory(char *filename)
886 {
887 return opendir(filename);
888 }
889
890 static char *
readdirectory(directory_type * dir)891 readdirectory(directory_type *dir)
892 {
893 for (;;)
894 {
895 struct dirent *dent = readdir(dir);
896 if (dent == NULL) return NULL;
897 if (strcmp(dent->d_name, ".") != 0 && strcmp(dent->d_name, "..") != 0)
898 return dent->d_name;
899 }
900 /* Control never reaches here */
901 }
902
903 static void
closedirectory(directory_type * dir)904 closedirectory(directory_type *dir)
905 {
906 closedir(dir);
907 }
908
909
910 /************* Test for regular file, Unix-style **********/
911
912 static int
isregfile(char * filename)913 isregfile(char *filename)
914 {
915 struct stat statbuf;
916 if (stat(filename, &statbuf) < 0)
917 return 1; /* In the expectation that opening as a file will fail */
918 return S_ISREG(statbuf.st_mode);
919 }
920
921
922 #if defined NATIVE_ZOS
923 /************* Test for a terminal in z/OS **********/
924 /* isatty() does not work in a TSO environment, so always give FALSE.*/
925
926 static BOOL
is_stdout_tty(void)927 is_stdout_tty(void)
928 {
929 return FALSE;
930 }
931
932 static BOOL
is_file_tty(FILE * f)933 is_file_tty(FILE *f)
934 {
935 return FALSE;
936 }
937
938
939 /************* Test for a terminal, Unix-style **********/
940
941 #else
942 static BOOL
is_stdout_tty(void)943 is_stdout_tty(void)
944 {
945 return isatty(fileno(stdout));
946 }
947
948 static BOOL
is_file_tty(FILE * f)949 is_file_tty(FILE *f)
950 {
951 return isatty(fileno(f));
952 }
953 #endif
954
955
956 /************* Print optionally coloured match Unix-style and z/OS **********/
957
958 static void
print_match(const void * buf,int length)959 print_match(const void *buf, int length)
960 {
961 if (length == 0) return;
962 if (do_colour) fprintf(stdout, "%c[%sm", 0x1b, colour_string);
963 FWRITE_IGNORE(buf, 1, length, stdout);
964 if (do_colour) fprintf(stdout, "%c[0m", 0x1b);
965 }
966
967 /* End of Unix-style or native z/OS environment functions. */
968
969
970 /************* Directory scanning in Windows ***********/
971
972 /* I (Philip Hazel) have no means of testing this code. It was contributed by
973 Lionel Fourquaux. David Burgess added a patch to define INVALID_FILE_ATTRIBUTES
974 when it did not exist. David Byron added a patch that moved the #include of
975 <windows.h> to before the INVALID_FILE_ATTRIBUTES definition rather than after.
976 */
977
978 #elif defined WIN32
979
980 #ifndef INVALID_FILE_ATTRIBUTES
981 #define INVALID_FILE_ATTRIBUTES 0xFFFFFFFF
982 #endif
983
984 typedef struct directory_type
985 {
986 HANDLE handle;
987 BOOL first;
988 WIN32_FIND_DATA data;
989 } directory_type;
990
991 #define FILESEP '/'
992
993 int
isdirectory(char * filename)994 isdirectory(char *filename)
995 {
996 DWORD attr = GetFileAttributes(filename);
997 if (attr == INVALID_FILE_ATTRIBUTES)
998 return 0;
999 return (attr & FILE_ATTRIBUTE_DIRECTORY) != 0;
1000 }
1001
1002 directory_type *
opendirectory(char * filename)1003 opendirectory(char *filename)
1004 {
1005 size_t len;
1006 char *pattern;
1007 directory_type *dir;
1008 DWORD err;
1009 len = strlen(filename);
1010 pattern = (char *)malloc(len + 3);
1011 dir = (directory_type *)malloc(sizeof(*dir));
1012 if ((pattern == NULL) || (dir == NULL))
1013 {
1014 fprintf(stderr, "pcre2grep: malloc failed\n");
1015 pcre2grep_exit(2);
1016 }
1017 memcpy(pattern, filename, len);
1018 if (iswild(filename))
1019 pattern[len] = 0;
1020 else
1021 memcpy(&(pattern[len]), "\\*", 3);
1022 dir->handle = FindFirstFile(pattern, &(dir->data));
1023 if (dir->handle != INVALID_HANDLE_VALUE)
1024 {
1025 free(pattern);
1026 dir->first = TRUE;
1027 return dir;
1028 }
1029 err = GetLastError();
1030 free(pattern);
1031 free(dir);
1032 errno = (err == ERROR_ACCESS_DENIED) ? EACCES : ENOENT;
1033 return NULL;
1034 }
1035
1036 char *
readdirectory(directory_type * dir)1037 readdirectory(directory_type *dir)
1038 {
1039 for (;;)
1040 {
1041 if (!dir->first)
1042 {
1043 if (!FindNextFile(dir->handle, &(dir->data)))
1044 return NULL;
1045 }
1046 else
1047 {
1048 dir->first = FALSE;
1049 }
1050 if (strcmp(dir->data.cFileName, ".") != 0 && strcmp(dir->data.cFileName, "..") != 0)
1051 return dir->data.cFileName;
1052 }
1053 #ifndef _MSC_VER
1054 return NULL; /* Keep compiler happy; never executed */
1055 #endif
1056 }
1057
1058 void
closedirectory(directory_type * dir)1059 closedirectory(directory_type *dir)
1060 {
1061 FindClose(dir->handle);
1062 free(dir);
1063 }
1064
1065
1066 /************* Test for regular file in Windows **********/
1067
1068 /* I don't know how to do this, or if it can be done; assume all paths are
1069 regular if they are not directories. */
1070
isregfile(char * filename)1071 int isregfile(char *filename)
1072 {
1073 return !isdirectory(filename);
1074 }
1075
1076
1077 /************* Test for a terminal in Windows **********/
1078
1079 static BOOL
is_stdout_tty(void)1080 is_stdout_tty(void)
1081 {
1082 return _isatty(_fileno(stdout));
1083 }
1084
1085 static BOOL
is_file_tty(FILE * f)1086 is_file_tty(FILE *f)
1087 {
1088 return _isatty(_fileno(f));
1089 }
1090
1091
1092 /************* Print optionally coloured match in Windows **********/
1093
1094 static void
print_match(const void * buf,int length)1095 print_match(const void *buf, int length)
1096 {
1097 if (length == 0) return;
1098 if (do_colour)
1099 {
1100 if (do_ansi) fprintf(stdout, "%c[%sm", 0x1b, colour_string);
1101 else SetConsoleTextAttribute(hstdout, match_colour);
1102 }
1103 FWRITE_IGNORE(buf, 1, length, stdout);
1104 if (do_colour)
1105 {
1106 if (do_ansi) fprintf(stdout, "%c[0m", 0x1b);
1107 else SetConsoleTextAttribute(hstdout, csbi.wAttributes);
1108 }
1109 }
1110
1111 /* End of Windows functions */
1112
1113
1114 /************* Directory scanning when we can't do it ***********/
1115
1116 /* The type is void, and apart from isdirectory(), the functions do nothing. */
1117
1118 #else
1119
1120 #define FILESEP 0
1121 typedef void directory_type;
1122
isdirectory(char * filename)1123 int isdirectory(char *filename) { return 0; }
opendirectory(char * filename)1124 directory_type * opendirectory(char *filename) { return (directory_type*)0;}
readdirectory(directory_type * dir)1125 char *readdirectory(directory_type *dir) { return (char*)0;}
closedirectory(directory_type * dir)1126 void closedirectory(directory_type *dir) {}
1127
1128
1129 /************* Test for regular file when we can't do it **********/
1130
1131 /* Assume all files are regular. */
1132
isregfile(char * filename)1133 int isregfile(char *filename) { return 1; }
1134
1135
1136 /************* Test for a terminal when we can't do it **********/
1137
1138 static BOOL
is_stdout_tty(void)1139 is_stdout_tty(void)
1140 {
1141 return FALSE;
1142 }
1143
1144 static BOOL
is_file_tty(FILE * f)1145 is_file_tty(FILE *f)
1146 {
1147 return FALSE;
1148 }
1149
1150
1151 /************* Print optionally coloured match when we can't do it **********/
1152
1153 static void
print_match(const void * buf,int length)1154 print_match(const void *buf, int length)
1155 {
1156 if (length == 0) return;
1157 FWRITE_IGNORE(buf, 1, length, stdout);
1158 }
1159
1160 #endif /* End of system-specific functions */
1161
1162
1163
1164 #ifndef HAVE_STRERROR
1165 /*************************************************
1166 * Provide strerror() for non-ANSI libraries *
1167 *************************************************/
1168
1169 /* Some old-fashioned systems still around (e.g. SunOS4) don't have strerror()
1170 in their libraries, but can provide the same facility by this simple
1171 alternative function. */
1172
1173 extern int sys_nerr;
1174 extern char *sys_errlist[];
1175
1176 char *
strerror(int n)1177 strerror(int n)
1178 {
1179 if (n < 0 || n >= sys_nerr) return "unknown error number";
1180 return sys_errlist[n];
1181 }
1182 #endif /* HAVE_STRERROR */
1183
1184
1185
1186 /*************************************************
1187 * Usage function *
1188 *************************************************/
1189
1190 static int
usage(int rc)1191 usage(int rc)
1192 {
1193 option_item *op;
1194 fprintf(stderr, "Usage: pcre2grep [-");
1195 for (op = optionlist; op->one_char != 0; op++)
1196 {
1197 if (op->one_char > 0) fprintf(stderr, "%c", op->one_char);
1198 }
1199 fprintf(stderr, "] [long options] [pattern] [files]\n");
1200 fprintf(stderr, "Type \"pcre2grep --help\" for more information and the long "
1201 "options.\n");
1202 return rc;
1203 }
1204
1205
1206
1207 /*************************************************
1208 * Help function *
1209 *************************************************/
1210
1211 static void
help(void)1212 help(void)
1213 {
1214 option_item *op;
1215
1216 printf("Usage: pcre2grep [OPTION]... [PATTERN] [FILE1 FILE2 ...]" STDOUT_NL);
1217 printf("Search for PATTERN in each FILE or standard input." STDOUT_NL);
1218 printf("PATTERN must be present if neither -e nor -f is used." STDOUT_NL);
1219
1220 #ifdef SUPPORT_PCRE2GREP_CALLOUT
1221 #ifdef SUPPORT_PCRE2GREP_CALLOUT_FORK
1222 printf("All callout scripts in patterns are supported." STDOUT_NL);
1223 #else
1224 printf("Non-fork callout scripts in patterns are supported." STDOUT_NL);
1225 #endif
1226 #else
1227 printf("Callout scripts are not supported in this pcre2grep." STDOUT_NL);
1228 #endif
1229
1230 printf("\"-\" can be used as a file name to mean STDIN." STDOUT_NL);
1231
1232 #ifdef SUPPORT_LIBZ
1233 printf("Files whose names end in .gz are read using zlib." STDOUT_NL);
1234 #endif
1235
1236 #ifdef SUPPORT_LIBBZ2
1237 printf("Files whose names end in .bz2 are read using bzlib2." STDOUT_NL);
1238 #endif
1239
1240 #if defined SUPPORT_LIBZ || defined SUPPORT_LIBBZ2
1241 printf("Other files and the standard input are read as plain files." STDOUT_NL STDOUT_NL);
1242 #else
1243 printf("All files are read as plain files, without any interpretation." STDOUT_NL STDOUT_NL);
1244 #endif
1245
1246 printf("Example: pcre2grep -i " QUOT "hello.*world" QUOT " menu.h main.c" STDOUT_NL STDOUT_NL);
1247 printf("Options:" STDOUT_NL);
1248
1249 for (op = optionlist; op->one_char != 0; op++)
1250 {
1251 int n;
1252 char s[4];
1253
1254 if (op->one_char > 0 && (op->long_name)[0] == 0)
1255 n = 31 - printf(" -%c", op->one_char);
1256 else
1257 {
1258 if (op->one_char > 0) sprintf(s, "-%c,", op->one_char);
1259 else strcpy(s, " ");
1260 n = 31 - printf(" %s --%s", s, op->long_name);
1261 }
1262
1263 if (n < 1) n = 1;
1264 printf("%.*s%s" STDOUT_NL, n, " ", op->help_text);
1265 }
1266
1267 printf(STDOUT_NL "Numbers may be followed by K or M, e.g. --max-buffer-size=100K." STDOUT_NL);
1268 printf("The default value for --buffer-size is %d." STDOUT_NL, PCRE2GREP_BUFSIZE);
1269 printf("The default value for --max-buffer-size is %d." STDOUT_NL, PCRE2GREP_MAX_BUFSIZE);
1270 printf("When reading patterns or file names from a file, trailing white" STDOUT_NL);
1271 printf("space is removed and blank lines are ignored." STDOUT_NL);
1272 printf("The maximum size of any pattern is %d bytes." STDOUT_NL, MAXPATLEN);
1273
1274 printf(STDOUT_NL "With no FILEs, read standard input. If fewer than two FILEs given, assume -h." STDOUT_NL);
1275 printf("Exit status is 0 if any matches, 1 if no matches, and 2 if trouble." STDOUT_NL);
1276 }
1277
1278
1279
1280 /*************************************************
1281 * Test exclude/includes *
1282 *************************************************/
1283
1284 /* If any exclude pattern matches, the path is excluded. Otherwise, unless
1285 there are no includes, the path must match an include pattern.
1286
1287 Arguments:
1288 path the path to be matched
1289 ip the chain of include patterns
1290 ep the chain of exclude patterns
1291
1292 Returns: TRUE if the path is not excluded
1293 */
1294
1295 static BOOL
test_incexc(char * path,patstr * ip,patstr * ep)1296 test_incexc(char *path, patstr *ip, patstr *ep)
1297 {
1298 int plen = strlen((const char *)path);
1299
1300 for (; ep != NULL; ep = ep->next)
1301 {
1302 if (pcre2_match(ep->compiled, (PCRE2_SPTR)path, plen, 0, 0, match_data, NULL) >= 0)
1303 return FALSE;
1304 }
1305
1306 if (ip == NULL) return TRUE;
1307
1308 for (; ip != NULL; ip = ip->next)
1309 {
1310 if (pcre2_match(ip->compiled, (PCRE2_SPTR)path, plen, 0, 0, match_data, NULL) >= 0)
1311 return TRUE;
1312 }
1313
1314 return FALSE;
1315 }
1316
1317
1318
1319 /*************************************************
1320 * Decode integer argument value *
1321 *************************************************/
1322
1323 /* Integer arguments can be followed by K or M. Avoid the use of strtoul()
1324 because SunOS4 doesn't have it. This is used only for unpicking arguments, so
1325 just keep it simple.
1326
1327 Arguments:
1328 option_data the option data string
1329 op the option item (for error messages)
1330 longop TRUE if option given in long form
1331
1332 Returns: a long integer
1333 */
1334
1335 static long int
decode_number(char * option_data,option_item * op,BOOL longop)1336 decode_number(char *option_data, option_item *op, BOOL longop)
1337 {
1338 unsigned long int n = 0;
1339 char *endptr = option_data;
1340 while (*endptr != 0 && isspace((unsigned char)(*endptr))) endptr++;
1341 while (isdigit((unsigned char)(*endptr)))
1342 n = n * 10 + (int)(*endptr++ - '0');
1343 if (toupper(*endptr) == 'K')
1344 {
1345 n *= 1024;
1346 endptr++;
1347 }
1348 else if (toupper(*endptr) == 'M')
1349 {
1350 n *= 1024*1024;
1351 endptr++;
1352 }
1353
1354 if (*endptr != 0) /* Error */
1355 {
1356 if (longop)
1357 {
1358 char *equals = strchr(op->long_name, '=');
1359 int nlen = (equals == NULL)? (int)strlen(op->long_name) :
1360 (int)(equals - op->long_name);
1361 fprintf(stderr, "pcre2grep: Malformed number \"%s\" after --%.*s\n",
1362 option_data, nlen, op->long_name);
1363 }
1364 else
1365 fprintf(stderr, "pcre2grep: Malformed number \"%s\" after -%c\n",
1366 option_data, op->one_char);
1367 pcre2grep_exit(usage(2));
1368 }
1369
1370 return n;
1371 }
1372
1373
1374
1375 /*************************************************
1376 * Add item to a chain of numbers *
1377 *************************************************/
1378
1379 /* Used to add an item onto a chain, or just return an unconnected item if the
1380 "after" argument is NULL.
1381
1382 Arguments:
1383 n the number to add
1384 after if not NULL points to item to insert after
1385
1386 Returns: new number block
1387 */
1388
1389 static omstr *
add_number(int n,omstr * after)1390 add_number(int n, omstr *after)
1391 {
1392 omstr *om = (omstr *)malloc(sizeof(omstr));
1393
1394 /* LCOV_EXCL_START - These lines won't be hit in normal testing. */
1395
1396 if (om == NULL)
1397 {
1398 fprintf(stderr, "pcre2grep: malloc failed\n");
1399 pcre2grep_exit(2);
1400 }
1401
1402 /* LCOV_EXCL_STOP */
1403
1404 om->next = NULL;
1405 om->groupnum = n;
1406
1407 if (after != NULL)
1408 {
1409 om->next = after->next;
1410 after->next = om;
1411 }
1412 return om;
1413 }
1414
1415
1416
1417 /*************************************************
1418 * Read one line of input *
1419 *************************************************/
1420
1421 /* Normally, input that is to be scanned is read using fread() (or gzread, or
1422 BZ2_read) into a large buffer, so many lines may be read at once. However,
1423 doing this for tty input means that no output appears until a lot of input has
1424 been typed. Instead, tty input is handled line by line. We cannot use fgets()
1425 for this, because it does not stop at a binary zero, and therefore there is no
1426 way of telling how many characters it has read, because there may be binary
1427 zeros embedded in the data. This function is also used for reading patterns
1428 from files (the -f option).
1429
1430 Arguments:
1431 buffer the buffer to read into
1432 length the maximum number of characters to read
1433 f the file
1434
1435 Returns: the number of characters read, zero at end of file
1436 */
1437
1438 static PCRE2_SIZE
read_one_line(char * buffer,PCRE2_SIZE length,FILE * f)1439 read_one_line(char *buffer, PCRE2_SIZE length, FILE *f)
1440 {
1441 int c;
1442 PCRE2_SIZE yield = 0;
1443 while ((c = fgetc(f)) != EOF)
1444 {
1445 buffer[yield++] = c;
1446 if (c == '\n' || yield >= length) break;
1447 }
1448 return yield;
1449 }
1450
1451
1452
1453 /*************************************************
1454 * Find end of line *
1455 *************************************************/
1456
1457 /* The length of the endline sequence that is found is set via lenptr. This may
1458 be zero at the very end of the file if there is no line-ending sequence there.
1459
1460 Arguments:
1461 p current position in line
1462 endptr end of available data
1463 lenptr where to put the length of the eol sequence
1464
1465 Returns: pointer after the last byte of the line,
1466 including the newline byte(s)
1467 */
1468
1469 static char *
end_of_line(char * p,char * endptr,int * lenptr)1470 end_of_line(char *p, char *endptr, int *lenptr)
1471 {
1472 switch(endlinetype)
1473 {
1474 default: /* Just in case */
1475 case PCRE2_NEWLINE_LF:
1476 while (p < endptr && *p != '\n') p++;
1477 if (p < endptr)
1478 {
1479 *lenptr = 1;
1480 return p + 1;
1481 }
1482 *lenptr = 0;
1483 return endptr;
1484
1485 case PCRE2_NEWLINE_CR:
1486 while (p < endptr && *p != '\r') p++;
1487 if (p < endptr)
1488 {
1489 *lenptr = 1;
1490 return p + 1;
1491 }
1492 *lenptr = 0;
1493 return endptr;
1494
1495 case PCRE2_NEWLINE_NUL:
1496 while (p < endptr && *p != '\0') p++;
1497 if (p < endptr)
1498 {
1499 *lenptr = 1;
1500 return p + 1;
1501 }
1502 *lenptr = 0;
1503 return endptr;
1504
1505 case PCRE2_NEWLINE_CRLF:
1506 for (;;)
1507 {
1508 while (p < endptr && *p != '\r') p++;
1509 if (++p >= endptr)
1510 {
1511 *lenptr = 0;
1512 return endptr;
1513 }
1514 if (*p == '\n')
1515 {
1516 *lenptr = 2;
1517 return p + 1;
1518 }
1519 }
1520 break;
1521
1522 case PCRE2_NEWLINE_ANYCRLF:
1523 while (p < endptr)
1524 {
1525 int extra = 0;
1526 int c = *((unsigned char *)p);
1527
1528 if (utf && c >= 0xc0)
1529 {
1530 int gcii, gcss;
1531 extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
1532 gcss = 6*extra;
1533 c = (c & utf8_table3[extra]) << gcss;
1534 for (gcii = 1; gcii <= extra; gcii++)
1535 {
1536 gcss -= 6;
1537 c |= (p[gcii] & 0x3f) << gcss;
1538 }
1539 }
1540
1541 p += 1 + extra;
1542
1543 switch (c)
1544 {
1545 case '\n':
1546 *lenptr = 1;
1547 return p;
1548
1549 case '\r':
1550 if (p < endptr && *p == '\n')
1551 {
1552 *lenptr = 2;
1553 p++;
1554 }
1555 else *lenptr = 1;
1556 return p;
1557
1558 default:
1559 break;
1560 }
1561 } /* End of loop for ANYCRLF case */
1562
1563 *lenptr = 0; /* Must have hit the end */
1564 return endptr;
1565
1566 case PCRE2_NEWLINE_ANY:
1567 while (p < endptr)
1568 {
1569 int extra = 0;
1570 int c = *((unsigned char *)p);
1571
1572 if (utf && c >= 0xc0)
1573 {
1574 int gcii, gcss;
1575 extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
1576 gcss = 6*extra;
1577 c = (c & utf8_table3[extra]) << gcss;
1578 for (gcii = 1; gcii <= extra; gcii++)
1579 {
1580 gcss -= 6;
1581 c |= (p[gcii] & 0x3f) << gcss;
1582 }
1583 }
1584
1585 p += 1 + extra;
1586
1587 switch (c)
1588 {
1589 case '\n': /* LF */
1590 case '\v': /* VT */
1591 case '\f': /* FF */
1592 *lenptr = 1;
1593 return p;
1594
1595 case '\r': /* CR */
1596 if (p < endptr && *p == '\n')
1597 {
1598 *lenptr = 2;
1599 p++;
1600 }
1601 else *lenptr = 1;
1602 return p;
1603
1604 #ifndef EBCDIC
1605 case 0x85: /* Unicode NEL */
1606 *lenptr = utf? 2 : 1;
1607 return p;
1608
1609 case 0x2028: /* Unicode LS */
1610 case 0x2029: /* Unicode PS */
1611 *lenptr = 3;
1612 return p;
1613 #endif /* Not EBCDIC */
1614
1615 default:
1616 break;
1617 }
1618 } /* End of loop for ANY case */
1619
1620 *lenptr = 0; /* Must have hit the end */
1621 return endptr;
1622 } /* End of overall switch */
1623 }
1624
1625
1626
1627 /*************************************************
1628 * Find start of previous line *
1629 *************************************************/
1630
1631 /* This is called when looking back for before lines to print.
1632
1633 Arguments:
1634 p start of the subsequent line
1635 startptr start of available data
1636
1637 Returns: pointer to the start of the previous line
1638 */
1639
1640 static char *
previous_line(char * p,char * startptr)1641 previous_line(char *p, char *startptr)
1642 {
1643 switch(endlinetype)
1644 {
1645 default: /* Just in case */
1646 case PCRE2_NEWLINE_LF:
1647 p--;
1648 while (p > startptr && p[-1] != '\n') p--;
1649 return p;
1650
1651 case PCRE2_NEWLINE_CR:
1652 p--;
1653 while (p > startptr && p[-1] != '\n') p--;
1654 return p;
1655
1656 case PCRE2_NEWLINE_NUL:
1657 p--;
1658 while (p > startptr && p[-1] != '\0') p--;
1659 return p;
1660
1661 case PCRE2_NEWLINE_CRLF:
1662 for (;;)
1663 {
1664 p -= 2;
1665 while (p > startptr && p[-1] != '\n') p--;
1666 if (p <= startptr + 1 || p[-2] == '\r') return p;
1667 }
1668 /* Control can never get here */
1669
1670 case PCRE2_NEWLINE_ANY:
1671 case PCRE2_NEWLINE_ANYCRLF:
1672 if (*(--p) == '\n' && p > startptr && p[-1] == '\r') p--;
1673 if (utf) while ((*p & 0xc0) == 0x80) p--;
1674
1675 while (p > startptr)
1676 {
1677 unsigned int c;
1678 char *pp = p - 1;
1679
1680 if (utf)
1681 {
1682 int extra = 0;
1683 while ((*pp & 0xc0) == 0x80) pp--;
1684 c = *((unsigned char *)pp);
1685 if (c >= 0xc0)
1686 {
1687 int gcii, gcss;
1688 extra = utf8_table4[c & 0x3f]; /* Number of additional bytes */
1689 gcss = 6*extra;
1690 c = (c & utf8_table3[extra]) << gcss;
1691 for (gcii = 1; gcii <= extra; gcii++)
1692 {
1693 gcss -= 6;
1694 c |= (pp[gcii] & 0x3f) << gcss;
1695 }
1696 }
1697 }
1698 else c = *((unsigned char *)pp);
1699
1700 if (endlinetype == PCRE2_NEWLINE_ANYCRLF) switch (c)
1701 {
1702 case '\n': /* LF */
1703 case '\r': /* CR */
1704 return p;
1705
1706 default:
1707 break;
1708 }
1709
1710 else switch (c)
1711 {
1712 case '\n': /* LF */
1713 case '\v': /* VT */
1714 case '\f': /* FF */
1715 case '\r': /* CR */
1716 #ifndef EBCDIC
1717 case 0x85: /* Unicode NEL */
1718 case 0x2028: /* Unicode LS */
1719 case 0x2029: /* Unicode PS */
1720 #endif /* Not EBCDIC */
1721 return p;
1722
1723 default:
1724 break;
1725 }
1726
1727 p = pp; /* Back one character */
1728 } /* End of loop for ANY case */
1729
1730 return startptr; /* Hit start of data */
1731 } /* End of overall switch */
1732 }
1733
1734
1735
1736 /*************************************************
1737 * Output newline at end *
1738 *************************************************/
1739
1740 /* This function is called if the final line of a file has been written to
1741 stdout, but it does not have a terminating newline.
1742
1743 Arguments: none
1744 Returns: nothing
1745 */
1746
1747 static void
write_final_newline(void)1748 write_final_newline(void)
1749 {
1750 switch(endlinetype)
1751 {
1752 default: /* Just in case */
1753 case PCRE2_NEWLINE_LF:
1754 case PCRE2_NEWLINE_ANY:
1755 case PCRE2_NEWLINE_ANYCRLF:
1756 fprintf(stdout, "\n");
1757 break;
1758
1759 case PCRE2_NEWLINE_CR:
1760 fprintf(stdout, "\r");
1761 break;
1762
1763 case PCRE2_NEWLINE_CRLF:
1764 fprintf(stdout, "\r\n");
1765 break;
1766
1767 case PCRE2_NEWLINE_NUL:
1768 fprintf(stdout, "%c", 0);
1769 break;
1770 }
1771 }
1772
1773
1774 /*************************************************
1775 * Print the previous "after" lines *
1776 *************************************************/
1777
1778 /* This is called if we are about to lose said lines because of buffer filling,
1779 and at the end of the file. The data in the line is written using fwrite() so
1780 that a binary zero does not terminate it.
1781
1782 Arguments:
1783 lastmatchnumber the number of the last matching line, plus one
1784 lastmatchrestart where we restarted after the last match
1785 endptr end of available data
1786 printname filename for printing
1787
1788 Returns: nothing
1789 */
1790
1791 static void
do_after_lines(unsigned long int lastmatchnumber,char * lastmatchrestart,char * endptr,const char * printname)1792 do_after_lines(unsigned long int lastmatchnumber, char *lastmatchrestart,
1793 char *endptr, const char *printname)
1794 {
1795 if (after_context > 0 && lastmatchnumber > 0)
1796 {
1797 int count = 0;
1798 int ellength = 0;
1799 while (lastmatchrestart < endptr && count < after_context)
1800 {
1801 char *pp = end_of_line(lastmatchrestart, endptr, &ellength);
1802 if (ellength == 0 && pp == main_buffer + bufsize) break;
1803 if (printname != NULL) fprintf(stdout, "%s%c", printname, printname_hyphen);
1804 if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
1805 FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
1806 lastmatchrestart = pp;
1807 count++;
1808 }
1809
1810 /* If we have printed any lines, arrange for a hyphen separator if anything
1811 else follows. Also, if the last line is the final line in the file and it had
1812 no newline, add one. */
1813
1814 if (count > 0)
1815 {
1816 hyphenpending = TRUE;
1817 if (ellength == 0 && lastmatchrestart >= endptr)
1818 write_final_newline();
1819 }
1820 }
1821 }
1822
1823
1824
1825 /*************************************************
1826 * Apply patterns to subject till one matches *
1827 *************************************************/
1828
1829 /* This function is called to run through all the patterns, looking for a
1830 match. When all possible matches are required, for example, for colouring, it
1831 checks all patterns for matching, and returns the earliest match. Otherwise, it
1832 returns the first pattern that has matched.
1833
1834 Arguments:
1835 matchptr the start of the subject
1836 length the length of the subject to match
1837 options options for pcre2_match
1838 startoffset where to start matching
1839 mrc address of where to put the result of pcre2_match()
1840
1841 Returns: TRUE if there was a match, match_data and offsets are set
1842 FALSE if there was no match (but no errors)
1843 invert if there was a non-fatal error
1844 */
1845
1846 static BOOL
match_patterns(char * matchptr,PCRE2_SIZE length,unsigned int options,PCRE2_SIZE startoffset,int * mrc)1847 match_patterns(char *matchptr, PCRE2_SIZE length, unsigned int options,
1848 PCRE2_SIZE startoffset, int *mrc)
1849 {
1850 PCRE2_SIZE slen = length;
1851 int first = -1;
1852 int firstrc = 0;
1853 patstr *p = patterns;
1854 const char *msg = "this text:\n\n";
1855
1856 if (slen > 200)
1857 {
1858 slen = 200;
1859 msg = "text that starts:\n\n";
1860 }
1861
1862 for (int i = 1; p != NULL; p = p->next, i++)
1863 {
1864 int rc = pcre2_match(p->compiled, (PCRE2_SPTR)matchptr, length,
1865 startoffset, options, match_data, match_context);
1866 if (rc == PCRE2_ERROR_NOMATCH) continue;
1867
1868 /* Handle a successful match. When all_matches is false, we are done.
1869 Otherwise we must save the earliest match. */
1870
1871 if (rc >= 0)
1872 {
1873 if (!all_matches)
1874 {
1875 *mrc = rc;
1876 return TRUE;
1877 }
1878
1879 if (first < 0 || offsets[0] < offsets_pair[first][0] ||
1880 (offsets[0] == offsets_pair[first][0] &&
1881 offsets[1] > offsets_pair[first][1]))
1882 {
1883 first = match_data_toggle;
1884 firstrc = rc;
1885 match_data_toggle ^= 1;
1886 match_data = match_data_pair[match_data_toggle];
1887 offsets = offsets_pair[match_data_toggle];
1888 }
1889 continue;
1890 }
1891
1892 /* Deal with PCRE2 error. */
1893
1894 fprintf(stderr, "pcre2grep: pcre2_match() gave error %d while matching ", rc);
1895 if (patterns->next != NULL) fprintf(stderr, "pattern number %d to ", i);
1896 fprintf(stderr, "%s", msg);
1897 FWRITE_IGNORE(matchptr, 1, slen, stderr); /* In case binary zero included */
1898 fprintf(stderr, "\n\n");
1899 if (rc <= PCRE2_ERROR_UTF8_ERR1 &&
1900 rc >= PCRE2_ERROR_UTF8_ERR21)
1901 {
1902 unsigned char mbuffer[256];
1903 PCRE2_SIZE startchar = pcre2_get_startchar(match_data);
1904 (void)pcre2_get_error_message(rc, mbuffer, sizeof(mbuffer));
1905 fprintf(stderr, "%s at offset %" SIZ_FORM "\n\n", mbuffer, startchar);
1906 }
1907 if (rc == PCRE2_ERROR_MATCHLIMIT || rc == PCRE2_ERROR_DEPTHLIMIT ||
1908 rc == PCRE2_ERROR_HEAPLIMIT || rc == PCRE2_ERROR_JIT_STACKLIMIT)
1909 resource_error = TRUE;
1910 if (error_count++ > 20)
1911 {
1912 fprintf(stderr, "pcre2grep: Too many errors - abandoned.\n");
1913 pcre2grep_exit(2);
1914 }
1915 return invert; /* No more matching; don't show the line again */
1916 }
1917
1918 /* We get here when all patterns have been tried. If all_matches is false,
1919 this means that none of them matched. If all_matches is true, matched_first
1920 will be non-NULL if there was at least one match, and it will point to the
1921 appropriate match_data block. */
1922
1923 if (!all_matches || first < 0) return FALSE;
1924
1925 match_data_toggle = first;
1926 match_data = match_data_pair[first];
1927 offsets = offsets_pair[first];
1928 *mrc = firstrc;
1929 return TRUE;
1930 }
1931
1932
1933
1934 /*************************************************
1935 * Decode dollar escape sequence *
1936 *************************************************/
1937
1938 /* Called from various places to decode $ escapes in output strings. The escape
1939 sequences are as follows:
1940
1941 $<digits> or ${<digits>} returns a capture number. However, if callout is TRUE,
1942 zero is never returned; '0' is substituted.
1943
1944 $a returns bell.
1945 $b returns backspace.
1946 $e returns escape.
1947 $f returns form feed.
1948 $n returns newline.
1949 $r returns carriage return.
1950 $t returns tab.
1951 $v returns vertical tab.
1952 $o<digits> returns the character represented by the given octal
1953 number; up to three digits are processed.
1954 $o{<digits>} does the same, up to 7 digits, but gives an error for mode-invalid
1955 code points.
1956 $x<digits> returns the character represented by the given hexadecimal
1957 number; up to two digits are processed.
1958 $x{<digits} does the same, up to 6 digits, but gives an error for mode-invalid
1959 code points.
1960 Any other character is substituted by itself. E.g: $$ is replaced by a single
1961 dollar.
1962
1963 Arguments:
1964 begin the start of the whole string
1965 string points to the $
1966 callout TRUE if in a callout (inhibits error messages)
1967 value where to return a value
1968 last where to return pointer to the last used character
1969
1970 Returns: DDE_ERROR after a syntax error
1971 DDE_CAPTURE if *value is a capture number
1972 DDE_CHAR if *value is a character code
1973 */
1974
1975 static int
decode_dollar_escape(PCRE2_SPTR begin,PCRE2_SPTR string,BOOL callout,uint32_t * value,PCRE2_SPTR * last)1976 decode_dollar_escape(PCRE2_SPTR begin, PCRE2_SPTR string, BOOL callout,
1977 uint32_t *value, PCRE2_SPTR *last)
1978 {
1979 uint32_t c = 0;
1980 int base = 10;
1981 int dcount;
1982 int rc = DDE_CHAR;
1983 BOOL brace = FALSE;
1984
1985 switch (*(++string))
1986 {
1987 case 0: /* Syntax error: a character must be present after $. */
1988 if (!callout)
1989 fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
1990 (int)(string - begin), "no character after $");
1991 *last = string;
1992 return DDE_ERROR;
1993
1994 case '{':
1995 brace = TRUE;
1996 string++;
1997 if (!isdigit((unsigned char)(*string))) /* Syntax error: a decimal number required. */
1998 {
1999 if (!callout)
2000 fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
2001 (int)(string - begin), "decimal number expected");
2002 rc = DDE_ERROR;
2003 break;
2004 }
2005
2006 /* Fall through */
2007
2008 /* The maximum capture number is 65535, so any number greater than that will
2009 always be an unknown capture number. We just stop incrementing, in order to
2010 avoid overflow. */
2011
2012 case '0': case '1': case '2': case '3': case '4':
2013 case '5': case '6': case '7': case '8': case '9':
2014 do
2015 {
2016 if (c <= 65535) c = c * 10 + (*string - '0');
2017 string++;
2018 }
2019 while (*string >= '0' && *string <= '9');
2020 string--; /* Point to last digit */
2021
2022 /* In a callout, capture number 0 is not available. No error can be given,
2023 so just return the character '0'. */
2024
2025 if (callout && c == 0)
2026 {
2027 *value = '0';
2028 }
2029 else
2030 {
2031 *value = c;
2032 rc = DDE_CAPTURE;
2033 }
2034 break;
2035
2036 /* Limit octal numbers to 3 digits without braces, or up to 7 with braces,
2037 for valid Unicode code points. */
2038
2039 case 'o':
2040 base = 8;
2041 string++;
2042 if (*string == '{')
2043 {
2044 brace = TRUE;
2045 string++;
2046 dcount = 7;
2047 }
2048 else dcount = 3;
2049 for (; dcount > 0; dcount--)
2050 {
2051 if (*string < '0' || *string > '7') break;
2052 c = c * 8 + (*string++ - '0');
2053 }
2054 *value = c;
2055 string--; /* Point to last digit */
2056 break;
2057
2058 /* Limit hex numbers to 2 digits without braces, or up to 6 with braces,
2059 for valid Unicode code points. */
2060
2061 case 'x':
2062 base = 16;
2063 string++;
2064 if (*string == '{')
2065 {
2066 brace = TRUE;
2067 string++;
2068 dcount = 6;
2069 }
2070 else dcount = 2;
2071 for (; dcount > 0; dcount--)
2072 {
2073 if (!isxdigit(*string)) break;
2074 if (*string >= '0' && *string <= '9')
2075 c = c *16 + *string++ - '0';
2076 else
2077 c = c * 16 + (*string++ | 0x20) - 'a' + 10;
2078 }
2079 *value = c;
2080 string--; /* Point to last digit */
2081 break;
2082
2083 case 'a': *value = '\a'; break;
2084 case 'b': *value = '\b'; break;
2085 #ifndef EBCDIC
2086 case 'e': *value = '\033'; break;
2087 #else
2088 case 'e': *value = '\047'; break;
2089 #endif
2090 case 'f': *value = '\f'; break;
2091 case 'n': *value = STDOUT_NL_CODE; break;
2092 case 'r': *value = '\r'; break;
2093 case 't': *value = '\t'; break;
2094 case 'v': *value = '\v'; break;
2095
2096 default: *value = *string; break;
2097 }
2098
2099 if (brace)
2100 {
2101 c = string[1];
2102 if (c != '}')
2103 {
2104 rc = DDE_ERROR;
2105 if (!callout)
2106 {
2107 if ((base == 8 && c >= '0' && c <= '7') ||
2108 (base == 16 && isxdigit(c)))
2109 {
2110 fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
2111 "too many %s digits\n", (int)(string - begin),
2112 (base == 8)? "octal" : "hex");
2113 }
2114 else
2115 {
2116 fprintf(stderr, "pcre2grep: Error in output text at offset %d: %s\n",
2117 (int)(string - begin), "missing closing brace");
2118 }
2119 }
2120 }
2121 else string++;
2122 }
2123
2124 /* Check maximum code point values, but take note of STDOUT_NL_CODE. */
2125
2126 if (rc == DDE_CHAR && *value != STDOUT_NL_CODE)
2127 {
2128 uint32_t max = utf? 0x0010ffffu : 0xffu;
2129 if (*value > max)
2130 {
2131 if (!callout)
2132 fprintf(stderr, "pcre2grep: Error in output text at offset %d: "
2133 "code point greater than 0x%x is invalid\n", (int)(string - begin), max);
2134 rc = DDE_ERROR;
2135 }
2136 }
2137
2138 *last = string;
2139 return rc;
2140 }
2141
2142
2143
2144 /*************************************************
2145 * Check output text for errors *
2146 *************************************************/
2147
2148 /* Called early, to get errors before doing anything for -O text; also called
2149 from callouts to check before outputting.
2150
2151 Arguments:
2152 string an --output text string
2153 callout TRUE if in a callout (stops printing errors)
2154
2155 Returns: TRUE if OK, FALSE on error
2156 */
2157
2158 static BOOL
syntax_check_output_text(PCRE2_SPTR string,BOOL callout)2159 syntax_check_output_text(PCRE2_SPTR string, BOOL callout)
2160 {
2161 uint32_t value;
2162 PCRE2_SPTR begin = string;
2163
2164 for (; *string != 0; string++)
2165 {
2166 if (*string == '$' &&
2167 decode_dollar_escape(begin, string, callout, &value, &string) == DDE_ERROR)
2168 return FALSE;
2169 }
2170
2171 return TRUE;
2172 }
2173
2174
2175 /*************************************************
2176 * Display output text *
2177 *************************************************/
2178
2179 /* Display the output text, which is assumed to have already been syntax
2180 checked. Output may contain escape sequences started by the dollar sign.
2181
2182 Arguments:
2183 string: the output text
2184 callout: TRUE for the builtin callout, FALSE for --output
2185 subject the start of the subject
2186 ovector: capture offsets
2187 capture_top: number of captures
2188
2189 Returns: TRUE if something was output, other than newline
2190 FALSE if nothing was output, or newline was last output
2191 */
2192
2193 static BOOL
display_output_text(PCRE2_SPTR string,BOOL callout,PCRE2_SPTR subject,PCRE2_SIZE * ovector,PCRE2_SIZE capture_top)2194 display_output_text(PCRE2_SPTR string, BOOL callout, PCRE2_SPTR subject,
2195 PCRE2_SIZE *ovector, PCRE2_SIZE capture_top)
2196 {
2197 uint32_t value;
2198 BOOL printed = FALSE;
2199 PCRE2_SPTR begin = string;
2200
2201 for (; *string != 0; string++)
2202 {
2203 if (*string == '$')
2204 {
2205 switch(decode_dollar_escape(begin, string, callout, &value, &string))
2206 {
2207 case DDE_CHAR:
2208 if (value == STDOUT_NL_CODE)
2209 {
2210 fprintf(stdout, STDOUT_NL);
2211 printed = FALSE;
2212 continue;
2213 }
2214 break; /* Will print value */
2215
2216 case DDE_CAPTURE:
2217 if (value < capture_top)
2218 {
2219 PCRE2_SIZE capturesize;
2220 value *= 2;
2221 capturesize = ovector[value + 1] - ovector[value];
2222 if (capturesize > 0)
2223 {
2224 print_match(subject + ovector[value], capturesize);
2225 printed = TRUE;
2226 }
2227 }
2228 continue;
2229
2230 /* LCOV_EXCL_START */
2231 default: /* Should not occur */
2232 break;
2233 /* LCOV_EXCL_STOP */
2234 }
2235 }
2236
2237 else value = *string; /* Not a $ escape */
2238
2239 if (!utf || value <= 127) fprintf(stdout, "%c", value); else
2240 {
2241 int n = ord2utf8(value);
2242 for (int i = 0; i < n; i++) fputc(utf8_buffer[i], stdout);
2243 }
2244
2245 printed = TRUE;
2246 }
2247
2248 return printed;
2249 }
2250
2251
2252 #ifdef SUPPORT_PCRE2GREP_CALLOUT
2253
2254 /*************************************************
2255 * Parse and execute callout scripts *
2256 *************************************************/
2257
2258 /* If SUPPORT_PCRE2GREP_CALLOUT_FORK is defined, this function parses a callout
2259 string block and executes the program specified by the string. The string is a
2260 list of substrings separated by pipe characters. The first substring represents
2261 the executable name, and the following substrings specify the arguments:
2262
2263 program_name|param1|param2|...
2264
2265 Any substring (including the program name) can contain escape sequences
2266 started by the dollar character. The escape sequences are substituted as
2267 follows:
2268
2269 $<digits> or ${<digits>} is replaced by the captured substring of the given
2270 decimal number, which must be greater than zero. If the number is greater
2271 than the number of capturing substrings, or if the capture is unset, the
2272 replacement is empty.
2273
2274 Any other character is substituted by itself. E.g: $$ is replaced by a single
2275 dollar or $| replaced by a pipe character.
2276
2277 Alternatively, if string starts with pipe, the remainder is taken as an output
2278 string, same as --output. This is the only form that is supported if
2279 SUPPORT_PCRE2GREP_FORK is not defined. In this case, --om-separator is used to
2280 separate each callout, defaulting to newline.
2281
2282 Example:
2283
2284 echo -e "abcde\n12345" | pcre2grep \
2285 '(.)(..(.))(?C"/bin/echo|Arg1: [$1] [$2] [$3]|Arg2: $|${1}$| ($4)")()' -
2286
2287 Output:
2288
2289 Arg1: [a] [bcd] [d] Arg2: |a| ()
2290 abcde
2291 Arg1: [1] [234] [4] Arg2: |1| ()
2292 12345
2293
2294 Arguments:
2295 blockptr the callout block
2296
2297 Returns: currently it always returns with 0
2298 */
2299
2300 static int
pcre2grep_callout(pcre2_callout_block * calloutptr,void * unused)2301 pcre2grep_callout(pcre2_callout_block *calloutptr, void *unused)
2302 {
2303 PCRE2_SIZE length = calloutptr->callout_string_length;
2304 PCRE2_SPTR string = calloutptr->callout_string;
2305 PCRE2_SPTR subject = calloutptr->subject;
2306 PCRE2_SIZE *ovector = calloutptr->offset_vector;
2307 PCRE2_SIZE capture_top = calloutptr->capture_top;
2308
2309 #ifdef SUPPORT_PCRE2GREP_CALLOUT_FORK
2310 PCRE2_SIZE argsvectorlen = 2;
2311 PCRE2_SIZE argslen = 1;
2312 char *args;
2313 char *argsptr;
2314 char **argsvector;
2315 char **argsvectorptr;
2316 #ifndef WIN32
2317 pid_t pid;
2318 #endif
2319 int result = 0;
2320 #endif /* SUPPORT_PCRE2GREP_CALLOUT_FORK */
2321
2322 (void)unused; /* Avoid compiler warning */
2323
2324 /* Only callouts with strings are supported. */
2325
2326 if (string == NULL || length == 0) return 0;
2327
2328 /* If there's no command, output the remainder directly. */
2329
2330 if (*string == '|')
2331 {
2332 string++;
2333 if (!syntax_check_output_text(string, TRUE)) return 0;
2334 (void)display_output_text(string, TRUE, subject, ovector, capture_top);
2335 return 0;
2336 }
2337
2338 #ifndef SUPPORT_PCRE2GREP_CALLOUT_FORK
2339 return 0;
2340 #else
2341
2342 /* Checking syntax and compute the number of string fragments. Callout strings
2343 are silently ignored in the event of a syntax error. */
2344
2345 while (length > 0)
2346 {
2347 if (*string == '|')
2348 {
2349 argsvectorlen++;
2350 if (argsvectorlen > 10000) return 0; /* Too many args */
2351 }
2352
2353 else if (*string == '$')
2354 {
2355 uint32_t value;
2356 PCRE2_SPTR begin = string;
2357
2358 switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
2359 {
2360 case DDE_CAPTURE:
2361 if (value < capture_top)
2362 {
2363 value *= 2;
2364 argslen += ovector[value + 1] - ovector[value];
2365 }
2366 argslen--; /* Negate the effect of argslen++ below. */
2367 break;
2368
2369 case DDE_CHAR:
2370 if (value == STDOUT_NL_CODE) argslen += STDOUT_NL_LEN - 1;
2371 else if (utf && value > 127) argslen += ord2utf8(value) - 1;
2372 break;
2373
2374 /* LCOV_EXCL_START */
2375 default: /* Should not occur */
2376 case DDE_ERROR:
2377 return 0;
2378 /* LCOV_EXCL_STOP */
2379 }
2380
2381 length -= (string - begin);
2382 }
2383
2384 string++;
2385 length--;
2386 argslen++;
2387 }
2388
2389 /* Get memory for the argument vector and its strings. */
2390
2391 args = (char*)malloc(argslen);
2392 if (args == NULL) return 0;
2393
2394 argsvector = (char**)malloc(argsvectorlen * sizeof(char*));
2395 if (argsvector == NULL)
2396 {
2397 /* LCOV_EXCL_START */
2398 free(args);
2399 return 0;
2400 /* LCOV_EXCL_STOP */
2401 }
2402
2403 /* Now reprocess the string and set up the arguments. */
2404
2405 argsptr = args;
2406 argsvectorptr = argsvector;
2407 *argsvectorptr++ = argsptr;
2408
2409 length = calloutptr->callout_string_length;
2410 string = calloutptr->callout_string;
2411
2412 while (length > 0)
2413 {
2414 if (*string == '|')
2415 {
2416 *argsptr++ = '\0';
2417 *argsvectorptr++ = argsptr;
2418 }
2419
2420 else if (*string == '$')
2421 {
2422 uint32_t value;
2423 PCRE2_SPTR begin = string;
2424
2425 switch (decode_dollar_escape(begin, string, TRUE, &value, &string))
2426 {
2427 case DDE_CAPTURE:
2428 if (value < capture_top)
2429 {
2430 PCRE2_SIZE capturesize;
2431 value *= 2;
2432 capturesize = ovector[value + 1] - ovector[value];
2433 memcpy(argsptr, subject + ovector[value], capturesize);
2434 argsptr += capturesize;
2435 }
2436 break;
2437
2438 case DDE_CHAR:
2439 if (value == STDOUT_NL_CODE)
2440 {
2441 memcpy(argsptr, STDOUT_NL, STDOUT_NL_LEN);
2442 argsptr += STDOUT_NL_LEN;
2443 }
2444 else if (utf && value > 127)
2445 {
2446 int n = ord2utf8(value);
2447 memcpy(argsptr, utf8_buffer, n);
2448 argsptr += n;
2449 }
2450 else
2451 {
2452 *argsptr++ = value;
2453 }
2454 break;
2455
2456 /* LCOV_EXCL_START */
2457 default: /* Even though this should not occur, the string having */
2458 case DDE_ERROR: /* been checked above, we need to include the free() */
2459 free(args); /* calls so that source checkers do not complain. */
2460 free(argsvector);
2461 return 0;
2462 /* LCOV_EXCL_STOP */
2463 }
2464
2465 length -= (string - begin);
2466 }
2467
2468 else *argsptr++ = *string;
2469
2470 /* Advance along the string */
2471
2472 string++;
2473 length--;
2474 }
2475
2476 *argsptr++ = '\0';
2477 *argsvectorptr = NULL;
2478
2479 /* Running an external command is system-dependent. Handle Windows and VMS as
2480 necessary, otherwise assume fork(). */
2481
2482 #ifdef WIN32
2483 result = _spawnvp(_P_WAIT, argsvector[0], (const char * const *)argsvector);
2484
2485 #elif defined __VMS
2486 {
2487 char cmdbuf[500];
2488 short i = 0;
2489 int flags = CLI$M_NOCLISYM|CLI$M_NOLOGNAM|CLI$M_NOKEYPAD, status, retstat;
2490 $DESCRIPTOR(cmd, cmdbuf);
2491
2492 cmdbuf[0] = 0;
2493 while (argsvector[i])
2494 {
2495 strcat(cmdbuf, argsvector[i]);
2496 strcat(cmdbuf, " ");
2497 i++;
2498 }
2499 cmd.dsc$w_length = strlen(cmdbuf) - 1;
2500 status = lib$spawn(&cmd, 0,0, &flags, 0,0, &retstat);
2501 if (!(status & 1)) result = 0;
2502 else result = retstat & 1 ? 0 : 1;
2503 }
2504
2505 #else /* Neither Windows nor VMS */
2506 pid = fork();
2507 if (pid == 0)
2508 {
2509 (void)execv(argsvector[0], argsvector);
2510 /* Control gets here if there is an error, e.g. a non-existent program */
2511 exit(1);
2512 }
2513 else if (pid > 0)
2514 {
2515 (void)fflush(stdout);
2516 (void)waitpid(pid, &result, 0);
2517 (void)fflush(stdout);
2518 }
2519 #endif /* End Windows/VMS/other handling */
2520
2521 free(args);
2522 free(argsvector);
2523
2524 /* Currently negative return values are not supported, only zero (match
2525 continues) or non-zero (match fails). */
2526
2527 return result != 0;
2528 #endif /* SUPPORT_PCRE2GREP_CALLOUT_FORK */
2529 }
2530 #endif /* SUPPORT_PCRE2GREP_CALLOUT */
2531
2532
2533
2534 /*************************************************
2535 * Read a portion of the file into buffer *
2536 *************************************************/
2537
2538 static PCRE2_SIZE
fill_buffer(void * handle,int frtype,char * buffer,PCRE2_SIZE length,BOOL input_line_buffered)2539 fill_buffer(void *handle, int frtype, char *buffer, PCRE2_SIZE length,
2540 BOOL input_line_buffered)
2541 {
2542 (void)frtype; /* Avoid warning when not used */
2543
2544 #ifdef SUPPORT_LIBZ
2545 if (frtype == FR_LIBZ)
2546 return gzread((gzFile)handle, buffer, length);
2547 else
2548 #endif
2549
2550 #ifdef SUPPORT_LIBBZ2
2551 if (frtype == FR_LIBBZ2)
2552 return (PCRE2_SIZE)BZ2_bzread((BZFILE *)handle, buffer, length);
2553 else
2554 #endif
2555
2556 return (input_line_buffered ?
2557 read_one_line(buffer, length, (FILE *)handle) :
2558 fread(buffer, 1, length, (FILE *)handle));
2559 }
2560
2561
2562
2563 /*************************************************
2564 * Grep an individual file *
2565 *************************************************/
2566
2567 /* This is called from grep_or_recurse() below. It uses a buffer that is three
2568 times the value of bufthird. The matching point is never allowed to stray into
2569 the top third of the buffer, thus keeping more of the file available for
2570 context printing or for multiline scanning. For large files, the pointer will
2571 be in the middle third most of the time, so the bottom third is available for
2572 "before" context printing.
2573
2574 Arguments:
2575 handle the fopened FILE stream for a normal file
2576 the gzFile pointer when reading is via libz
2577 the BZFILE pointer when reading is via libbz2
2578 frtype FR_PLAIN, FR_LIBZ, or FR_LIBBZ2
2579 filename the file name or NULL (for errors)
2580 printname the file name if it is to be printed for each match
2581 or NULL if the file name is not to be printed
2582 it cannot be NULL if filenames[_nomatch]_only is set
2583
2584 Returns: 0 if there was at least one match
2585 1 otherwise (no matches)
2586 2 if an overlong line is encountered
2587 3 if there is a read error on a .bz2 file
2588 */
2589
2590 static int
pcre2grep(void * handle,int frtype,const char * filename,const char * printname)2591 pcre2grep(void *handle, int frtype, const char *filename, const char *printname)
2592 {
2593 int rc = 1;
2594 int filepos = 0;
2595 unsigned long int linenumber = 1;
2596 unsigned long int lastmatchnumber = 0;
2597 unsigned long int count = 0;
2598 long int count_matched_lines = 0;
2599 char *lastmatchrestart = main_buffer;
2600 char *ptr = main_buffer;
2601 char *endptr;
2602 PCRE2_SIZE bufflength;
2603 BOOL binary = FALSE;
2604 BOOL endhyphenpending = FALSE;
2605 BOOL lines_printed = FALSE;
2606 BOOL input_line_buffered = line_buffered;
2607 FILE *in = NULL; /* Ensure initialized */
2608 long stream_start = -1; /* Only non-negative if relevant */
2609
2610 /* Do the first read into the start of the buffer and set up the pointer to end
2611 of what we have. In the case of libz, a non-zipped .gz file will be read as a
2612 plain file. However, if a .bz2 file isn't actually bzipped, the first read will
2613 fail. */
2614
2615 if (frtype != FR_LIBZ && frtype != FR_LIBBZ2)
2616 {
2617 in = (FILE *)handle;
2618 if (feof(in)) return 1;
2619 if (is_file_tty(in)) input_line_buffered = TRUE;
2620 else
2621 {
2622 if (count_limit >= 0 && filename == stdin_name)
2623 stream_start = ftell(in);
2624 }
2625 }
2626 else input_line_buffered = FALSE;
2627
2628 bufflength = fill_buffer(handle, frtype, main_buffer, bufsize,
2629 input_line_buffered);
2630
2631 #ifdef SUPPORT_LIBBZ2
2632 if (frtype == FR_LIBBZ2 && (int)bufflength < 0) return 3; /* Gotcha: bufflength is PCRE2_SIZE */
2633 #endif
2634
2635 endptr = main_buffer + bufflength;
2636
2637 /* Unless binary-files=text, see if we have a binary file. This uses the same
2638 rule as GNU grep, namely, a search for a binary zero byte near the start of the
2639 file. However, when the newline convention is binary zero, we can't do this. */
2640
2641 if (binary_files != BIN_TEXT)
2642 {
2643 if (endlinetype != PCRE2_NEWLINE_NUL)
2644 binary = memchr(main_buffer, 0, (bufflength > 1024)? 1024 : bufflength)
2645 != NULL;
2646 if (binary && binary_files == BIN_NOMATCH) return 1;
2647 }
2648
2649 /* Loop while the current pointer is not at the end of the file. For large
2650 files, endptr will be at the end of the buffer when we are in the middle of the
2651 file, but ptr will never get there, because as soon as it gets over 2/3 of the
2652 way, the buffer is shifted left and re-filled. */
2653
2654 while (ptr < endptr)
2655 {
2656 int endlinelength;
2657 int mrc = 0;
2658 unsigned int options = 0;
2659 BOOL match;
2660 BOOL line_matched = FALSE;
2661 char *t = ptr;
2662 PCRE2_SIZE length, linelength;
2663 PCRE2_SIZE startoffset = 0;
2664
2665 /* If the -m option set a limit for the number of matched or non-matched
2666 lines, check it here. A limit of zero means that no matching is ever done.
2667 For stdin from a file, set the file position. */
2668
2669 if (count_limit >= 0 && count_matched_lines >= count_limit)
2670 {
2671 if (stream_start >= 0)
2672 (void)fseek(handle, stream_start + (long int)filepos, SEEK_SET);
2673 rc = (count_limit == 0)? 1 : 0;
2674 break;
2675 }
2676
2677 /* At this point, ptr is at the start of a line. We need to find the length
2678 of the subject string to pass to pcre2_match(). In multiline mode, it is the
2679 length remainder of the data in the buffer. Otherwise, it is the length of
2680 the next line, excluding the terminating newline. After matching, we always
2681 advance by the length of the next line. In multiline mode the PCRE2_FIRSTLINE
2682 option is used for compiling, so that any match is constrained to be in the
2683 first line. */
2684
2685 t = end_of_line(t, endptr, &endlinelength);
2686 linelength = t - ptr - endlinelength;
2687 length = multiline? (PCRE2_SIZE)(endptr - ptr) : linelength;
2688
2689 /* Check to see if the line we are looking at extends right to the very end
2690 of the buffer without a line terminator. This means the line is too long to
2691 handle at the current buffer size. Until the buffer reaches its maximum size,
2692 try doubling it and reading more data. */
2693
2694 if (endlinelength == 0 && t == main_buffer + bufsize)
2695 {
2696 if (bufthird < max_bufthird)
2697 {
2698 char *new_buffer;
2699 PCRE2_SIZE new_bufthird = 2*bufthird;
2700
2701 if (new_bufthird > max_bufthird) new_bufthird = max_bufthird;
2702 new_buffer = (char *)malloc(3*new_bufthird);
2703
2704 if (new_buffer == NULL)
2705 {
2706 /* LCOV_EXCL_START */
2707 fprintf(stderr,
2708 "pcre2grep: line %lu%s%s is too long for the internal buffer\n"
2709 "pcre2grep: not enough memory to increase the buffer size to %"
2710 SIZ_FORM "\n",
2711 linenumber,
2712 (filename == NULL)? "" : " of file ",
2713 (filename == NULL)? "" : filename,
2714 new_bufthird);
2715 return 2;
2716 /* LCOV_EXCL_STOP */
2717 }
2718
2719 /* Copy the data and adjust pointers to the new buffer location. */
2720
2721 memcpy(new_buffer, main_buffer, bufsize);
2722 bufthird = new_bufthird;
2723 bufsize = 3*bufthird;
2724 ptr = new_buffer + (ptr - main_buffer);
2725 lastmatchrestart = new_buffer + (lastmatchrestart - main_buffer);
2726 free(main_buffer);
2727 main_buffer = new_buffer;
2728
2729 /* Read more data into the buffer and then try to find the line ending
2730 again. */
2731
2732 bufflength += fill_buffer(handle, frtype, main_buffer + bufflength,
2733 bufsize - bufflength, input_line_buffered);
2734 endptr = main_buffer + bufflength;
2735 continue;
2736 }
2737 else
2738 {
2739 fprintf(stderr,
2740 "pcre2grep: line %lu%s%s is too long for the internal buffer\n"
2741 "pcre2grep: the maximum buffer size is %" SIZ_FORM "\n"
2742 "pcre2grep: use the --max-buffer-size option to change it\n",
2743 linenumber,
2744 (filename == NULL)? "" : " of file ",
2745 (filename == NULL)? "" : filename,
2746 bufthird);
2747 return 2;
2748 }
2749 }
2750
2751 /* We come back here after a match when only_matching_count is non-zero, in
2752 order to find any further matches in the same line. This applies to
2753 --only-matching, --file-offsets, and --line-offsets. */
2754
2755 ONLY_MATCHING_RESTART:
2756
2757 /* Run through all the patterns until one matches or there is an error other
2758 than NOMATCH. This code is in a subroutine so that it can be re-used for
2759 finding subsequent matches when colouring matched lines. After finding one
2760 match, set PCRE2_NOTEMPTY to disable any further matches of null strings in
2761 this line. */
2762
2763 match = match_patterns(ptr, length, options, startoffset, &mrc);
2764 options = PCRE2_NOTEMPTY;
2765
2766 /* If it's a match or a not-match (as required), do what's wanted. NOTE: Use
2767 only FWRITE_IGNORE() - which is just a packaged fwrite() that ignores its
2768 return code - to output data lines, so that binary zeroes are treated as just
2769 another data character. */
2770
2771 if (match != invert)
2772 {
2773 BOOL hyphenprinted = FALSE;
2774
2775 /* We've failed if we want a file that doesn't have any matches. */
2776
2777 if (filenames == FN_NOMATCH_ONLY) return 1;
2778
2779 /* Remember that this line matched (for counting matched lines) */
2780
2781 line_matched = TRUE;
2782
2783 /* If all we want is a yes/no answer, we can return immediately. */
2784
2785 if (quiet) return 0;
2786
2787 /* Just count if just counting is wanted. */
2788
2789 else if (count_only || show_total_count) count++;
2790
2791 /* When handling a binary file and binary-files==binary, the "binary"
2792 variable will be set true (it's false in all other cases). In this
2793 situation we just want to output the file name. No need to scan further. */
2794
2795 else if (binary)
2796 {
2797 fprintf(stdout, "Binary file %s matches" STDOUT_NL, filename);
2798 return 0;
2799 }
2800
2801 /* Likewise, if all we want is a file name, there is no need to scan any
2802 more lines in the file. */
2803
2804 else if (filenames == FN_MATCH_ONLY)
2805 {
2806 fprintf(stdout, "%s", printname);
2807 if (printname_nl == NULL) fprintf(stdout, "%c", 0);
2808 else fprintf(stdout, "%s", printname_nl);
2809 return 0;
2810 }
2811
2812 /* The --only-matching option prints just the substring that matched,
2813 and/or one or more captured portions of it, as long as these strings are
2814 not empty. The --file-offsets and --line-offsets options output offsets for
2815 the matching substring (all three set only_matching_count non-zero). None
2816 of these mutually exclusive options prints any context. Afterwards, adjust
2817 the start and then jump back to look for further matches in the same line.
2818 If we are in invert mode, however, nothing is printed and we do not restart
2819 - this could still be useful because the return code is set. */
2820
2821 else if (only_matching_count != 0)
2822 {
2823 if (!invert)
2824 {
2825 PCRE2_SIZE oldstartoffset;
2826
2827 if (printname != NULL) fprintf(stdout, "%s%c", printname,
2828 printname_colon);
2829 if (number) fprintf(stdout, "%lu:", linenumber);
2830
2831 /* Handle --line-offsets */
2832
2833 if (line_offsets)
2834 fprintf(stdout, "%d,%d" STDOUT_NL, (int)(ptr + offsets[0] - ptr),
2835 (int)(offsets[1] - offsets[0]));
2836
2837 /* Handle --file-offsets */
2838
2839 else if (file_offsets)
2840 fprintf(stdout, "%d,%d" STDOUT_NL,
2841 (int)(filepos + ptr + offsets[0] - ptr),
2842 (int)(offsets[1] - offsets[0]));
2843
2844 /* Handle --output (which has already been syntax checked) */
2845
2846 else if (output_text != NULL)
2847 {
2848 (void)display_output_text((PCRE2_SPTR)output_text, FALSE,
2849 (PCRE2_SPTR)ptr, offsets, mrc);
2850 fprintf(stdout, STDOUT_NL);
2851 }
2852
2853 /* Handle --only-matching, which may occur many times */
2854
2855 else
2856 {
2857 BOOL printed = FALSE;
2858 omstr *om;
2859
2860 for (om = only_matching; om != NULL; om = om->next)
2861 {
2862 int n = om->groupnum;
2863 if (n == 0 || n < mrc)
2864 {
2865 int plen = offsets[2*n + 1] - offsets[2*n];
2866 if (plen > 0)
2867 {
2868 if (printed && om_separator != NULL)
2869 fprintf(stdout, "%s", om_separator);
2870 print_match(ptr + offsets[n*2], plen);
2871 printed = TRUE;
2872 }
2873 }
2874 }
2875 if (printed || printname != NULL || number)
2876 fprintf(stdout, STDOUT_NL);
2877 }
2878
2879 /* Prepare to repeat to find the next match in the line. */
2880
2881 //match = FALSE;
2882 if (line_buffered) fflush(stdout);
2883 rc = 0; /* Had some success */
2884
2885 /* If the pattern contained a lookbehind that included \K, it is
2886 possible that the end of the match might be at or before the actual
2887 starting offset we have just used. In this case, start one character
2888 further on. */
2889
2890 startoffset = offsets[1]; /* Restart after the match */
2891 oldstartoffset = pcre2_get_startchar(match_data);
2892 if (startoffset <= oldstartoffset)
2893 {
2894 if (startoffset >= length) goto END_ONE_MATCH; /* Were at end */
2895 startoffset = oldstartoffset + 1;
2896 if (utf) while ((ptr[startoffset] & 0xc0) == 0x80) startoffset++;
2897 }
2898
2899 /* If the current match ended past the end of the line (only possible
2900 in multiline mode), we must move on to the line in which it did end
2901 before searching for more matches. */
2902
2903 while (startoffset > linelength)
2904 {
2905 ptr += linelength + endlinelength;
2906 filepos += (int)(linelength + endlinelength);
2907 linenumber++;
2908 startoffset -= (int)(linelength + endlinelength);
2909 t = end_of_line(ptr, endptr, &endlinelength);
2910 linelength = t - ptr - endlinelength;
2911 length = (PCRE2_SIZE)(endptr - ptr);
2912 }
2913
2914 goto ONLY_MATCHING_RESTART;
2915 }
2916 }
2917
2918 /* This is the default case when none of the above options is set. We print
2919 the matching lines(s), possibly preceded and/or followed by other lines of
2920 context. */
2921
2922 else
2923 {
2924 lines_printed = TRUE;
2925
2926 /* See if there is a requirement to print some "after" lines from a
2927 previous match. We never print any overlaps. */
2928
2929 if (after_context > 0 && lastmatchnumber > 0)
2930 {
2931 int ellength;
2932 int linecount = 0;
2933 char *p = lastmatchrestart;
2934
2935 while (p < ptr && linecount < after_context)
2936 {
2937 p = end_of_line(p, ptr, &ellength);
2938 linecount++;
2939 }
2940
2941 /* It is important to advance lastmatchrestart during this printing so
2942 that it interacts correctly with any "before" printing below. Print
2943 each line's data using fwrite() in case there are binary zeroes. */
2944
2945 while (lastmatchrestart < p)
2946 {
2947 char *pp = lastmatchrestart;
2948 if (printname != NULL) fprintf(stdout, "%s%c", printname,
2949 printname_hyphen);
2950 if (number) fprintf(stdout, "%lu-", lastmatchnumber++);
2951 pp = end_of_line(pp, endptr, &ellength);
2952 FWRITE_IGNORE(lastmatchrestart, 1, pp - lastmatchrestart, stdout);
2953 lastmatchrestart = pp;
2954 }
2955 if (lastmatchrestart != ptr) hyphenpending = TRUE;
2956 }
2957
2958 /* If there were non-contiguous lines printed above, insert hyphens. */
2959
2960 if (hyphenpending)
2961 {
2962 if (group_separator != NULL)
2963 fprintf(stdout, "%s%s", group_separator, STDOUT_NL);
2964 hyphenpending = FALSE;
2965 hyphenprinted = TRUE;
2966 }
2967
2968 /* See if there is a requirement to print some "before" lines for this
2969 match. Again, don't print overlaps. */
2970
2971 if (before_context > 0)
2972 {
2973 int linecount = 0;
2974 char *p = ptr;
2975
2976 while (p > main_buffer &&
2977 (lastmatchnumber == 0 || p > lastmatchrestart) &&
2978 linecount < before_context)
2979 {
2980 linecount++;
2981 p = previous_line(p, main_buffer);
2982 }
2983
2984 if (lastmatchnumber > 0 && p > lastmatchrestart && !hyphenprinted &&
2985 group_separator != NULL)
2986 fprintf(stdout, "%s%s", group_separator, STDOUT_NL);
2987
2988 while (p < ptr)
2989 {
2990 int ellength;
2991 char *pp = p;
2992 if (printname != NULL) fprintf(stdout, "%s%c", printname,
2993 printname_hyphen);
2994 if (number) fprintf(stdout, "%lu-", linenumber - linecount--);
2995 pp = end_of_line(pp, endptr, &ellength);
2996 FWRITE_IGNORE(p, 1, pp - p, stdout);
2997 p = pp;
2998 }
2999 }
3000
3001 /* Now print the matching line(s); ensure we set hyphenpending at the end
3002 of the file if any context lines are being output. */
3003
3004 if (after_context > 0 || before_context > 0)
3005 endhyphenpending = TRUE;
3006
3007 if (printname != NULL) fprintf(stdout, "%s%c", printname,
3008 printname_colon);
3009 if (number) fprintf(stdout, "%lu:", linenumber);
3010
3011 /* In multiline mode, or if colouring, we have to split the line(s) up
3012 and search for further matches, but not of course if the line is a
3013 non-match. In multiline mode this is necessary in case there is another
3014 match that spans the end of the current line. When colouring we want to
3015 colour all matches. */
3016
3017 if ((multiline || do_colour) && !invert)
3018 {
3019 int plength;
3020 PCRE2_SIZE endprevious;
3021
3022 /* The use of \K may make the end offset earlier than the start. In
3023 this situation, swap them round. */
3024
3025 if (offsets[0] > offsets[1])
3026 {
3027 PCRE2_SIZE temp = offsets[0];
3028 offsets[0] = offsets[1];
3029 offsets[1] = temp;
3030 }
3031
3032 FWRITE_IGNORE(ptr, 1, offsets[0], stdout);
3033 print_match(ptr + offsets[0], offsets[1] - offsets[0]);
3034
3035 for (;;)
3036 {
3037 PCRE2_SIZE oldstartoffset = pcre2_get_startchar(match_data);
3038
3039 endprevious = offsets[1];
3040 startoffset = endprevious; /* Advance after previous match. */
3041
3042 /* If the pattern contained a lookbehind that included \K, it is
3043 possible that the end of the match might be at or before the actual
3044 starting offset we have just used. In this case, start one character
3045 further on. */
3046
3047 if (startoffset <= oldstartoffset)
3048 {
3049 startoffset = oldstartoffset + 1;
3050 if (utf) while ((ptr[startoffset] & 0xc0) == 0x80) startoffset++;
3051 }
3052
3053 /* If the current match ended past the end of the line (only possible
3054 in multiline mode), we must move on to the line in which it did end
3055 before searching for more matches. Because the PCRE2_FIRSTLINE option
3056 is set, the start of the match will always be before the first
3057 newline sequence. */
3058
3059 while (startoffset > linelength + endlinelength)
3060 {
3061 ptr += linelength + endlinelength;
3062 filepos += (int)(linelength + endlinelength);
3063 linenumber++;
3064 startoffset -= (int)(linelength + endlinelength);
3065 endprevious -= (int)(linelength + endlinelength);
3066 t = end_of_line(ptr, endptr, &endlinelength);
3067 linelength = t - ptr - endlinelength;
3068 length = (PCRE2_SIZE)(endptr - ptr);
3069 }
3070
3071 /* If startoffset is at the exact end of the line it means this
3072 complete line was the final part of the match, so there is nothing
3073 more to do. */
3074
3075 if (startoffset == linelength + endlinelength) break;
3076
3077 /* Otherwise, run a match from within the final line, and if found,
3078 loop for any that may follow. */
3079
3080 if (!match_patterns(ptr, length, options, startoffset, &mrc)) break;
3081
3082 /* The use of \K may make the end offset earlier than the start. In
3083 this situation, swap them round. */
3084
3085 if (offsets[0] > offsets[1])
3086 {
3087 PCRE2_SIZE temp = offsets[0];
3088 offsets[0] = offsets[1];
3089 offsets[1] = temp;
3090 }
3091
3092 FWRITE_IGNORE(ptr + endprevious, 1, offsets[0] - endprevious, stdout);
3093 print_match(ptr + offsets[0], offsets[1] - offsets[0]);
3094 }
3095
3096 /* In multiline mode, we may have already printed the complete line
3097 and its line-ending characters (if they matched the pattern), so there
3098 may be no more to print. */
3099
3100 plength = (int)((linelength + endlinelength) - endprevious);
3101 if (plength > 0) FWRITE_IGNORE(ptr + endprevious, 1, plength, stdout);
3102 }
3103
3104 /* Not colouring or multiline; no need to search for further matches. */
3105
3106 else FWRITE_IGNORE(ptr, 1, linelength + endlinelength, stdout);
3107 }
3108
3109 /* End of doing what has to be done for a match. If --line-buffered was
3110 given, flush the output. */
3111
3112 if (line_buffered) fflush(stdout);
3113 rc = 0; /* Had some success */
3114
3115 /* Remember where the last match happened for after_context. We remember
3116 where we are about to restart, and that line's number. */
3117
3118 lastmatchrestart = ptr + linelength + endlinelength;
3119 lastmatchnumber = linenumber + 1;
3120
3121 /* If a line was printed and we are now at the end of the file and the last
3122 line had no newline, output one. */
3123
3124 if (lines_printed && lastmatchrestart >= endptr && endlinelength == 0)
3125 write_final_newline();
3126 }
3127
3128 /* For a match in multiline inverted mode (which of course did not cause
3129 anything to be printed), we have to move on to the end of the match before
3130 proceeding. */
3131
3132 if (multiline && invert && match)
3133 {
3134 int ellength;
3135 char *endmatch = ptr + offsets[1];
3136 t = ptr;
3137 while (t < endmatch)
3138 {
3139 t = end_of_line(t, endptr, &ellength);
3140 if (t <= endmatch) linenumber++; else break;
3141 }
3142 endmatch = end_of_line(endmatch, endptr, &ellength);
3143 linelength = endmatch - ptr - ellength;
3144 }
3145
3146 /* Advance to after the newline and increment the line number. The file
3147 offset to the current line is maintained in filepos. */
3148
3149 END_ONE_MATCH:
3150 ptr += linelength + endlinelength;
3151 filepos += (int)(linelength + endlinelength);
3152 linenumber++;
3153
3154 /* If there was at least one match (or a non-match, as required) in the line,
3155 increment the count for the -m option. */
3156
3157 if (line_matched) count_matched_lines++;
3158
3159 /* If input is line buffered, and the buffer is not yet full, read another
3160 line and add it into the buffer. */
3161
3162 if (input_line_buffered && bufflength < (PCRE2_SIZE)bufsize)
3163 {
3164 PCRE2_SIZE add = read_one_line(ptr, bufsize - (ptr - main_buffer), in);
3165 bufflength += add;
3166 endptr += add;
3167 }
3168
3169 /* If we haven't yet reached the end of the file (the buffer is full), and
3170 the current point is in the top 1/3 of the buffer, slide the buffer down by
3171 1/3 and refill it. Before we do this, if some unprinted "after" lines are
3172 about to be lost, print them. */
3173
3174 if (bufflength >= (PCRE2_SIZE)bufsize && ptr > main_buffer + 2*bufthird)
3175 {
3176 if (after_context > 0 &&
3177 lastmatchnumber > 0 &&
3178 lastmatchrestart < main_buffer + bufthird)
3179 {
3180 do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname);
3181 lastmatchnumber = 0; /* Indicates no after lines pending */
3182 }
3183
3184 /* Now do the shuffle */
3185
3186 (void)memmove(main_buffer, main_buffer + bufthird, 2*bufthird);
3187 ptr -= bufthird;
3188
3189 bufflength = 2*bufthird + fill_buffer(handle, frtype,
3190 main_buffer + 2*bufthird, bufthird, input_line_buffered);
3191 endptr = main_buffer + bufflength;
3192
3193 /* Adjust any last match point */
3194
3195 if (lastmatchnumber > 0) lastmatchrestart -= bufthird;
3196 }
3197 } /* Loop through the whole file */
3198
3199 /* End of file; print final "after" lines if wanted; do_after_lines sets
3200 hyphenpending if it prints something. */
3201
3202 if (only_matching_count == 0 && !(count_only|show_total_count))
3203 {
3204 do_after_lines(lastmatchnumber, lastmatchrestart, endptr, printname);
3205 hyphenpending |= endhyphenpending;
3206 }
3207
3208 /* Print the file name if we are looking for those without matches and there
3209 were none. If we found a match, we won't have got this far. */
3210
3211 if (filenames == FN_NOMATCH_ONLY)
3212 {
3213 fprintf(stdout, "%s", printname);
3214 if (printname_nl == NULL) fprintf(stdout, "%c", 0);
3215 else fprintf(stdout, "%s", printname_nl);
3216 return 0;
3217 }
3218
3219 /* Print the match count if wanted */
3220
3221 if (count_only && !quiet)
3222 {
3223 if (count > 0 || !omit_zero_count)
3224 {
3225 if (printname != NULL && filenames != FN_NONE)
3226 fprintf(stdout, "%s%c", printname, printname_colon);
3227 fprintf(stdout, "%lu" STDOUT_NL, count);
3228 counts_printed++;
3229 }
3230 }
3231
3232 total_count += count; /* Can be set without count_only */
3233 return rc;
3234 }
3235
3236
3237
3238 /*************************************************
3239 * Grep a file or recurse into a directory *
3240 *************************************************/
3241
3242 /* Given a path name, if it's a directory, scan all the files if we are
3243 recursing; if it's a file, grep it.
3244
3245 Arguments:
3246 pathname the path to investigate
3247 dir_recurse TRUE if recursing is wanted (-r or -drecurse)
3248 only_one_at_top TRUE if the path is the only one at toplevel
3249
3250 Returns: -1 the file/directory was skipped
3251 0 if there was at least one match
3252 1 if there were no matches
3253 2 there was some kind of error
3254
3255 However, file opening failures are suppressed if "silent" is set.
3256 */
3257
3258 static int
grep_or_recurse(char * pathname,BOOL dir_recurse,BOOL only_one_at_top)3259 grep_or_recurse(char *pathname, BOOL dir_recurse, BOOL only_one_at_top)
3260 {
3261 int rc = 1;
3262 int frtype;
3263 void *handle;
3264 char *lastcomp;
3265 FILE *in = NULL; /* Ensure initialized */
3266
3267 #ifdef SUPPORT_LIBZ
3268 gzFile ingz = NULL;
3269 #endif
3270
3271 #ifdef SUPPORT_LIBBZ2
3272 BZFILE *inbz2 = NULL;
3273 #endif
3274
3275 #if defined SUPPORT_LIBZ || defined SUPPORT_LIBBZ2
3276 int pathlen;
3277 #endif
3278
3279 #if defined NATIVE_ZOS
3280 int zos_type;
3281 FILE *zos_test_file;
3282 #endif
3283
3284 /* If the file name is "-" we scan stdin */
3285
3286 if (strcmp(pathname, "-") == 0)
3287 {
3288 if (count_limit >= 0) setbuf(stdin, NULL);
3289 return pcre2grep(stdin, FR_PLAIN, stdin_name,
3290 (filenames > FN_DEFAULT || (filenames == FN_DEFAULT && !only_one_at_top))?
3291 stdin_name : NULL);
3292 }
3293
3294 /* Inclusion and exclusion: --include-dir and --exclude-dir apply only to
3295 directories, whereas --include and --exclude apply to everything else. The test
3296 is against the final component of the path. */
3297
3298 lastcomp = strrchr(pathname, FILESEP);
3299 lastcomp = (lastcomp == NULL)? pathname : lastcomp + 1;
3300
3301 /* If the file is a directory, skip if not recursing or if explicitly excluded.
3302 Otherwise, scan the directory and recurse for each path within it. The scanning
3303 code is localized so it can be made system-specific. */
3304
3305
3306 /* For z/OS, determine the file type. */
3307
3308 #if defined NATIVE_ZOS
3309 zos_test_file = fopen(pathname,"rb");
3310
3311 if (zos_test_file == NULL)
3312 {
3313 if (!silent) fprintf(stderr, "pcre2grep: failed to test next file %s\n",
3314 pathname, strerror(errno));
3315 return -1;
3316 }
3317 zos_type = identifyzosfiletype (zos_test_file);
3318 fclose (zos_test_file);
3319
3320 /* Handle a PDS in separate code */
3321
3322 if (zos_type == __ZOS_PDS || zos_type == __ZOS_PDSE)
3323 {
3324 return travelonpdsdir (pathname, only_one_at_top);
3325 }
3326
3327 /* Deal with regular files in the normal way below. These types are:
3328 zos_type == __ZOS_PDS_MEMBER
3329 zos_type == __ZOS_PS
3330 zos_type == __ZOS_VSAM_KSDS
3331 zos_type == __ZOS_VSAM_ESDS
3332 zos_type == __ZOS_VSAM_RRDS
3333 */
3334
3335 /* Handle a z/OS directory using common code. */
3336
3337 else if (zos_type == __ZOS_HFS)
3338 {
3339 #endif /* NATIVE_ZOS */
3340
3341
3342 /* Handle directories: common code for all OS */
3343
3344 if (isdirectory(pathname))
3345 {
3346 if (dee_action == dee_SKIP ||
3347 !test_incexc(lastcomp, include_dir_patterns, exclude_dir_patterns))
3348 return -1;
3349
3350 if (dee_action == dee_RECURSE)
3351 {
3352 char childpath[FNBUFSIZ];
3353 char *nextfile;
3354 directory_type *dir = opendirectory(pathname);
3355
3356 if (dir == NULL)
3357 {
3358 /* LCOV_EXCL_START - this is a "never" event */
3359 if (!silent)
3360 fprintf(stderr, "pcre2grep: Failed to open directory %s: %s\n", pathname,
3361 strerror(errno));
3362 return 2;
3363 /* LCOV_EXCL_STOP */
3364 }
3365
3366 while ((nextfile = readdirectory(dir)) != NULL)
3367 {
3368 int frc;
3369 int fnlength = strlen(pathname) + strlen(nextfile) + 2;
3370 if (fnlength > FNBUFSIZ)
3371 {
3372 /* LCOV_EXCL_START - this is a "never" event */
3373 fprintf(stderr, "pcre2grep: recursive filename is too long\n");
3374 rc = 2;
3375 break;
3376 /* LCOV_EXCL_STOP */
3377 }
3378 sprintf(childpath, "%s%c%s", pathname, FILESEP, nextfile);
3379
3380 /* If the realpath() function is available, we can try to prevent endless
3381 recursion caused by a symlink pointing to a parent directory (GitHub
3382 issue #2 (old Bugzilla #2794). Original patch from Thomas Tempelmann.
3383 Modified to avoid using strlcat() because that isn't a standard C
3384 function, and also modified not to copy back the fully resolved path,
3385 because that affects the output from pcre2grep. */
3386
3387 #ifdef HAVE_REALPATH
3388 {
3389 char resolvedpath[PATH_MAX];
3390 BOOL isSame;
3391 size_t rlen;
3392 if (realpath(childpath, resolvedpath) == NULL)
3393 /* LCOV_EXCL_START - this is a "never" event */
3394 continue; /* This path is invalid - we can skip processing this */
3395 /* LCOV_EXCL_STOP */
3396 isSame = strcmp(pathname, resolvedpath) == 0;
3397 if (isSame) continue; /* We have a recursion */
3398 rlen = strlen(resolvedpath);
3399 if (rlen++ < sizeof(resolvedpath) - 3)
3400 {
3401 BOOL contained;
3402 strcat(resolvedpath, "/");
3403 contained = strncmp(pathname, resolvedpath, rlen) == 0;
3404 if (contained) continue; /* We have a recursion */
3405 }
3406 }
3407 #endif /* HAVE_REALPATH */
3408
3409 frc = grep_or_recurse(childpath, dir_recurse, FALSE);
3410 if (frc > 1) rc = frc;
3411 else if (frc == 0 && rc == 1) rc = 0;
3412 }
3413
3414 closedirectory(dir);
3415 return rc;
3416 }
3417 }
3418
3419 #ifdef WIN32
3420 if (iswild(pathname))
3421 {
3422 char buffer[1024];
3423 char *nextfile;
3424 char *name;
3425 directory_type *dir = opendirectory(pathname);
3426
3427 if (dir == NULL)
3428 return 0;
3429
3430 for (nextfile = name = pathname; *nextfile != 0; nextfile++)
3431 if (*nextfile == '/' || *nextfile == '\\')
3432 name = nextfile + 1;
3433 *name = 0;
3434
3435 while ((nextfile = readdirectory(dir)) != NULL)
3436 {
3437 int frc;
3438 sprintf(buffer, "%.512s%.128s", pathname, nextfile);
3439 frc = grep_or_recurse(buffer, dir_recurse, FALSE);
3440 if (frc > 1) rc = frc;
3441 else if (frc == 0 && rc == 1) rc = 0;
3442 }
3443
3444 closedirectory(dir);
3445 return rc;
3446 }
3447 #endif
3448
3449 #if defined NATIVE_ZOS
3450 }
3451 #endif
3452
3453 /* If the file is not a directory, check for a regular file, and if it is not,
3454 skip it if that's been requested. Otherwise, check for an explicit inclusion or
3455 exclusion. */
3456
3457 else if (
3458 #if defined NATIVE_ZOS
3459 (zos_type == __ZOS_NOFILE && DEE_action == DEE_SKIP) ||
3460 #else /* all other OS */
3461 (!isregfile(pathname) && DEE_action == DEE_SKIP) ||
3462 #endif
3463 !test_incexc(lastcomp, include_patterns, exclude_patterns))
3464 return -1; /* File skipped */
3465
3466 /* Control reaches here if we have a regular file, or if we have a directory
3467 and recursion or skipping was not requested, or if we have anything else and
3468 skipping was not requested. The scan proceeds. If this is the first and only
3469 argument at top level, we don't show the file name, unless we are only showing
3470 the file name, or the filename was forced (-H). */
3471
3472 #if defined SUPPORT_LIBZ || defined SUPPORT_LIBBZ2
3473 pathlen = (int)(strlen(pathname));
3474 #endif
3475
3476 /* Open using zlib if it is supported and the file name ends with .gz. */
3477
3478 #ifdef SUPPORT_LIBZ
3479 if (pathlen > 3 && strcmp(pathname + pathlen - 3, ".gz") == 0)
3480 {
3481 ingz = gzopen(pathname, "rb");
3482 if (ingz == NULL)
3483 {
3484 /* LCOV_EXCL_START */
3485 if (!silent)
3486 fprintf(stderr, "pcre2grep: Failed to open %s: %s\n", pathname,
3487 strerror(errno));
3488 return 2;
3489 /* LCOV_EXCL_STOP */
3490 }
3491 handle = (void *)ingz;
3492 frtype = FR_LIBZ;
3493 }
3494 else
3495 #endif
3496
3497 /* Otherwise open with bz2lib if it is supported and the name ends with .bz2. */
3498
3499 #ifdef SUPPORT_LIBBZ2
3500 if (pathlen > 4 && strcmp(pathname + pathlen - 4, ".bz2") == 0)
3501 {
3502 inbz2 = BZ2_bzopen(pathname, "rb");
3503 handle = (void *)inbz2;
3504 frtype = FR_LIBBZ2;
3505 }
3506 else
3507 #endif
3508
3509 /* Otherwise use plain fopen(). The label is so that we can come back here if
3510 an attempt to read a .bz2 file indicates that it really is a plain file. */
3511
3512 #ifdef SUPPORT_LIBBZ2
3513 PLAIN_FILE:
3514 #endif
3515 {
3516 in = fopen(pathname, "rb");
3517 handle = (void *)in;
3518 frtype = FR_PLAIN;
3519 }
3520
3521 /* All the opening methods return errno when they fail. */
3522
3523 if (handle == NULL)
3524 {
3525 if (!silent)
3526 fprintf(stderr, "pcre2grep: Failed to open %s: %s\n", pathname,
3527 strerror(errno));
3528 return 2;
3529 }
3530
3531 /* Now grep the file */
3532
3533 rc = pcre2grep(handle, frtype, pathname, (filenames > FN_DEFAULT ||
3534 (filenames == FN_DEFAULT && !only_one_at_top))? pathname : NULL);
3535
3536 /* Close in an appropriate manner. */
3537
3538 #ifdef SUPPORT_LIBZ
3539 if (frtype == FR_LIBZ)
3540 gzclose(ingz);
3541 else
3542 #endif
3543
3544 /* If it is a .bz2 file and the result is 3, it means that the first attempt to
3545 read failed. If the error indicates that the file isn't in fact bzipped, try
3546 again as a normal file. */
3547
3548 #ifdef SUPPORT_LIBBZ2
3549 if (frtype == FR_LIBBZ2)
3550 {
3551 if (rc == 3)
3552 {
3553 int errnum;
3554 const char *err = BZ2_bzerror(inbz2, &errnum);
3555 if (errnum == BZ_DATA_ERROR_MAGIC)
3556 {
3557 BZ2_bzclose(inbz2);
3558 goto PLAIN_FILE;
3559 }
3560 /* LCOV_EXCL_START */
3561 else if (!silent)
3562 fprintf(stderr, "pcre2grep: Failed to read %s using bzlib: %s\n",
3563 pathname, err);
3564 rc = 2; /* The normal "something went wrong" code */
3565 /* LCOV_EXCL_STOP */
3566 }
3567 BZ2_bzclose(inbz2);
3568 }
3569 else
3570 #endif
3571
3572 /* Normal file close */
3573
3574 fclose(in);
3575
3576 /* Pass back the yield from pcre2grep(). */
3577
3578 return rc;
3579 }
3580
3581
3582
3583 /*************************************************
3584 * Handle a no-data option *
3585 *************************************************/
3586
3587 /* This is called when a known option has been identified. */
3588
3589 static int
handle_option(int letter,int options)3590 handle_option(int letter, int options)
3591 {
3592 switch(letter)
3593 {
3594 case N_FOFFSETS: file_offsets = TRUE; break;
3595 case N_HELP: help(); pcre2grep_exit(0); break; /* Stops compiler warning */
3596 case N_LBUFFER: line_buffered = TRUE; break;
3597 case N_LOFFSETS: line_offsets = number = TRUE; break;
3598 case N_NOJIT: use_jit = FALSE; break;
3599 case N_ALLABSK: extra_options |= PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK; break;
3600 case N_NO_GROUP_SEPARATOR: group_separator = NULL; break;
3601 case 'a': binary_files = BIN_TEXT; break;
3602 case 'c': count_only = TRUE; break;
3603 case N_POSIX_DIGIT: posix_digit = TRUE; break;
3604 case 'E': case_restrict = TRUE; break;
3605 case 'F': options |= PCRE2_LITERAL; break;
3606 case 'H': filenames = FN_FORCE; break;
3607 case 'I': binary_files = BIN_NOMATCH; break;
3608 case 'h': filenames = FN_NONE; break;
3609 case 'i': options |= PCRE2_CASELESS; break;
3610 case 'l': omit_zero_count = TRUE; filenames = FN_MATCH_ONLY; break;
3611 case 'L': filenames = FN_NOMATCH_ONLY; break;
3612 case 'M': multiline = TRUE; options |= PCRE2_MULTILINE|PCRE2_FIRSTLINE; break;
3613 case 'n': number = TRUE; break;
3614
3615 case 'o':
3616 only_matching_last = add_number(0, only_matching_last);
3617 if (only_matching == NULL) only_matching = only_matching_last;
3618 break;
3619
3620 case 'P': no_ucp = TRUE; break;
3621 case 'q': quiet = TRUE; break;
3622 case 'r': dee_action = dee_RECURSE; break;
3623 case 's': silent = TRUE; break;
3624 case 't': show_total_count = TRUE; break;
3625 case 'u': options |= PCRE2_UTF | PCRE2_UCP; utf = TRUE; break;
3626 case 'U': options |= PCRE2_UTF | PCRE2_MATCH_INVALID_UTF | PCRE2_UCP;
3627 utf = TRUE; break;
3628 case 'v': invert = TRUE; break;
3629
3630 case 'V':
3631 {
3632 unsigned char buffer[128];
3633 (void)pcre2_config(PCRE2_CONFIG_VERSION, buffer);
3634 fprintf(stdout, "pcre2grep version %s" STDOUT_NL, buffer);
3635 }
3636 pcre2grep_exit(0);
3637 break; /* LCOV_EXCL_LINE - statement kept to avoid compiler warning */
3638
3639 case 'w': extra_options |= PCRE2_EXTRA_MATCH_WORD; break;
3640 case 'x': extra_options |= PCRE2_EXTRA_MATCH_LINE; break;
3641 case 'Z': printname_colon = printname_hyphen = 0; printname_nl = NULL; break;
3642
3643 /* LCOV_EXCL_START - this is a "never event" */
3644 default:
3645 fprintf(stderr, "pcre2grep: Unknown option -%c\n", letter);
3646 pcre2grep_exit(usage(2));
3647 /* LCOV_EXCL_STOP */
3648 }
3649
3650 return options;
3651 }
3652
3653
3654
3655 /*************************************************
3656 * Construct printed ordinal *
3657 *************************************************/
3658
3659 /* This turns a number into "1st", "3rd", etc. */
3660
3661 static char *
ordin(int n)3662 ordin(int n)
3663 {
3664 static char buffer[14];
3665 char *p = buffer;
3666 sprintf(p, "%d", n);
3667 while (*p != 0) p++;
3668 n %= 100;
3669 if (n >= 11 && n <= 13) n = 0;
3670 switch (n%10)
3671 {
3672 case 1: strcpy(p, "st"); break;
3673 case 2: strcpy(p, "nd"); break;
3674 case 3: strcpy(p, "rd"); break;
3675 default: strcpy(p, "th"); break;
3676 }
3677 return buffer;
3678 }
3679
3680
3681
3682 /*************************************************
3683 * Compile a single pattern *
3684 *************************************************/
3685
3686 /* Do nothing if the pattern has already been compiled. This is the case for
3687 include/exclude patterns read from a file.
3688
3689 When the -F option has been used, each "pattern" may be a list of strings,
3690 separated by line breaks. They will be matched literally. We split such a
3691 string and compile the first substring, inserting an additional block into the
3692 pattern chain.
3693
3694 Arguments:
3695 p points to the pattern block
3696 options the PCRE options
3697 fromfile TRUE if the pattern was read from a file
3698 fromtext file name or identifying text (e.g. "include")
3699 count 0 if this is the only command line pattern, or
3700 number of the command line pattern, or
3701 linenumber for a pattern from a file
3702
3703 Returns: TRUE on success, FALSE after an error
3704 */
3705
3706 static BOOL
compile_pattern(patstr * p,int options,int fromfile,const char * fromtext,int count)3707 compile_pattern(patstr *p, int options, int fromfile, const char *fromtext,
3708 int count)
3709 {
3710 char *ps;
3711 int errcode;
3712 PCRE2_SIZE patlen, erroffset;
3713 PCRE2_UCHAR errmessbuffer[ERRBUFSIZ];
3714
3715 if (p->compiled != NULL) return TRUE;
3716 ps = p->string;
3717 patlen = p->length;
3718
3719 if ((options & PCRE2_LITERAL) != 0)
3720 {
3721 int ellength;
3722 char *eop = ps + patlen;
3723 char *pe = end_of_line(ps, eop, &ellength);
3724
3725 if (ellength != 0)
3726 {
3727 patlen = pe - ps - ellength;
3728 if (add_pattern(pe, p->length-patlen-ellength, p) == NULL) return FALSE;
3729 }
3730 }
3731
3732 p->compiled = pcre2_compile((PCRE2_SPTR)ps, patlen, options, &errcode,
3733 &erroffset, compile_context);
3734
3735 /* Handle successful compile. Try JIT-compiling if supported and enabled. We
3736 ignore any JIT compiler errors, relying falling back to interpreting if
3737 anything goes wrong with JIT. */
3738
3739 if (p->compiled != NULL)
3740 {
3741 #ifdef SUPPORT_PCRE2GREP_JIT
3742 if (use_jit) (void)pcre2_jit_compile(p->compiled, PCRE2_JIT_COMPLETE);
3743 #endif
3744 return TRUE;
3745 }
3746
3747 /* Handle compile errors */
3748
3749 if (erroffset > patlen) erroffset = patlen;
3750 pcre2_get_error_message(errcode, errmessbuffer, sizeof(errmessbuffer));
3751
3752 if (fromfile)
3753 {
3754 fprintf(stderr, "pcre2grep: Error in regex in line %d of %s "
3755 "at offset %d: %s\n", count, fromtext, (int)erroffset, errmessbuffer);
3756 }
3757 else
3758 {
3759 if (count == 0)
3760 fprintf(stderr, "pcre2grep: Error in %s regex at offset %d: %s\n",
3761 fromtext, (int)erroffset, errmessbuffer);
3762 else
3763 fprintf(stderr, "pcre2grep: Error in %s %s regex at offset %d: %s\n",
3764 ordin(count), fromtext, (int)erroffset, errmessbuffer);
3765 }
3766
3767 return FALSE;
3768 }
3769
3770
3771
3772 /*************************************************
3773 * Read and compile a file of patterns *
3774 *************************************************/
3775
3776 /* This is used for --filelist, --include-from, and --exclude-from.
3777
3778 Arguments:
3779 name the name of the file; "-" is stdin
3780 patptr pointer to the pattern chain anchor
3781 patlastptr pointer to the last pattern pointer
3782
3783 Returns: TRUE if all went well
3784 */
3785
3786 static BOOL
read_pattern_file(char * name,patstr ** patptr,patstr ** patlastptr)3787 read_pattern_file(char *name, patstr **patptr, patstr **patlastptr)
3788 {
3789 int linenumber = 0;
3790 PCRE2_SIZE patlen;
3791 FILE *f;
3792 const char *filename;
3793 char buffer[MAXPATLEN+20];
3794
3795 if (strcmp(name, "-") == 0)
3796 {
3797 f = stdin;
3798 filename = stdin_name;
3799 }
3800 else
3801 {
3802 f = fopen(name, "r");
3803 if (f == NULL)
3804 {
3805 fprintf(stderr, "pcre2grep: Failed to open %s: %s\n", name, strerror(errno));
3806 return FALSE;
3807 }
3808 filename = name;
3809 }
3810
3811 while ((patlen = read_one_line(buffer, sizeof(buffer), f)) > 0)
3812 {
3813 while (patlen > 0 && isspace((unsigned char)(buffer[patlen-1]))) patlen--;
3814 linenumber++;
3815 if (patlen == 0) continue; /* Skip blank lines */
3816
3817 /* Note: this call to add_pattern() puts a pointer to the local variable
3818 "buffer" into the pattern chain. However, that pointer is used only when
3819 compiling the pattern, which happens immediately below, so we flatten it
3820 afterwards, as a precaution against any later code trying to use it. */
3821
3822 *patlastptr = add_pattern(buffer, patlen, *patlastptr);
3823 if (*patlastptr == NULL)
3824 {
3825 /* LCOV_EXCL_START - won't happen in testing */
3826 if (f != stdin) fclose(f);
3827 return FALSE;
3828 /* LCOV_EXCL_STOP */
3829 }
3830 if (*patptr == NULL) *patptr = *patlastptr;
3831
3832 /* This loop is needed because compiling a "pattern" when -F is set may add
3833 on additional literal patterns if the original contains a newline. In the
3834 common case, it never will, because read_one_line() stops at a newline.
3835 However, the -N option can be used to give pcre2grep a different newline
3836 setting. */
3837
3838 for(;;)
3839 {
3840 if (!compile_pattern(*patlastptr, pcre2_options, TRUE, filename,
3841 linenumber))
3842 {
3843 if (f != stdin) fclose(f);
3844 return FALSE;
3845 }
3846 (*patlastptr)->string = NULL; /* Insurance */
3847 if ((*patlastptr)->next == NULL) break;
3848 *patlastptr = (*patlastptr)->next;
3849 }
3850 }
3851
3852 if (f != stdin) fclose(f);
3853 return TRUE;
3854 }
3855
3856
3857
3858 /*************************************************
3859 * Main program *
3860 *************************************************/
3861
3862 /* Returns 0 if something matched, 1 if nothing matched, 2 after an error. */
3863
3864 int
main(int argc,char ** argv)3865 main(int argc, char **argv)
3866 {
3867 int i, j;
3868 int rc = 1;
3869 BOOL only_one_at_top;
3870 patstr *cp;
3871 fnstr *fn;
3872 omstr *om;
3873 const char *locale_from = "--locale";
3874
3875 #ifdef SUPPORT_PCRE2GREP_JIT
3876 pcre2_jit_stack *jit_stack = NULL;
3877 #endif
3878
3879 /* In Windows, stdout is set up as a text stream, which means that \n is
3880 converted to \r\n. This causes output lines that are copied from the input to
3881 change from ....\r\n to ....\r\r\n, which is not right. We therefore ensure
3882 that stdout is a binary stream. Note that this means all other output to stdout
3883 must use STDOUT_NL to terminate lines. */
3884
3885 #ifdef WIN32
3886 _setmode(_fileno(stdout), _O_BINARY);
3887 #endif
3888
3889 /* Process the options */
3890
3891 for (i = 1; i < argc; i++)
3892 {
3893 option_item *op = NULL;
3894 char *option_data = (char *)""; /* default to keep compiler happy */
3895 BOOL longop;
3896 BOOL longopwasequals = FALSE;
3897
3898 if (argv[i][0] != '-') break;
3899
3900 /* If we hit an argument that is just "-", it may be a reference to STDIN,
3901 but only if we have previously had -e or -f to define the patterns. */
3902
3903 if (argv[i][1] == 0)
3904 {
3905 if (pattern_files != NULL || patterns != NULL) break;
3906 else pcre2grep_exit(usage(2));
3907 }
3908
3909 /* Handle a long name option, or -- to terminate the options */
3910
3911 if (argv[i][1] == '-')
3912 {
3913 char *arg = argv[i] + 2;
3914 char *argequals = strchr(arg, '=');
3915
3916 if (*arg == 0) /* -- terminates options */
3917 {
3918 i++;
3919 break; /* out of the options-handling loop */
3920 }
3921
3922 longop = TRUE;
3923
3924 /* Some long options have data that follows after =, for example file=name.
3925 Some options have variations in the long name spelling: specifically, we
3926 allow "regexp" because GNU grep allows it, though I personally go along
3927 with Jeffrey Friedl and Larry Wall in preferring "regex" without the "p".
3928 These options are entered in the table as "regex(p)". Options can be in
3929 both these categories. */
3930
3931 for (op = optionlist; op->one_char != 0; op++)
3932 {
3933 char *opbra = strchr(op->long_name, '(');
3934 char *equals = strchr(op->long_name, '=');
3935
3936 /* Handle options with only one spelling of the name */
3937
3938 if (opbra == NULL) /* Does not contain '(' */
3939 {
3940 if (equals == NULL) /* Not thing=data case */
3941 {
3942 if (strcmp(arg, op->long_name) == 0) break;
3943 }
3944 else /* Special case xxx=data */
3945 {
3946 int oplen = (int)(equals - op->long_name);
3947 int arglen = (argequals == NULL)?
3948 (int)strlen(arg) : (int)(argequals - arg);
3949 if (oplen == arglen && strncmp(arg, op->long_name, oplen) == 0)
3950 {
3951 option_data = arg + arglen;
3952 if (*option_data == '=')
3953 {
3954 option_data++;
3955 longopwasequals = TRUE;
3956 }
3957 break;
3958 }
3959 }
3960 }
3961
3962 /* Handle options with an alternate spelling of the name */
3963
3964 else
3965 {
3966 char buff1[24];
3967 char buff2[24];
3968 int ret;
3969
3970 int baselen = (int)(opbra - op->long_name);
3971 int fulllen = (int)(strchr(op->long_name, ')') - op->long_name + 1);
3972 int arglen = (argequals == NULL || equals == NULL)?
3973 (int)strlen(arg) : (int)(argequals - arg);
3974
3975 if ((ret = snprintf(buff1, sizeof(buff1), "%.*s", baselen, op->long_name),
3976 ret < 0 || ret > (int)sizeof(buff1)) ||
3977 (ret = snprintf(buff2, sizeof(buff2), "%s%.*s", buff1,
3978 fulllen - baselen - 2, opbra + 1),
3979 ret < 0 || ret > (int)sizeof(buff2)))
3980 {
3981 /* LCOV_EXCL_START - this is a "never" event */
3982 fprintf(stderr, "pcre2grep: Buffer overflow when parsing %s option\n",
3983 op->long_name);
3984 pcre2grep_exit(2);
3985 /* LCOV_EXCL_STOP */
3986 }
3987
3988 if (strncmp(arg, buff1, arglen) == 0 ||
3989 strncmp(arg, buff2, arglen) == 0)
3990 {
3991 if (equals != NULL && argequals != NULL)
3992 {
3993 option_data = argequals;
3994 if (*option_data == '=')
3995 {
3996 option_data++;
3997 longopwasequals = TRUE;
3998 }
3999 }
4000 break;
4001 }
4002 }
4003 }
4004
4005 if (op->one_char == 0)
4006 {
4007 fprintf(stderr, "pcre2grep: Unknown option %s\n", argv[i]);
4008 pcre2grep_exit(usage(2));
4009 }
4010 }
4011
4012 /* One-char options; many that have no data may be in a single argument; we
4013 continue till we hit the last one or one that needs data. */
4014
4015 else
4016 {
4017 char *s = argv[i] + 1;
4018 longop = FALSE;
4019
4020 while (*s != 0)
4021 {
4022 for (op = optionlist; op->one_char != 0; op++)
4023 {
4024 if (*s == op->one_char) break;
4025 }
4026 if (op->one_char == 0)
4027 {
4028 fprintf(stderr, "pcre2grep: Unknown option letter '%c' in \"%s\"\n",
4029 *s, argv[i]);
4030 pcre2grep_exit(usage(2));
4031 }
4032
4033 option_data = s+1;
4034
4035 /* Break out if this is the last character in the string; it's handled
4036 below like a single multi-char option. */
4037
4038 if (*option_data == 0) break;
4039
4040 /* Check for a single-character option that has data: OP_OP_NUMBER(S)
4041 are used for ones that either have a numerical number or defaults, i.e.
4042 the data is optional. If a digit follows, there is data; if not, carry on
4043 with other single-character options in the same string. */
4044
4045 if (op->type == OP_OP_NUMBER || op->type == OP_OP_NUMBERS)
4046 {
4047 if (isdigit((unsigned char)(s[1]))) break;
4048 }
4049 else /* Check for an option with data */
4050 {
4051 if (op->type != OP_NODATA) break;
4052 }
4053
4054 /* Handle a single-character option with no data, then loop for the
4055 next character in the string. */
4056
4057 pcre2_options = handle_option(*s++, pcre2_options);
4058 }
4059 }
4060
4061 /* At this point we should have op pointing to a matched option. If the type
4062 is NO_DATA, it means that there is no data, and the option might set
4063 something in the PCRE options. */
4064
4065 if (op->type == OP_NODATA)
4066 {
4067 pcre2_options = handle_option(op->one_char, pcre2_options);
4068 continue;
4069 }
4070
4071 /* If the option type is OP_OP_STRING or OP_OP_NUMBER(S), it's an option that
4072 either has a value or defaults to something. It cannot have data in a
4073 separate item. At the moment, the only such options are "colo(u)r",
4074 and "only-matching". */
4075
4076 if (*option_data == 0 &&
4077 (op->type == OP_OP_STRING || op->type == OP_OP_NUMBER ||
4078 op->type == OP_OP_NUMBERS))
4079 {
4080 switch (op->one_char)
4081 {
4082 case N_COLOUR:
4083 colour_option = "auto";
4084 break;
4085
4086 case 'o':
4087 only_matching_last = add_number(0, only_matching_last);
4088 if (only_matching == NULL) only_matching = only_matching_last;
4089 break;
4090 }
4091 continue;
4092 }
4093
4094 /* Otherwise, find the data string for the option. */
4095
4096 if (*option_data == 0)
4097 {
4098 if (i >= argc - 1 || longopwasequals)
4099 {
4100 fprintf(stderr, "pcre2grep: Data missing after %s\n", argv[i]);
4101 pcre2grep_exit(usage(2));
4102 }
4103 option_data = argv[++i];
4104 }
4105
4106 /* If the option type is OP_OP_NUMBERS, the value is a number that is to be
4107 added to a chain of numbers. */
4108
4109 if (op->type == OP_OP_NUMBERS)
4110 {
4111 unsigned long int n = decode_number(option_data, op, longop);
4112 omdatastr *omd = (omdatastr *)op->dataptr;
4113 *(omd->lastptr) = add_number((int)n, *(omd->lastptr));
4114 if (*(omd->anchor) == NULL) *(omd->anchor) = *(omd->lastptr);
4115 }
4116
4117 /* If the option type is OP_PATLIST, it's the -e option, or one of the
4118 include/exclude options, which can be called multiple times to create lists
4119 of patterns. */
4120
4121 else if (op->type == OP_PATLIST)
4122 {
4123 patdatastr *pd = (patdatastr *)op->dataptr;
4124 *(pd->lastptr) = add_pattern(option_data, (PCRE2_SIZE)strlen(option_data),
4125 *(pd->lastptr));
4126 if (*(pd->lastptr) == NULL) goto EXIT2;
4127 if (*(pd->anchor) == NULL) *(pd->anchor) = *(pd->lastptr);
4128 }
4129
4130 /* If the option type is OP_FILELIST, it's one of the options that names a
4131 file. */
4132
4133 else if (op->type == OP_FILELIST)
4134 {
4135 fndatastr *fd = (fndatastr *)op->dataptr;
4136 fn = (fnstr *)malloc(sizeof(fnstr));
4137 if (fn == NULL)
4138 {
4139 /* LCOV_EXCL_START */
4140 fprintf(stderr, "pcre2grep: malloc failed\n");
4141 goto EXIT2;
4142 /* LCOV_EXCL_STOP */
4143 }
4144 fn->next = NULL;
4145 fn->name = option_data;
4146 if (*(fd->anchor) == NULL)
4147 *(fd->anchor) = fn;
4148 else
4149 (*(fd->lastptr))->next = fn;
4150 *(fd->lastptr) = fn;
4151 }
4152
4153 /* Handle OP_BINARY_FILES */
4154
4155 else if (op->type == OP_BINFILES)
4156 {
4157 if (strcmp(option_data, "binary") == 0)
4158 binary_files = BIN_BINARY;
4159 else if (strcmp(option_data, "without-match") == 0)
4160 binary_files = BIN_NOMATCH;
4161 else if (strcmp(option_data, "text") == 0)
4162 binary_files = BIN_TEXT;
4163 else
4164 {
4165 fprintf(stderr, "pcre2grep: unknown value \"%s\" for binary-files\n",
4166 option_data);
4167 pcre2grep_exit(usage(2));
4168 }
4169 }
4170
4171 /* Otherwise, deal with a single string or numeric data value. */
4172
4173 else if (op->type != OP_NUMBER && op->type != OP_U32NUMBER &&
4174 op->type != OP_OP_NUMBER && op->type != OP_SIZE)
4175 {
4176 *((char **)op->dataptr) = option_data;
4177 }
4178 else
4179 {
4180 unsigned long int n = decode_number(option_data, op, longop);
4181 if (op->type == OP_U32NUMBER) *((uint32_t *)op->dataptr) = n;
4182 else if (op->type == OP_SIZE) *((PCRE2_SIZE *)op->dataptr) = n;
4183 else *((int *)op->dataptr) = n;
4184 }
4185 }
4186
4187 /* Options have been decoded. If -C was used, its value is used as a default
4188 for -A and -B. */
4189
4190 if (both_context > 0)
4191 {
4192 if (after_context == 0) after_context = both_context;
4193 if (before_context == 0) before_context = both_context;
4194 }
4195
4196 /* Only one of --only-matching, --output, --file-offsets, or --line-offsets is
4197 permitted. They display, each in their own way, only the data that has matched.
4198 */
4199
4200 only_matching_count = (only_matching != NULL) + (output_text != NULL) +
4201 file_offsets + line_offsets;
4202
4203 if (only_matching_count > 1)
4204 {
4205 fprintf(stderr, "pcre2grep: Cannot mix --only-matching, --output, "
4206 "--file-offsets and/or --line-offsets\n");
4207 pcre2grep_exit(usage(2));
4208 }
4209
4210 /* Check that there is a big enough ovector for all -o settings. */
4211
4212 for (om = only_matching; om != NULL; om = om->next)
4213 {
4214 int n = om->groupnum;
4215 if (n > (int)capture_max)
4216 {
4217 fprintf(stderr, "pcre2grep: Requested group %d cannot be captured.\n", n);
4218 fprintf(stderr, "pcre2grep: Use --om-capture to increase the size of the capture vector.\n");
4219 goto EXIT2;
4220 }
4221 }
4222
4223 /* Check the text supplied to --output for errors. */
4224
4225 if (output_text != NULL &&
4226 !syntax_check_output_text((PCRE2_SPTR)output_text, FALSE))
4227 goto EXIT2;
4228
4229 /* Set up default compile and match contexts and match data blocks. */
4230
4231 offset_size = capture_max + 1;
4232 compile_context = pcre2_compile_context_create(NULL);
4233 match_context = pcre2_match_context_create(NULL);
4234 match_data_pair[0] = pcre2_match_data_create(offset_size, NULL);
4235 match_data_pair[1] = pcre2_match_data_create(offset_size, NULL);
4236 offsets_pair[0] = pcre2_get_ovector_pointer(match_data_pair[0]);
4237 offsets_pair[1] = pcre2_get_ovector_pointer(match_data_pair[1]);
4238 match_data = match_data_pair[0];
4239 offsets = offsets_pair[0];
4240 match_data_toggle = 0;
4241
4242 /* If string (script) callouts are supported, set up the callout processing
4243 function in the match context. */
4244
4245 #ifdef SUPPORT_PCRE2GREP_CALLOUT
4246 pcre2_set_callout(match_context, pcre2grep_callout, NULL);
4247 #endif
4248
4249 /* Put limits into the match context. */
4250
4251 if (heap_limit != PCRE2_UNSET) pcre2_set_heap_limit(match_context, heap_limit);
4252 if (match_limit > 0) pcre2_set_match_limit(match_context, match_limit);
4253 if (depth_limit > 0) pcre2_set_depth_limit(match_context, depth_limit);
4254
4255 /* If a locale has not been provided as an option, see if the LC_CTYPE or
4256 LC_ALL environment variable is set, and if so, use it. */
4257
4258 if (locale == NULL)
4259 {
4260 locale = getenv("LC_ALL");
4261 locale_from = "LC_ALL";
4262 }
4263
4264 if (locale == NULL)
4265 {
4266 locale = getenv("LC_CTYPE");
4267 locale_from = "LC_CTYPE";
4268 }
4269
4270 /* If a locale is set, use it to generate the tables the PCRE needs. Passing
4271 NULL to pcre2_maketables() means that malloc() is used to get the memory. */
4272
4273 if (locale != NULL)
4274 {
4275 if (setlocale(LC_CTYPE, locale) == NULL)
4276 {
4277 fprintf(stderr, "pcre2grep: Failed to set locale %s (obtained from %s)\n",
4278 locale, locale_from);
4279 goto EXIT2;
4280 }
4281 character_tables = pcre2_maketables(NULL);
4282 pcre2_set_character_tables(compile_context, character_tables);
4283 }
4284
4285 /* Sort out colouring */
4286
4287 if (colour_option != NULL && strcmp(colour_option, "never") != 0)
4288 {
4289 if (strcmp(colour_option, "always") == 0)
4290 #ifdef WIN32
4291 do_ansi = !is_stdout_tty(),
4292 #endif
4293 do_colour = TRUE;
4294 else if (strcmp(colour_option, "auto") == 0) do_colour = is_stdout_tty();
4295 else
4296 {
4297 fprintf(stderr, "pcre2grep: Unknown colour setting \"%s\"\n",
4298 colour_option);
4299 goto EXIT2;
4300 }
4301 if (do_colour)
4302 {
4303 char *cs = getenv("PCRE2GREP_COLOUR");
4304 if (cs == NULL) cs = getenv("PCRE2GREP_COLOR");
4305 if (cs == NULL) cs = getenv("PCREGREP_COLOUR");
4306 if (cs == NULL) cs = getenv("PCREGREP_COLOR");
4307 if (cs == NULL) cs = parse_grep_colors(getenv("GREP_COLORS"));
4308 if (cs == NULL) cs = getenv("GREP_COLOR");
4309 if (cs != NULL)
4310 {
4311 if (strspn(cs, ";0123456789") == strlen(cs)) colour_string = cs;
4312 }
4313 #ifdef WIN32
4314 init_colour_output();
4315 #endif
4316 }
4317 }
4318
4319 /* When colouring or otherwise identifying matching substrings, we need to find
4320 all possible matches when there are multiple patterns. */
4321
4322 all_matches = do_colour || only_matching_count != 0;
4323
4324 /* Sort out a newline setting. */
4325
4326 if (newline_arg != NULL)
4327 {
4328 for (endlinetype = 1; endlinetype < (int)(sizeof(newlines)/sizeof(char *));
4329 endlinetype++)
4330 {
4331 if (strcmpic(newline_arg, newlines[endlinetype]) == 0) break;
4332 }
4333 if (endlinetype < (int)(sizeof(newlines)/sizeof(char *)))
4334 pcre2_set_newline(compile_context, endlinetype);
4335 else
4336 {
4337 fprintf(stderr, "pcre2grep: Invalid newline specifier \"%s\"\n",
4338 newline_arg);
4339 goto EXIT2;
4340 }
4341 }
4342
4343 /* Find default newline convention */
4344
4345 else
4346 {
4347 (void)pcre2_config(PCRE2_CONFIG_NEWLINE, &endlinetype);
4348 }
4349
4350 /* Interpret the text values for -d and -D */
4351
4352 if (dee_option != NULL)
4353 {
4354 if (strcmp(dee_option, "read") == 0) dee_action = dee_READ;
4355 else if (strcmp(dee_option, "recurse") == 0) dee_action = dee_RECURSE;
4356 else if (strcmp(dee_option, "skip") == 0) dee_action = dee_SKIP;
4357 else
4358 {
4359 fprintf(stderr, "pcre2grep: Invalid value \"%s\" for -d\n", dee_option);
4360 goto EXIT2;
4361 }
4362 }
4363
4364 if (DEE_option != NULL)
4365 {
4366 if (strcmp(DEE_option, "read") == 0) DEE_action = DEE_READ;
4367 else if (strcmp(DEE_option, "skip") == 0) DEE_action = DEE_SKIP;
4368 else
4369 {
4370 fprintf(stderr, "pcre2grep: Invalid value \"%s\" for -D\n", DEE_option);
4371 goto EXIT2;
4372 }
4373 }
4374
4375 /* If no_ucp is set, remove PCRE2_UCP from the compile options. */
4376
4377 if (no_ucp) pcre2_options &= ~PCRE2_UCP;
4378
4379 /* adjust the extra options. */
4380
4381 if (case_restrict) extra_options |= PCRE2_EXTRA_CASELESS_RESTRICT;
4382 if (posix_digit)
4383 extra_options |= (PCRE2_EXTRA_ASCII_BSD | PCRE2_EXTRA_ASCII_DIGIT);
4384
4385 /* Set the extra options in the compile context. */
4386
4387 (void)pcre2_set_compile_extra_options(compile_context, extra_options);
4388
4389 /* If use_jit is set, check whether JIT is available. If not, do not try
4390 to use JIT. */
4391
4392 if (use_jit)
4393 {
4394 uint32_t answer;
4395 (void)pcre2_config(PCRE2_CONFIG_JIT, &answer);
4396 if (!answer) use_jit = FALSE;
4397 }
4398
4399 /* Get memory for the main buffer. */
4400
4401 if (bufthird <= 0)
4402 {
4403 fprintf(stderr, "pcre2grep: --buffer-size must be greater than zero\n");
4404 goto EXIT2;
4405 }
4406
4407 bufsize = 3*bufthird;
4408 main_buffer = (char *)malloc(bufsize);
4409
4410 if (main_buffer == NULL)
4411 {
4412 /* LCOV_EXCL_START */
4413 fprintf(stderr, "pcre2grep: malloc failed\n");
4414 goto EXIT2;
4415 /* LCOV_EXCL_STOP */
4416 }
4417
4418 /* If no patterns were provided by -e, and there are no files provided by -f,
4419 the first argument is the one and only pattern, and it must exist. */
4420
4421 if (patterns == NULL && pattern_files == NULL)
4422 {
4423 if (i >= argc) return usage(2);
4424 patterns = patterns_last = add_pattern(argv[i], (PCRE2_SIZE)strlen(argv[i]),
4425 NULL);
4426 i++;
4427 if (patterns == NULL) goto EXIT2;
4428 }
4429
4430 /* Compile the patterns that were provided on the command line, either by
4431 multiple uses of -e or as a single unkeyed pattern. We cannot do this until
4432 after all the command-line options are read so that we know which PCRE options
4433 to use. When -F is used, compile_pattern() may add another block into the
4434 chain, so we must not access the next pointer till after the compile. */
4435
4436 for (j = 1, cp = patterns; cp != NULL; j++, cp = cp->next)
4437 {
4438 if (!compile_pattern(cp, pcre2_options, FALSE, "command-line",
4439 (j == 1 && patterns->next == NULL)? 0 : j))
4440 goto EXIT2;
4441 }
4442
4443 /* Read and compile the regular expressions that are provided in files. */
4444
4445 for (fn = pattern_files; fn != NULL; fn = fn->next)
4446 {
4447 if (!read_pattern_file(fn->name, &patterns, &patterns_last)) goto EXIT2;
4448 }
4449
4450 /* Unless JIT has been explicitly disabled, arrange a stack for it to use. */
4451
4452 #ifdef SUPPORT_PCRE2GREP_JIT
4453 if (use_jit)
4454 {
4455 jit_stack = pcre2_jit_stack_create(32*1024, 1024*1024, NULL);
4456 if (jit_stack != NULL )
4457 pcre2_jit_stack_assign(match_context, NULL, jit_stack);
4458 }
4459 #endif
4460
4461 /* -F, -w, and -x do not apply to include or exclude patterns, so we must
4462 adjust the options. */
4463
4464 pcre2_options &= ~PCRE2_LITERAL;
4465 (void)pcre2_set_compile_extra_options(compile_context, 0);
4466
4467 /* If there are include or exclude patterns read from the command line, compile
4468 them. */
4469
4470 for (j = 0; j < 4; j++)
4471 {
4472 int k;
4473 for (k = 1, cp = *(incexlist[j]); cp != NULL; k++, cp = cp->next)
4474 {
4475 if (!compile_pattern(cp, pcre2_options, FALSE, incexname[j],
4476 (k == 1 && cp->next == NULL)? 0 : k))
4477 goto EXIT2;
4478 }
4479 }
4480
4481 /* Read and compile include/exclude patterns from files. */
4482
4483 for (fn = include_from; fn != NULL; fn = fn->next)
4484 {
4485 if (!read_pattern_file(fn->name, &include_patterns, &include_patterns_last))
4486 goto EXIT2;
4487 }
4488
4489 for (fn = exclude_from; fn != NULL; fn = fn->next)
4490 {
4491 if (!read_pattern_file(fn->name, &exclude_patterns, &exclude_patterns_last))
4492 goto EXIT2;
4493 }
4494
4495 /* If there are no files that contain lists of files to search, and there are
4496 no file arguments, search stdin, and then exit. */
4497
4498 if (file_lists == NULL && i >= argc)
4499 {
4500 /* Using a buffered stdin, that then is seek is not portable,
4501 so attempt to remove the buffer, to workaround reported issues
4502 affecting several BSD and AIX */
4503 if (count_limit >= 0)
4504 setbuf(stdin, NULL);
4505 rc = pcre2grep(stdin, FR_PLAIN, stdin_name,
4506 (filenames > FN_DEFAULT)? stdin_name : NULL);
4507 goto EXIT;
4508 }
4509
4510 /* If any files that contains a list of files to search have been specified,
4511 read them line by line and search the given files. */
4512
4513 for (fn = file_lists; fn != NULL; fn = fn->next)
4514 {
4515 char buffer[FNBUFSIZ];
4516 FILE *fl;
4517 if (strcmp(fn->name, "-") == 0) fl = stdin; else
4518 {
4519 fl = fopen(fn->name, "rb");
4520 if (fl == NULL)
4521 {
4522 fprintf(stderr, "pcre2grep: Failed to open %s: %s\n", fn->name,
4523 strerror(errno));
4524 goto EXIT2;
4525 }
4526 }
4527 while (fgets(buffer, sizeof(buffer), fl) != NULL)
4528 {
4529 int frc;
4530 char *end = buffer + (int)strlen(buffer);
4531 while (end > buffer && isspace((unsigned char)(end[-1]))) end--;
4532 *end = 0;
4533 if (*buffer != 0)
4534 {
4535 frc = grep_or_recurse(buffer, dee_action == dee_RECURSE, FALSE);
4536 if (frc > 1) rc = frc;
4537 else if (frc == 0 && rc == 1) rc = 0;
4538 }
4539 }
4540 if (fl != stdin) fclose(fl);
4541 }
4542
4543 /* After handling file-list, work through remaining arguments. Pass in the fact
4544 that there is only one argument at top level - this suppresses the file name if
4545 the argument is not a directory and filenames are not otherwise forced. */
4546
4547 only_one_at_top = i == argc - 1 && file_lists == NULL;
4548
4549 for (; i < argc; i++)
4550 {
4551 int frc = grep_or_recurse(argv[i], dee_action == dee_RECURSE,
4552 only_one_at_top);
4553 if (frc > 1) rc = frc;
4554 else if (frc == 0 && rc == 1) rc = 0;
4555 }
4556
4557 /* Show the total number of matches if requested, but not if only one file's
4558 count was printed. */
4559
4560 if (show_total_count && counts_printed != 1 && filenames != FN_NOMATCH_ONLY)
4561 {
4562 if (counts_printed != 0 && filenames >= FN_DEFAULT)
4563 fprintf(stdout, "TOTAL:");
4564 fprintf(stdout, "%lu" STDOUT_NL, total_count);
4565 }
4566
4567 EXIT:
4568 #ifdef SUPPORT_PCRE2GREP_JIT
4569 pcre2_jit_free_unused_memory(NULL);
4570 if (jit_stack != NULL) pcre2_jit_stack_free(jit_stack);
4571 #endif
4572
4573 free(main_buffer);
4574 if (character_tables != NULL) pcre2_maketables_free(NULL, character_tables);
4575
4576 pcre2_compile_context_free(compile_context);
4577 pcre2_match_context_free(match_context);
4578 pcre2_match_data_free(match_data_pair[0]);
4579 pcre2_match_data_free(match_data_pair[1]);
4580
4581 free_pattern_chain(patterns);
4582 free_pattern_chain(include_patterns);
4583 free_pattern_chain(include_dir_patterns);
4584 free_pattern_chain(exclude_patterns);
4585 free_pattern_chain(exclude_dir_patterns);
4586
4587 free_file_chain(exclude_from);
4588 free_file_chain(include_from);
4589 free_file_chain(pattern_files);
4590 free_file_chain(file_lists);
4591
4592 while (only_matching != NULL)
4593 {
4594 omstr *this = only_matching;
4595 only_matching = this->next;
4596 free(this);
4597 }
4598
4599 pcre2grep_exit(rc);
4600
4601 EXIT2:
4602 rc = 2;
4603 goto EXIT;
4604 }
4605
4606 /* End of pcre2grep */
4607