xref: /aosp_15_r20/external/toybox/toys/posix/sed.c (revision cf5a6c84e2b8763fc1a7db14496fd4742913b199)
1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2  *
3  * Copyright 2014 Rob Landley <[email protected]>
4  *
5  * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6  *
7  * xform See https://www.gnu.org/software/tar/manual/html_section/transform.html
8  *
9  * TODO: lines > 2G could wrap signed int length counters. Not just getline()
10  * but N and s///
11  * TODO: make y// handle unicode, unicode delimiters
12  * TODO: handle error return from emit(), error_msg/exit consistently
13  *       What's the right thing to do for -i when write fails? Skip to next?
14  * test '//q' with no previous regex, also repeat previous regex?
15  *
16  * Deviations from POSIX: allow extended regular expressions with -r,
17  * editing in place with -i, separate with -s, NUL-delimited strings with -z,
18  * printf escapes in text, line continuations, semicolons after all commands,
19  * 2-address anywhere an address is allowed, "T" command, multiline
20  * continuations for [abc], \; to end [abc] argument before end of line.
21  * Explicit violations of stuff posix says NOT to do: N at EOF does default
22  * print, l escapes \n
23  * Added --tarxform mode to support tar --xform
24 
25 USE_SED(NEWTOY(sed, "(help)(version)(tarxform)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_AUTOCONF))
26 
27 config SED
28   bool "sed"
29   default y
30   help
31     usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
32 
33     Stream editor. Apply editing SCRIPTs to lines of input.
34 
35     -e	Add SCRIPT to list
36     -f	Add contents of SCRIPT_FILE to list
37     -i	Edit each file in place (-iEXT keeps backup file with extension EXT)
38     -n	No default output (use the p command to output matched lines)
39     -r	Use extended regular expression syntax
40     -E	POSIX alias for -r
41     -s	Treat input files separately (implied by -i)
42     -z	Use \0 rather than \n as input line separator
43 
44     A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
45     All -e SCRIPTs and -f SCRIPT_FILE contents are combined in order as if
46     separated by newlines. If no -e or -f then first argument is the SCRIPT.
47 
48     COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
49 
50       [ADDRESS[,ADDRESS]][!]COMMAND
51 
52     ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
53     last line (-s or -i makes it last line of each file). One address matches one
54     line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
55     match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
56 
57     REGULAR EXPRESSIONS start and end with the same character (anything but
58     backslash or newline). To use the delimiter in the regex escape it with a
59     backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
60     An empty regex repeats the previous one. ADDRESS regexes require any
61     first delimiter except / to be \escaped to distinguish it from COMMANDs.
62 
63     Sed reads each line of input, processes it, and writes it out or discards it
64     before reading the next. Sed can remember one additional line in a separate
65     buffer (the h, H, g, G, and x commands), and can read the next line of input
66     early (the n and N commands), but otherwise operates on individual lines.
67 
68     Each COMMAND starts with a single character. Commands with no arguments are:
69 
70       !  Run this command when the ADDRESS _didn't_ match.
71       {  Start new command block, continuing until a corresponding "}".
72          Command blocks nest and can have ADDRESSes applying to the whole block.
73       }  End command block (this COMMAND cannot have an address)
74       d  Delete this line and move on to the next one
75          (ignores remaining COMMANDs)
76       D  Delete one line of input and restart command SCRIPT (same as "d"
77          unless you've glued lines together with "N" or similar)
78       g  Get remembered line (overwriting current line)
79       G  Get remembered line (appending to current line)
80       h  Remember this line (overwriting remembered line)
81       H  Remember this line (appending to remembered line, if any)
82       l  Print line escaping \abfrtvn, octal escape other nonprintng chars,
83          wrap lines to terminal width with \, append $ to end of line.
84       n  Print default output and read next line over current line (quit at EOF)
85       N  Append \n and next line of input to this line. Quit at EOF without
86          default output. Advances line counter for ADDRESS and "=".
87       p  Print this line
88       P  Print this line up to first newline (from "N")
89       q  Quit (print default output, no more commands processed or lines read)
90       x  Exchange this line with remembered line (overwrite in both directions)
91       =  Print the current line number (plus newline)
92       #  Comment, ignores rest of this line of SCRIPT (until newline)
93 
94     Commands that take an argument:
95 
96       : LABEL    Target for jump commands
97       a TEXT     Append text to output before reading next line
98       b LABEL    Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
99       c TEXT     Delete matching ADDRESS range and output TEXT instead
100       i TEXT     Insert text (output immediately)
101       r FILE     Append contents of FILE to output before reading next line.
102       s/S/R/F    Search for regex S replace match with R using flags F. Delimiter
103                  is anything but \n or \, escape with \ to use in S or R. Printf
104                  escapes work. Unescaped & in R becomes full matched text, \1
105                  through \9 = parenthetical subexpression from S. \ at end of
106                  line appends next line of SCRIPT. The flags in F are:
107                  [0-9]    A number N, substitute only Nth match
108                  g        Global, substitute all matches
109                  i/I      Ignore case when matching
110                  p        Print resulting line when match found and replaced
111                  w [file] Write (append) line to file when match replaced
112       t LABEL    Test, jump if s/// command matched this line since last test
113       T LABEL    Test false, jump to :LABEL only if no s/// found a match
114       w FILE     Write (append) line to file
115       y/old/new/ Change each character in 'old' to corresponding character
116                  in 'new' (with standard backslash escapes, delimiter can be
117                  any repeated character except \ or \n)
118 
119     The TEXT arguments (to a c i) may end with an unescaped "\" to append
120     the next line (leading whitespace is not skipped), and treat ";" as a
121     literal character (use "\;" instead).
122 */
123 
124 #define FOR_sed
125 #include "toys.h"
126 
127 GLOBALS(
128   char *i;
129   struct arg_list *f, *e;
130 
131   // processed pattern list
132   struct double_list *pattern;
133 
134   char *nextline, *remember, *tarxform;
135   void *restart, *lastregex;
136   long nextlen, rememberlen, count;
137   int fdout, noeol;
138   unsigned xx, tarxlen, xflags;
139   char delim, xftype;
140 )
141 
142 // Linked list of parsed sed commands. Offset fields indicate location where
143 // regex or string starts, ala offset+(char *)struct, because we remalloc()
144 // these to expand them for multiline inputs, and pointers would have to be
145 // individually adjusted.
146 
147 struct sedcmd {
148   struct sedcmd *next, *prev;
149 
150   // Begin and end of each match
151   long lmatch[2]; // line number of match
152   int rmatch[2];  // offset of regex struct for prefix matches (/abc/,/def/p)
153   int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
154   unsigned not, hit;
155   unsigned sflags; // s///flag bits, see SFLAG macros below
156   char c; // action
157 };
158 
159 #define SFLAG_i 1
160 #define SFLAG_g 2
161 #define SFLAG_p 4
162 #define SFLAG_x 8
163 #define SFLAG_slash 16
164 #define SFLAG_R 32
165 #define SFLAG_S 64
166 #define SFLAG_H 128
167 
168 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)169 static int emit(char *line, long len, int eol)
170 {
171   int l = len, old = line[len];
172 
173   if (FLAG(tarxform)) {
174     TT.tarxform = xrealloc(TT.tarxform, TT.tarxlen+len+TT.noeol+eol);
175     if (TT.noeol) TT.tarxform[TT.tarxlen++] = TT.delim;
176     memcpy(TT.tarxform+TT.tarxlen, line, len);
177     TT.tarxlen += len;
178     if (eol) TT.tarxform[TT.tarxlen++] = TT.delim;
179   } else {
180     if (TT.noeol && !writeall(TT.fdout, &TT.delim, 1)) return 1;
181     if (eol) line[len++] = TT.delim;
182     if (!len) return 0;
183     l = writeall(TT.fdout, line, len);
184     if (eol) line[len-1] = old;
185   }
186   TT.noeol = !eol;
187   if (l != len) {
188     if (TT.fdout != 1) perror_msg("short write");
189 
190     return 1;
191   }
192 
193   return 0;
194 }
195 
196 // Extend allocation to include new string, with newline between if newlen<0
197 
extend_string(char ** old,char * new,int oldlen,int newlen)198 static char *extend_string(char **old, char *new, int oldlen, int newlen)
199 {
200   int newline = newlen < 0;
201   char *s;
202 
203   if (newline) newlen = -newlen;
204   s = *old = xrealloc(*old, oldlen+newlen+newline+1);
205   if (newline) s[oldlen++] = TT.delim;
206   memcpy(s+oldlen, new, newlen);
207   s[oldlen+newlen] = 0;
208 
209   return s+oldlen+newlen+1;
210 }
211 
212 // An empty regex repeats the previous one
get_regex(void * command,int offset)213 static void *get_regex(void *command, int offset)
214 {
215   if (!offset) {
216     if (!TT.lastregex) error_exit("no previous regex");
217     return TT.lastregex;
218   }
219 
220   return TT.lastregex = offset+(char *)command;
221 }
222 
223 // Apply pattern to line from input file
sed_line(char ** pline,long plen)224 static void sed_line(char **pline, long plen)
225 {
226   struct append {
227     struct append *next, *prev;
228     int file;
229     char *str;
230   } *append = 0;
231   char *line;
232   long len;
233   struct sedcmd *command;
234   int eol = 0, tea = 0;
235 
236   if (FLAG(tarxform)) {
237     if (!pline) return;
238 
239     line = *pline;
240     len = plen;
241     *pline = 0;
242     pline = 0;
243   } else {
244     line = TT.nextline;
245     len = TT.nextlen;
246 
247     // Ignore EOF for all files before last unless -i or -s
248     if (!pline && !FLAG(i) && !FLAG(s)) return;
249 
250     // Grab next line for deferred processing (EOF detection: we get a NULL
251     // pline at EOF to flush last line). Note that only end of _last_ input
252     // file matches $ (unless we're doing -i).
253     TT.nextline = 0;
254     TT.nextlen = 0;
255     if (pline) {
256       TT.nextline = *pline;
257       TT.nextlen = plen;
258       *pline = 0;
259     }
260   }
261 
262   if (!line || !len) return;
263   if (line[len-1] == TT.delim) line[--len] = eol++;
264   if (FLAG(tarxform) && len) {
265     TT.xftype = line[--len];
266     line[len] = 0;
267   }
268   TT.count++;
269 
270   // To prevent N as last command from restarting script, we added 1 to restart
271   // so we'd use it here even when NULL. Alas, compilers that think C has
272   // references instead of pointers assume ptr-1 can never be NULL (demonstrably
273   // untrue) and inappropriately dead code eliminate, so use LP64 math until
274   // we get a -fpointers-are-not-references compiler option.
275   command = (void *)(TT.restart ? ((unsigned long)TT.restart)-1
276     : (unsigned long)TT.pattern);
277   TT.restart = 0;
278 
279   while (command) {
280     char *str, c = command->c;
281 
282     // Have we got a line or regex matching range for this rule?
283     if (*command->lmatch || *command->rmatch) {
284       int miss = 0;
285       long lm;
286 
287       // In a match that might end?
288       if (command->hit) {
289         if (!(lm = command->lmatch[1])) {
290           if (!command->rmatch[1]) command->hit = 0;
291           else {
292             void *rm = get_regex(command, command->rmatch[1]);
293 
294             // regex match end includes matching line, so defer deactivation
295             if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
296           }
297         } else if (lm > 0 && lm < TT.count) command->hit = 0;
298         else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
299 
300       // Start a new match?
301       } else {
302         if (!(lm = *command->lmatch)) {
303           void *rm = get_regex(command, *command->rmatch);
304 
305           if (line && !regexec0(rm, line, len, 0, 0, 0))
306             command->hit = TT.count;
307         } else if (lm == TT.count || (lm == -1 && !pline))
308           command->hit = TT.count;
309 
310         if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
311       }
312 
313       // Didn't match?
314       lm = !(command->not^!!command->hit);
315 
316       // Deferred disable from regex end match
317       if (miss || command->lmatch[1] == TT.count) command->hit = 0;
318 
319       if (lm) {
320         // Handle skipping curly bracket command group
321         if (c == '{') {
322           int curly = 1;
323 
324           while (curly) {
325             command = command->next;
326             if (command->c == '{') curly++;
327             if (command->c == '}') curly--;
328           }
329         }
330         command = command->next;
331         continue;
332       }
333     }
334 
335     // A deleted line can still update line match state for later commands
336     if (!line) {
337       command = command->next;
338       continue;
339     }
340 
341     // Process command
342 
343     if (c=='a' || c=='r') {
344       struct append *a = xzalloc(sizeof(struct append));
345       if (command->arg1) a->str = command->arg1+(char *)command;
346       a->file = c=='r';
347       dlist_add_nomalloc((void *)&append, (void *)a);
348     } else if (c=='b' || c=='t' || c=='T') {
349       int t = tea;
350 
351       if (c != 'b') tea = 0;
352       if (c=='b' || t^(c=='T')) {
353         if (!command->arg1) break;
354         str = command->arg1+(char *)command;
355         for (command = (void *)TT.pattern; command; command = command->next)
356           if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
357             break;
358         if (!command) error_exit("no :%s", str);
359       }
360     } else if (c=='c') {
361       str = command->arg1+(char *)command;
362       if (!command->hit) emit(str, strlen(str), 1);
363       free(line);
364       line = 0;
365       continue;
366     } else if (c=='d') {
367       free(line);
368       line = 0;
369       continue;
370     } else if (c=='D') {
371       // Delete up to \n or end of buffer
372       str = line;
373       while ((str-line)<len) if (*(str++) == TT.delim) break;
374       len -= str - line;
375       memmove(line, str, len);
376 
377       // if "delete" blanks line, disable further processing
378       // otherwise trim and restart script
379       if (!len) {
380         free(line);
381         line = 0;
382       } else {
383         line[len] = 0;
384         command = (void *)TT.pattern;
385       }
386       continue;
387     } else if (c=='g') {
388       free(line);
389       line = xmemdup(TT.remember, TT.rememberlen+1);
390       len = TT.rememberlen;
391     } else if (c=='G') {
392       line = xrealloc(line, len+TT.rememberlen+2);
393       line[len++] = TT.delim;
394       memcpy(line+len, TT.remember, TT.rememberlen);
395       line[len += TT.rememberlen] = 0;
396     } else if (c=='h') {
397       free(TT.remember);
398       TT.remember = xstrdup(line);
399       TT.rememberlen = len;
400     } else if (c=='H') {
401       TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
402       TT.remember[TT.rememberlen++] = TT.delim;
403       memcpy(TT.remember+TT.rememberlen, line, len);
404       TT.remember[TT.rememberlen += len] = 0;
405     } else if (c=='i') {
406       str = command->arg1+(char *)command;
407       emit(str, strlen(str), 1);
408     } else if (c=='l') {
409       int i, x, off;
410 
411       if (!TT.xx) {
412         terminal_size(&TT.xx, 0);
413         if (!TT.xx) TT.xx = 80;
414         if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
415         if (TT.xx > 4) TT.xx -= 4;
416       }
417 
418       for (i = off = 0; i<len; i++) {
419         if (off >= TT.xx) {
420           toybuf[off++] = '\\';
421           emit(toybuf, off, 1);
422           off = 0;
423         }
424         x = stridx("\\\a\b\f\r\t\v\n", line[i]);
425         if (x != -1) {
426           toybuf[off++] = '\\';
427           toybuf[off++] = "\\abfrtvn"[x];
428         } else if (line[i] >= ' ') toybuf[off++] = line[i];
429         else off += sprintf(toybuf+off, "\\%03o", line[i]);
430       }
431       toybuf[off++] = '$';
432       emit(toybuf, off, 1);
433     } else if (c=='n') {
434       // The +1 forces restart processing even when next is null
435       TT.restart = (void *)(((unsigned long)command->next)+1);
436 
437       break;
438     } else if (c=='N') {
439       // Can't just grab next line because we could have multiple N and
440       // we need to actually read ahead to get N;$p EOF detection right.
441       if (pline) {
442         // The +1 forces restart processing even when  next is null
443         TT.restart = (void *)(((unsigned long)command->next)+1);
444         extend_string(&line, TT.nextline, len, -TT.nextlen);
445         free(TT.nextline);
446         TT.nextline = line;
447         TT.nextlen += len + 1;
448         line = 0;
449       }
450 
451       // Pending append goes out right after N
452       goto done;
453     } else if (c=='p' || c=='P') {
454       char *l = (c=='P') ? strchr(line, TT.delim) : 0;
455 
456       if (emit(line, l ? l-line : len, eol)) break;
457     } else if (c=='q' || c=='Q') {
458       if (pline) *pline = (void *)1;
459       free(TT.nextline);
460       if (!toys.exitval && command->arg1)
461         toys.exitval = atoi(command->arg1+(char *)command);
462       TT.nextline = 0;
463       TT.nextlen = 0;
464       if (c=='Q') line = 0;
465 
466       break;
467     } else if (c=='s') {
468       char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
469       regmatch_t *match = (void *)toybuf;
470       regex_t *reg = get_regex(command, command->arg1);
471       int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
472         bonk = 0, mlen, off, newlen;
473 
474       // Skip suppressed --tarxform types
475       if (TT.xftype && (command->sflags & (SFLAG_R<<stridx("rsh", TT.xftype))));
476 
477       // Loop finding match in remaining line (up to remaining len)
478       else while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
479         mlen = match[0].rm_eo-match[0].rm_so;
480 
481         // xform matches ending in / aren't allowed to match entire line
482         if ((command->sflags & SFLAG_slash) && mlen==len) {
483           while (len && ++bonk && line[--len]=='/');
484           continue;
485         }
486 
487         mflags = REG_NOTBOL;
488 
489         // Zero length matches don't count immediately after a previous match
490         if (!mlen && !zmatch) {
491           if (rline-line == len) break;
492           if (l2) l2[l2used++] = *rline++;
493           zmatch++;
494           continue;
495         } else zmatch = 0;
496 
497         // If we're replacing only a specific match, skip if this isn't it
498         off = command->sflags>>8;
499         if (off && off != ++count) {
500           if (l2) memcpy(l2+l2used, rline, match[0].rm_eo);
501           l2used += match[0].rm_eo;
502           rline += match[0].rm_eo;
503 
504           continue;
505         }
506         // The fact getline() can allocate unbounded amounts of memory is
507         // a bigger issue, but while we're here check for integer overflow
508         if (match[0].rm_eo > INT_MAX) perror_exit(0);
509 
510         // newlen = strlen(new) but with \1 and & and printf escapes
511         for (off = newlen = 0; new[off]; off++) {
512           int cc = -1;
513 
514           if (new[off] == '&') cc = 0;
515           else if (new[off] == '\\') cc = new[++off] - '0';
516           if (cc < 0 || cc > 9) {
517             newlen++;
518             continue;
519           }
520           newlen += match[cc].rm_eo-match[cc].rm_so;
521         }
522 
523         // Copy changed data to new string
524 
525         // Adjust allocation size of new string, copy data we know we'll keep
526         l2l += newlen-mlen;
527         if ((mlen = l2l|0xfff) > l2old) {
528           l2 = xrealloc(l2, ++mlen);
529           if (l2used && !l2old) memcpy(l2, rline-l2used, l2used);
530           l2old = mlen;
531         }
532         if (match[0].rm_so) {
533           memcpy(l2+l2used, rline, match[0].rm_so);
534           l2used += match[0].rm_so;
535         }
536 
537         // copy in new replacement text
538         for (off = mlen = 0; new[off]; off++) {
539           int cc = 0, ll;
540 
541           if (new[off] == '\\') {
542             cc = new[++off] - '0';
543             if (cc<0 || cc>9) {
544               if (!(l2[l2used+mlen++] = unescape(new[off])))
545                 l2[l2used+mlen-1] = new[off];
546 
547               continue;
548             } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
549           } else if (new[off] != '&') {
550             l2[l2used+mlen++] = new[off];
551 
552             continue;
553           }
554 
555           if (match[cc].rm_so != -1) {
556             ll = match[cc].rm_eo-match[cc].rm_so;
557             memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
558             mlen += ll;
559           }
560         }
561         l2used += newlen;
562         rline += match[0].rm_eo;
563 
564         if (!(command->sflags & SFLAG_g)) break;
565       }
566       len += bonk;
567 
568       // If we made any changes, finish off l2 and swap it for line
569       if (l2) {
570         // grab trailing unmatched data and null terminator, swap with original
571         mlen = len-(rline-line);
572         memcpy(l2+l2used, rline, mlen+1);
573         len = l2used + mlen;
574         free(line);
575         line = l2;
576       }
577 
578       if (mflags) {
579         if (command->sflags & SFLAG_p) emit(line, len, eol);
580 
581         tea = 1;
582         if (command->w) goto writenow;
583       }
584     } else if (c=='w') {
585       int fd, noeol;
586       char *name;
587 
588 writenow:
589       if (FLAG(tarxform)) error_exit("tilt");
590 
591       // Swap out emit() context
592       fd = TT.fdout;
593       noeol = TT.noeol;
594 
595       // We save filehandle and newline status before filename
596       name = command->w + (char *)command;
597       memcpy(&TT.fdout, name, 4);
598       name += 4;
599       TT.noeol = *(name++);
600 
601       // write, then save/restore context
602       if (emit(line, len, eol))
603         perror_exit("w '%s'", command->arg1+(char *)command);
604       *(--name) = TT.noeol;
605       TT.noeol = noeol;
606       TT.fdout = fd;
607     } else if (c=='x') {
608       long swap = TT.rememberlen;
609 
610       str = TT.remember;
611       TT.remember = line;
612       line = str;
613       TT.rememberlen = len;
614       len = swap;
615     } else if (c=='y') {
616       char *from, *to = (char *)command;
617       int i, j;
618 
619       from = to+command->arg1;
620       to += command->arg2;
621 
622       for (i = 0; i < len; i++) {
623         j = stridx(from, line[i]);
624         if (j != -1) line[i] = to[j];
625       }
626     } else if (c=='=') {
627       sprintf(toybuf, "%ld", TT.count);
628       if (emit(toybuf, strlen(toybuf), 1)) break;
629     }
630 
631     command = command->next;
632   }
633 
634 done:
635   if (line && !FLAG(n)) emit(line, len, eol);
636 
637   // TODO: should "sed -z ax" use \n instead of NUL?
638   if (dlist_terminate(append)) while (append) {
639     struct append *a = append->next;
640 
641     if (append->file) {
642       int fd = open(append->str, O_RDONLY);
643 
644       // Force newline if noeol pending
645       if (fd != -1) {
646         if (TT.noeol) xwrite(TT.fdout, &TT.delim, 1);
647         TT.noeol = 0;
648         xsendfile(fd, TT.fdout);
649         close(fd);
650       }
651     } else if (append->str) emit(append->str, strlen(append->str), 1);
652     else emit(line, 0, 0);
653     free(append);
654     append = a;
655   }
656   free(line);
657 
658   if (TT.tarxlen) {
659     dprintf(TT.fdout, "%08x", --TT.tarxlen);
660     writeall(TT.fdout, TT.tarxform, TT.tarxlen);
661     TT.tarxlen = 0;
662   }
663 }
664 
665 // Callback called on each input file
do_sed_file(int fd,char * name)666 static void do_sed_file(int fd, char *name)
667 {
668   char *tmp, *s;
669 
670   if (FLAG(i)) {
671     if (!fd) return error_msg("-i on stdin");
672     TT.fdout = copy_tempfile(fd, name, &tmp);
673   }
674   if (FLAG(i) || FLAG(s)) {
675     struct sedcmd *command;
676 
677     TT.count = 0;
678     for (command = (void *)TT.pattern; command; command = command->next)
679       command->hit = 0;
680   }
681   do_lines(fd, TT.delim, sed_line);
682   if (FLAG(i)) {
683     if (TT.i && *TT.i) {
684       xrename(name, s = xmprintf("%s%s", name, TT.i));
685       free(s);
686     }
687     replace_tempfile(-1, TT.fdout, &tmp);
688     TT.fdout = 1;
689   }
690   if (FLAG(i) || FLAG(s)) {
691     TT.nextline = 0;
692     TT.nextlen = TT.noeol = 0;
693   }
694 }
695 
696 // Copy chunk of string between two delimiters, converting printf escapes.
697 // returns processed copy of string (0 if error), *pstr advances to next
698 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
699 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)700 static char *unescape_delimited_string(char **pstr, char *delim)
701 {
702   char *to, *from, mode = 0, d;
703 
704   // Grab leading delimiter (if necessary), allocate space for new string
705   from = *pstr;
706   if (!delim || !*delim) {
707     if (!(d = *(from++))) return 0;
708     if (d == '\\') d = *(from++);
709     if (!d || d == '\\') return 0;
710     if (delim) *delim = d;
711   } else d = *delim;
712   to = delim = xmalloc(strlen(*pstr)+1);
713 
714   while (mode || *from != d) {
715     if (!*from) return 0;
716 
717     // delimiter in regex character range doesn't count
718     if (*from == '[') {
719       if (!mode) {
720         mode = ']';
721         if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
722       } else if (mode == ']' && strchr(".=:", from[1])) {
723         *(to++) = *(from++);
724         mode = *from;
725       }
726     } else if (*from == mode) {
727       if (mode == ']') mode = 0;
728       else {
729         *(to++) = *(from++);
730         mode = ']';
731       }
732     // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
733     // but the perl build does it, so we need to filter it out.
734     } else if (mode && *from == '-' && from[-1] == from[1]) {
735       from+=2;
736       continue;
737     } else if (*from == '\\') {
738       if (!from[1]) return 0;
739 
740       // Check escaped end delimiter before printf style escapes.
741       if (from[1] == d) from++;
742       else if (from[1]=='\\') *(to++) = *(from++);
743       else {
744         char c = unescape(from[1]);
745 
746         if (c) {
747           *(to++) = c;
748           from+=2;
749           continue;
750         } else if (!mode) *(to++) = *(from++);
751       }
752     }
753     *(to++) = *(from++);
754   }
755   *to = 0;
756   *pstr = from+1;
757 
758   return delim;
759 }
760 
761 // Translate pattern strings into command structures. Each command structure
762 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)763 static void parse_pattern(char **pline, long len)
764 {
765   struct sedcmd *command = (void *)TT.pattern;
766   char *line, *reg, c, *errstart;
767   int i;
768 
769   line = errstart = pline ? *pline : "";
770   if (len && line[len-1]=='\n') line[--len] = 0;
771 
772   // Append this line to previous multiline command? (hit indicates type.)
773   // During parsing "hit" stores data about line continuations, but in
774   // sed_line() it means the match range attached to this command
775   // is active, so processing the continuation must zero it again.
776   if (command && command->prev->hit) {
777     // Remove half-finished entry from list so remalloc() doesn't confuse it
778     TT.pattern = TT.pattern->prev;
779     command = dlist_pop(&TT.pattern);
780     c = command->c;
781     reg = (char *)command;
782     reg += command->arg1 + strlen(reg + command->arg1);
783 
784     // Resume parsing for 'a' or 's' command. (Only two that can do this.)
785     // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
786     // a unicode character.
787     if (command->hit < 256) goto resume_s;
788     else goto resume_a;
789   }
790 
791   // Loop through commands in this line.
792 
793   command = 0;
794   for (;;) {
795     if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
796 
797     // If there's no more data on this line, return.
798     for (;;) {
799       while (isspace(*line) || *line == ';') line++;
800       if (*line == '#') while (*line && *line != '\n') line++;
801       else break;
802     }
803     if (!*line) return;
804 
805     if (FLAG(tarxform) && strstart(&line, "flags=")) {
806       TT.xflags = 7;
807       while (0<=(i = stridx("rRsShH", *line))) {
808         if (i&1) TT.xflags |= 1<<(i>>1);
809         else TT.xflags &= ~(1<<(i>>1));
810         line++;
811       }
812       continue;
813     }
814 
815     // Start by writing data into toybuf.
816 
817     errstart = line;
818     memset(toybuf, 0, sizeof(struct sedcmd));
819     command = (void *)toybuf;
820     reg = toybuf + sizeof(struct sedcmd);
821 
822     // Parse address range (if any)
823     for (i = 0; i < 2; i++) {
824       if (*line == ',') line++;
825       else if (i) break;
826 
827       if (i && *line == '+' && isdigit(line[1])) {
828         line++;
829         command->lmatch[i] = -2-strtol(line, &line, 0);
830       } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
831       else if (*line == '$') {
832         command->lmatch[i] = -1;
833         line++;
834       } else if (*line == '/' || *line == '\\') {
835         char *s = line;
836 
837         if (!(s = unescape_delimited_string(&line, 0))) goto error;
838         if (!*s) command->rmatch[i] = 0;
839         else {
840           xregcomp((void *)reg, s, REG_EXTENDED*FLAG(r));
841           command->rmatch[i] = reg-toybuf;
842           reg += sizeof(regex_t);
843         }
844         free(s);
845       } else break;
846     }
847 
848     while (isspace(*line)) line++;
849     if (!*line) break;
850 
851     if (*line == '!') {
852       command->not = 1;
853       line++;
854     }
855     while (isspace(*line)) line++;
856     if (!*line) break;
857 
858     c = command->c = *(line++);
859     if (strchr("}:", c) && i) break;
860     if (strchr("aiqQr=", c) && i>1) break;
861 
862     // Allocate memory and copy out of toybuf now that we know how big it is
863     command = xmemdup(toybuf, reg-toybuf);
864     reg = (reg-toybuf) + (char *)command;
865 
866     // Parse arguments by command type
867     if (c == '{') TT.nextlen++;
868     else if (c == '}') {
869       if (!TT.nextlen--) break;
870     } else if (c == 's') {
871       char *end, delim = 0;
872       int flags;
873 
874       // s/pattern/replacement/flags
875 
876       // line continuations use arg1 (back at the start of the function),
877       // so let's fill out arg2 first (since the regex part can't be multiple
878       // lines) and swap them back later.
879 
880       // get pattern (just record, we parse it later)
881       command->arg2 = reg - (char *)command;
882       if (!(TT.remember = unescape_delimited_string(&line, &delim)))
883         goto error;
884 
885       reg += sizeof(regex_t);
886       command->arg1 = reg-(char *)command;
887       command->hit = delim;
888 resume_s:
889       // get replacement - don't replace escapes yet because \1 and \& need
890       // processing later, after we replace \\ with \ we can't tell \\1 from \1
891       end = line;
892       while (*end != command->hit) {
893         if (!*end) goto error;
894         if (*end++ == '\\') {
895           if (!*end || *end == '\n') {
896             end[-1] = '\n';
897             break;
898           }
899           end++;
900         }
901       }
902 
903       reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
904       line = end;
905       // line continuation? (note: '\n' can't be a valid delim).
906       if (*line == command->hit) command->hit = 0;
907       else {
908         if (!*line) continue;
909         reg--;
910         line++;
911         goto resume_s;
912       }
913 
914       // swap arg1/arg2 so they're back in order arguments occur.
915       i = command->arg1;
916       command->arg1 = command->arg2;
917       command->arg2 = i;
918       command->sflags = TT.xflags*SFLAG_R;
919 
920       // get flags
921       for (line++; *line; line++) {
922         long l;
923 
924         if (isspace(*line) && *line != '\n') continue;
925         if (0 <= (l = stridx("igpx", *line))) command->sflags |= 1<<l;
926         else if (*line == 'I') command->sflags |= 1<<0;
927         else if (FLAG(tarxform) && 0 <= (l = stridx("RSH", *line)))
928           command->sflags |= SFLAG_R<<l;
929         // Given that the default is rsh all enabled... why do these exist?
930         else if (FLAG(tarxform) && 0 <= (l = stridx("rsh", *line)))
931           command->sflags &= ~(SFLAG_R<<l);
932         else if (!(command->sflags>>8) && 0<(l = strtol(line, &line, 10))) {
933           command->sflags |= l << 8;
934           line--;
935         } else break;
936       }
937       flags = (FLAG(r) || (command->sflags & SFLAG_x)) ? REG_EXTENDED : 0;
938       if (command->sflags & SFLAG_i) flags |= REG_ICASE;
939 
940       // We deferred actually parsing the regex until we had the s///i flag
941       // allocating the space was done by extend_string() above
942       if (!*TT.remember) command->arg1 = 0;
943       else {
944         xregcomp((void *)(command->arg1+(char *)command), TT.remember, flags);
945         if (FLAG(tarxform) && TT.remember[strlen(TT.remember)-1]=='/')
946           command->sflags |= SFLAG_slash;
947       }
948       free(TT.remember);
949       TT.remember = 0;
950       if (*line == 'w') {
951         line++;
952         goto writenow;
953       }
954     } else if (c == 'w') {
955       int fd, delim;
956       char *cc;
957 
958       // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
959       // eol status, and to retain the filename for error messages, we'd need
960       // to go up to arg5 just for this. Compromise: dynamically allocate the
961       // filehandle and eol status.
962 
963 writenow:
964       while (isspace(*line)) line++;
965       if (!*line) goto error;
966       for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
967       delim = *cc;
968       *cc = 0;
969       fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0644);
970       *cc = delim;
971 
972       command->w = reg - (char *)command;
973       command = xrealloc(command, command->w+(cc-line)+6);
974       reg = command->w + (char *)command;
975 
976       memcpy(reg, &fd, 4);
977       reg += 4;
978       *(reg++) = 0;
979       memcpy(reg, line, delim);
980       reg += delim;
981       *(reg++) = 0;
982 
983       line = cc;
984       if (delim) line += 2;
985     } else if (c == 'y') {
986       char *s, delim = 0;
987       int len;
988 
989       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
990       command->arg1 = reg-(char *)command;
991       len = strlen(s);
992       reg = extend_string((void *)&command, s, reg-(char *)command, len);
993       free(s);
994       command->arg2 = reg-(char *)command;
995       if (!(s = unescape_delimited_string(&line, &delim))) goto error;
996       if (len != strlen(s)) goto error;
997       reg = extend_string((void *)&command, s, reg-(char*)command, len);
998       free(s);
999     } else if (strchr("abcirtTqQw:", c)) {
1000       int end;
1001 
1002       // trim leading spaces
1003       while (isspace(*line) && *line != '\n') line++;
1004 
1005       // Resume logic differs from 's' case because we don't add a newline
1006       // unless it's after something, so we add it on return instead.
1007 resume_a:
1008       command->hit = 0;
1009 
1010       // btTqQ: end with space or semicolon, aicrw continue to newline.
1011       if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
1012         // Argument's optional for btTqQ
1013         if (strchr("btTqQ", c)) continue;
1014         else if (!command->arg1) break;
1015       }
1016       // Error checking: qQ can only have digits after them
1017       if (c=='q' || c=='Q') {
1018         for (i = 0; i<end && isdigit(line[i]); i++);
1019         if (i != end) {
1020           line += i;
1021           break;
1022         }
1023       }
1024 
1025       // Extend allocation to include new string. We use offsets instead of
1026       // pointers so realloc() moving stuff doesn't break things. Ok to write
1027       // \n over NUL terminator because call to extend_string() adds it back.
1028       if (!command->arg1) command->arg1 = reg - (char*)command;
1029       else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
1030       else if (!pline) {
1031         command->arg1 = 0;
1032         continue;
1033       }
1034       reg = extend_string((void *)&command, line, reg - (char *)command, end);
1035 
1036       // Recopy data to remove escape sequences and handle line continuation.
1037       if (strchr("aci", c)) {
1038         reg -= end+1;
1039         for (i = end; i; i--) {
1040           if ((*reg++ = *line++)=='\\') {
1041 
1042             // escape at end of line: resume if -e escaped literal newline,
1043             // else request callback and resume with next line
1044             if (!--i) {
1045               *--reg = 0;
1046               if (*line) {
1047                 line++;
1048                 goto resume_a;
1049               }
1050               command->hit = 256;
1051               break;
1052             }
1053             if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1054             line++;
1055           }
1056         }
1057         *reg = 0;
1058       } else line += end;
1059 
1060     // Commands that take no arguments
1061     } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
1062   }
1063 
1064 error:
1065   error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1066 }
1067 
1068 // Is the pointer "find" within the string "range".
instr(char * find,char * range)1069 static int instr(char *find, char *range)
1070 {
1071   return find>=range && range+strlen(range)>=find;
1072 }
1073 
sed_main(void)1074 void sed_main(void)
1075 {
1076   char **args = toys.optargs, **aa;
1077 
1078   if (FLAG(tarxform)) toys.optflags |= FLAG_z;
1079   if (!FLAG(z)) TT.delim = '\n';
1080 
1081   // Parse pattern into commands.
1082 
1083   // If no -e or -f, first argument is the pattern.
1084   if (!TT.e && !TT.f) {
1085     if (!*toys.optargs) error_exit("no pattern");
1086     (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1087   }
1088 
1089   // -e and -f care about order, so use argv[] to recreate original order
1090   for (aa = toys.argv+1; *aa; aa++) {
1091     if (TT.e && instr(TT.e->arg, *aa)) {
1092       parse_pattern(&TT.e->arg, strlen(TT.e->arg));
1093       free(llist_pop(&TT.e));
1094     }
1095     if (TT.f && instr(TT.f->arg, *aa)) {
1096       do_lines(xopenro(TT.f->arg), TT.delim, parse_pattern);
1097       free(llist_pop(&TT.f));
1098     }
1099   }
1100   parse_pattern(0, 0);
1101   dlist_terminate(TT.pattern);
1102   if (TT.nextlen) error_exit("no }");
1103 
1104   TT.fdout = 1;
1105   TT.remember = xstrdup("");
1106 
1107   // Inflict pattern upon input files. Long version because !O_CLOEXEC
1108   loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1109 
1110   // Provide EOF flush at end of cumulative input for non-i mode.
1111   if (!FLAG(i) && !FLAG(s)) {
1112     toys.optflags |= FLAG_s;
1113     sed_line(0, 0);
1114   }
1115 
1116   // TODO: need to close fd when done for TOYBOX_FREE?
1117 }
1118