1 /* sed.c - stream editor. Thing that does s/// and other stuff.
2 *
3 * Copyright 2014 Rob Landley <[email protected]>
4 *
5 * See http://pubs.opengroup.org/onlinepubs/9699919799/utilities/sed.html
6 *
7 * xform See https://www.gnu.org/software/tar/manual/html_section/transform.html
8 *
9 * TODO: lines > 2G could wrap signed int length counters. Not just getline()
10 * but N and s///
11 * TODO: make y// handle unicode, unicode delimiters
12 * TODO: handle error return from emit(), error_msg/exit consistently
13 * What's the right thing to do for -i when write fails? Skip to next?
14 * test '//q' with no previous regex, also repeat previous regex?
15 *
16 * Deviations from POSIX: allow extended regular expressions with -r,
17 * editing in place with -i, separate with -s, NUL-delimited strings with -z,
18 * printf escapes in text, line continuations, semicolons after all commands,
19 * 2-address anywhere an address is allowed, "T" command, multiline
20 * continuations for [abc], \; to end [abc] argument before end of line.
21 * Explicit violations of stuff posix says NOT to do: N at EOF does default
22 * print, l escapes \n
23 * Added --tarxform mode to support tar --xform
24
25 USE_SED(NEWTOY(sed, "(help)(version)(tarxform)e*f*i:;nErz(null-data)s[+Er]", TOYFLAG_BIN|TOYFLAG_AUTOCONF))
26
27 config SED
28 bool "sed"
29 default y
30 help
31 usage: sed [-inrszE] [-e SCRIPT]...|SCRIPT [-f SCRIPT_FILE]... [FILE...]
32
33 Stream editor. Apply editing SCRIPTs to lines of input.
34
35 -e Add SCRIPT to list
36 -f Add contents of SCRIPT_FILE to list
37 -i Edit each file in place (-iEXT keeps backup file with extension EXT)
38 -n No default output (use the p command to output matched lines)
39 -r Use extended regular expression syntax
40 -E POSIX alias for -r
41 -s Treat input files separately (implied by -i)
42 -z Use \0 rather than \n as input line separator
43
44 A SCRIPT is one or more COMMANDs separated by newlines or semicolons.
45 All -e SCRIPTs and -f SCRIPT_FILE contents are combined in order as if
46 separated by newlines. If no -e or -f then first argument is the SCRIPT.
47
48 COMMANDs apply to every line unless prefixed with an ADDRESS of the form:
49
50 [ADDRESS[,ADDRESS]][!]COMMAND
51
52 ADDRESS is a line number (starting at 1), a /REGULAR EXPRESSION/, or $ for
53 last line (-s or -i makes it last line of each file). One address matches one
54 line, ADDRESS,ADDRESS matches from first to second inclusive. Two regexes can
55 match multiple ranges. ADDRESS,+N ends N lines later. ! inverts the match.
56
57 REGULAR EXPRESSIONS start and end with the same character (anything but
58 backslash or newline). To use the delimiter in the regex escape it with a
59 backslash, and printf escapes (\abcefnrtv and octal, hex, and unicode) work.
60 An empty regex repeats the previous one. ADDRESS regexes require any
61 first delimiter except / to be \escaped to distinguish it from COMMANDs.
62
63 Sed reads each line of input, processes it, and writes it out or discards it
64 before reading the next. Sed can remember one additional line in a separate
65 buffer (the h, H, g, G, and x commands), and can read the next line of input
66 early (the n and N commands), but otherwise operates on individual lines.
67
68 Each COMMAND starts with a single character. Commands with no arguments are:
69
70 ! Run this command when the ADDRESS _didn't_ match.
71 { Start new command block, continuing until a corresponding "}".
72 Command blocks nest and can have ADDRESSes applying to the whole block.
73 } End command block (this COMMAND cannot have an address)
74 d Delete this line and move on to the next one
75 (ignores remaining COMMANDs)
76 D Delete one line of input and restart command SCRIPT (same as "d"
77 unless you've glued lines together with "N" or similar)
78 g Get remembered line (overwriting current line)
79 G Get remembered line (appending to current line)
80 h Remember this line (overwriting remembered line)
81 H Remember this line (appending to remembered line, if any)
82 l Print line escaping \abfrtvn, octal escape other nonprintng chars,
83 wrap lines to terminal width with \, append $ to end of line.
84 n Print default output and read next line over current line (quit at EOF)
85 N Append \n and next line of input to this line. Quit at EOF without
86 default output. Advances line counter for ADDRESS and "=".
87 p Print this line
88 P Print this line up to first newline (from "N")
89 q Quit (print default output, no more commands processed or lines read)
90 x Exchange this line with remembered line (overwrite in both directions)
91 = Print the current line number (plus newline)
92 # Comment, ignores rest of this line of SCRIPT (until newline)
93
94 Commands that take an argument:
95
96 : LABEL Target for jump commands
97 a TEXT Append text to output before reading next line
98 b LABEL Branch, jumps to :LABEL (with no LABEL to end of SCRIPT)
99 c TEXT Delete matching ADDRESS range and output TEXT instead
100 i TEXT Insert text (output immediately)
101 r FILE Append contents of FILE to output before reading next line.
102 s/S/R/F Search for regex S replace match with R using flags F. Delimiter
103 is anything but \n or \, escape with \ to use in S or R. Printf
104 escapes work. Unescaped & in R becomes full matched text, \1
105 through \9 = parenthetical subexpression from S. \ at end of
106 line appends next line of SCRIPT. The flags in F are:
107 [0-9] A number N, substitute only Nth match
108 g Global, substitute all matches
109 i/I Ignore case when matching
110 p Print resulting line when match found and replaced
111 w [file] Write (append) line to file when match replaced
112 t LABEL Test, jump if s/// command matched this line since last test
113 T LABEL Test false, jump to :LABEL only if no s/// found a match
114 w FILE Write (append) line to file
115 y/old/new/ Change each character in 'old' to corresponding character
116 in 'new' (with standard backslash escapes, delimiter can be
117 any repeated character except \ or \n)
118
119 The TEXT arguments (to a c i) may end with an unescaped "\" to append
120 the next line (leading whitespace is not skipped), and treat ";" as a
121 literal character (use "\;" instead).
122 */
123
124 #define FOR_sed
125 #include "toys.h"
126
127 GLOBALS(
128 char *i;
129 struct arg_list *f, *e;
130
131 // processed pattern list
132 struct double_list *pattern;
133
134 char *nextline, *remember, *tarxform;
135 void *restart, *lastregex;
136 long nextlen, rememberlen, count;
137 int fdout, noeol;
138 unsigned xx, tarxlen, xflags;
139 char delim, xftype;
140 )
141
142 // Linked list of parsed sed commands. Offset fields indicate location where
143 // regex or string starts, ala offset+(char *)struct, because we remalloc()
144 // these to expand them for multiline inputs, and pointers would have to be
145 // individually adjusted.
146
147 struct sedcmd {
148 struct sedcmd *next, *prev;
149
150 // Begin and end of each match
151 long lmatch[2]; // line number of match
152 int rmatch[2]; // offset of regex struct for prefix matches (/abc/,/def/p)
153 int arg1, arg2, w; // offset of two arguments per command, plus s//w filename
154 unsigned not, hit;
155 unsigned sflags; // s///flag bits, see SFLAG macros below
156 char c; // action
157 };
158
159 #define SFLAG_i 1
160 #define SFLAG_g 2
161 #define SFLAG_p 4
162 #define SFLAG_x 8
163 #define SFLAG_slash 16
164 #define SFLAG_R 32
165 #define SFLAG_S 64
166 #define SFLAG_H 128
167
168 // Write out line with potential embedded NUL, handling eol/noeol
emit(char * line,long len,int eol)169 static int emit(char *line, long len, int eol)
170 {
171 int l = len, old = line[len];
172
173 if (FLAG(tarxform)) {
174 TT.tarxform = xrealloc(TT.tarxform, TT.tarxlen+len+TT.noeol+eol);
175 if (TT.noeol) TT.tarxform[TT.tarxlen++] = TT.delim;
176 memcpy(TT.tarxform+TT.tarxlen, line, len);
177 TT.tarxlen += len;
178 if (eol) TT.tarxform[TT.tarxlen++] = TT.delim;
179 } else {
180 if (TT.noeol && !writeall(TT.fdout, &TT.delim, 1)) return 1;
181 if (eol) line[len++] = TT.delim;
182 if (!len) return 0;
183 l = writeall(TT.fdout, line, len);
184 if (eol) line[len-1] = old;
185 }
186 TT.noeol = !eol;
187 if (l != len) {
188 if (TT.fdout != 1) perror_msg("short write");
189
190 return 1;
191 }
192
193 return 0;
194 }
195
196 // Extend allocation to include new string, with newline between if newlen<0
197
extend_string(char ** old,char * new,int oldlen,int newlen)198 static char *extend_string(char **old, char *new, int oldlen, int newlen)
199 {
200 int newline = newlen < 0;
201 char *s;
202
203 if (newline) newlen = -newlen;
204 s = *old = xrealloc(*old, oldlen+newlen+newline+1);
205 if (newline) s[oldlen++] = TT.delim;
206 memcpy(s+oldlen, new, newlen);
207 s[oldlen+newlen] = 0;
208
209 return s+oldlen+newlen+1;
210 }
211
212 // An empty regex repeats the previous one
get_regex(void * command,int offset)213 static void *get_regex(void *command, int offset)
214 {
215 if (!offset) {
216 if (!TT.lastregex) error_exit("no previous regex");
217 return TT.lastregex;
218 }
219
220 return TT.lastregex = offset+(char *)command;
221 }
222
223 // Apply pattern to line from input file
sed_line(char ** pline,long plen)224 static void sed_line(char **pline, long plen)
225 {
226 struct append {
227 struct append *next, *prev;
228 int file;
229 char *str;
230 } *append = 0;
231 char *line;
232 long len;
233 struct sedcmd *command;
234 int eol = 0, tea = 0;
235
236 if (FLAG(tarxform)) {
237 if (!pline) return;
238
239 line = *pline;
240 len = plen;
241 *pline = 0;
242 pline = 0;
243 } else {
244 line = TT.nextline;
245 len = TT.nextlen;
246
247 // Ignore EOF for all files before last unless -i or -s
248 if (!pline && !FLAG(i) && !FLAG(s)) return;
249
250 // Grab next line for deferred processing (EOF detection: we get a NULL
251 // pline at EOF to flush last line). Note that only end of _last_ input
252 // file matches $ (unless we're doing -i).
253 TT.nextline = 0;
254 TT.nextlen = 0;
255 if (pline) {
256 TT.nextline = *pline;
257 TT.nextlen = plen;
258 *pline = 0;
259 }
260 }
261
262 if (!line || !len) return;
263 if (line[len-1] == TT.delim) line[--len] = eol++;
264 if (FLAG(tarxform) && len) {
265 TT.xftype = line[--len];
266 line[len] = 0;
267 }
268 TT.count++;
269
270 // To prevent N as last command from restarting script, we added 1 to restart
271 // so we'd use it here even when NULL. Alas, compilers that think C has
272 // references instead of pointers assume ptr-1 can never be NULL (demonstrably
273 // untrue) and inappropriately dead code eliminate, so use LP64 math until
274 // we get a -fpointers-are-not-references compiler option.
275 command = (void *)(TT.restart ? ((unsigned long)TT.restart)-1
276 : (unsigned long)TT.pattern);
277 TT.restart = 0;
278
279 while (command) {
280 char *str, c = command->c;
281
282 // Have we got a line or regex matching range for this rule?
283 if (*command->lmatch || *command->rmatch) {
284 int miss = 0;
285 long lm;
286
287 // In a match that might end?
288 if (command->hit) {
289 if (!(lm = command->lmatch[1])) {
290 if (!command->rmatch[1]) command->hit = 0;
291 else {
292 void *rm = get_regex(command, command->rmatch[1]);
293
294 // regex match end includes matching line, so defer deactivation
295 if (line && !regexec0(rm, line, len, 0, 0, 0)) miss = 1;
296 }
297 } else if (lm > 0 && lm < TT.count) command->hit = 0;
298 else if (lm < -1 && TT.count == command->hit+(-lm-1)) command->hit = 0;
299
300 // Start a new match?
301 } else {
302 if (!(lm = *command->lmatch)) {
303 void *rm = get_regex(command, *command->rmatch);
304
305 if (line && !regexec0(rm, line, len, 0, 0, 0))
306 command->hit = TT.count;
307 } else if (lm == TT.count || (lm == -1 && !pline))
308 command->hit = TT.count;
309
310 if (!command->lmatch[1] && !command->rmatch[1]) miss = 1;
311 }
312
313 // Didn't match?
314 lm = !(command->not^!!command->hit);
315
316 // Deferred disable from regex end match
317 if (miss || command->lmatch[1] == TT.count) command->hit = 0;
318
319 if (lm) {
320 // Handle skipping curly bracket command group
321 if (c == '{') {
322 int curly = 1;
323
324 while (curly) {
325 command = command->next;
326 if (command->c == '{') curly++;
327 if (command->c == '}') curly--;
328 }
329 }
330 command = command->next;
331 continue;
332 }
333 }
334
335 // A deleted line can still update line match state for later commands
336 if (!line) {
337 command = command->next;
338 continue;
339 }
340
341 // Process command
342
343 if (c=='a' || c=='r') {
344 struct append *a = xzalloc(sizeof(struct append));
345 if (command->arg1) a->str = command->arg1+(char *)command;
346 a->file = c=='r';
347 dlist_add_nomalloc((void *)&append, (void *)a);
348 } else if (c=='b' || c=='t' || c=='T') {
349 int t = tea;
350
351 if (c != 'b') tea = 0;
352 if (c=='b' || t^(c=='T')) {
353 if (!command->arg1) break;
354 str = command->arg1+(char *)command;
355 for (command = (void *)TT.pattern; command; command = command->next)
356 if (command->c == ':' && !strcmp(command->arg1+(char *)command, str))
357 break;
358 if (!command) error_exit("no :%s", str);
359 }
360 } else if (c=='c') {
361 str = command->arg1+(char *)command;
362 if (!command->hit) emit(str, strlen(str), 1);
363 free(line);
364 line = 0;
365 continue;
366 } else if (c=='d') {
367 free(line);
368 line = 0;
369 continue;
370 } else if (c=='D') {
371 // Delete up to \n or end of buffer
372 str = line;
373 while ((str-line)<len) if (*(str++) == TT.delim) break;
374 len -= str - line;
375 memmove(line, str, len);
376
377 // if "delete" blanks line, disable further processing
378 // otherwise trim and restart script
379 if (!len) {
380 free(line);
381 line = 0;
382 } else {
383 line[len] = 0;
384 command = (void *)TT.pattern;
385 }
386 continue;
387 } else if (c=='g') {
388 free(line);
389 line = xmemdup(TT.remember, TT.rememberlen+1);
390 len = TT.rememberlen;
391 } else if (c=='G') {
392 line = xrealloc(line, len+TT.rememberlen+2);
393 line[len++] = TT.delim;
394 memcpy(line+len, TT.remember, TT.rememberlen);
395 line[len += TT.rememberlen] = 0;
396 } else if (c=='h') {
397 free(TT.remember);
398 TT.remember = xstrdup(line);
399 TT.rememberlen = len;
400 } else if (c=='H') {
401 TT.remember = xrealloc(TT.remember, TT.rememberlen+len+2);
402 TT.remember[TT.rememberlen++] = TT.delim;
403 memcpy(TT.remember+TT.rememberlen, line, len);
404 TT.remember[TT.rememberlen += len] = 0;
405 } else if (c=='i') {
406 str = command->arg1+(char *)command;
407 emit(str, strlen(str), 1);
408 } else if (c=='l') {
409 int i, x, off;
410
411 if (!TT.xx) {
412 terminal_size(&TT.xx, 0);
413 if (!TT.xx) TT.xx = 80;
414 if (TT.xx > sizeof(toybuf)-10) TT.xx = sizeof(toybuf)-10;
415 if (TT.xx > 4) TT.xx -= 4;
416 }
417
418 for (i = off = 0; i<len; i++) {
419 if (off >= TT.xx) {
420 toybuf[off++] = '\\';
421 emit(toybuf, off, 1);
422 off = 0;
423 }
424 x = stridx("\\\a\b\f\r\t\v\n", line[i]);
425 if (x != -1) {
426 toybuf[off++] = '\\';
427 toybuf[off++] = "\\abfrtvn"[x];
428 } else if (line[i] >= ' ') toybuf[off++] = line[i];
429 else off += sprintf(toybuf+off, "\\%03o", line[i]);
430 }
431 toybuf[off++] = '$';
432 emit(toybuf, off, 1);
433 } else if (c=='n') {
434 // The +1 forces restart processing even when next is null
435 TT.restart = (void *)(((unsigned long)command->next)+1);
436
437 break;
438 } else if (c=='N') {
439 // Can't just grab next line because we could have multiple N and
440 // we need to actually read ahead to get N;$p EOF detection right.
441 if (pline) {
442 // The +1 forces restart processing even when next is null
443 TT.restart = (void *)(((unsigned long)command->next)+1);
444 extend_string(&line, TT.nextline, len, -TT.nextlen);
445 free(TT.nextline);
446 TT.nextline = line;
447 TT.nextlen += len + 1;
448 line = 0;
449 }
450
451 // Pending append goes out right after N
452 goto done;
453 } else if (c=='p' || c=='P') {
454 char *l = (c=='P') ? strchr(line, TT.delim) : 0;
455
456 if (emit(line, l ? l-line : len, eol)) break;
457 } else if (c=='q' || c=='Q') {
458 if (pline) *pline = (void *)1;
459 free(TT.nextline);
460 if (!toys.exitval && command->arg1)
461 toys.exitval = atoi(command->arg1+(char *)command);
462 TT.nextline = 0;
463 TT.nextlen = 0;
464 if (c=='Q') line = 0;
465
466 break;
467 } else if (c=='s') {
468 char *rline = line, *new = command->arg2 + (char *)command, *l2 = 0;
469 regmatch_t *match = (void *)toybuf;
470 regex_t *reg = get_regex(command, command->arg1);
471 int mflags = 0, count = 0, l2used = 0, zmatch = 1, l2l = len, l2old = 0,
472 bonk = 0, mlen, off, newlen;
473
474 // Skip suppressed --tarxform types
475 if (TT.xftype && (command->sflags & (SFLAG_R<<stridx("rsh", TT.xftype))));
476
477 // Loop finding match in remaining line (up to remaining len)
478 else while (!regexec0(reg, rline, len-(rline-line), 10, match, mflags)) {
479 mlen = match[0].rm_eo-match[0].rm_so;
480
481 // xform matches ending in / aren't allowed to match entire line
482 if ((command->sflags & SFLAG_slash) && mlen==len) {
483 while (len && ++bonk && line[--len]=='/');
484 continue;
485 }
486
487 mflags = REG_NOTBOL;
488
489 // Zero length matches don't count immediately after a previous match
490 if (!mlen && !zmatch) {
491 if (rline-line == len) break;
492 if (l2) l2[l2used++] = *rline++;
493 zmatch++;
494 continue;
495 } else zmatch = 0;
496
497 // If we're replacing only a specific match, skip if this isn't it
498 off = command->sflags>>8;
499 if (off && off != ++count) {
500 if (l2) memcpy(l2+l2used, rline, match[0].rm_eo);
501 l2used += match[0].rm_eo;
502 rline += match[0].rm_eo;
503
504 continue;
505 }
506 // The fact getline() can allocate unbounded amounts of memory is
507 // a bigger issue, but while we're here check for integer overflow
508 if (match[0].rm_eo > INT_MAX) perror_exit(0);
509
510 // newlen = strlen(new) but with \1 and & and printf escapes
511 for (off = newlen = 0; new[off]; off++) {
512 int cc = -1;
513
514 if (new[off] == '&') cc = 0;
515 else if (new[off] == '\\') cc = new[++off] - '0';
516 if (cc < 0 || cc > 9) {
517 newlen++;
518 continue;
519 }
520 newlen += match[cc].rm_eo-match[cc].rm_so;
521 }
522
523 // Copy changed data to new string
524
525 // Adjust allocation size of new string, copy data we know we'll keep
526 l2l += newlen-mlen;
527 if ((mlen = l2l|0xfff) > l2old) {
528 l2 = xrealloc(l2, ++mlen);
529 if (l2used && !l2old) memcpy(l2, rline-l2used, l2used);
530 l2old = mlen;
531 }
532 if (match[0].rm_so) {
533 memcpy(l2+l2used, rline, match[0].rm_so);
534 l2used += match[0].rm_so;
535 }
536
537 // copy in new replacement text
538 for (off = mlen = 0; new[off]; off++) {
539 int cc = 0, ll;
540
541 if (new[off] == '\\') {
542 cc = new[++off] - '0';
543 if (cc<0 || cc>9) {
544 if (!(l2[l2used+mlen++] = unescape(new[off])))
545 l2[l2used+mlen-1] = new[off];
546
547 continue;
548 } else if (cc > reg->re_nsub) error_exit("no s//\\%d/", cc);
549 } else if (new[off] != '&') {
550 l2[l2used+mlen++] = new[off];
551
552 continue;
553 }
554
555 if (match[cc].rm_so != -1) {
556 ll = match[cc].rm_eo-match[cc].rm_so;
557 memcpy(l2+l2used+mlen, rline+match[cc].rm_so, ll);
558 mlen += ll;
559 }
560 }
561 l2used += newlen;
562 rline += match[0].rm_eo;
563
564 if (!(command->sflags & SFLAG_g)) break;
565 }
566 len += bonk;
567
568 // If we made any changes, finish off l2 and swap it for line
569 if (l2) {
570 // grab trailing unmatched data and null terminator, swap with original
571 mlen = len-(rline-line);
572 memcpy(l2+l2used, rline, mlen+1);
573 len = l2used + mlen;
574 free(line);
575 line = l2;
576 }
577
578 if (mflags) {
579 if (command->sflags & SFLAG_p) emit(line, len, eol);
580
581 tea = 1;
582 if (command->w) goto writenow;
583 }
584 } else if (c=='w') {
585 int fd, noeol;
586 char *name;
587
588 writenow:
589 if (FLAG(tarxform)) error_exit("tilt");
590
591 // Swap out emit() context
592 fd = TT.fdout;
593 noeol = TT.noeol;
594
595 // We save filehandle and newline status before filename
596 name = command->w + (char *)command;
597 memcpy(&TT.fdout, name, 4);
598 name += 4;
599 TT.noeol = *(name++);
600
601 // write, then save/restore context
602 if (emit(line, len, eol))
603 perror_exit("w '%s'", command->arg1+(char *)command);
604 *(--name) = TT.noeol;
605 TT.noeol = noeol;
606 TT.fdout = fd;
607 } else if (c=='x') {
608 long swap = TT.rememberlen;
609
610 str = TT.remember;
611 TT.remember = line;
612 line = str;
613 TT.rememberlen = len;
614 len = swap;
615 } else if (c=='y') {
616 char *from, *to = (char *)command;
617 int i, j;
618
619 from = to+command->arg1;
620 to += command->arg2;
621
622 for (i = 0; i < len; i++) {
623 j = stridx(from, line[i]);
624 if (j != -1) line[i] = to[j];
625 }
626 } else if (c=='=') {
627 sprintf(toybuf, "%ld", TT.count);
628 if (emit(toybuf, strlen(toybuf), 1)) break;
629 }
630
631 command = command->next;
632 }
633
634 done:
635 if (line && !FLAG(n)) emit(line, len, eol);
636
637 // TODO: should "sed -z ax" use \n instead of NUL?
638 if (dlist_terminate(append)) while (append) {
639 struct append *a = append->next;
640
641 if (append->file) {
642 int fd = open(append->str, O_RDONLY);
643
644 // Force newline if noeol pending
645 if (fd != -1) {
646 if (TT.noeol) xwrite(TT.fdout, &TT.delim, 1);
647 TT.noeol = 0;
648 xsendfile(fd, TT.fdout);
649 close(fd);
650 }
651 } else if (append->str) emit(append->str, strlen(append->str), 1);
652 else emit(line, 0, 0);
653 free(append);
654 append = a;
655 }
656 free(line);
657
658 if (TT.tarxlen) {
659 dprintf(TT.fdout, "%08x", --TT.tarxlen);
660 writeall(TT.fdout, TT.tarxform, TT.tarxlen);
661 TT.tarxlen = 0;
662 }
663 }
664
665 // Callback called on each input file
do_sed_file(int fd,char * name)666 static void do_sed_file(int fd, char *name)
667 {
668 char *tmp, *s;
669
670 if (FLAG(i)) {
671 if (!fd) return error_msg("-i on stdin");
672 TT.fdout = copy_tempfile(fd, name, &tmp);
673 }
674 if (FLAG(i) || FLAG(s)) {
675 struct sedcmd *command;
676
677 TT.count = 0;
678 for (command = (void *)TT.pattern; command; command = command->next)
679 command->hit = 0;
680 }
681 do_lines(fd, TT.delim, sed_line);
682 if (FLAG(i)) {
683 if (TT.i && *TT.i) {
684 xrename(name, s = xmprintf("%s%s", name, TT.i));
685 free(s);
686 }
687 replace_tempfile(-1, TT.fdout, &tmp);
688 TT.fdout = 1;
689 }
690 if (FLAG(i) || FLAG(s)) {
691 TT.nextline = 0;
692 TT.nextlen = TT.noeol = 0;
693 }
694 }
695
696 // Copy chunk of string between two delimiters, converting printf escapes.
697 // returns processed copy of string (0 if error), *pstr advances to next
698 // unused char. if delim (or *delim) is 0 uses/saves starting char as delimiter
699 // if regxex, ignore delimiter in [ranges]
unescape_delimited_string(char ** pstr,char * delim)700 static char *unescape_delimited_string(char **pstr, char *delim)
701 {
702 char *to, *from, mode = 0, d;
703
704 // Grab leading delimiter (if necessary), allocate space for new string
705 from = *pstr;
706 if (!delim || !*delim) {
707 if (!(d = *(from++))) return 0;
708 if (d == '\\') d = *(from++);
709 if (!d || d == '\\') return 0;
710 if (delim) *delim = d;
711 } else d = *delim;
712 to = delim = xmalloc(strlen(*pstr)+1);
713
714 while (mode || *from != d) {
715 if (!*from) return 0;
716
717 // delimiter in regex character range doesn't count
718 if (*from == '[') {
719 if (!mode) {
720 mode = ']';
721 if (from[1]=='-' || from[1]==']') *(to++) = *(from++);
722 } else if (mode == ']' && strchr(".=:", from[1])) {
723 *(to++) = *(from++);
724 mode = *from;
725 }
726 } else if (*from == mode) {
727 if (mode == ']') mode = 0;
728 else {
729 *(to++) = *(from++);
730 mode = ']';
731 }
732 // Length 1 range (X-X with same X) is "undefined" and makes regcomp err,
733 // but the perl build does it, so we need to filter it out.
734 } else if (mode && *from == '-' && from[-1] == from[1]) {
735 from+=2;
736 continue;
737 } else if (*from == '\\') {
738 if (!from[1]) return 0;
739
740 // Check escaped end delimiter before printf style escapes.
741 if (from[1] == d) from++;
742 else if (from[1]=='\\') *(to++) = *(from++);
743 else {
744 char c = unescape(from[1]);
745
746 if (c) {
747 *(to++) = c;
748 from+=2;
749 continue;
750 } else if (!mode) *(to++) = *(from++);
751 }
752 }
753 *(to++) = *(from++);
754 }
755 *to = 0;
756 *pstr = from+1;
757
758 return delim;
759 }
760
761 // Translate pattern strings into command structures. Each command structure
762 // is a single allocation (which requires some math and remalloc at times).
parse_pattern(char ** pline,long len)763 static void parse_pattern(char **pline, long len)
764 {
765 struct sedcmd *command = (void *)TT.pattern;
766 char *line, *reg, c, *errstart;
767 int i;
768
769 line = errstart = pline ? *pline : "";
770 if (len && line[len-1]=='\n') line[--len] = 0;
771
772 // Append this line to previous multiline command? (hit indicates type.)
773 // During parsing "hit" stores data about line continuations, but in
774 // sed_line() it means the match range attached to this command
775 // is active, so processing the continuation must zero it again.
776 if (command && command->prev->hit) {
777 // Remove half-finished entry from list so remalloc() doesn't confuse it
778 TT.pattern = TT.pattern->prev;
779 command = dlist_pop(&TT.pattern);
780 c = command->c;
781 reg = (char *)command;
782 reg += command->arg1 + strlen(reg + command->arg1);
783
784 // Resume parsing for 'a' or 's' command. (Only two that can do this.)
785 // TODO: using 256 to indicate 'a' means our s/// delimiter can't be
786 // a unicode character.
787 if (command->hit < 256) goto resume_s;
788 else goto resume_a;
789 }
790
791 // Loop through commands in this line.
792
793 command = 0;
794 for (;;) {
795 if (command) dlist_add_nomalloc(&TT.pattern, (void *)command);
796
797 // If there's no more data on this line, return.
798 for (;;) {
799 while (isspace(*line) || *line == ';') line++;
800 if (*line == '#') while (*line && *line != '\n') line++;
801 else break;
802 }
803 if (!*line) return;
804
805 if (FLAG(tarxform) && strstart(&line, "flags=")) {
806 TT.xflags = 7;
807 while (0<=(i = stridx("rRsShH", *line))) {
808 if (i&1) TT.xflags |= 1<<(i>>1);
809 else TT.xflags &= ~(1<<(i>>1));
810 line++;
811 }
812 continue;
813 }
814
815 // Start by writing data into toybuf.
816
817 errstart = line;
818 memset(toybuf, 0, sizeof(struct sedcmd));
819 command = (void *)toybuf;
820 reg = toybuf + sizeof(struct sedcmd);
821
822 // Parse address range (if any)
823 for (i = 0; i < 2; i++) {
824 if (*line == ',') line++;
825 else if (i) break;
826
827 if (i && *line == '+' && isdigit(line[1])) {
828 line++;
829 command->lmatch[i] = -2-strtol(line, &line, 0);
830 } else if (isdigit(*line)) command->lmatch[i] = strtol(line, &line, 0);
831 else if (*line == '$') {
832 command->lmatch[i] = -1;
833 line++;
834 } else if (*line == '/' || *line == '\\') {
835 char *s = line;
836
837 if (!(s = unescape_delimited_string(&line, 0))) goto error;
838 if (!*s) command->rmatch[i] = 0;
839 else {
840 xregcomp((void *)reg, s, REG_EXTENDED*FLAG(r));
841 command->rmatch[i] = reg-toybuf;
842 reg += sizeof(regex_t);
843 }
844 free(s);
845 } else break;
846 }
847
848 while (isspace(*line)) line++;
849 if (!*line) break;
850
851 if (*line == '!') {
852 command->not = 1;
853 line++;
854 }
855 while (isspace(*line)) line++;
856 if (!*line) break;
857
858 c = command->c = *(line++);
859 if (strchr("}:", c) && i) break;
860 if (strchr("aiqQr=", c) && i>1) break;
861
862 // Allocate memory and copy out of toybuf now that we know how big it is
863 command = xmemdup(toybuf, reg-toybuf);
864 reg = (reg-toybuf) + (char *)command;
865
866 // Parse arguments by command type
867 if (c == '{') TT.nextlen++;
868 else if (c == '}') {
869 if (!TT.nextlen--) break;
870 } else if (c == 's') {
871 char *end, delim = 0;
872 int flags;
873
874 // s/pattern/replacement/flags
875
876 // line continuations use arg1 (back at the start of the function),
877 // so let's fill out arg2 first (since the regex part can't be multiple
878 // lines) and swap them back later.
879
880 // get pattern (just record, we parse it later)
881 command->arg2 = reg - (char *)command;
882 if (!(TT.remember = unescape_delimited_string(&line, &delim)))
883 goto error;
884
885 reg += sizeof(regex_t);
886 command->arg1 = reg-(char *)command;
887 command->hit = delim;
888 resume_s:
889 // get replacement - don't replace escapes yet because \1 and \& need
890 // processing later, after we replace \\ with \ we can't tell \\1 from \1
891 end = line;
892 while (*end != command->hit) {
893 if (!*end) goto error;
894 if (*end++ == '\\') {
895 if (!*end || *end == '\n') {
896 end[-1] = '\n';
897 break;
898 }
899 end++;
900 }
901 }
902
903 reg = extend_string((void *)&command, line, reg-(char *)command,end-line);
904 line = end;
905 // line continuation? (note: '\n' can't be a valid delim).
906 if (*line == command->hit) command->hit = 0;
907 else {
908 if (!*line) continue;
909 reg--;
910 line++;
911 goto resume_s;
912 }
913
914 // swap arg1/arg2 so they're back in order arguments occur.
915 i = command->arg1;
916 command->arg1 = command->arg2;
917 command->arg2 = i;
918 command->sflags = TT.xflags*SFLAG_R;
919
920 // get flags
921 for (line++; *line; line++) {
922 long l;
923
924 if (isspace(*line) && *line != '\n') continue;
925 if (0 <= (l = stridx("igpx", *line))) command->sflags |= 1<<l;
926 else if (*line == 'I') command->sflags |= 1<<0;
927 else if (FLAG(tarxform) && 0 <= (l = stridx("RSH", *line)))
928 command->sflags |= SFLAG_R<<l;
929 // Given that the default is rsh all enabled... why do these exist?
930 else if (FLAG(tarxform) && 0 <= (l = stridx("rsh", *line)))
931 command->sflags &= ~(SFLAG_R<<l);
932 else if (!(command->sflags>>8) && 0<(l = strtol(line, &line, 10))) {
933 command->sflags |= l << 8;
934 line--;
935 } else break;
936 }
937 flags = (FLAG(r) || (command->sflags & SFLAG_x)) ? REG_EXTENDED : 0;
938 if (command->sflags & SFLAG_i) flags |= REG_ICASE;
939
940 // We deferred actually parsing the regex until we had the s///i flag
941 // allocating the space was done by extend_string() above
942 if (!*TT.remember) command->arg1 = 0;
943 else {
944 xregcomp((void *)(command->arg1+(char *)command), TT.remember, flags);
945 if (FLAG(tarxform) && TT.remember[strlen(TT.remember)-1]=='/')
946 command->sflags |= SFLAG_slash;
947 }
948 free(TT.remember);
949 TT.remember = 0;
950 if (*line == 'w') {
951 line++;
952 goto writenow;
953 }
954 } else if (c == 'w') {
955 int fd, delim;
956 char *cc;
957
958 // Since s/// uses arg1 and arg2, and w needs a persistent filehandle and
959 // eol status, and to retain the filename for error messages, we'd need
960 // to go up to arg5 just for this. Compromise: dynamically allocate the
961 // filehandle and eol status.
962
963 writenow:
964 while (isspace(*line)) line++;
965 if (!*line) goto error;
966 for (cc = line; *cc; cc++) if (*cc == '\\' && cc[1] == ';') break;
967 delim = *cc;
968 *cc = 0;
969 fd = xcreate(line, O_WRONLY|O_CREAT|O_TRUNC|O_APPEND, 0644);
970 *cc = delim;
971
972 command->w = reg - (char *)command;
973 command = xrealloc(command, command->w+(cc-line)+6);
974 reg = command->w + (char *)command;
975
976 memcpy(reg, &fd, 4);
977 reg += 4;
978 *(reg++) = 0;
979 memcpy(reg, line, delim);
980 reg += delim;
981 *(reg++) = 0;
982
983 line = cc;
984 if (delim) line += 2;
985 } else if (c == 'y') {
986 char *s, delim = 0;
987 int len;
988
989 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
990 command->arg1 = reg-(char *)command;
991 len = strlen(s);
992 reg = extend_string((void *)&command, s, reg-(char *)command, len);
993 free(s);
994 command->arg2 = reg-(char *)command;
995 if (!(s = unescape_delimited_string(&line, &delim))) goto error;
996 if (len != strlen(s)) goto error;
997 reg = extend_string((void *)&command, s, reg-(char*)command, len);
998 free(s);
999 } else if (strchr("abcirtTqQw:", c)) {
1000 int end;
1001
1002 // trim leading spaces
1003 while (isspace(*line) && *line != '\n') line++;
1004
1005 // Resume logic differs from 's' case because we don't add a newline
1006 // unless it's after something, so we add it on return instead.
1007 resume_a:
1008 command->hit = 0;
1009
1010 // btTqQ: end with space or semicolon, aicrw continue to newline.
1011 if (!(end = strcspn(line, strchr(":btTqQ", c) ? "}; \t\r\n\v\f" : "\n"))){
1012 // Argument's optional for btTqQ
1013 if (strchr("btTqQ", c)) continue;
1014 else if (!command->arg1) break;
1015 }
1016 // Error checking: qQ can only have digits after them
1017 if (c=='q' || c=='Q') {
1018 for (i = 0; i<end && isdigit(line[i]); i++);
1019 if (i != end) {
1020 line += i;
1021 break;
1022 }
1023 }
1024
1025 // Extend allocation to include new string. We use offsets instead of
1026 // pointers so realloc() moving stuff doesn't break things. Ok to write
1027 // \n over NUL terminator because call to extend_string() adds it back.
1028 if (!command->arg1) command->arg1 = reg - (char*)command;
1029 else if (*(command->arg1+(char *)command)) *(reg++) = '\n';
1030 else if (!pline) {
1031 command->arg1 = 0;
1032 continue;
1033 }
1034 reg = extend_string((void *)&command, line, reg - (char *)command, end);
1035
1036 // Recopy data to remove escape sequences and handle line continuation.
1037 if (strchr("aci", c)) {
1038 reg -= end+1;
1039 for (i = end; i; i--) {
1040 if ((*reg++ = *line++)=='\\') {
1041
1042 // escape at end of line: resume if -e escaped literal newline,
1043 // else request callback and resume with next line
1044 if (!--i) {
1045 *--reg = 0;
1046 if (*line) {
1047 line++;
1048 goto resume_a;
1049 }
1050 command->hit = 256;
1051 break;
1052 }
1053 if (!(reg[-1] = unescape(*line))) reg[-1] = *line;
1054 line++;
1055 }
1056 }
1057 *reg = 0;
1058 } else line += end;
1059
1060 // Commands that take no arguments
1061 } else if (!strchr("{dDgGhHlnNpPx=", c)) break;
1062 }
1063
1064 error:
1065 error_exit("bad pattern '%s'@%ld (%c)", errstart, line-errstart+1L, *line);
1066 }
1067
1068 // Is the pointer "find" within the string "range".
instr(char * find,char * range)1069 static int instr(char *find, char *range)
1070 {
1071 return find>=range && range+strlen(range)>=find;
1072 }
1073
sed_main(void)1074 void sed_main(void)
1075 {
1076 char **args = toys.optargs, **aa;
1077
1078 if (FLAG(tarxform)) toys.optflags |= FLAG_z;
1079 if (!FLAG(z)) TT.delim = '\n';
1080
1081 // Parse pattern into commands.
1082
1083 // If no -e or -f, first argument is the pattern.
1084 if (!TT.e && !TT.f) {
1085 if (!*toys.optargs) error_exit("no pattern");
1086 (TT.e = xzalloc(sizeof(struct arg_list)))->arg = *(args++);
1087 }
1088
1089 // -e and -f care about order, so use argv[] to recreate original order
1090 for (aa = toys.argv+1; *aa; aa++) {
1091 if (TT.e && instr(TT.e->arg, *aa)) {
1092 parse_pattern(&TT.e->arg, strlen(TT.e->arg));
1093 free(llist_pop(&TT.e));
1094 }
1095 if (TT.f && instr(TT.f->arg, *aa)) {
1096 do_lines(xopenro(TT.f->arg), TT.delim, parse_pattern);
1097 free(llist_pop(&TT.f));
1098 }
1099 }
1100 parse_pattern(0, 0);
1101 dlist_terminate(TT.pattern);
1102 if (TT.nextlen) error_exit("no }");
1103
1104 TT.fdout = 1;
1105 TT.remember = xstrdup("");
1106
1107 // Inflict pattern upon input files. Long version because !O_CLOEXEC
1108 loopfiles_rw(args, O_RDONLY|WARN_ONLY, 0, do_sed_file);
1109
1110 // Provide EOF flush at end of cumulative input for non-i mode.
1111 if (!FLAG(i) && !FLAG(s)) {
1112 toys.optflags |= FLAG_s;
1113 sed_line(0, 0);
1114 }
1115
1116 // TODO: need to close fd when done for TOYBOX_FREE?
1117 }
1118