1 /* awk.c - An awk implementation.
2 * vi: tabstop=2 softtabstop=2 shiftwidth=2
3 *
4 * Copyright 2024 Ray Gardner <[email protected]>
5 *
6 * See https://pubs.opengroup.org/onlinepubs/9799919799/utilities/awk.html
7 *
8 * Deviations from posix: Don't handle LANG, LC_ALL, etc.
9 * Accept regex for RS
10 * Bitwise functions (from gawk): and, or, xor, lshift, rshift
11 * Attempt to follow tradition (nawk, gawk) where it departs from posix
12 *
13 * TODO: Lazy field splitting; improve performance; more testing/debugging
14
15 USE_AWK(NEWTOY(awk, "F:v*f*bc", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LINEBUF))
16
17 config AWK
18 bool "awk"
19 default n
20 help
21 usage: awk [-F sepstring] [-v assignment]... program [argument...]
22 or:
23 awk [-F sepstring] -f progfile [-f progfile]... [-v assignment]...
24 [argument...]
25 also:
26 -b : count bytes, not characters (experimental)
27 -c : compile only, do not run
28 */
29
30 #define FOR_awk
31 #include "toys.h"
32
33 GLOBALS(
34 struct arg_list *f;
35 struct arg_list *v;
36 char *F;
37
38 struct scanner_state {
39 char *p;
40 char *progstring;
41 struct arg_list *prog_args;
42 char *filename;
43 char *line;
44 size_t line_size;
45 ssize_t line_len;
46 int line_num;
47 int ch;
48 FILE *fp;
49 // state includes latest token seen
50 int tok;
51 int tokbuiltin;
52 int toktype;
53 char *tokstr;
54 size_t maxtok;
55 size_t toklen;
56 double numval;
57 int error; // Set if lexical error.
58 } *scs;
59 char *tokstr;
60 int prevtok;
61
62 struct compiler_globals {
63 int in_print_stmt;
64 int paren_level;
65 int in_function_body;
66 int funcnum;
67 int nparms;
68 int compile_error_count;
69 int first_begin;
70 int last_begin;
71 int first_end;
72 int last_end;
73 int first_recrule;
74 int last_recrule;
75 int break_dest;
76 int continue_dest;
77 int stack_offset_to_fix; // fixup stack if return in for(e in a)
78 int range_pattern_num;
79 int rule_type; // tkbegin, tkend, or 0
80 } cgl;
81
82 // zvalue: the main awk value type
83 // Can be number or string or both, or else map (array) or regex
84 struct zvalue {
85 unsigned flags;
86 double num;
87 union { // anonymous union not in C99; not going to fix it now.
88 struct zstring *vst;
89 struct zmap *map;
90 regex_t *rx;
91 };
92 } nozvalue; // to shut up compiler warning TODO FIXME
93
94 struct runtime_globals {
95 struct zvalue cur_arg;
96 FILE *fp; // current data file
97 int narg; // cmdline arg index
98 int nfiles; // num of cmdline data file args processed
99 int eof; // all cmdline files (incl. stdin) read
100 char *recptr;
101 struct zstring *zspr; // Global to receive sprintf() string value
102 } rgl;
103
104 // Expanding sequential list
105 struct zlist {
106 char *base, *limit, *avail;
107 size_t size;
108 } globals_table, // global symbol table
109 locals_table, // local symbol table
110 func_def_table; // function symbol table
111 // runtime lists
112 struct zlist literals, fields, zcode, stack;
113
114 char *progname;
115
116 int spec_var_limit;
117 int zcode_last;
118 struct zvalue *stackp; // top of stack ptr
119
120 char *pbuf; // Used for number formatting in num_to_zstring()
121 #define RS_MAX 64
122 char rs_last[RS_MAX];
123 regex_t rx_rs_default, rx_rs_last;
124 regex_t rx_default, rx_last, rx_printf_fmt;
125 #define FS_MAX 64
126 char fs_last[FS_MAX];
127 char one_char_fs[4];
128 int nf_internal; // should match NF
129 char range_sw[64]; // FIXME TODO quick and dirty set of range switches
130 int file_cnt, std_file_cnt;
131
132 struct zfile {
133 struct zfile *next;
134 char *fn;
135 FILE *fp;
136 char mode; // w, a, or r
137 char file_or_pipe; // 1 if file, 0 if pipe
138 char is_tty, is_std_file;
139 char eof;
140 int ro, lim, buflen;
141 char *buf;
142 } *zfiles, *cfile, *zstdout;
143 )
144
awk_exit(int status)145 static void awk_exit(int status)
146 {
147 toys.exitval = status;
148 xexit();
149 }
150 #ifdef __GNUC__
151 #define ATTR_FALLTHROUGH_INTENDED __attribute__ ((fallthrough))
152 #else
153 #define ATTR_FALLTHROUGH_INTENDED
154 #endif
155
156 ////////////////////
157 //// declarations
158 ////////////////////
159
160 #define PBUFSIZE 512 // For num_to_zstring()
161
162 enum toktypes {
163 // EOF (use -1 from stdio.h)
164 ERROR = 2, NEWLINE, VAR, NUMBER, STRING, REGEX, USERFUNC, BUILTIN, TOKEN,
165 KEYWORD
166 };
167
168 // Must align with lbp_table[]
169 enum tokens {
170 tkunusedtoken, tkeof, tkerr, tknl,
171 tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
172
173 // static char *ops = " ; , [ ] ( ) { } $ ++ -- ^ ! * / % + - "
174 // "< <= != == > >= ~ !~ && || ? : ^= %= *= /= += -= = >> | ";
175 tksemi, tkcomma, tklbracket, tkrbracket, tklparen, tkrparen, tklbrace,
176 tkrbrace, tkfield, tkincr, tkdecr, tkpow, tknot, tkmul, tkdiv, tkmod,
177 tkplus, tkminus,
178 tkcat, // !!! Fake operator for concatenation (just adjacent string exprs)
179 tklt, tkle, tkne, tkeq, tkgt, tkge, tkmatchop, tknotmatch, tkand, tkor,
180 tkternif, tkternelse, tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
181 tkaddasgn, tksubasgn, tkasgn, tkappend, tkpipe,
182
183 // static char *keywords = " in BEGIN END if else "
184 // "while for do break continue exit function "
185 // "return next nextfile delete print printf getline ";
186 tkin, tkbegin, tkend, tkif, tkelse,
187 tkwhile, tkfor, tkdo, tkbreak, tkcontinue, tkexit, tkfunction,
188 tkreturn, tknext, tknextfile, tkdelete, tkprint, tkprintf, tkgetline,
189
190 // static char *builtins = " atan2 cos sin exp "
191 // "log sqrt int rand srand length "
192 // "tolower toupper system fflush "
193 // "and or xor lshift rshift ";
194 tkatan2, tkcos, tksin, tkexp, tklog, tksqrt, tkint, tkrand, tksrand,
195 tklength, tktolower, tktoupper, tksystem, tkfflush,
196 tkband, tkbor, tkbxor, tklshift, tkrshift,
197
198 // static char *specialfuncs = " close index match split "
199 // "sub gsub sprintf substr ";
200 tkclose, tkindex, tkmatch, tksplit,
201 tksub, tkgsub, tksprintf, tksubstr, tklasttk
202 };
203
204 enum opcodes {
205 opunusedop = tklasttk,
206 opvarref, opmapref, opfldref, oppush, opdrop, opdrop_n, opnotnot,
207 oppreincr, oppredecr, oppostincr, oppostdecr, opnegate, opjump, opjumptrue,
208 opjumpfalse, opprepcall, opmap, opmapiternext, opmapdelete, opmatchrec,
209 opquit, opprintrec, oprange1, oprange2, oprange3, oplastop
210 };
211
212 // Special variables (POSIX). Must align with char *spec_vars[]
213 enum spec_var_names { ARGC=1, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
214 NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP };
215
216 struct symtab_slot { // global symbol table entry
217 unsigned flags;
218 char *name;
219 };
220
221 // zstring: flexible string type.
222 // Capacity must be > size because we insert a NUL byte.
223 struct zstring {
224 int refcnt;
225 unsigned size;
226 unsigned capacity;
227 char str[]; // C99 flexible array member
228 };
229
230 // Flag bits for zvalue and symbol tables
231 #define ZF_MAYBEMAP (1u << 1)
232 #define ZF_MAP (1u << 2)
233 #define ZF_SCALAR (1u << 3)
234 #define ZF_NUM (1u << 4)
235 #define ZF_RX (1u << 5)
236 #define ZF_STR (1u << 6)
237 #define ZF_NUMSTR (1u << 7) // "numeric string" per posix
238 #define ZF_REF (1u << 9) // for lvalues
239 #define ZF_MAPREF (1u << 10) // for lvalues
240 #define ZF_FIELDREF (1u << 11) // for lvalues
241 #define ZF_EMPTY_RX (1u << 12)
242 #define ZF_ANYMAP (ZF_MAP | ZF_MAYBEMAP)
243
244 // Macro to help facilitate possible future change in zvalue layout.
245 #define ZVINIT(flags, num, ptr) {(flags), (double)(num), {(ptr)}}
246
247 #define IS_STR(zvalp) ((zvalp)->flags & ZF_STR)
248 #define IS_RX(zvalp) ((zvalp)->flags & ZF_RX)
249 #define IS_NUM(zvalp) ((zvalp)->flags & ZF_NUM)
250 #define IS_MAP(zvalp) ((zvalp)->flags & ZF_MAP)
251 #define IS_EMPTY_RX(zvalp) ((zvalp)->flags & ZF_EMPTY_RX)
252
253 #define GLOBAL ((struct symtab_slot *)TT.globals_table.base)
254 #define LOCAL ((struct symtab_slot *)TT.locals_table.base)
255 #define FUNC_DEF ((struct functab_slot *)TT.func_def_table.base)
256
257 #define LITERAL ((struct zvalue *)TT.literals.base)
258 #define STACK ((struct zvalue *)TT.stack.base)
259 #define FIELD ((struct zvalue *)TT.fields.base)
260
261 #define ZCODE ((int *)TT.zcode.base)
262
263 #define FUNC_DEFINED (1u)
264 #define FUNC_CALLED (2u)
265
266 #define MIN_STACK_LEFT 1024
267
268 struct functab_slot { // function symbol table entry
269 unsigned flags;
270 char *name;
271 struct zlist function_locals;
272 int zcode_addr;
273 };
274
275 // Elements of the hash table (key/value pairs)
276 struct zmap_slot {
277 int hash; // store hash key to speed hash table expansion
278 struct zstring *key;
279 struct zvalue val;
280 };
281 #define ZMSLOTINIT(hash, key, val) {hash, key, val}
282
283 // zmap: Mapping data type for arrays; a hash table. Values in hash are either
284 // 0 (unused), -1 (marked deleted), or one plus the number of the zmap slot
285 // containing a key/value pair. The zlist slot entries are numbered from 0 to
286 // count-1, so need to add one to distinguish from unused. The probe sequence
287 // is borrowed from Python dict, using the "perturb" idea to mix in upper bits
288 // of the original hash value.
289 struct zmap {
290 unsigned mask; // tablesize - 1; tablesize is 2 ** n
291 int *hash; // (mask + 1) elements
292 int limit; // 80% of table size ((mask+1)*8/10)
293 int count; // number of occupied slots in hash
294 int deleted; // number of deleted slots
295 struct zlist slot; // expanding list of zmap_slot elements
296 };
297
298 #define MAPSLOT ((struct zmap_slot *)(m->slot).base)
299 #define FFATAL(format, ...) zzerr("$" format, __VA_ARGS__)
300 #define FATAL(...) zzerr("$%s\n", __VA_ARGS__)
301 #define XERR(format, ...) zzerr(format, __VA_ARGS__)
302
303 #define NO_EXIT_STATUS (9999987) // value unlikely to appear in exit stmt
304
305
306
307 ////////////////////
308 //// lib
309 ////////////////////
310
xfree(void * p)311 static void xfree(void *p)
312 {
313 free(p);
314 }
315
hexval(int c)316 static int hexval(int c)
317 {
318 // Assumes c is valid hex digit
319 return isdigit(c) ? c - '0' : (c | 040) - 'a' + 10;
320 }
321
322 ////////////////////
323 //// common defs
324 ////////////////////
325
326 // These (ops, keywords, builtins) must align with enum tokens
327 static char *ops = " ; , [ ] ( ) { } $ ++ -- ^ ! * / % + - .. "
328 "< <= != == > >= ~ !~ && || ? : ^= %= *= /= += -= = >> | ";
329
330 static char *keywords = " in BEGIN END if else "
331 "while for do break continue exit function "
332 "return next nextfile delete print printf getline ";
333
334 static char *builtins = " atan2 cos sin exp log "
335 "sqrt int rand srand length "
336 "tolower toupper system fflush "
337 "and or xor lshift rshift "
338 "close index match split "
339 "sub gsub sprintf substr ";
340
zzerr(char * format,...)341 static void zzerr(char *format, ...)
342 {
343 va_list args;
344 int fatal_sw = 0;
345 fprintf(stderr, "%s: ", TT.progname);
346 if (format[0] == '$') {
347 fprintf(stderr, "FATAL: ");
348 format++;
349 fatal_sw = 1;
350 }
351 fprintf(stderr, "file %s line %d: ", TT.scs->filename, TT.scs->line_num);
352 va_start(args, format);
353 vfprintf(stderr, format, args);
354 va_end(args);
355 if (format[strlen(format)-1] != '\n') fputc('\n', stderr); // TEMP FIXME !!!
356 fflush(stderr);
357 if (fatal_sw) awk_exit(2);
358 // Don't bump error count for warnings
359 else if (!strstr(format, "arning")) TT.cgl.compile_error_count++;
360 }
361
get_token_text(char * op,int tk)362 static void get_token_text(char *op, int tk)
363 {
364 // This MUST ? be changed if ops string or tk... assignments change!
365 memmove(op, ops + 3 * (tk - tksemi) + 1, 2);
366 op[ op[1] == ' ' ? 1 : 2 ] = 0;
367 }
368
369 ////////////////////
370 /// UTF-8
371 ////////////////////
372
373 // Return number of bytes in 'cnt' utf8 codepoints
bytesinutf8(char * str,size_t len,size_t cnt)374 static int bytesinutf8(char *str, size_t len, size_t cnt)
375 {
376 if (FLAG(b)) return cnt;
377 unsigned wch;
378 char *lim = str + len, *s0 = str;
379 while (cnt-- && str < lim) {
380 int r = utf8towc(&wch, str, lim - str);
381 str += r > 0 ? r : 1;
382 }
383 return str - s0;
384 }
385
386 // Return number of utf8 codepoints in str
utf8cnt(char * str,size_t len)387 static int utf8cnt(char *str, size_t len)
388 {
389 unsigned wch;
390 int cnt = 0;
391 char *lim;
392 if (!len || FLAG(b)) return len;
393 for (lim = str + len; str < lim; cnt++) {
394 int r = utf8towc(&wch, str, lim - str);
395 str += r > 0 ? r : 1;
396 }
397 return cnt;
398 }
399
400 ////////////////////
401 //// zlist
402 ////////////////////
403
zlist_initx(struct zlist * p,size_t size,size_t count)404 static struct zlist *zlist_initx(struct zlist *p, size_t size, size_t count)
405 {
406 p->base = p->avail = xzalloc(count * size);
407 p->limit = p->base + size * count;
408 p->size = size;
409 return p;
410 }
411
zlist_init(struct zlist * p,size_t size)412 static struct zlist *zlist_init(struct zlist *p, size_t size)
413 {
414 #define SLIST_MAX_INIT_BYTES 128
415 return zlist_initx(p, size, SLIST_MAX_INIT_BYTES / size);
416 }
417
418 // This is called from zlist_append() and add_stack() in run
zlist_expand(struct zlist * p)419 static void zlist_expand(struct zlist *p)
420 {
421 size_t offset = p->avail - p->base;
422 size_t cap = p->limit - p->base;
423 size_t newcap = maxof(cap + p->size, ((cap / p->size) * 3 / 2) * p->size);
424 if (newcap <= cap) error_exit("mem req error");
425 char *base = xrealloc(p->base, newcap);
426 p->base = base;
427 p->limit = base + newcap;
428 p->avail = base + offset;
429 }
430
zlist_append(struct zlist * p,void * obj)431 static size_t zlist_append(struct zlist *p, void *obj)
432 {
433 // Insert obj (p->size bytes) at end of list, expand as needed.
434 // Return scaled offset to newly inserted obj; i.e. the
435 // "slot number" 0, 1, 2,...
436 void *objtemp = 0;
437 if (p->avail > p->limit - p->size) {
438 objtemp = xmalloc(p->size); // Copy obj in case it is in
439 memmove(objtemp, obj, p->size); // the area realloc might free!
440 obj = objtemp;
441 zlist_expand(p);
442 }
443 memmove(p->avail, obj, p->size);
444 if (objtemp) xfree(objtemp);
445 p->avail += p->size;
446 return (p->avail - p->base - p->size) / p->size; // offset of updated slot
447 }
448
zlist_len(struct zlist * p)449 static int zlist_len(struct zlist *p)
450 {
451 return (p->avail - p->base) / p->size;
452 }
453
454 ////////////////////
455 //// zstring
456 ////////////////////
457
zstring_release(struct zstring ** s)458 static void zstring_release(struct zstring **s)
459 {
460 if (*s && (**s).refcnt-- == 0) xfree(*s); //free_zstring(s);
461 *s = 0;
462 }
463
zstring_incr_refcnt(struct zstring * s)464 static void zstring_incr_refcnt(struct zstring *s)
465 {
466 if (s) s->refcnt++;
467 }
468
469 // !! Use only if 'to' is NULL or its refcnt is 0.
zstring_modify(struct zstring * to,size_t at,char * s,size_t n)470 static struct zstring *zstring_modify(struct zstring *to, size_t at, char *s, size_t n)
471 {
472 size_t cap = at + n + 1;
473 if (!to || to->capacity < cap) {
474 to = xrealloc(to, sizeof(*to) + cap);
475 to->capacity = cap;
476 to->refcnt = 0;
477 }
478 memcpy(to->str + at, s, n);
479 to->size = at + n;
480 to->str[to->size] = '\0';
481 return to;
482 }
483
484 // The 'to' pointer may move by realloc, so return (maybe updated) pointer.
485 // If refcnt is nonzero then there is another pointer to this zstring,
486 // so copy this one and release it. If refcnt is zero we can mutate this.
zstring_update(struct zstring * to,size_t at,char * s,size_t n)487 static struct zstring *zstring_update(struct zstring *to, size_t at, char *s, size_t n)
488 {
489 if (to && to->refcnt) {
490 struct zstring *to_before = to;
491 to = zstring_modify(0, 0, to->str, to->size);
492 zstring_release(&to_before);
493 }
494 return zstring_modify(to, at, s, n);
495 }
496
zstring_copy(struct zstring * to,struct zstring * from)497 static struct zstring *zstring_copy(struct zstring *to, struct zstring *from)
498 {
499 return zstring_update(to, 0, from->str, from->size);
500 }
501
zstring_extend(struct zstring * to,struct zstring * from)502 static struct zstring *zstring_extend(struct zstring *to, struct zstring *from)
503 {
504 return zstring_update(to, to->size, from->str, from->size);
505 }
506
new_zstring(char * s,size_t size)507 static struct zstring *new_zstring(char *s, size_t size)
508 {
509 return zstring_modify(0, 0, s, size);
510 }
511
512 ////////////////////
513 //// zvalue
514 ////////////////////
515
516 static struct zvalue uninit_zvalue = ZVINIT(0, 0.0, 0);
517
518 // This will be reassigned in init_globals() with an empty string.
519 // It's a special value used for "uninitialized" field vars
520 // referenced past $NF. See push_field().
521 static struct zvalue uninit_string_zvalue = ZVINIT(0, 0.0, 0);
522
new_str_val(char * s)523 static struct zvalue new_str_val(char *s)
524 {
525 // Only if no nul inside string!
526 struct zvalue v = ZVINIT(ZF_STR, 0.0, new_zstring(s, strlen(s)));
527 return v;
528 }
529
zvalue_release_zstring(struct zvalue * v)530 static void zvalue_release_zstring(struct zvalue *v)
531 {
532 if (v && ! (v->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&v->vst);
533 }
534
535 // push_val() is used for initializing globals (see init_compiler())
536 // but mostly used in runtime
537 // WARNING: push_val may change location of v, so do NOT depend on it after!
538 // Note the incr refcnt used to be after the zlist_append, but that caused a
539 // heap-use-after-free error when the zlist_append relocated the zvalue being
540 // pushed, invalidating the v pointer.
push_val(struct zvalue * v)541 static void push_val(struct zvalue *v)
542 {
543 if (IS_STR(v) && v->vst) v->vst->refcnt++; // inlined zstring_incr_refcnt()
544 *++TT.stackp = *v;
545 }
546
zvalue_copy(struct zvalue * to,struct zvalue * from)547 static void zvalue_copy(struct zvalue *to, struct zvalue *from)
548 {
549 if (IS_RX(from)) *to = *from;
550 else {
551 zvalue_release_zstring(to);
552 *to = *from;
553 zstring_incr_refcnt(to->vst);
554 }
555 }
556
zvalue_dup_zstring(struct zvalue * v)557 static void zvalue_dup_zstring(struct zvalue *v)
558 {
559 struct zstring *z = new_zstring(v->vst->str, v->vst->size);
560 zstring_release(&v->vst);
561 v->vst = z;
562 }
563
564 ////////////////////
565 //// zmap (array) implementation
566 ////////////////////
567
zstring_match(struct zstring * a,struct zstring * b)568 static int zstring_match(struct zstring *a, struct zstring *b)
569 {
570 return a->size == b->size && memcmp(a->str, b->str, a->size) == 0;
571 }
572
zstring_hash(struct zstring * s)573 static int zstring_hash(struct zstring *s)
574 { // djb2 -- small, fast, good enough for this
575 unsigned h = 5381;
576 char *p = s->str, *lim = p + s->size;
577 while (p < lim)
578 h = (h << 5) + h + *p++;
579 return h;
580 }
581
582 enum { PSHIFT = 5 }; // "perturb" shift -- see find_mapslot() below
583
find_mapslot(struct zmap * m,struct zstring * key,int * hash,int * probe)584 static struct zmap_slot *find_mapslot(struct zmap *m, struct zstring *key, int *hash, int *probe)
585 {
586 struct zmap_slot *x = 0;
587 unsigned perturb = *hash = zstring_hash(key);
588 *probe = *hash & m->mask;
589 int n, first_deleted = -1;
590 while ((n = m->hash[*probe])) {
591 if (n > 0) {
592 x = &MAPSLOT[n-1];
593 if (*hash == x->hash && zstring_match(key, x->key)) {
594 return x;
595 }
596 } else if (first_deleted < 0) first_deleted = *probe;
597 // Based on technique in Python dict implementation. Comment there
598 // (https://github.com/python/cpython/blob/3.10/Objects/dictobject.c)
599 // says
600 //
601 // j = ((5*j) + 1) mod 2**i
602 // For any initial j in range(2**i), repeating that 2**i times generates
603 // each int in range(2**i) exactly once (see any text on random-number
604 // generation for proof).
605 //
606 // The addition of 'perturb' greatly improves the probe sequence. See
607 // the Python dict implementation for more details.
608 *probe = (*probe * 5 + 1 + (perturb >>= PSHIFT)) & m->mask;
609 }
610 if (first_deleted >= 0) *probe = first_deleted;
611 return 0;
612 }
613
zmap_find(struct zmap * m,struct zstring * key)614 static struct zvalue *zmap_find(struct zmap *m, struct zstring *key)
615 {
616 int hash, probe;
617 struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
618 return x ? &x->val : 0;
619 }
620
zmap_init(struct zmap * m)621 static void zmap_init(struct zmap *m)
622 {
623 enum {INIT_SIZE = 8};
624 m->mask = INIT_SIZE - 1;
625 m->hash = xzalloc(INIT_SIZE * sizeof(*m->hash));
626 m->limit = INIT_SIZE * 8 / 10;
627 m->count = 0;
628 m->deleted = 0;
629 zlist_init(&m->slot, sizeof(struct zmap_slot));
630 }
631
zvalue_map_init(struct zvalue * v)632 static void zvalue_map_init(struct zvalue *v)
633 {
634 struct zmap *m = xmalloc(sizeof(*m));
635 zmap_init(m);
636 v->map = m;
637 v->flags |= ZF_MAP;
638 }
639
zmap_delete_map_incl_slotdata(struct zmap * m)640 static void zmap_delete_map_incl_slotdata(struct zmap *m)
641 {
642 for (struct zmap_slot *p = &MAPSLOT[0]; p < &MAPSLOT[zlist_len(&m->slot)]; p++) {
643 if (p->key) zstring_release(&p->key);
644 if (p->val.vst) zstring_release(&p->val.vst);
645 }
646 xfree(m->slot.base);
647 xfree(m->hash);
648 }
649
zmap_delete_map(struct zmap * m)650 static void zmap_delete_map(struct zmap *m)
651 {
652 zmap_delete_map_incl_slotdata(m);
653 zmap_init(m);
654 }
655
zmap_rehash(struct zmap * m)656 static void zmap_rehash(struct zmap *m)
657 {
658 // New table is twice the size of old.
659 int size = m->mask + 1;
660 unsigned mask = 2 * size - 1;
661 int *h = xzalloc(2 * size * sizeof(*m->hash));
662 // Step through the old hash table, set up location in new table.
663 for (int i = 0; i < size; i++) {
664 int n = m->hash[i];
665 if (n > 0) {
666 int hash = MAPSLOT[n-1].hash;
667 unsigned perturb = hash;
668 int p = hash & mask;
669 while (h[p]) {
670 p = (p * 5 + 1 + (perturb >>= PSHIFT)) & mask;
671 }
672 h[p] = n;
673 }
674 }
675 m->mask = mask;
676 xfree(m->hash);
677 m->hash = h;
678 m->limit = 2 * size * 8 / 10;
679 }
680
zmap_find_or_insert_key(struct zmap * m,struct zstring * key)681 static struct zmap_slot *zmap_find_or_insert_key(struct zmap *m, struct zstring *key)
682 {
683 int hash, probe;
684 struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
685 if (x) return x;
686 // not found; insert it.
687 if (m->count == m->limit) {
688 zmap_rehash(m); // rehash if getting too full.
689 // rerun find_mapslot to get new probe index
690 x = find_mapslot(m, key, &hash, &probe);
691 }
692 // Assign key to new slot entry and bump refcnt.
693 struct zmap_slot zs = ZMSLOTINIT(hash, key, (struct zvalue)ZVINIT(0, 0.0, 0));
694 zstring_incr_refcnt(key);
695 int n = zlist_append(&m->slot, &zs);
696 m->count++;
697 m->hash[probe] = n + 1;
698 return &MAPSLOT[n];
699 }
700
zmap_delete(struct zmap * m,struct zstring * key)701 static void zmap_delete(struct zmap *m, struct zstring *key)
702 {
703 int hash, probe;
704 struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
705 if (!x) return;
706 zstring_release(&MAPSLOT[m->hash[probe] - 1].key);
707 m->hash[probe] = -1;
708 m->deleted++;
709 }
710
711 ////////////////////
712 //// scan (lexical analyzer)
713 ////////////////////
714
715 // TODO:
716 // IS line_num getting incr correctly? Newline counts as start of line!?
717 // Handle nuls in file better.
718 // Open files "rb" and handle CRs in program.
719 // Roll gch() into get_char() ?
720 // Deal with signed char (at EOF? elsewhere?)
721 //
722 // 2023-01-11: Allow nul bytes inside strings? regexes?
723
progfile_open(void)724 static void progfile_open(void)
725 {
726 TT.scs->filename = TT.scs->prog_args->arg;
727 TT.scs->prog_args = TT.scs->prog_args->next;
728 TT.scs->fp = stdin;
729 if (strcmp(TT.scs->filename, "-")) TT.scs->fp = fopen(TT.scs->filename, "r");
730 if (!TT.scs->fp) error_exit("Can't open %s", TT.scs->filename);
731 TT.scs->line_num = 0;
732 }
733
get_char(void)734 static int get_char(void)
735 {
736 static char *nl = "\n";
737 // On first entry, TT.scs->p points to progstring if any, or null string.
738 for (;;) {
739 int c = *(TT.scs->p)++;
740 if (c) {
741 return c;
742 }
743 if (TT.scs->progstring) { // Fake newline at end of progstring.
744 if (TT.scs->progstring == nl) return EOF;
745 TT.scs->p = TT.scs->progstring = nl;
746 continue;
747 }
748 // Here if getting from progfile(s).
749 if (TT.scs->line == nl) return EOF;
750 if (!TT.scs->fp) {
751 progfile_open();
752 }
753 // Save last char to allow faking final newline.
754 int lastchar = (TT.scs->p)[-2];
755 TT.scs->line_len = getline(&TT.scs->line, &TT.scs->line_size, TT.scs->fp);
756 if (TT.scs->line_len > 0) {
757 TT.scs->line_num++;
758 TT.scs->p = TT.scs->line;
759 continue;
760 }
761 // EOF
762 // FIXME TODO or check for error? feof() vs. ferror()
763 fclose(TT.scs->fp);
764 TT.scs->fp = 0;
765 TT.scs->p = " " + 2;
766 if (!TT.scs->prog_args) {
767 xfree(TT.scs->line);
768 if (lastchar == '\n') return EOF;
769 // Fake final newline
770 TT.scs->line = TT.scs->p = nl;
771 }
772 }
773 }
774
append_this_char(int c)775 static void append_this_char(int c)
776 {
777 if (TT.scs->toklen == TT.scs->maxtok - 1) {
778 TT.scs->maxtok *= 2;
779 TT.scs->tokstr = xrealloc(TT.scs->tokstr, TT.scs->maxtok);
780 }
781 TT.scs->tokstr[TT.scs->toklen++] = c;
782 TT.scs->tokstr[TT.scs->toklen] = 0;
783 }
784
gch(void)785 static void gch(void)
786 {
787 // FIXME probably not right place to skip CRs.
788 do {
789 TT.scs->ch = get_char();
790 } while (TT.scs->ch == '\r');
791 }
792
append_char(void)793 static void append_char(void)
794 {
795 append_this_char(TT.scs->ch);
796 gch();
797 }
798
find_keyword_or_builtin(char * table,int first_tok_in_table)799 static int find_keyword_or_builtin(char *table,
800 int first_tok_in_table)
801 {
802 char s[16] = " ", *p;
803 // keywords and builtin functions are spaced 10 apart for strstr() lookup,
804 // so must be less than that long.
805 if (TT.scs->toklen >= 10) return 0;
806 strcat(s, TT.scs->tokstr);
807 strcat(s, " ");
808 p = strstr(table, s);
809 if (!p) return 0;
810 return first_tok_in_table + (p - table) / 10;
811 }
812
find_token(void)813 static int find_token(void)
814 {
815 char s[6] = " ", *p;
816 // tokens are spaced 3 apart for strstr() lookup, so must be less than
817 // that long.
818 strcat(s, TT.scs->tokstr);
819 strcat(s, " ");
820 p = strstr(ops, s);
821 if (!p) return 0;
822 return tksemi + (p - ops) / 3;
823 }
824
find_keyword(void)825 static int find_keyword(void)
826 {
827 return find_keyword_or_builtin(keywords, tkin);
828 }
829
find_builtin(void)830 static int find_builtin(void)
831 {
832 return find_keyword_or_builtin(builtins, tkatan2);
833 }
834
get_number(void)835 static void get_number(void)
836 {
837 // Assumes TT.scs->ch is digit or dot on entry.
838 // TT.scs->p points to the following character.
839 // OK formats: 1 1. 1.2 1.2E3 1.2E+3 1.2E-3 1.E2 1.E+2 1.E-2 1E2 .1 .1E2
840 // .1E+2 .1E-2
841 // NOT OK: . .E .E1 .E+ .E+1 ; 1E .1E 1.E 1.E+ 1.E- parse as number
842 // followed by variable E.
843 // gawk accepts 12.E+ and 12.E- as 12; nawk & mawk say syntax error.
844 char *leftover;
845 int len;
846 TT.scs->numval = strtod(TT.scs->p - 1, &leftover);
847 len = leftover - TT.scs->p + 1;
848 if (len == 0) {
849 append_char();
850 TT.scs->toktype = ERROR;
851 TT.scs->tok = tkerr;
852 TT.scs->error = 1;
853 FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
854 return;
855 }
856 while (len--)
857 append_char();
858 }
859
get_string_or_regex(int endchar)860 static void get_string_or_regex(int endchar)
861 {
862 gch();
863 while (TT.scs->ch != endchar) {
864 if (TT.scs->ch == '\n') {
865 // FIXME Handle unterminated string or regex. Is this OK?
866 // FIXME TODO better diagnostic here?
867 XERR("%s\n", "unterminated string or regex");
868 break;
869 } else if (TT.scs->ch == '\\') {
870 // \\ \a \b \f \n \r \t \v \" \/ \ddd
871 char *p, *escapes = "\\abfnrtv\"/";
872 gch();
873 if (TT.scs->ch == '\n') { // backslash newline is continuation
874 gch();
875 continue;
876 } else if ((p = strchr(escapes, TT.scs->ch))) {
877 // posix regex does not use these escapes,
878 // but awk does, so do them.
879 int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes];
880 append_this_char(c);
881 // Need to double up \ inside literal regex
882 if (endchar == '/' && c == '\\') append_this_char('\\');
883 gch();
884 } else if (TT.scs->ch == 'x') {
885 gch();
886 if (isxdigit(TT.scs->ch)) {
887 int c = hexval(TT.scs->ch);
888 gch();
889 if (isxdigit(TT.scs->ch)) {
890 c = c * 16 + hexval(TT.scs->ch);
891 gch();
892 }
893 append_this_char(c);
894 } else append_this_char('x');
895 } else if (TT.scs->ch == 'u') {
896 gch();
897 if (isxdigit(TT.scs->ch)) {
898 int i = 0, j = 0, c = 0;
899 char codep[9] = {0};
900 do {
901 codep[j++] = TT.scs->ch;
902 gch();
903 } while (j < 8 && isxdigit(TT.scs->ch));
904 c = strtol(codep, 0, 16);
905 for (i = wctoutf8(codep, c), j = 0; j < i; j++)
906 append_this_char(codep[j]);
907 } else append_this_char('u');
908 } else if (isdigit(TT.scs->ch)) {
909 if (TT.scs->ch < '8') {
910 int k, c = 0;
911 for (k = 0; k < 3; k++) {
912 if (isdigit(TT.scs->ch) && TT.scs->ch < '8') {
913 c = c * 8 + TT.scs->ch - '0';
914 gch();
915 } else
916 break;
917 }
918 append_this_char(c);
919 } else {
920 append_char();
921 }
922 } else {
923 if (endchar == '/') {
924 // pass \ unmolested if not awk escape,
925 // so that regex routines can see it.
926 if (!strchr(".[]()*+?{}|^$-", TT.scs->ch)) {
927 XERR("warning: '\\%c' -- unknown regex escape\n", TT.scs->ch);
928 }
929 append_this_char('\\');
930 } else {
931 XERR("warning: '\\%c' treated as plain '%c'\n", TT.scs->ch, TT.scs->ch);
932 }
933 }
934 } else if (TT.scs->ch == EOF) {
935 FATAL("EOF in string or regex\n");
936 } else {
937 append_char();
938 }
939 }
940 gch();
941 }
942
ascan_opt_div(int div_op_allowed_here)943 static void ascan_opt_div(int div_op_allowed_here)
944 {
945 int n;
946 for (;;) {
947 TT.scs->tokbuiltin = 0;
948 TT.scs->toklen = 0;
949 TT.scs->tokstr[0] = 0;
950 while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
951 gch();
952 if (TT.scs->ch == '\\') {
953 append_char();
954 if (TT.scs->ch == '\n') {
955 gch();
956 continue;
957 }
958 TT.scs->toktype = ERROR; // \ not last char in line.
959 TT.scs->tok = tkerr;
960 TT.scs->error = 3;
961 FATAL("backslash not last char in line\n");
962 return;
963 }
964 break;
965 }
966 // Note \<NEWLINE> in comment does not continue it.
967 if (TT.scs->ch == '#') {
968 gch();
969 while (TT.scs->ch != '\n')
970 gch();
971 // Need to fall through here to pick up newline.
972 }
973 if (TT.scs->ch == '\n') {
974 TT.scs->toktype = NEWLINE;
975 TT.scs->tok = tknl;
976 append_char();
977 } else if (isalpha(TT.scs->ch) || TT.scs->ch == '_') {
978 append_char();
979 while (isalnum(TT.scs->ch) || TT.scs->ch == '_') {
980 append_char();
981 }
982 if ((n = find_keyword()) != 0) {
983 TT.scs->toktype = KEYWORD;
984 TT.scs->tok = n;
985 } else if ((n = find_builtin()) != 0) {
986 TT.scs->toktype = BUILTIN;
987 TT.scs->tok = tkbuiltin;
988 TT.scs->tokbuiltin = n;
989 } else if (TT.scs->ch == '(') {
990 TT.scs->toktype = USERFUNC;
991 TT.scs->tok = tkfunc;
992 } else {
993 TT.scs->toktype = VAR;
994 TT.scs->tok = tkvar;
995 // skip whitespace to be able to check for , or )
996 while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
997 gch();
998 }
999 return;
1000 } else if (TT.scs->ch == '"') {
1001 TT.scs->toktype = STRING;
1002 TT.scs->tok = tkstring;
1003 get_string_or_regex('"');
1004 } else if (isdigit(TT.scs->ch) || TT.scs->ch == '.') {
1005 TT.scs->toktype = NUMBER;
1006 TT.scs->tok = tknumber;
1007 get_number();
1008 } else if (TT.scs->ch == '/' && ! div_op_allowed_here) {
1009 TT.scs->toktype = REGEX;
1010 TT.scs->tok = tkregex;
1011 get_string_or_regex('/');
1012 } else if (TT.scs->ch == EOF) {
1013 TT.scs->toktype = EOF;
1014 TT.scs->tok = tkeof;
1015 } else if (TT.scs->ch == '\0') {
1016 append_char();
1017 TT.scs->toktype = ERROR;
1018 TT.scs->tok = tkerr;
1019 TT.scs->error = 5;
1020 FATAL("null char\n");
1021 } else {
1022 // All other tokens.
1023 TT.scs->toktype = TT.scs->ch;
1024 append_char();
1025 // Special case for **= and ** tokens
1026 if (TT.scs->toktype == '*' && TT.scs->ch == '*') {
1027 append_char();
1028 if (TT.scs->ch == '=') {
1029 append_char();
1030 TT.scs->tok = tkpowasgn;
1031 } else TT.scs->tok = tkpow;
1032 TT.scs->toktype = TT.scs->tok + 200;
1033 return;
1034 }
1035 // Is it a 2-character token?
1036 if (TT.scs->ch != ' ' && TT.scs->ch != '\n') {
1037 append_this_char(TT.scs->ch);
1038 if (find_token()) {
1039 TT.scs->tok = find_token();
1040 TT.scs->toktype = TT.scs->tok + 200;
1041 gch(); // Eat second char of token.
1042 return;
1043 }
1044 TT.scs->toklen--; // Not 2-character token; back off.
1045 TT.scs->tokstr[TT.scs->toklen] = 0;
1046 }
1047 TT.scs->tok = find_token();
1048 if (TT.scs->tok) return;
1049 TT.scs->toktype = ERROR;
1050 TT.scs->tok = tkerr;
1051 TT.scs->error = 4;
1052 FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
1053 }
1054 }
1055
scan_opt_div(int div_op_allowed_here)1056 static void scan_opt_div(int div_op_allowed_here)
1057 {
1058 // TODO FIXME need better diags for bad tokens!
1059 // TODO Also set global syntax error flag.
1060 do ascan_opt_div(div_op_allowed_here); while (TT.scs->tok == tkerr);
1061 }
1062
init_scanner(void)1063 static void init_scanner(void)
1064 {
1065 TT.prevtok = tkeof;
1066 gch();
1067 }
1068
1069 // POSIX says '/' does not begin a regex wherever '/' or '/=' can mean divide.
1070 // Pretty sure if / or /= comes after these, it means divide:
1071 static char div_preceders[] = {tknumber, tkstring, tkvar, tkgetline, tkrparen, tkrbracket, tkincr, tkdecr, 0};
1072
1073 // For checking end of prev statement for termination and if '/' can come next
1074
scan(void)1075 static void scan(void)
1076 {
1077 TT.prevtok = TT.scs->tok;
1078 if (TT.prevtok && strchr(div_preceders, TT.prevtok)) scan_opt_div(1);
1079 else scan_opt_div(0);
1080 TT.tokstr = TT.scs->tokstr;
1081 }
1082
1083 ////////////////////
1084 //// compile
1085 ////////////////////
1086
1087 // NOTES:
1088 // NL ok after , { && || do else OR after right paren after if/while/for
1089 // TODO:
1090 // see case tkgetline -- test more
1091 // case tkmatchop, tknotmatch -- fix ~ (/re/)
1092
1093 // Forward declarations -- for mutually recursive parsing functions
1094 static int expr(int rbp);
1095 static void lvalue(void);
1096 static int primary(void);
1097 static void stmt(void);
1098 static void action(int action_type);
1099
1100 #define CURTOK() (TT.scs->tok)
1101 #define ISTOK(toknum) (TT.scs->tok == (toknum))
1102
havetok(int tk)1103 static int havetok(int tk)
1104 {
1105 if (!ISTOK(tk)) return 0;
1106 scan();
1107 return 1;
1108 }
1109
1110 //// code and "literal" emitters
gencd(int op)1111 static void gencd(int op)
1112 {
1113 TT.zcode_last = zlist_append(&TT.zcode, &op);
1114 }
1115
gen2cd(int op,int n)1116 static void gen2cd(int op, int n)
1117 {
1118 gencd(op);
1119 gencd(n);
1120 }
1121
make_literal_str_val(char * s)1122 static int make_literal_str_val(char *s)
1123 {
1124 // Only if no nul inside string!
1125 struct zvalue v = new_str_val(s);
1126 return zlist_append(&TT.literals, &v);
1127 }
1128
make_literal_regex_val(char * s)1129 static int make_literal_regex_val(char *s)
1130 {
1131 regex_t *rx;
1132 rx = xmalloc(sizeof(*rx));
1133 xregcomp(rx, s, REG_EXTENDED);
1134 struct zvalue v = ZVINIT(ZF_RX, 0, 0);
1135 v.rx = rx;
1136 // Flag empty rx to make it easy to identify for split() special case
1137 if (!*s) v.flags |= ZF_EMPTY_RX;
1138 return zlist_append(&TT.literals, &v);
1139 }
1140
make_literal_num_val(double num)1141 static int make_literal_num_val(double num)
1142 {
1143 struct zvalue v = ZVINIT(ZF_NUM, num, 0);
1144 return zlist_append(&TT.literals, &v);
1145 }
1146
make_uninit_val(void)1147 static int make_uninit_val(void)
1148 {
1149 return zlist_append(&TT.literals, &uninit_zvalue);
1150 }
1151 //// END code and "literal" emitters
1152
1153 //// Symbol tables functions
find_func_def_entry(char * s)1154 static int find_func_def_entry(char *s)
1155 {
1156 for (int k = 1; k < zlist_len(&TT.func_def_table); k++)
1157 if (!strcmp(s, FUNC_DEF[k].name)) return k;
1158 return 0;
1159 }
1160
add_func_def_entry(char * s)1161 static int add_func_def_entry(char *s)
1162 {
1163 struct functab_slot ent = {0, 0, {0, 0, 0, 0}, 0};
1164 ent.name = xstrdup(s);
1165 int slotnum = zlist_append(&TT.func_def_table, &ent);
1166 return slotnum;
1167 }
1168
find_global(char * s)1169 static int find_global(char *s)
1170 {
1171 for (int k = 1; k < zlist_len(&TT.globals_table); k++)
1172 if (!strcmp(s, GLOBAL[k].name)) return k;
1173 return 0;
1174 }
1175
add_global(char * s)1176 static int add_global(char *s)
1177 {
1178 struct symtab_slot ent = {0, 0};
1179 ent.name = xstrdup(s);
1180 int slotnum = zlist_append(&TT.globals_table, &ent);
1181 return slotnum;
1182 }
1183
find_local_entry(char * s)1184 static int find_local_entry(char *s)
1185 {
1186 for (int k = 1; k < zlist_len(&TT.locals_table); k++)
1187 if (!strcmp(s, LOCAL[k].name)) return k;
1188 return 0;
1189 }
1190
add_local_entry(char * s)1191 static int add_local_entry(char *s)
1192 {
1193 struct symtab_slot ent = {0, 0};
1194 ent.name = xstrdup(s);
1195 int slotnum = zlist_append(&TT.locals_table, &ent);
1196 return slotnum;
1197 }
1198
find_or_add_var_name(void)1199 static int find_or_add_var_name(void)
1200 {
1201 int slotnum = 0; // + means global; - means local to function
1202 int globals_ent = 0;
1203 int locals_ent = find_local_entry(TT.tokstr); // in local symbol table?
1204 if (locals_ent) {
1205 slotnum = -locals_ent;
1206 } else {
1207 globals_ent = find_global(TT.tokstr);
1208 if (!globals_ent) globals_ent = add_global(TT.tokstr);
1209 slotnum = globals_ent;
1210 if (find_func_def_entry(TT.tokstr))
1211 // POSIX: The same name shall not be used both as a variable name
1212 // with global scope and as the name of a function.
1213 XERR("var '%s' used as function name\n", TT.tokstr);
1214 }
1215 return slotnum;
1216 }
1217
1218 //// END Symbol tables functions
1219
1220 //// Initialization
init_locals_table(void)1221 static void init_locals_table(void)
1222 {
1223 static struct symtab_slot locals_ent;
1224 zlist_init(&TT.locals_table, sizeof(struct symtab_slot));
1225 zlist_append(&TT.locals_table, &locals_ent);
1226 }
1227
init_tables(void)1228 static void init_tables(void)
1229 {
1230 static struct symtab_slot global_ent;
1231 static struct functab_slot func_ent;
1232
1233 // Append dummy elements in lists to force valid offsets nonzero.
1234 zlist_init(&TT.globals_table, sizeof(struct symtab_slot));
1235 zlist_append(&TT.globals_table, &global_ent);
1236 zlist_init(&TT.func_def_table, sizeof(struct functab_slot));
1237 zlist_append(&TT.func_def_table, &func_ent);
1238 init_locals_table();
1239 zlist_init(&TT.zcode, sizeof(int));
1240 gencd(tkeof); // to ensure zcode offsets are non-zero
1241 zlist_init(&TT.literals, sizeof(struct zvalue));
1242 // Init stack size at twice MIN_STACK_LEFT. MIN_STACK_LEFT is at least as
1243 // many entries as any statement may ever take. Currently there is no diag
1244 // if this is exceeded; prog. will probably crash. 1024 should be plenty?
1245 zlist_initx(&TT.stack, sizeof(struct zvalue), 2 * MIN_STACK_LEFT);
1246 TT.stackp = (struct zvalue *)TT.stack.base;
1247 zlist_init(&TT.fields, sizeof(struct zvalue));
1248 zlist_append(&TT.literals, &uninit_zvalue);
1249 zlist_append(&TT.stack, &uninit_zvalue);
1250 zlist_append(&TT.fields, &uninit_zvalue);
1251 FIELD[0].vst = new_zstring("", 0);
1252 }
1253
init_compiler(void)1254 static void init_compiler(void)
1255 {
1256 // Special variables (POSIX). Must align with enum spec_var_names
1257 static char *spec_vars[] = { "ARGC", "ARGV", "CONVFMT", "ENVIRON", "FILENAME",
1258 "FNR", "FS", "NF", "NR", "OFMT", "OFS", "ORS", "RLENGTH", "RS", "RSTART",
1259 "SUBSEP", 0};
1260
1261 init_tables();
1262 for (int k = 0; spec_vars[k]; k++) {
1263 TT.spec_var_limit = add_global(spec_vars[k]);
1264 GLOBAL[TT.spec_var_limit++].flags |= (k == 1 || k == 3) ? ZF_MAP : ZF_SCALAR;
1265 push_val(&uninit_zvalue);
1266 }
1267 }
1268 //// END Initialization
1269
1270 //// Parsing and compiling to TT.zcode
1271 // Left binding powers
1272 static int lbp_table[] = { // Must align with enum Toks
1273 0, 0, 0, 0, // tkunusedtoken, tkeof, tkerr, tknl,
1274 250, 250, 250, // tkvar, tknumber, tkstring,
1275 250, 250, 250, // tkregex, tkfunc, tkbuiltin,
1276 0, 0, 210, 0, // tksemi, tkcomma, tklbracket, tkrbracket,
1277 200, 0, 0, 0, // tklparen, tkrparen, tklbrace, tkrbrace,
1278 190, 180, 180, 170, 160, // tkfield, tkincr, tkdecr, tkpow, tknot,
1279 150, 150, 150, 140, 140, // tkmul, tkdiv, tkmod, tkplus, tkminus,
1280 130, // tkcat, // FAKE (?) optor for concatenation (adjacent string exprs)
1281 110, 110, 110, 110, 110, 110, // tklt, tkle, tkne, tkeq, tkgt, tkge,
1282 100, 100, // tkmatchop, tknotmatch,
1283 80, 70, // tkand, tkor,
1284 60, 0, // tkternif, tkternelse,
1285 50, 50, 50, 50, // tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1286 50, 50, 50, // tkaddasgn, tksubasgn, tkasgn,
1287 0, 120, // tkappend, tkpipe,
1288 90 // tkin
1289 };
1290
getlbp(int tok)1291 static int getlbp(int tok)
1292 {
1293 // FIXME: should tkappend be here too? is tkpipe needed?
1294 // In print statement outside parens: make '>' end an expression
1295 if (TT.cgl.in_print_stmt && ! TT.cgl.paren_level && (tok == tkgt || tok == tkpipe))
1296 return 0;
1297 return (0 <= tok && tok <= tkin) ? lbp_table[tok] :
1298 // getline is special, not a normal builtin.
1299 // close, index, match, split, sub, gsub, sprintf, substr
1300 // are really builtin functions though bwk treats them as keywords.
1301 (tkgetline <= tok && tok <= tksubstr) ? 240 : 0; // FIXME 240 is temp?
1302 }
1303
1304 // Get right binding power. Same as left except for right associative optors
getrbp(int tok)1305 static int getrbp(int tok)
1306 {
1307 int lbp = getlbp(tok);
1308 // ternary (?:), assignment, power ops are right associative
1309 return (lbp <= 60 || lbp == 170) ? lbp - 1 : lbp;
1310 }
1311
unexpected_eof(void)1312 static void unexpected_eof(void)
1313 {
1314 error_exit("terminated with error(s)");
1315 }
1316
1317 //// syntax error diagnostic and recovery (Turner's method)
1318 // D.A. Turner, Error diagnosis and recovery in one pass compilers,
1319 // Information Processing Letters, Volume 6, Issue 4, 1977, Pages 113-115
1320 static int recovering = 0;
1321
complain(int tk)1322 static void complain(int tk)
1323 {
1324 char op[3], tkstr[10];
1325 if (recovering) return;
1326 recovering = 1;
1327 if (!strcmp(TT.tokstr, "\n")) TT.tokstr = "<newline>";
1328 if (tksemi <= tk && tk <= tkpipe) {
1329 get_token_text(op, tk);
1330 XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, op);
1331 } else if (tk >= tkin && tk <= tksubstr) {
1332 if (tk < tkatan2) memmove(tkstr, keywords + 1 + 10 * (tk - tkin), 10);
1333 else memmove(tkstr, builtins + 1 + 10 * (tk - tkatan2), 10);
1334 *strchr(tkstr, ' ') = 0;
1335 XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, tkstr);
1336 } else XERR("syntax near '%s'\n", TT.tokstr);
1337 }
1338
expect(int tk)1339 static void expect(int tk)
1340 {
1341 if (recovering) {
1342 while (!ISTOK(tkeof) && !ISTOK(tk))
1343 scan();
1344 if (ISTOK(tkeof)) unexpected_eof();
1345 scan(); // consume expected token
1346 recovering = 0;
1347 } else if (!havetok(tk)) complain(tk);
1348 }
1349
skip_to(char * tklist)1350 static void skip_to(char *tklist)
1351 {
1352 do scan(); while (!ISTOK(tkeof) && !strchr(tklist, CURTOK()));
1353 if (ISTOK(tkeof)) unexpected_eof();
1354 }
1355
1356 //// END syntax error diagnostic and recovery (Turner's method)
1357
optional_nl_or_semi(void)1358 static void optional_nl_or_semi(void)
1359 {
1360 while (havetok(tknl) || havetok(tksemi))
1361 ;
1362 }
1363
optional_nl(void)1364 static void optional_nl(void)
1365 {
1366 while (havetok(tknl))
1367 ;
1368 }
1369
rparen(void)1370 static void rparen(void)
1371 {
1372 expect(tkrparen);
1373 optional_nl();
1374 }
1375
have_comma(void)1376 static int have_comma(void)
1377 {
1378 if (!havetok(tkcomma)) return 0;
1379 optional_nl();
1380 return 1;
1381 }
1382
check_set_map(int slotnum)1383 static void check_set_map(int slotnum)
1384 {
1385 // POSIX: The same name shall not be used within the same scope both as
1386 // a scalar variable and as an array.
1387 if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_SCALAR)
1388 XERR("scalar param '%s' used as array\n", LOCAL[-slotnum].name);
1389 if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_SCALAR)
1390 XERR("scalar var '%s' used as array\n", GLOBAL[slotnum].name);
1391 if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_MAP;
1392 if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_MAP;
1393 }
1394
check_set_scalar(int slotnum)1395 static void check_set_scalar(int slotnum)
1396 {
1397 if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_MAP)
1398 XERR("array param '%s' used as scalar\n", LOCAL[-slotnum].name);
1399 if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_MAP)
1400 XERR("array var '%s' used as scalar\n", GLOBAL[slotnum].name);
1401 if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_SCALAR;
1402 if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_SCALAR;
1403 }
1404
map_name(void)1405 static void map_name(void)
1406 {
1407 int slotnum;
1408 check_set_map(slotnum = find_or_add_var_name());
1409 gen2cd(tkvar, slotnum);
1410 }
1411
check_builtin_arg_counts(int tk,int num_args,char * fname)1412 static void check_builtin_arg_counts(int tk, int num_args, char *fname)
1413 {
1414 static char builtin_1_arg[] = { tkcos, tksin, tkexp, tklog, tksqrt, tkint,
1415 tktolower, tktoupper, tkclose, tksystem, 0};
1416 static char builtin_2_arg[] = { tkatan2, tkmatch, tkindex, tklshift, tkrshift, 0};
1417 static char builtin_al_2_arg[] = { tkband, tkbor, tkbxor, 0};
1418 static char builtin_2_3_arg[] = { tksub, tkgsub, tksplit, tksubstr, 0};
1419 static char builtin_0_1_arg[] = { tksrand, tklength, tkfflush, 0};
1420
1421 if (tk == tkrand && num_args)
1422 XERR("function '%s' expected no args, got %d\n", fname, num_args);
1423 else if (strchr(builtin_1_arg, tk) && num_args != 1)
1424 XERR("function '%s' expected 1 arg, got %d\n", fname, num_args);
1425 else if (strchr(builtin_2_arg, tk) && num_args != 2)
1426 XERR("function '%s' expected 2 args, got %d\n", fname, num_args);
1427 else if (strchr(builtin_al_2_arg, tk) && num_args < 2)
1428 XERR("function '%s' expected at least 2 args, got %d\n", fname, num_args);
1429 else if (strchr(builtin_2_3_arg, tk) && num_args != 2 && num_args != 3)
1430 XERR("function '%s' expected 2 or 3 args, got %d\n", fname, num_args);
1431 else if (strchr(builtin_0_1_arg, tk) && num_args != 0 && num_args != 1)
1432 XERR("function '%s' expected no arg or 1 arg, got %d\n", fname, num_args);
1433 }
1434
builtin_call(int tk,char * builtin_name)1435 static void builtin_call(int tk, char *builtin_name)
1436 {
1437 int num_args = 0;
1438 expect(tklparen);
1439 TT.cgl.paren_level++;
1440 switch (tk) {
1441 case tksub:
1442 case tkgsub:
1443 if (ISTOK(tkregex)) {
1444 gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1445 scan();
1446 } else expr(0);
1447 expect(tkcomma);
1448 optional_nl();
1449 expr(0);
1450 if (have_comma()) {
1451 lvalue();
1452 } else {
1453 gen2cd(tknumber, make_literal_num_val(0));
1454 gen2cd(opfldref, tkeof);
1455 }
1456 num_args = 3;
1457 break;
1458
1459 case tkmatch:
1460 expr(0);
1461 expect(tkcomma);
1462 optional_nl();
1463 if (ISTOK(tkregex)) {
1464 gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1465 scan();
1466 } else expr(0);
1467 num_args = 2;
1468 break;
1469
1470 case tksplit:
1471 expr(0);
1472 expect(tkcomma);
1473 optional_nl();
1474 if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1475 map_name();
1476 scan();
1477 } else {
1478 XERR("%s\n", "expected array name as split() 2nd arg");
1479 expr(0);
1480 }
1481 // FIXME some recovery needed here!?
1482 num_args = 2;
1483 if (have_comma()) {
1484 if (ISTOK(tkregex)) {
1485 gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1486 scan();
1487 } else expr(0);
1488 num_args++;
1489 }
1490 break;
1491
1492 case tklength:
1493 if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1494 gen2cd(tkvar, find_or_add_var_name());
1495 scan();
1496 num_args++;
1497 }
1498 ATTR_FALLTHROUGH_INTENDED;
1499
1500 default:
1501 if (ISTOK(tkrparen)) break;
1502 do {
1503 expr(0);
1504 num_args++;
1505 } while (have_comma());
1506 break;
1507 }
1508 expect(tkrparen);
1509 TT.cgl.paren_level--;
1510
1511 check_builtin_arg_counts(tk, num_args, builtin_name);
1512
1513 gen2cd(tk, num_args);
1514 }
1515
function_call(void)1516 static void function_call(void)
1517 {
1518 // Function call: generate TT.zcode to:
1519 // push placeholder for return value, push placeholder for return addr,
1520 // push args, then push number of args, then:
1521 // for builtins: gen opcode (e.g. tkgsub)
1522 // for user func: gen (tkfunc, number-of-args)
1523 int functk = 0, funcnum = 0;
1524 char builtin_name[16]; // be sure it's long enough for all builtins
1525 if (ISTOK(tkbuiltin)) {
1526 functk = TT.scs->tokbuiltin;
1527 strcpy(builtin_name, TT.tokstr);
1528 } else if (ISTOK(tkfunc)) { // user function
1529 funcnum = find_func_def_entry(TT.tokstr);
1530 if (!funcnum) funcnum = add_func_def_entry(TT.tokstr);
1531 FUNC_DEF[funcnum].flags |= FUNC_CALLED;
1532 gen2cd(opprepcall, funcnum);
1533 } else error_exit("bad function %s!", TT.tokstr);
1534 scan();
1535 // length() can appear without parens
1536 int num_args = 0;
1537 if (functk == tklength && !ISTOK(tklparen)) {
1538 gen2cd(functk, 0);
1539 return;
1540 }
1541 if (functk) { // builtin
1542 builtin_call(functk, builtin_name);
1543 return;
1544 }
1545 expect(tklparen);
1546 TT.cgl.paren_level++;
1547 if (ISTOK(tkrparen)) {
1548 scan();
1549 } else {
1550 do {
1551 if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1552 // Function call arg that is a lone variable. Cannot tell in this
1553 // context if it is a scalar or map. Just add it to symbol table.
1554 gen2cd(tkvar, find_or_add_var_name());
1555 scan();
1556 } else expr(0);
1557 num_args++;
1558 } while (have_comma());
1559 expect(tkrparen);
1560 }
1561 TT.cgl.paren_level--;
1562 gen2cd(tkfunc, num_args);
1563 }
1564
var(void)1565 static void var(void)
1566 {
1567 // var name is in TT.tokstr
1568 // slotnum: + means global; - means local to function
1569 int slotnum = find_or_add_var_name();
1570 scan();
1571 if (havetok(tklbracket)) {
1572 check_set_map(slotnum);
1573 int num_subscripts = 0;
1574 do {
1575 expr(0);
1576 num_subscripts++;
1577 } while (have_comma());
1578 expect(tkrbracket);
1579 if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1580 gen2cd(opmap, slotnum);
1581 } else {
1582 check_set_scalar(slotnum);
1583 gen2cd(tkvar, slotnum);
1584 }
1585 }
1586
1587 // Dollar $ tkfield can be followed by "any" expresson, but
1588 // the way it binds varies.
1589 // The following are valid lvalues:
1590 // $ ( expr )
1591 // $ tkvar $ tknumber $ tkstring $ tkregex
1592 // $ tkfunc(...)
1593 // $ tkbuiltin(...)
1594 // $ length # with no parens after
1595 // $ tkclose(), ... $ tksubstr
1596 // $ tkgetline FIXME TODO TEST THIS
1597 // $ ++ lvalue
1598 // $ -- lvalue
1599 // $ + expression_up_to_exponentiation (also -, ! prefix ops)
1600 // $ $ whatever_can_follow_and_bind_to_dollar
1601 //
1602 // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1603 // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline,
1604 // tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1605 //
1606 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k*k }'
1607 // 18
1608 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k*k }'
1609 // 18
1610 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k^k }'
1611 // 81
1612 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k^k }'
1613 // 8
1614
field_op(void)1615 static void field_op(void)
1616 {
1617 // CURTOK() must be $ here.
1618 expect(tkfield);
1619 // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1620 // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1621 // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1622 if (ISTOK(tkfield)) field_op();
1623 else if (ISTOK(tkvar)) var();
1624 else primary();
1625 // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
1626 // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
1627 gen2cd(tkfield, tkeof);
1628 }
1629
1630 // Tokens that can start expression
1631 static char exprstartsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc,
1632 tkbuiltin, tkfield, tkminus, tkplus, tknot, tkincr, tkdecr, tklparen,
1633 tkgetline, tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf,
1634 tksubstr, tkband, tkbor, tkbxor, tkrshift, tklshift, 0};
1635
1636 // Tokens that can end statement
1637 static char stmtendsy[] = {tknl, tksemi, tkrbrace, 0};
1638
1639 // Tokens that can follow expressions of a print statement
1640 static char printexprendsy[] = {tkgt, tkappend, tkpipe, tknl, tksemi, tkrbrace, 0};
1641
1642 // !! Ensure this:
1643 // ternary op is right associative, so
1644 // a ? b : c ? d : e evaluates as
1645 // a ? b : (c ? d : e) not as
1646 // (a ? b : c) ? d : e
1647
convert_push_to_reference(void)1648 static void convert_push_to_reference(void)
1649 {
1650 if (ZCODE[TT.zcode_last - 1] == tkvar) ZCODE[TT.zcode_last-1] = opvarref;
1651 else if (ZCODE[TT.zcode_last - 1] == opmap) ZCODE[TT.zcode_last - 1] = opmapref;
1652 else if (ZCODE[TT.zcode_last - 1] == tkfield) ZCODE[TT.zcode_last - 1] = opfldref;
1653 else error_exit("bad lvalue?");
1654 }
1655
lvalue(void)1656 static void lvalue(void)
1657 {
1658 if (ISTOK(tkfield)) {
1659 field_op();
1660 convert_push_to_reference();
1661 } else if (ISTOK(tkvar)) {
1662 var();
1663 convert_push_to_reference();
1664 } else {
1665 XERR("syntax near '%s' (bad lvalue)\n", TT.tokstr);
1666 }
1667 }
1668
primary(void)1669 static int primary(void)
1670 {
1671 // On entry: CURTOK() is first token of expression
1672 // On exit: CURTOK() is infix operator (for binary_op() to handle) or next
1673 // token after end of expression.
1674 // return -1 for field or var (potential lvalue);
1675 // 2 or more for comma-separated expr list
1676 // as in "multiple subscript expression in array"
1677 // e.g. (1, 2) in array_name, or a print/printf list;
1678 // otherwise return 0
1679 //
1680 // expr can start with:
1681 // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1682 // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1683 // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1684 //
1685 // bwk treats these as keywords, not builtins: close index match split sub gsub
1686 // sprintf substr
1687 //
1688 // bwk builtins are: atan2 cos sin exp log sqrt int rand srand length tolower
1689 // toupper system fflush
1690 // NOTE: fflush() is NOT in POSIX awk
1691 //
1692 // primary() must consume prefix and postfix operators as well as
1693 // num, string, regex, var, var with subscripts, and function calls
1694
1695 int num_exprs = 0;
1696 int nargs, modifier;
1697 int tok = CURTOK();
1698 switch (tok) {
1699 case tkvar:
1700 case tkfield:
1701 if (ISTOK(tkvar)) var();
1702 else field_op();
1703 if (ISTOK(tkincr) || ISTOK(tkdecr)) {
1704 convert_push_to_reference();
1705 gencd(CURTOK());
1706 scan();
1707 } else return -1;
1708 break;
1709
1710 case tknumber:
1711 gen2cd(tknumber, make_literal_num_val(TT.scs->numval));
1712 scan();
1713 break;
1714
1715 case tkstring:
1716 gen2cd(tkstring, make_literal_str_val(TT.tokstr));
1717 scan();
1718 break;
1719
1720 case tkregex:
1721 // When an ERE token appears as an expression in any context other
1722 // than as the right-hand of the '~' or "!~" operator or as one of
1723 // the built-in function arguments described below, the value of
1724 // the resulting expression shall be the equivalent of: $0 ~ /ere/
1725 // FIXME TODO
1726 gen2cd(opmatchrec, make_literal_regex_val(TT.tokstr));
1727 scan();
1728 break;
1729
1730 case tkbuiltin: // various builtins
1731 case tkfunc: // user-defined function
1732 function_call();
1733 break;
1734
1735 // Unary prefix ! + -
1736 case tknot:
1737 case tkminus:
1738 case tkplus:
1739 scan();
1740 expr(getlbp(tknot)); // unary +/- same precedence as !
1741 if (tok == tknot) gencd(tknot);
1742 else gencd(opnegate); // forces to number
1743 if (tok == tkplus) gencd(opnegate); // forces to number
1744 break;
1745
1746 // Unary prefix ++ -- MUST take lvalue
1747 case tkincr:
1748 case tkdecr:
1749 scan();
1750 lvalue();
1751 if (tok == tkincr) gencd(oppreincr);
1752 else gencd(oppredecr);
1753 break;
1754
1755 case tklparen:
1756 scan();
1757 TT.cgl.paren_level++;
1758 num_exprs = 0;
1759 do {
1760 expr(0);
1761 num_exprs++;
1762 } while (have_comma());
1763 expect(tkrparen);
1764 TT.cgl.paren_level--;
1765 if (num_exprs > 1) return num_exprs;
1766 break;
1767
1768 case tkgetline:
1769 // getline may be (according to awk book):
1770 // getline [var [<file]]
1771 // getline <file
1772 // cmd | getline [var]
1773 // var must be lvalue (can be any lvalue?)
1774 scan();
1775 nargs = 0;
1776 modifier = tkeof;
1777 if (ISTOK(tkfield) || ISTOK(tkvar)) {
1778 lvalue();
1779 nargs++;
1780 }
1781 if (havetok(tklt)) {
1782 expr(getrbp(tkcat)); // bwk "historical practice" precedence
1783 nargs++;
1784 modifier = tklt;
1785 }
1786 gen2cd(tkgetline, nargs);
1787 gencd(modifier);
1788 break;
1789
1790 default:
1791 XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1792 skip_to(stmtendsy);
1793 break;
1794 }
1795 return 0;
1796 }
1797
binary_op(int optor)1798 static void binary_op(int optor) // Also for ternary ?: optor.
1799 {
1800 int nargs, cdx = 0; // index in TT.zcode list
1801 int rbp = getrbp(optor);
1802 if (optor != tkcat) scan();
1803 // CURTOK() holds first token of right operand.
1804 switch (optor) {
1805 case tkin:
1806 // right side of 'in' must be (only) an array name
1807 map_name();
1808 gencd(tkin);
1809 scan();
1810 // FIXME TODO 20230109 x = y in a && 2 works OK?
1811 // x = y in a + 2 does not; it's parsed as x = (y in a) + 2
1812 // The +2 is not cat'ed with (y in a) as in bwk's OTA.
1813 // Other awks see y in a + 2 as a syntax error. They (may)
1814 // not want anything after y in a except a lower binding operator
1815 // (&& || ?:) or end of expression, i.e. ')' ';' '}'
1816 break;
1817
1818 case tkpipe:
1819 expect(tkgetline);
1820 nargs = 1;
1821 if (ISTOK(tkfield) || ISTOK(tkvar)) {
1822 lvalue();
1823 nargs++;
1824 }
1825 gen2cd(tkgetline, nargs);
1826 gencd(tkpipe);
1827 break;
1828
1829 case tkand:
1830 case tkor:
1831 optional_nl();
1832 gen2cd(optor, -1); // tkand: jump if false, else drop
1833 cdx = TT.zcode_last; // tkor: jump if true, else drop
1834 expr(rbp);
1835 gencd(opnotnot); // replace TT.stack top with truth value
1836 ZCODE[cdx] = TT.zcode_last - cdx;
1837 break;
1838
1839 case tkternif:
1840 gen2cd(optor, -1);
1841 cdx = TT.zcode_last;
1842 expr(0);
1843 expect(tkternelse);
1844 gen2cd(tkternelse, -1);
1845 ZCODE[cdx] = TT.zcode_last - cdx;
1846 cdx = TT.zcode_last;
1847 expr(rbp);
1848 ZCODE[cdx] = TT.zcode_last - cdx;
1849 break;
1850
1851 case tkmatchop:
1852 case tknotmatch:
1853 expr(rbp);
1854 if (ZCODE[TT.zcode_last - 1] == opmatchrec) ZCODE[TT.zcode_last - 1] = tkregex;
1855 gencd(optor);
1856 break;
1857
1858 default:
1859 expr(rbp);
1860 gencd(optor);
1861 }
1862 }
1863
cat_start_concated_expr(int tok)1864 static int cat_start_concated_expr(int tok)
1865 {
1866 // concat'ed expr can start w/ var number string func builtin $ ! ( (or ++ if prev was not lvalue)
1867 static char exprstarttermsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
1868 tkfield, tknot, tkincr, tkdecr, tklparen, tkgetline, 0};
1869
1870 // NOTE this depends on builtins (close etc) being >= tkgetline
1871 return !! strchr(exprstarttermsy, tok) || tok >= tkgetline;
1872 }
1873
1874 #define CALLED_BY_PRINT 99987 // Arbitrary, different from any real rbp value
1875
expr(int rbp)1876 static int expr(int rbp)
1877 {
1878 // On entry: TT.scs has first symbol of expression, e.g. var, number, string,
1879 // regex, func, getline, left paren, prefix op ($ ++ -- ! unary + or -) etc.
1880 static char asgnops[] = {tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1881 tkaddasgn, tksubasgn, tkasgn, 0};
1882 int prim_st = primary();
1883 // If called directly by print_stmt(), and found a parenthesized expression list
1884 // followed by an end of print statement: any of > >> | ; } <newline>
1885 // Then: return the count of expressions in list
1886 // Else: continue parsing an expression
1887 if (rbp == CALLED_BY_PRINT) {
1888 if (prim_st > 0 && strchr(printexprendsy, CURTOK())) return prim_st;
1889 else rbp = 0;
1890 }
1891
1892 // mult_expr_list in parens must be followed by 'in' unless it
1893 // immediately follows print or printf, where it may still be followed
1894 // by 'in' ... unless at end of statement
1895 if (prim_st > 0 && ! ISTOK(tkin))
1896 XERR("syntax near '%s'; expected 'in'\n", TT.tokstr);
1897 if (prim_st > 0) gen2cd(tkrbracket, prim_st);
1898 // primary() has eaten subscripts, function args, postfix ops.
1899 // CURTOK() should be a binary op.
1900 int optor = CURTOK();
1901 if (strchr(asgnops, optor)) {
1902
1903 // TODO FIXME ? NOT SURE IF THIS WORKS RIGHT!
1904 // awk does not parse according to POSIX spec in some odd cases.
1905 // When an assignment (lvalue =) is on the right of certain operators,
1906 // it is not treated as a bad lvalue (as it is in C).
1907 // Example: (1 && a=2) # no error; the assignment is performed.
1908 // This happens for ?: || && ~ !~ < <= ~= == > >=
1909 //
1910 static char odd_assignment_rbp[] = {59, 60, 70, 80, 100, 110, 0};
1911 if (prim_st < 0 && (rbp <= getrbp(optor) || strchr(odd_assignment_rbp, rbp))) {
1912 convert_push_to_reference();
1913 scan();
1914 expr(getrbp(optor));
1915 gencd(optor);
1916 return 0;
1917 }
1918 XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1919 skip_to(stmtendsy);
1920 }
1921 if (cat_start_concated_expr(optor)) optor = tkcat;
1922 while (rbp < getlbp(optor)) {
1923 binary_op(optor);
1924 // HERE tok s/b an operator or expression terminator ( ; etc.).
1925 optor = CURTOK();
1926 if (cat_start_concated_expr(optor)) optor = tkcat;
1927 }
1928 return 0;
1929 }
1930
print_stmt(int tk)1931 static void print_stmt(int tk)
1932 {
1933 static char outmodes[] = {tkgt, tkappend, tkpipe, 0};
1934 int num_exprs = 0, outmode;
1935 TT.cgl.in_print_stmt = 1;
1936 expect(tk); // tkprint or tkprintf
1937 if ((tk == tkprintf) || !strchr(printexprendsy, CURTOK())) {
1938 // printf always needs expression
1939 // print non-empty statement needs expression
1940 num_exprs = expr(CALLED_BY_PRINT);
1941 if (num_exprs > 0 && !strchr(printexprendsy, CURTOK())) FATAL("print stmt bug");
1942 if (!num_exprs) {
1943 for (num_exprs++; have_comma(); num_exprs++)
1944 expr(0);
1945 }
1946 }
1947 outmode = CURTOK();
1948 if (strchr(outmodes, outmode)) {
1949 scan();
1950 expr(0); // FIXME s/b only bwk term? check POSIX
1951 num_exprs++;
1952 } else outmode = 0;
1953 gen2cd(tk, num_exprs);
1954 gencd(outmode);
1955 TT.cgl.in_print_stmt = 0;
1956 }
1957
delete_stmt(void)1958 static void delete_stmt(void)
1959 {
1960 expect(tkdelete);
1961 if (ISTOK(tkvar)) {
1962 int slotnum = find_or_add_var_name();
1963 check_set_map(slotnum);
1964 scan();
1965 if (havetok(tklbracket)) {
1966 int num_subscripts = 0;
1967 do {
1968 expr(0);
1969 num_subscripts++;
1970 } while (have_comma());
1971 expect(tkrbracket);
1972 if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1973 gen2cd(opmapref, slotnum);
1974 gencd(tkdelete);
1975 } else {
1976 // delete entire map (elements only; var is still a map)
1977 gen2cd(opmapref, slotnum);
1978 gencd(opmapdelete);
1979 }
1980 } else expect(tkvar);
1981 }
1982
simple_stmt(void)1983 static void simple_stmt(void)
1984 {
1985 if (strchr(exprstartsy, CURTOK())) {
1986 expr(0);
1987 gencd(opdrop);
1988 return;
1989 }
1990 switch (CURTOK()) {
1991 case tkprint:
1992 case tkprintf:
1993 print_stmt(CURTOK());
1994 break;
1995
1996 case tkdelete:
1997 delete_stmt();
1998 break;
1999
2000 default:
2001 XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
2002 skip_to(stmtendsy);
2003 }
2004 }
2005
prev_was_terminated(void)2006 static int prev_was_terminated(void)
2007 {
2008 return !!strchr(stmtendsy, TT.prevtok);
2009 }
2010
is_nl_semi(void)2011 static int is_nl_semi(void)
2012 {
2013 return ISTOK(tknl) || ISTOK(tksemi);
2014 }
2015
if_stmt(void)2016 static void if_stmt(void)
2017 {
2018 expect(tkif);
2019 expect(tklparen);
2020 expr(0);
2021 rparen();
2022 gen2cd(tkif, -1);
2023 int cdx = TT.zcode_last;
2024 stmt();
2025 if (!prev_was_terminated() && is_nl_semi()) {
2026 scan();
2027 optional_nl();
2028 }
2029 if (prev_was_terminated()) {
2030 optional_nl();
2031 if (havetok(tkelse)) {
2032 gen2cd(tkelse, -1);
2033 ZCODE[cdx] = TT.zcode_last - cdx;
2034 cdx = TT.zcode_last;
2035 optional_nl();
2036 stmt();
2037 }
2038 }
2039 ZCODE[cdx] = TT.zcode_last - cdx;
2040 }
2041
save_break_continue(int * brk,int * cont)2042 static void save_break_continue(int *brk, int *cont)
2043 {
2044 *brk = TT.cgl.break_dest;
2045 *cont = TT.cgl.continue_dest;
2046 }
2047
restore_break_continue(int * brk,int * cont)2048 static void restore_break_continue(int *brk, int *cont)
2049 {
2050 TT.cgl.break_dest = *brk;
2051 TT.cgl.continue_dest = *cont;
2052 }
2053
while_stmt(void)2054 static void while_stmt(void)
2055 {
2056 int brk, cont;
2057 save_break_continue(&brk, &cont);
2058 expect(tkwhile);
2059 expect(tklparen);
2060 TT.cgl.continue_dest = TT.zcode_last + 1;
2061 expr(0);
2062 rparen();
2063 gen2cd(tkwhile, 2); // drop, jump if true
2064 TT.cgl.break_dest = TT.zcode_last + 1;
2065 gen2cd(opjump, -1); // jump here to break
2066 stmt();
2067 gen2cd(opjump, -1); // jump to continue
2068 ZCODE[TT.zcode_last] = TT.cgl.continue_dest - TT.zcode_last - 1;
2069 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2070 restore_break_continue(&brk, &cont);
2071 }
2072
do_stmt(void)2073 static void do_stmt(void)
2074 {
2075 int brk, cont;
2076 save_break_continue(&brk, &cont);
2077 expect(tkdo);
2078 optional_nl();
2079 gen2cd(opjump, 4); // jump over jumps, to statement
2080 TT.cgl.continue_dest = TT.zcode_last + 1;
2081 gen2cd(opjump, -1); // here on continue
2082 TT.cgl.break_dest = TT.zcode_last + 1;
2083 gen2cd(opjump, -1); // here on break
2084 stmt();
2085 if (!prev_was_terminated()) {
2086 if (is_nl_semi()) {
2087 scan();
2088 optional_nl();
2089 } else {
2090 XERR("syntax near '%s' -- ';' or newline expected\n", TT.tokstr);
2091 // FIXME
2092 }
2093 }
2094 ZCODE[TT.cgl.continue_dest + 1] = TT.zcode_last - TT.cgl.continue_dest - 1;
2095 optional_nl();
2096 expect(tkwhile);
2097 expect(tklparen);
2098 expr(0);
2099 rparen();
2100 gen2cd(tkwhile, TT.cgl.break_dest - TT.zcode_last - 1);
2101 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2102 restore_break_continue(&brk, &cont);
2103 }
2104
for_not_map_iter(void)2105 static void for_not_map_iter(void)
2106 {
2107 // Here after loop initialization, if any; loop condition
2108 int condition_loc = TT.zcode_last + 1;
2109 if (havetok(tksemi)) {
2110 // "endless" loop variant; no condition
2111 // no NL allowed here in OTA
2112 gen2cd(opjump, -1); // jump to statement
2113 } else {
2114 optional_nl(); // NOT posix or awk book; in OTA
2115 expr(0); // loop while true
2116 expect(tksemi);
2117 gen2cd(tkwhile, -1); // drop, jump to statement if true
2118 }
2119 optional_nl(); // NOT posix or awk book; in OTA
2120 TT.cgl.break_dest = TT.zcode_last + 1;
2121 gen2cd(opjump, -1);
2122 TT.cgl.continue_dest = TT.zcode_last + 1;
2123 if (!ISTOK(tkrparen)) simple_stmt(); // "increment"
2124 gen2cd(opjump, condition_loc - TT.zcode_last - 3);
2125 rparen();
2126 ZCODE[TT.cgl.break_dest - 1] = TT.zcode_last - TT.cgl.break_dest + 1;
2127 stmt();
2128 gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2129 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2130 }
2131
valid_for_array_iteration(int first,int last)2132 static int valid_for_array_iteration(int first, int last)
2133 {
2134 return ZCODE[first] == tkvar && ZCODE[first + 2] == tkvar
2135 && ZCODE[first + 4] == tkin && ZCODE[first + 5] == opdrop
2136 && first + 5 == last;
2137 }
2138
for_stmt(void)2139 static void for_stmt(void)
2140 {
2141 int brk, cont;
2142 save_break_continue(&brk, &cont);
2143 expect(tkfor);
2144 expect(tklparen);
2145 if (havetok(tksemi)) {
2146 // No "initialization" part
2147 for_not_map_iter();
2148 } else {
2149 int loop_start_loc = TT.zcode_last + 1;
2150 simple_stmt(); // initializaton part, OR varname in arrayname form
2151 if (!havetok(tkrparen)) {
2152 expect(tksemi);
2153 for_not_map_iter();
2154 } else {
2155 // Must be map iteration
2156 // Check here for varname in varname!
2157 // FIXME TODO must examine generated TT.zcode for var in array?
2158 if (!valid_for_array_iteration(loop_start_loc, TT.zcode_last))
2159 XERR("%s", "bad 'for (var in array)' loop\n");
2160 else {
2161 ZCODE[TT.zcode_last-5] = opvarref;
2162 ZCODE[TT.zcode_last-1] = tknumber;
2163 ZCODE[TT.zcode_last] = make_literal_num_val(-1);
2164 TT.cgl.continue_dest = TT.zcode_last + 1;
2165 gen2cd(opmapiternext, 2);
2166 TT.cgl.break_dest = TT.zcode_last + 1;
2167 gen2cd(opjump, -1); // fill in with loc after stmt
2168 }
2169 optional_nl();
2170 // fixup TT.stack if return or exit inside for (var in array)
2171 TT.cgl.stack_offset_to_fix += 3;
2172 stmt();
2173 TT.cgl.stack_offset_to_fix -= 3;
2174 gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2175 ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2176 gencd(opdrop);
2177 gencd(opdrop);
2178 gencd(opdrop);
2179 }
2180 }
2181 restore_break_continue(&brk, &cont);
2182 }
2183
stmt(void)2184 static void stmt(void)
2185 {
2186 switch (CURTOK()) {
2187 case tkeof:
2188 break; // FIXME ERROR?
2189
2190 case tkbreak:
2191 scan();
2192 if (TT.cgl.break_dest) gen2cd(tkbreak, TT.cgl.break_dest - TT.zcode_last - 3);
2193 else XERR("%s", "break not in a loop\n");
2194 break;
2195
2196 case tkcontinue:
2197 scan();
2198 if (TT.cgl.continue_dest)
2199 gen2cd(tkcontinue, TT.cgl.continue_dest - TT.zcode_last - 3);
2200 else XERR("%s", "continue not in a loop\n");
2201 break;
2202
2203 case tknext:
2204 scan();
2205 gencd(tknext);
2206 if (TT.cgl.rule_type) XERR("%s", "next inside BEGIN or END\n");
2207 if (TT.cgl.in_function_body) XERR("%s", "next inside function def\n");
2208 break;
2209
2210 case tknextfile:
2211 scan();
2212 gencd(tknextfile);
2213 if (TT.cgl.rule_type) XERR("%s", "nextfile inside BEGIN or END\n");
2214 if (TT.cgl.in_function_body) XERR("%s", "nextfile inside function def\n");
2215 break;
2216
2217 case tkexit:
2218 scan();
2219 if (strchr(exprstartsy, CURTOK())) {
2220 expr(0);
2221 } else gen2cd(tknumber, make_literal_num_val(NO_EXIT_STATUS));
2222 gencd(tkexit);
2223 break;
2224
2225 case tkreturn:
2226 scan();
2227 if (TT.cgl.stack_offset_to_fix) gen2cd(opdrop_n, TT.cgl.stack_offset_to_fix);
2228 if (strchr(exprstartsy, CURTOK())) {
2229 expr(0);
2230 } else gen2cd(tknumber, make_literal_num_val(0.0));
2231 gen2cd(tkreturn, TT.cgl.nparms);
2232 if (!TT.cgl.in_function_body) XERR("%s", "return outside function def\n");
2233 break;
2234
2235 case tklbrace:
2236 action(tklbrace);
2237 break;
2238
2239 case tkif:
2240 if_stmt();
2241 break;
2242
2243 case tkwhile:
2244 while_stmt();
2245 break;
2246
2247 case tkdo:
2248 do_stmt();
2249 break;
2250
2251 case tkfor:
2252 for_stmt();
2253 break;
2254
2255 case tksemi:
2256 scan();
2257 break;
2258 default:
2259 simple_stmt(); // expression print printf delete
2260 }
2261 }
2262
add_param(int funcnum,char * s)2263 static void add_param(int funcnum, char *s)
2264 {
2265 if (!find_local_entry(s)) add_local_entry(s);
2266 else XERR("function '%s' dup param '%s'\n", FUNC_DEF[funcnum].name, s);
2267 TT.cgl.nparms++;
2268
2269 // POSIX: The same name shall not be used as both a function parameter name
2270 // and as the name of a function or a special awk variable.
2271 // !!! NOTE seems implementations exc. mawk only compare param names with
2272 // builtin funcs; use same name as userfunc is OK!
2273 if (!strcmp(s, FUNC_DEF[funcnum].name))
2274 XERR("function '%s' param '%s' matches func name\n",
2275 FUNC_DEF[funcnum].name, s);
2276 if (find_global(s) && find_global(s) < TT.spec_var_limit)
2277 XERR("function '%s' param '%s' matches special var\n",
2278 FUNC_DEF[funcnum].name, s);
2279 }
2280
function_def(void)2281 static void function_def(void)
2282 {
2283 expect(tkfunction);
2284 int funcnum = find_func_def_entry(TT.tokstr);
2285 if (!funcnum) {
2286 funcnum = add_func_def_entry(TT.tokstr);
2287 } else if (FUNC_DEF[funcnum].flags & FUNC_DEFINED) {
2288 XERR("dup defined function '%s'\n", TT.tokstr);
2289 }
2290 FUNC_DEF[funcnum].flags |= FUNC_DEFINED;
2291 if (find_global(TT.tokstr)) {
2292 // POSIX: The same name shall not be used both as a variable name with
2293 // global scope and as the name of a function.
2294 XERR("function name '%s' previously defined\n", TT.tokstr);
2295 }
2296
2297 gen2cd(tkfunction, funcnum);
2298 FUNC_DEF[funcnum].zcode_addr = TT.zcode_last - 1;
2299 TT.cgl.funcnum = funcnum;
2300 TT.cgl.nparms = 0;
2301 if (ISTOK(tkfunc)) expect(tkfunc); // func name with no space before (
2302 else expect(tkvar); // func name with space before (
2303 expect(tklparen);
2304 if (ISTOK(tkvar)) {
2305 add_param(funcnum, TT.tokstr);
2306 scan();
2307 // FIXME is the the best way? what if TT.tokstr not a tkvar?
2308 while (have_comma()) {
2309 add_param(funcnum, TT.tokstr);
2310 expect(tkvar);
2311 }
2312 }
2313 rparen();
2314 if (ISTOK(tklbrace)) {
2315 TT.cgl.in_function_body = 1;
2316 action(tkfunc);
2317 TT.cgl.in_function_body = 0;
2318 // Need to return uninit value if falling off end of function.
2319 gen2cd(tknumber, make_uninit_val());
2320 gen2cd(tkreturn, TT.cgl.nparms);
2321 } else {
2322 XERR("syntax near '%s'\n", TT.tokstr);
2323 // FIXME some recovery needed here!?
2324 }
2325 // Do not re-init locals table for dup function.
2326 // Avoids memory leak detected by LeakSanitizer.
2327 if (!FUNC_DEF[funcnum].function_locals.base) {
2328 FUNC_DEF[funcnum].function_locals = TT.locals_table;
2329 init_locals_table();
2330 }
2331 }
2332
action(int action_type)2333 static void action(int action_type)
2334 {
2335 (void)action_type;
2336 // action_type is tkbegin, tkend, tkdo (every line), tkif (if pattern),
2337 // tkfunc (function body), tklbrace (compound statement)
2338 // Should have lbrace on entry.
2339 expect(tklbrace);
2340 for (;;) {
2341 if (ISTOK(tkeof)) unexpected_eof();
2342 optional_nl_or_semi();
2343 if (havetok(tkrbrace)) {
2344 break;
2345 }
2346 stmt();
2347 // stmt() is normally unterminated here, but may be terminated if we
2348 // have if with no else (had to consume terminator looking for else)
2349 // !!! if (ISTOK(tkrbrace) || prev_was_terminated())
2350 if (prev_was_terminated()) continue;
2351 if (!is_nl_semi() && !ISTOK(tkrbrace)) {
2352 XERR("syntax near '%s' -- newline, ';', or '}' expected\n", TT.tokstr);
2353 while (!is_nl_semi() && !ISTOK(tkrbrace) && !ISTOK(tkeof)) scan();
2354 if (ISTOK(tkeof)) unexpected_eof();
2355 }
2356 if (havetok(tkrbrace)) break;
2357 // Must be semicolon or newline
2358 scan();
2359 }
2360 }
2361
rule(void)2362 static void rule(void)
2363 {
2364 // pa_pat
2365 // | pa_pat lbrace stmtlist '}'
2366 // | pa_pat ',' opt_nl pa_pat
2367 // | pa_pat ',' opt_nl pa_pat lbrace stmtlist '}'
2368 // | lbrace stmtlist '}'
2369 // | XBEGIN lbrace stmtlist '}'
2370 // | XEND lbrace stmtlist '}'
2371 // | FUNC funcname '(' varlist rparen lbrace stmtlist '}'
2372
2373 switch (CURTOK()) {
2374 case tkbegin:
2375 scan();
2376 if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin] = TT.zcode_last - TT.cgl.last_begin;
2377 else TT.cgl.first_begin = TT.zcode_last + 1;
2378
2379 TT.cgl.rule_type = tkbegin;
2380 action(tkbegin);
2381 TT.cgl.rule_type = 0;
2382 gen2cd(opjump, -1);
2383 TT.cgl.last_begin = TT.zcode_last;
2384 break;
2385
2386 case tkend:
2387 scan();
2388 if (TT.cgl.last_end) ZCODE[TT.cgl.last_end] = TT.zcode_last - TT.cgl.last_end;
2389 else TT.cgl.first_end = TT.zcode_last + 1;
2390
2391 TT.cgl.rule_type = tkbegin;
2392 action(tkend);
2393 TT.cgl.rule_type = 0;
2394 gen2cd(opjump, -1);
2395 TT.cgl.last_end = TT.zcode_last;
2396 break;
2397
2398 case tklbrace:
2399 if (TT.cgl.last_recrule)
2400 ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2401 else TT.cgl.first_recrule = TT.zcode_last + 1;
2402 action(tkdo);
2403 gen2cd(opjump, -1);
2404 TT.cgl.last_recrule = TT.zcode_last;
2405 break;
2406
2407 case tkfunction:
2408 function_def();
2409 break;
2410 default:
2411 if (TT.cgl.last_recrule)
2412 ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2413 else TT.cgl.first_recrule = TT.zcode_last + 1;
2414 gen2cd(opjump, 1);
2415 gencd(tkeof);
2416 int cdx = 0, saveloc = TT.zcode_last;
2417 expr(0);
2418 if (!have_comma()) {
2419 gen2cd(tkif, -1);
2420 cdx = TT.zcode_last;
2421 } else {
2422 gen2cd(oprange2, ++TT.cgl.range_pattern_num);
2423 gencd(-1);
2424 cdx = TT.zcode_last;
2425 ZCODE[saveloc-2] = oprange1;
2426 ZCODE[saveloc-1] = TT.cgl.range_pattern_num;
2427 ZCODE[saveloc] = TT.zcode_last - saveloc;
2428 expr(0);
2429 gen2cd(oprange3, TT.cgl.range_pattern_num);
2430 }
2431 if (ISTOK(tklbrace)) {
2432 action(tkif);
2433 ZCODE[cdx] = TT.zcode_last - cdx;
2434 } else {
2435 gencd(opprintrec); // print $0 ?
2436 ZCODE[cdx] = TT.zcode_last - cdx;
2437 }
2438 gen2cd(opjump, -1);
2439 TT.cgl.last_recrule = TT.zcode_last;
2440 }
2441 }
2442
diag_func_def_ref(void)2443 static void diag_func_def_ref(void)
2444 {
2445 int n = zlist_len(&TT.func_def_table);
2446 for (int k = 1; k < n; k++) {
2447 if ((FUNC_DEF[k].flags & FUNC_CALLED) &&
2448 !(FUNC_DEF[k].flags & FUNC_DEFINED)) {
2449 // Sorry, we can't tell where this was called from, for now at least.
2450 XERR("Undefined function '%s'", FUNC_DEF[k].name);
2451 }
2452 }
2453 }
2454
compile(void)2455 static void compile(void)
2456 {
2457 init_compiler();
2458 init_scanner();
2459 scan();
2460 optional_nl_or_semi(); // Does posix allow NL or ; before first rule?
2461 while (! ISTOK(tkeof)) {
2462 rule();
2463 optional_nl_or_semi(); // NOT POSIX
2464 }
2465
2466
2467 if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin-1] = opquit;
2468 if (TT.cgl.last_end) ZCODE[TT.cgl.last_end-1] = opquit;
2469 if (TT.cgl.last_recrule) ZCODE[TT.cgl.last_recrule-1] = opquit;
2470
2471 gen2cd(tknumber, make_literal_num_val(0.0));
2472 gencd(tkexit);
2473 gencd(opquit);
2474 // If there are only BEGIN and END or only END actions, generate actions to
2475 // read all input before END.
2476 if (TT.cgl.first_end && !TT.cgl.first_recrule) {
2477 gencd(opquit);
2478 TT.cgl.first_recrule = TT.zcode_last;
2479 }
2480 gencd(opquit); // One more opcode to keep ip in bounds in run code.
2481 diag_func_def_ref();
2482 }
2483
2484 ////////////////////
2485 //// runtime
2486 ////////////////////
2487
check_numeric_string(struct zvalue * v)2488 static void check_numeric_string(struct zvalue *v)
2489 {
2490 if (v->vst) {
2491 char *end, *s = v->vst->str;
2492 // Significant speed gain with this test:
2493 // num string must begin space, +, -, ., or digit.
2494 if (strchr("+-.1234567890 ", *s)) {
2495 double num = strtod(s, &end);
2496 if (s == end || end[strspn(end, " ")]) return;
2497 v->num = num;
2498 v->flags |= ZF_NUM | ZF_STR | ZF_NUMSTR;
2499 }
2500 }
2501 }
2502
num_to_zstring(double n,char * fmt)2503 static struct zstring *num_to_zstring(double n, char *fmt)
2504 {
2505 int k;
2506 if (n == (long long)n) k = snprintf(TT.pbuf, PBUFSIZE, "%lld", (long long)n);
2507 else k = snprintf(TT.pbuf, PBUFSIZE, fmt, n);
2508 if (k < 0 || k >= PBUFSIZE) FFATAL("error encoding %f via '%s'", n, fmt);
2509 return new_zstring(TT.pbuf, k);
2510 }
2511
2512 ////////////////////
2513 //// regex routines
2514 ////////////////////
2515
escape_str(char * s,int is_regex)2516 static char *escape_str(char *s, int is_regex)
2517 {
2518 char *p, *escapes = is_regex ? "abfnrtv\"/" : "\\abfnrtv\"/";
2519 // FIXME TODO should / be in there?
2520 char *s0 = s, *to = s;
2521 while ((*to = *s)) {
2522 if (*s != '\\') { to++, s++;
2523 } else if ((p = strchr(escapes, *++s))) {
2524 // checking char after \ for known escapes
2525 int c = (is_regex?"\a\b\f\n\r\t\v\"/":"\\\a\b\f\n\r\t\v\"/")[p-escapes];
2526 if (c) *to = c, s++; // else final backslash
2527 to++;
2528 } else if ('0' <= *s && *s <= '9') {
2529 int k, c = *s++ - '0';
2530 for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++)
2531 c = c * 8 + *s++ - '0';
2532 *to++ = c;
2533 } else if (*s == 'x') {
2534 if (isxdigit(s[1])) {
2535 int c = hexval(*++s);
2536 if (isxdigit(s[1])) c = c * 16 + hexval(*++s);
2537 *to++ = c, s++;
2538 }
2539 } else {
2540 if (is_regex) *to++ = '\\';
2541 *to++ = *s++;
2542 }
2543 }
2544 return s0;
2545 }
2546
force_maybemap_to_scalar(struct zvalue * v)2547 static void force_maybemap_to_scalar(struct zvalue *v)
2548 {
2549 if (!(v->flags & ZF_ANYMAP)) return;
2550 if (v->flags & ZF_MAP || v->map->count)
2551 FATAL("array in scalar context");
2552 v->flags = 0;
2553 v->map = 0; // v->flags = v->map = 0 gets warning
2554 }
2555
force_maybemap_to_map(struct zvalue * v)2556 static void force_maybemap_to_map(struct zvalue *v)
2557 {
2558 if (v->flags & ZF_MAYBEMAP) v->flags = ZF_MAP;
2559 }
2560
2561 // fmt_offs is either CONVFMT or OFMT (offset in stack to zvalue)
to_str_fmt(struct zvalue * v,int fmt_offs)2562 static struct zvalue *to_str_fmt(struct zvalue *v, int fmt_offs)
2563 {
2564 force_maybemap_to_scalar(v);
2565 // TODO: consider handling numstring differently
2566 if (v->flags & ZF_NUMSTR) v->flags = ZF_STR;
2567 if (IS_STR(v)) return v;
2568 else if (!v->flags) { // uninitialized
2569 v->vst = new_zstring("", 0);
2570 } else if (IS_NUM(v)) {
2571 zvalue_release_zstring(v);
2572 if (!IS_STR(&STACK[fmt_offs])) {
2573 zstring_release(&STACK[fmt_offs].vst);
2574 STACK[fmt_offs].vst = num_to_zstring(STACK[fmt_offs].num, "%.6g");
2575 STACK[fmt_offs].flags = ZF_STR;
2576 }
2577 v->vst = num_to_zstring(v->num, STACK[fmt_offs].vst->str);
2578 } else {
2579 FATAL("Wrong or unknown type in to_str_fmt\n");
2580 }
2581 v->flags = ZF_STR;
2582 return v;
2583 }
2584
to_str(struct zvalue * v)2585 static struct zvalue *to_str(struct zvalue *v)
2586 {
2587 return to_str_fmt(v, CONVFMT);
2588 }
2589
2590 // TODO FIXME Is this needed? (YES -- investigate) Just use to_str()?
2591 #define ENSURE_STR(v) (IS_STR(v) ? (v) : to_str(v))
2592
rx_zvalue_compile(regex_t ** rx,struct zvalue * pat)2593 static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat)
2594 {
2595 if (IS_RX(pat)) *rx = pat->rx;
2596 else {
2597 zvalue_dup_zstring(to_str(pat));
2598 escape_str(pat->vst->str, 1);
2599 xregcomp(*rx, pat->vst->str, REG_EXTENDED);
2600 }
2601 }
2602
rx_zvalue_free(regex_t * rx,struct zvalue * pat)2603 static void rx_zvalue_free(regex_t *rx, struct zvalue *pat)
2604 {
2605 if (!IS_RX(pat) || rx != pat->rx) regfree(rx);
2606 }
2607
2608 // Used by the match/not match ops (~ !~) and implicit $0 match (/regex/)
match(struct zvalue * zvsubject,struct zvalue * zvpat)2609 static int match(struct zvalue *zvsubject, struct zvalue *zvpat)
2610 {
2611 int r;
2612 regex_t rx, *rxp = ℞
2613 rx_zvalue_compile(&rxp, zvpat);
2614 if ((r = regexec(rxp, to_str(zvsubject)->vst->str, 0, 0, 0)) != 0) {
2615 if (r != REG_NOMATCH) {
2616 char errbuf[256];
2617 regerror(r, &rx, errbuf, sizeof(errbuf));
2618 // FIXME TODO better diagnostic here
2619 error_exit("regex match error %d: %s", r, errbuf);
2620 }
2621 rx_zvalue_free(rxp, zvpat);
2622 return 1;
2623 }
2624 rx_zvalue_free(rxp, zvpat);
2625 return 0;
2626 }
2627
rx_find(regex_t * rx,char * s,regoff_t * start,regoff_t * end,int eflags)2628 static int rx_find(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2629 {
2630 regmatch_t matches[1];
2631 int r = regexec(rx, s, 1, matches, eflags);
2632 if (r == REG_NOMATCH) return r;
2633 if (r) FATAL("regexec error"); // TODO ? use regerr() to meaningful msg
2634 *start = matches[0].rm_so;
2635 *end = matches[0].rm_eo;
2636 return 0;
2637 }
2638
2639 // Differs from rx_find() in that FS cannot match null (empty) string.
2640 // See https://www.austingroupbugs.net/view.php?id=1468.
rx_find_FS(regex_t * rx,char * s,regoff_t * start,regoff_t * end,int eflags)2641 static int rx_find_FS(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2642 {
2643 int r = rx_find(rx, s, start, end, eflags);
2644 if (r || *start != *end) return r; // not found, or found non-empty match
2645 // Found empty match, retry starting past the match
2646 char *p = s + *end;
2647 if (!*p) return REG_NOMATCH; // End of string, no non-empty match found
2648 // Empty match not at EOS, move ahead and try again
2649 while (!r && *start == *end && *++p)
2650 r = rx_find(rx, p, start, end, eflags);
2651 if (r || !*p) return REG_NOMATCH; // no non-empty match found
2652 *start += p - s; // offsets from original string
2653 *end += p - s;
2654 return 0;
2655 }
2656
2657 ////////////////////
2658 //// fields
2659 ////////////////////
2660
2661 #define FIELDS_MAX 102400 // Was 1024; need more for toybox awk test
2662 #define THIS_MEANS_SET_NF 999999999
2663
get_int_val(struct zvalue * v)2664 static int get_int_val(struct zvalue *v)
2665 {
2666 if (IS_NUM(v)) return (int)v->num;
2667 if (IS_STR(v) && v->vst) return (int)atof(v->vst->str);
2668 return 0;
2669 }
2670
2671 // A single-char FS is never a regex, so make it a [<char>] regex to
2672 // match only that one char in case FS is a regex metachar.
2673 // If regex FS is needed, must use > 1 char. If a '.' regex
2674 // is needed, use e.g. '.|.' (unlikely case).
fmt_one_char_fs(char * fs)2675 static char *fmt_one_char_fs(char *fs)
2676 {
2677 if (strlen(fs) != 1) return fs;
2678 snprintf(TT.one_char_fs, sizeof(TT.one_char_fs), "[%c]", fs[0]);
2679 return TT.one_char_fs;
2680 }
2681
rx_fs_prep(char * fs)2682 static regex_t *rx_fs_prep(char *fs)
2683 {
2684 if (!strcmp(fs, " ")) return &TT.rx_default;
2685 if (!strcmp(fs, TT.fs_last)) return &TT.rx_last;
2686 if (strlen(fs) >= FS_MAX) FATAL("FS too long");
2687 strcpy(TT.fs_last, fs);
2688 regfree(&TT.rx_last);
2689 xregcomp(&TT.rx_last, fmt_one_char_fs(fs), REG_EXTENDED);
2690 return &TT.rx_last;
2691 }
2692
2693 // Only for use by split() builtin
set_map_element(struct zmap * m,int k,char * val,size_t len)2694 static void set_map_element(struct zmap *m, int k, char *val, size_t len)
2695 {
2696 // Do not need format here b/c k is integer, uses "%lld" format.
2697 struct zstring *key = num_to_zstring(k, "");// "" vs 0 format avoids warning
2698 struct zmap_slot *zs = zmap_find_or_insert_key(m, key);
2699 zstring_release(&key);
2700 zs->val.vst = zstring_update(zs->val.vst, 0, val, len);
2701 zs->val.flags = ZF_STR;
2702 check_numeric_string(&zs->val);
2703 }
2704
set_zvalue_str(struct zvalue * v,char * s,size_t size)2705 static void set_zvalue_str(struct zvalue *v, char *s, size_t size)
2706 {
2707 v->vst = zstring_update(v->vst, 0, s, size);
2708 v->flags = ZF_STR;
2709 }
2710
2711 // All changes to NF go through here!
set_nf(int nf)2712 static void set_nf(int nf)
2713 {
2714 if (nf < 0) FATAL("NF set negative");
2715 STACK[NF].num = TT.nf_internal = nf;
2716 STACK[NF].flags = ZF_NUM;
2717 }
2718
set_field(struct zmap * unused,int fnum,char * s,size_t size)2719 static void set_field(struct zmap *unused, int fnum, char *s, size_t size)
2720 { (void)unused;
2721 if (fnum < 0 || fnum > FIELDS_MAX) FFATAL("bad field num %d\n", fnum);
2722 int nfields = zlist_len(&TT.fields);
2723 // Need nfields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2724 while (nfields <= fnum)
2725 nfields = zlist_append(&TT.fields, &uninit_zvalue) + 1;
2726 set_zvalue_str(&FIELD[fnum], s, size);
2727 set_nf(fnum);
2728 check_numeric_string(&FIELD[fnum]);
2729 }
2730
2731 // Split s via fs, using setter; return number of TT.fields.
2732 // This is used to split TT.fields and also for split() builtin.
splitter(void (* setter)(struct zmap *,int,char *,size_t),struct zmap * m,char * s,struct zvalue * zvfs)2733 static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct zmap *m, char *s, struct zvalue *zvfs)
2734 {
2735 regex_t *rx;
2736 regoff_t offs, end;
2737 int multiline_null_rs = !ENSURE_STR(&STACK[RS])->vst->str[0];
2738 int nf = 0, r = 0, eflag = 0;
2739 int one_char_fs = 0;
2740 char *s0 = s, *fs = "";
2741 if (!IS_RX(zvfs)) {
2742 to_str(zvfs);
2743 fs = zvfs->vst->str;
2744 one_char_fs = utf8cnt(zvfs->vst->str, zvfs->vst->size) == 1;
2745 }
2746 // Empty string or empty fs (regex).
2747 // Need to include !*s b/c empty string, otherwise
2748 // split("", a, "x") splits to a 1-element (empty element) array
2749 if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) {
2750 while (*s) {
2751 if (*s < 128) setter(m, ++nf, s++, 1);
2752 else { // Handle UTF-8
2753 char cbuf[8];
2754 unsigned wc;
2755 int nc = utf8towc(&wc, s, strlen(s));
2756 if (nc < 2) FFATAL("bad string for split: \"%s\"\n", s0);
2757 s += nc;
2758 nc = wctoutf8(cbuf, wc);
2759 setter(m, ++nf, cbuf, nc);
2760 }
2761 }
2762 return nf;
2763 }
2764 if (IS_RX(zvfs)) rx = zvfs->rx;
2765 else rx = rx_fs_prep(fs);
2766 while (*s) {
2767 // Find the next occurrence of FS.
2768 // rx_find_FS() returns 0 if found. If nonzero, the field will
2769 // be the rest of the record (all of it if first time through).
2770 if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s);
2771 if (setter == set_field && multiline_null_rs && one_char_fs) {
2772 // Contra POSIX, if RS=="" then newline is always also a
2773 // field separator only if FS is a single char (see gawk manual)
2774 int k = strcspn(s, "\n");
2775 if (k < offs) offs = k, end = k + 1;
2776 }
2777 eflag |= REG_NOTBOL;
2778
2779 // Field will be s up to (not including) the offset. If offset
2780 // is zero and FS is found and FS is ' ' (TT.rx_default "[ \t]+"),
2781 // then the find is the leading or trailing spaces and/or tabs.
2782 // If so, skip this (empty) field, otherwise set field, length is offs.
2783 if (offs || r || rx != &TT.rx_default) setter(m, ++nf, s, offs);
2784 s += end;
2785 }
2786 if (!r && rx != &TT.rx_default) setter(m, ++nf, "", 0);
2787 return nf;
2788 }
2789
build_fields(void)2790 static void build_fields(void)
2791 {
2792 char *rec = FIELD[0].vst->str;
2793 // TODO test this -- why did I not want to split empty $0?
2794 // Maybe don't split empty $0 b/c non-default FS gets NF==1 with splitter()?
2795 set_nf(*rec ? splitter(set_field, 0, rec, to_str(&STACK[FS])) : 0);
2796 }
2797
rebuild_field0(void)2798 static void rebuild_field0(void)
2799 {
2800 struct zstring *s = FIELD[0].vst;
2801 int nf = TT.nf_internal;
2802 if (!nf) {
2803 zvalue_copy(&FIELD[0], &uninit_string_zvalue);
2804 return;
2805 }
2806 // uninit value needed for eventual reference to .vst in zstring_release()
2807 struct zvalue tempv = uninit_zvalue;
2808 zvalue_copy(&tempv, to_str(&STACK[OFS]));
2809 for (int i = 1; i <= nf; i++) {
2810 if (i > 1) {
2811 s = s ? zstring_extend(s, tempv.vst) : zstring_copy(s, tempv.vst);
2812 }
2813 if (FIELD[i].flags) to_str(&FIELD[i]);
2814 if (FIELD[i].vst) {
2815 if (i > 1) s = zstring_extend(s, FIELD[i].vst);
2816 else s = zstring_copy(s, FIELD[i].vst);
2817 }
2818 }
2819 FIELD[0].vst = s;
2820 FIELD[0].flags |= ZF_STR;
2821 zvalue_release_zstring(&tempv);
2822 }
2823
2824 // get field ref (lvalue ref) in prep for assignment to field.
2825 // [... assigning to a nonexistent field (for example, $(NF+2)=5) shall
2826 // increase the value of NF; create any intervening TT.fields with the
2827 // uninitialized value; and cause the value of $0 to be recomputed, with the
2828 // TT.fields being separated by the value of OFS.]
2829 // Called by setup_lvalue()
get_field_ref(int fnum)2830 static struct zvalue *get_field_ref(int fnum)
2831 {
2832 if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2833 if (fnum > TT.nf_internal) {
2834 // Ensure TT.fields list is large enough for fnum
2835 // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2836 for (int i = TT.nf_internal + 1; i <= fnum; i++) {
2837 if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2838 zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2839 }
2840 set_nf(fnum);
2841 }
2842 return &FIELD[fnum];
2843 }
2844
2845 // Called by tksplit op
split(struct zstring * s,struct zvalue * a,struct zvalue * fs)2846 static int split(struct zstring *s, struct zvalue *a, struct zvalue *fs)
2847 {
2848 return splitter(set_map_element, a->map, s->str, fs);
2849 }
2850
2851 // Called by getrec_f0_f() and getrec_f0()
copy_to_field0(char * buf,size_t k)2852 static void copy_to_field0(char *buf, size_t k)
2853 {
2854 set_zvalue_str(&FIELD[0], buf, k);
2855 check_numeric_string(&FIELD[0]);
2856 build_fields();
2857 }
2858
2859 // After changing $0, must rebuild TT.fields & reset NF
2860 // Changing other field must rebuild $0
2861 // Called by gsub() and assignment ops.
fixup_fields(int fnum)2862 static void fixup_fields(int fnum)
2863 {
2864 if (fnum == THIS_MEANS_SET_NF) { // NF was assigned to
2865 int new_nf = get_int_val(&STACK[NF]);
2866 // Ensure TT.fields list is large enough for fnum
2867 // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2868 for (int i = TT.nf_internal + 1; i <= new_nf; i++) {
2869 if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2870 zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2871 }
2872 set_nf(TT.nf_internal = STACK[NF].num);
2873 rebuild_field0();
2874 return;
2875 }
2876 // fnum is # of field that was just updated.
2877 // If it's 0, need to rebuild the TT.fields 1... n.
2878 // If it's non-0, need to rebuild field 0.
2879 to_str(&FIELD[fnum]);
2880 if (fnum) check_numeric_string(&FIELD[fnum]);
2881 if (fnum) rebuild_field0();
2882 else build_fields();
2883 }
2884
2885 // Fetching non-existent field gets uninit string value; no change to NF!
2886 // Called by tkfield op // TODO inline it?
push_field(int fnum)2887 static void push_field(int fnum)
2888 {
2889 if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2890 // Contrary to posix, awk evaluates TT.fields beyond $NF as empty strings.
2891 if (fnum > TT.nf_internal) push_val(&uninit_string_zvalue);
2892 else push_val(&FIELD[fnum]);
2893 }
2894
2895 ////////////////////
2896 //// END fields
2897 ////////////////////
2898
2899 #define STKP TT.stackp // pointer to top of stack
2900
seedrand(double seed)2901 static double seedrand(double seed)
2902 {
2903 static double prev_seed;
2904 double r = prev_seed;
2905 srandom(trunc(prev_seed = seed));
2906 return r;
2907 }
2908
popnumval(void)2909 static int popnumval(void)
2910 {
2911 return STKP-- -> num;
2912 }
2913
drop(void)2914 static void drop(void)
2915 {
2916 if (!(STKP->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&STKP->vst);
2917 STKP--;
2918 }
2919
drop_n(int n)2920 static void drop_n(int n)
2921 {
2922 while (n--) drop();
2923 }
2924
swap(void)2925 static void swap(void)
2926 {
2927 struct zvalue tmp = STKP[-1];
2928 STKP[-1] = STKP[0];
2929 STKP[0] = tmp;
2930 }
2931
2932 // Set and return logical (0/1) val of top TT.stack value; flag value as NUM.
get_set_logical(void)2933 static int get_set_logical(void)
2934 {
2935 struct zvalue *v = STKP;
2936 force_maybemap_to_scalar(v);
2937 int r = 0;
2938 if (IS_NUM(v)) r = !! v->num;
2939 else if (IS_STR(v)) r = (v->vst && v->vst->str[0]);
2940 zvalue_release_zstring(v);
2941 v->num = r;
2942 v->flags = ZF_NUM;
2943 return r;
2944 }
2945
2946
to_num(struct zvalue * v)2947 static double to_num(struct zvalue *v)
2948 {
2949 force_maybemap_to_scalar(v);
2950 if (v->flags & ZF_NUMSTR) zvalue_release_zstring(v);
2951 else if (!IS_NUM(v)) {
2952 v->num = 0.0;
2953 if (IS_STR(v) && v->vst) v->num = atof(v->vst->str);
2954 zvalue_release_zstring(v);
2955 }
2956 v->flags = ZF_NUM;
2957 return v->num;
2958 }
2959
set_num(struct zvalue * v,double n)2960 static void set_num(struct zvalue *v, double n)
2961 {
2962 zstring_release(&v->vst);
2963 v->num = n;
2964 v->flags = ZF_NUM;
2965 }
2966
incr_zvalue(struct zvalue * v)2967 static void incr_zvalue(struct zvalue *v)
2968 {
2969 v->num = trunc(to_num(v)) + 1;
2970 }
2971
push_int_val(ptrdiff_t n)2972 static void push_int_val(ptrdiff_t n)
2973 {
2974 struct zvalue v = ZVINIT(ZF_NUM, n, 0);
2975 push_val(&v);
2976 }
2977
get_map_val(struct zvalue * v,struct zvalue * key)2978 static struct zvalue *get_map_val(struct zvalue *v, struct zvalue *key)
2979 {
2980 struct zmap_slot *x = zmap_find_or_insert_key(v->map, to_str(key)->vst);
2981 return &x->val;
2982 }
2983
setup_lvalue(int ref_stack_ptr,int parmbase,int * field_num)2984 static struct zvalue *setup_lvalue(int ref_stack_ptr, int parmbase, int *field_num)
2985 {
2986 // ref_stack_ptr is number of slots down in stack the ref is
2987 // for +=, *=, etc
2988 // Stack is: ... scalar_ref value_to_op_by
2989 // or ... subscript_val map_ref value_to_op_by
2990 // or ... fieldref value_to_op_by
2991 // for =, ++, --
2992 // Stack is: ... scalar_ref
2993 // or ... subscript_val map_ref
2994 // or ... fieldnum fieldref
2995 int k;
2996 struct zvalue *ref, *v = 0; // init v to mute "may be uninit" warning
2997 *field_num = -1;
2998 ref = STKP - ref_stack_ptr;
2999 if (ref->flags & ZF_FIELDREF) return get_field_ref(*field_num = ref->num);
3000 k = ref->num >= 0 ? ref->num : parmbase - ref->num;
3001 if (k == NF) *field_num = THIS_MEANS_SET_NF;
3002 v = &STACK[k];
3003 if (ref->flags & ZF_REF) {
3004 force_maybemap_to_scalar(v);
3005 } else if (ref->flags & ZF_MAPREF) {
3006 force_maybemap_to_map(v);
3007 if (!IS_MAP(v)) FATAL("scalar in array context");
3008 v = get_map_val(v, STKP - ref_stack_ptr - 1);
3009 swap();
3010 drop();
3011 } else FATAL("assignment to bad lvalue");
3012 return v; // order FATAL() and return to mute warning
3013 }
3014
new_file(char * fn,FILE * fp,char mode,char file_or_pipe,char is_std_file)3015 static struct zfile *new_file(char *fn, FILE *fp, char mode, char file_or_pipe,
3016 char is_std_file)
3017 {
3018 struct zfile *f = xzalloc(sizeof(struct zfile));
3019 *f = (struct zfile){TT.zfiles, xstrdup(fn), fp, mode, file_or_pipe,
3020 isatty(fileno(fp)), is_std_file, 0, 0, 0, 0, 0};
3021 return TT.zfiles = f;
3022 }
3023
fflush_all(void)3024 static int fflush_all(void)
3025 {
3026 int ret = 0;
3027 for (struct zfile *p = TT.zfiles; p; p = p->next)
3028 if (fflush(p->fp)) ret = -1;
3029 return ret;
3030 }
3031
fflush_file(int nargs)3032 static int fflush_file(int nargs)
3033 {
3034 if (!nargs) return fflush_all();
3035
3036 to_str(STKP); // filename at top of TT.stack
3037 // Null string means flush all
3038 if (!STKP[0].vst->str[0]) return fflush_all();
3039
3040 // is it open in file table?
3041 for (struct zfile *p = TT.zfiles; p; p = p->next)
3042 if (!strcmp(STKP[0].vst->str, p->fn))
3043 if (!fflush(p->fp)) return 0;
3044 return -1; // error, or file not found in table
3045 }
close_file(char * fn)3046 static int close_file(char *fn)
3047 {
3048 // !fn (null ptr) means close all (exc. stdin/stdout/stderr)
3049 int r = 0;
3050 struct zfile *np, **pp = &TT.zfiles;
3051 for (struct zfile *p = TT.zfiles; p; p = np) {
3052 np = p->next; // save in case unlinking file (invalidates p->next)
3053 // Don't close std files -- wrecks print/printf (can be fixed though TODO)
3054 if ((!p->is_std_file) && (!fn || !strcmp(fn, p->fn))) {
3055 xfree(p->buf);
3056 xfree(p->fn);
3057 r = (p->fp) ? (p->file_or_pipe ? fclose : pclose)(p->fp) : -1;
3058 *pp = p->next;
3059 xfree(p);
3060 if (fn) return r;
3061 } else pp = &p->next; // only if not unlinking zfile
3062 }
3063 return -1; // file not in table, or closed all files
3064 }
3065
3066 static struct zfile badfile_obj, *badfile = &badfile_obj;
3067
3068 // FIXME TODO check if file/pipe/mode matches what's in the table already.
3069 // Apparently gawk/mawk/nawk are OK with different mode, but just use the file
3070 // in whatever mode it's already in; i.e. > after >> still appends.
setup_file(char file_or_pipe,char * mode)3071 static struct zfile *setup_file(char file_or_pipe, char *mode)
3072 {
3073 to_str(STKP); // filename at top of TT.stack
3074 char *fn = STKP[0].vst->str;
3075 // is it already open in file table?
3076 for (struct zfile *p = TT.zfiles; p; p = p->next)
3077 if (!strcmp(fn, p->fn)) {
3078 drop();
3079 return p; // open; return it
3080 }
3081 FILE *fp = (file_or_pipe ? fopen : popen)(fn, mode);
3082 if (fp) {
3083 struct zfile *p = new_file(fn, fp, *mode, file_or_pipe, 0);
3084 drop();
3085 return p;
3086 }
3087 if (*mode != 'r') FFATAL("cannot open '%s'\n", fn);
3088 drop();
3089 return badfile;
3090 }
3091
3092 // TODO FIXME should be a function?
3093 #define stkn(n) ((int)(TT.stackp - (n) - (struct zvalue *)TT.stack.base))
3094
getcnt(int k)3095 static int getcnt(int k)
3096 {
3097 if (k >= stkn(0)) FATAL("too few args for printf\n");
3098 return (int)to_num(&STACK[k]);
3099 }
3100
fsprintf(FILE * ignored,const char * fmt,...)3101 static int fsprintf(FILE *ignored, const char *fmt, ...)
3102 {
3103 (void)ignored;
3104 va_list args, args2;
3105 va_start(args, fmt);
3106 va_copy(args2, args);
3107 int len = vsnprintf(0, 0, fmt, args); // size needed
3108 va_end(args);
3109 if (len < 0) FATAL("Bad sprintf format");
3110 // Unfortunately we have to mess with zstring internals here.
3111 if (TT.rgl.zspr->size + len + 1 > TT.rgl.zspr->capacity) {
3112 // This should always work b/c capacity > size
3113 unsigned cap = 2 * TT.rgl.zspr->capacity + len;
3114 TT.rgl.zspr = xrealloc(TT.rgl.zspr, sizeof(*TT.rgl.zspr) + cap);
3115 TT.rgl.zspr->capacity = cap;
3116 }
3117 vsnprintf(TT.rgl.zspr->str + TT.rgl.zspr->size, len+1, fmt, args2);
3118 TT.rgl.zspr->size += len;
3119 TT.rgl.zspr->str[TT.rgl.zspr->size] = 0;
3120 va_end(args2);
3121 return 0;
3122 }
3123
varprint(int (* fpvar)(FILE *,const char *,...),FILE * outfp,int nargs)3124 static void varprint(int(*fpvar)(FILE *, const char *, ...), FILE *outfp, int nargs)
3125 {
3126 int k, nn, nnc, fmtc, holdc, cnt1 = 0, cnt2 = 0;
3127 char *s = 0; // to shut up spurious warning
3128 regoff_t offs = -1, e = -1;
3129 char *pfmt, *fmt = to_str(STKP-nargs+1)->vst->str;
3130 k = stkn(nargs - 2);
3131 while (*fmt) {
3132 double n = 0;
3133 nn = strcspn(fmt, "%");
3134 if (nn) {
3135 holdc = fmt[nn];
3136 fmt[nn] = 0;
3137 fpvar(outfp, "%s", fmt);
3138 fmt[nn] = holdc;
3139 }
3140 fmt += nn;
3141 if (!*(pfmt = fmt)) break;
3142 nnc = strcspn(fmt+1, "aAdiouxXfFeEgGcs%");
3143 fmtc = fmt[nnc+1];
3144 if (!fmtc) FFATAL("bad printf format '%s'", fmt);
3145 holdc = fmt[nnc+2];
3146 fmt[nnc+2] = 0;
3147 if (rx_find(&TT.rx_printf_fmt, fmt, &offs, &e, 0))
3148 FFATAL("bad printf format <%s>\n", fmt);
3149 int nargsneeded = 1;
3150 for (char *p = strchr(fmt, '*'); p; p = strchr(p+1, '*'))
3151 nargsneeded++;
3152 nargsneeded -= fmtc == '%';
3153
3154 switch (nargsneeded) {
3155 case 0:
3156 fpvar(outfp, fmt);
3157 break;
3158 case 3:
3159 cnt1 = getcnt(k++);
3160 ATTR_FALLTHROUGH_INTENDED;
3161 case 2:
3162 cnt2 = getcnt(k++);
3163 ATTR_FALLTHROUGH_INTENDED;
3164 case 1:
3165 if (k > stkn(0)) FATAL("too few args for printf\n");
3166 if (fmtc == 's') {
3167 s = to_str(&STACK[k++])->vst->str;
3168 } else if (fmtc == 'c' && !IS_NUM(&STACK[k])) {
3169 unsigned wch;
3170 struct zvalue *z = &STACK[k++];
3171 if (z->vst && z->vst->str[0])
3172 n = utf8towc(&wch, z->vst->str, z->vst->size) < 1 ? 0xfffd : wch;
3173 } else {
3174 n = to_num(&STACK[k++]);
3175 }
3176 if (strchr("cdiouxX", fmtc)) {
3177 pfmt = strcpy(TT.pbuf, fmt);
3178 if (pfmt[nnc] != 'l') {
3179 strcpy(pfmt+nnc+1, "l_");
3180 pfmt[nnc+2] = fmtc;
3181 }
3182 }
3183 if (fmtc == 'c' && n > 0x10ffff) n = 0xfffd; // musl won't take larger "wchar"
3184 switch (nargsneeded) {
3185 case 1:
3186 if (fmtc == 's') fpvar(outfp, pfmt, s);
3187 else if (fmtc == 'c') fpvar(outfp, pfmt, (wint_t)n);
3188 else if (strchr("di", fmtc)) fpvar(outfp, pfmt, (long)n);
3189 else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, (unsigned long)n);
3190 else fpvar(outfp, pfmt, n);
3191 break;
3192 case 2:
3193 if (fmtc == 's') fpvar(outfp, pfmt, cnt2, s);
3194 else if (fmtc == 'c') fpvar(outfp, pfmt, cnt2, (wint_t)n);
3195 else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt2, (long)n);
3196 else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt2, (unsigned long)n);
3197 else fpvar(outfp, pfmt, cnt2, n);
3198 break;
3199 case 3:
3200 if (fmtc == 's') fpvar(outfp, pfmt, cnt1, cnt2, s);
3201 else if (fmtc == 'c') fpvar(outfp, pfmt, cnt1, cnt2, (wint_t)n);
3202 else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (long)n);
3203 else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (unsigned long)n);
3204 else fpvar(outfp, pfmt, cnt1, cnt2, n);
3205 break;
3206 }
3207 break;
3208 default:
3209 FATAL("bad printf format\n");
3210 }
3211 fmt += nnc + 2;
3212 *fmt = holdc;
3213 }
3214 }
3215
is_ok_varname(char * v)3216 static int is_ok_varname(char *v)
3217 {
3218 char *ok = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
3219 if (!*v) return 0;
3220 for (int i = 0; v[i]; i++)
3221 if (i ? !strchr(ok, v[i]) : !strchr(ok + 10, v[i])) return 0;
3222 return 1;
3223 }
3224
3225 // FIXME TODO return value never used. What if assign to var not in globals?
assign_global(char * var,char * value)3226 static int assign_global(char *var, char *value)
3227 {
3228 if (!is_ok_varname(var)) FFATAL("Invalid variable name '%s'\n", var);
3229 int globals_ent = find_global(var);
3230 if (globals_ent) {
3231 struct zvalue *v = &STACK[globals_ent];
3232 if (IS_MAP(v)) error_exit("-v assignment to array"); // Maybe not needed?
3233
3234 // The compile phase may insert a var in global table with flag of zero. Then
3235 // init_globals() will assign a ZF_MAYBEMAP flag to it. If it is then assigned
3236 // via -v option or by assignment_arg() it will here be assigned a string value.
3237 // So first, remove all map data to prevent memory leak. BUG FIX // 2024-02-13.
3238 if (v->flags & ZF_ANYMAP) {
3239 zmap_delete_map_incl_slotdata(v->map);
3240 xfree(v->map);
3241 v->map = 0;
3242 v->flags &= ~ZF_ANYMAP;
3243 }
3244
3245 zvalue_release_zstring(v);
3246 value = xstrdup(value);
3247 *v = new_str_val(escape_str(value, 0));
3248 xfree(value);
3249 check_numeric_string(v);
3250 return 1;
3251 }
3252 return 0;
3253 }
3254
3255 // If valid assignment arg, assign the global and return 1;
3256 // otherwise return 0.
3257 // TODO FIXME This does not check the format of the variable per posix.
3258 // Needs to start w/ _A-Za-z then _A-Za-z0-9
3259 // If not valid assignment form, then nextfilearg needs to treat as filename.
assignment_arg(char * arg)3260 static int assignment_arg(char *arg)
3261 {
3262 char *val = strchr(arg, '=');
3263 if (val) {
3264 *val++ = 0;
3265 if (!is_ok_varname(arg)) {
3266 *--val = '=';
3267 return 0;
3268 }
3269 assign_global(arg, val);
3270 *--val = '=';
3271 return 1;
3272 } else return 0;
3273 }
3274
nextfilearg(void)3275 static char *nextfilearg(void)
3276 {
3277 char *arg;
3278 do {
3279 if (++TT.rgl.narg >= (int)to_num(&STACK[ARGC])) return 0;
3280 struct zvalue *v = &STACK[ARGV];
3281 struct zvalue zkey = ZVINIT(ZF_STR, 0,
3282 num_to_zstring(TT.rgl.narg, to_str(&STACK[CONVFMT])->vst->str));
3283 arg = "";
3284 if (zmap_find(v->map, zkey.vst)) {
3285 zvalue_copy(&TT.rgl.cur_arg, to_str(get_map_val(v, &zkey)));
3286 arg = TT.rgl.cur_arg.vst->str;
3287 }
3288 zvalue_release_zstring(&zkey);
3289 } while (!*arg || assignment_arg(arg));
3290 TT.rgl.nfiles++;
3291 return arg;
3292 }
3293
next_fp(void)3294 static int next_fp(void)
3295 {
3296 char *fn = nextfilearg();
3297 if (TT.cfile->fp && TT.cfile->fp != stdin) fclose(TT.cfile->fp);
3298 if ((!fn && !TT.rgl.nfiles && TT.cfile->fp != stdin) || (fn && !strcmp(fn, "-"))) {
3299 xfree(TT.cfile->buf);
3300 *TT.cfile = (struct zfile){0};
3301 TT.cfile->fp = stdin;
3302 TT.cfile->fn = "-";
3303 zvalue_release_zstring(&STACK[FILENAME]);
3304 STACK[FILENAME].vst = new_zstring("-", 1);
3305 } else if (fn) {
3306 xfree(TT.cfile->buf);
3307 *TT.cfile = (struct zfile){0};
3308 if (!(TT.cfile->fp = fopen(fn, "r"))) FFATAL("can't open %s\n", fn);
3309 TT.cfile->fn = fn;
3310 zvalue_copy(&STACK[FILENAME], &TT.rgl.cur_arg);
3311 } else {
3312 TT.rgl.eof = 1;
3313 return 0;
3314 }
3315 set_num(&STACK[FNR], 0);
3316 TT.cfile->is_tty = isatty(fileno(TT.cfile->fp));
3317 return 1;
3318 }
3319
rx_find_rs(regex_t * rx,char * s,long len,regoff_t * start,regoff_t * end,int one_byte_rs)3320 static int rx_find_rs(regex_t *rx, char *s, long len,
3321 regoff_t *start, regoff_t *end, int one_byte_rs)
3322 {
3323 regmatch_t matches[1];
3324 if (one_byte_rs) {
3325 char *p = memchr(s, one_byte_rs, len);
3326 if (!p) return REG_NOMATCH;
3327 *start = p - s;
3328 *end = *start + 1;
3329 } else {
3330 int r = regexec0(rx, s, len, 1, matches, 0);
3331 if (r == REG_NOMATCH) return r;
3332 if (r) FATAL("regexec error"); // TODO ? use regerr() to meaningful msg
3333 *start = matches[0].rm_so;
3334 *end = matches[0].rm_eo;
3335 }
3336 return 0;
3337 }
3338
3339 // get a record; return length, or -1 at EOF
3340 // Does work for getrec_f() for regular RS or multiline
getr(struct zfile * zfp,int rs_mode)3341 static ssize_t getr(struct zfile *zfp, int rs_mode)
3342 {
3343 // zfp->buf (initially null) points to record buffer
3344 // zfp->buflen -- size of allocated buf
3345 // TT.rgl.recptr -- points to where record is being / has been read into
3346 // zfp->ro -- offset in buf to record data
3347 // zfp->lim -- offset to 1+last byte read in buffer
3348 // rs_mode nonzero iff multiline mode; reused for one-byte RS
3349
3350 regex_t rsrx; // FIXME Need to cache and avoid rx compile on every record?
3351 long ret = -1;
3352 int r = -REG_NOMATCH; // r cannot have this value after rx_findx() below
3353 regoff_t so = 0, eo = 0;
3354 size_t m = 0, n = 0;
3355
3356 xregcomp(&rsrx, rs_mode ? "\n\n+" : fmt_one_char_fs(STACK[RS].vst->str),
3357 REG_EXTENDED);
3358 rs_mode = strlen(STACK[RS].vst->str) == 1 ? STACK[RS].vst->str[0] : 0;
3359 for ( ;; ) {
3360 if (zfp->ro == zfp->lim && zfp->eof) break; // EOF & last record; return -1
3361
3362 // Allocate initial buffer, and expand iff buffer holds one
3363 // possibly (probably) incomplete record.
3364 if (zfp->ro == 0 && zfp->lim == zfp->buflen)
3365 zfp->buf = xrealloc(zfp->buf,
3366 (zfp->buflen = maxof(512, zfp->buflen * 2)) + 1);
3367
3368 if ((m = zfp->buflen - zfp->lim) && !zfp->eof) {
3369 // Read iff space left in buffer
3370 if (zfp->is_tty) m = 1;
3371 n = fread(zfp->buf + zfp->lim, 1, m, zfp->fp);
3372 if (n < m) {
3373 if (ferror(zfp->fp)) FFATAL("i/o error %d on %s!", errno, zfp->fn);
3374 zfp->eof = 1;
3375 if (!n && r == -REG_NOMATCH) break; // catch empty file here
3376 }
3377 zfp->lim += n;
3378 zfp->buf[zfp->lim] = 0;
3379 }
3380 TT.rgl.recptr = zfp->buf + zfp->ro;
3381 r = rx_find_rs(&rsrx, TT.rgl.recptr, zfp->lim - zfp->ro, &so, &eo, rs_mode);
3382 if (!r && so == eo) r = 1; // RS was empty, so fake not found
3383
3384 if (!zfp->eof && (r
3385 || (zfp->lim - (zfp->ro + eo)) < zfp->buflen / 4) && !zfp->is_tty) {
3386 // RS not found, or found near lim. Slide up and try to get more data
3387 // If recptr at start of buf and RS not found then expand buffer
3388 memmove(zfp->buf, TT.rgl.recptr, zfp->lim - zfp->ro);
3389 zfp->lim -= zfp->ro;
3390 zfp->ro = 0;
3391 continue;
3392 }
3393 ret = so; // If RS found, then 'so' is rec length
3394 if (zfp->eof) {
3395 if (r) { // EOF and RS not found; rec is all data left in buf
3396 ret = zfp->lim - zfp->ro;
3397 zfp->ro = zfp->lim; // set ro for -1 return on next call
3398 } else zfp->ro += eo; // RS found; advance ro
3399 } else zfp->ro += eo; // Here only if RS found not near lim
3400
3401 if (!r || !zfp->is_tty) {
3402 // If is_tty then RS found; reset buffer pointers;
3403 // is_tty uses one rec per buffer load
3404 if (zfp->is_tty) zfp->ro = zfp->lim = 0;
3405 break;
3406 } // RS not found AND is_tty; loop to keep reading
3407 }
3408 regfree(&rsrx);
3409 return ret;
3410 }
3411
3412 // get a record; return length, or -1 at EOF
getrec_f(struct zfile * zfp)3413 static ssize_t getrec_f(struct zfile *zfp)
3414 {
3415 int k;
3416 if (ENSURE_STR(&STACK[RS])->vst->str[0]) return getr(zfp, 0);
3417 // RS == "" so multiline read
3418 // Passing 1 to getr() forces multiline mode, which uses regex "\n\n+" to
3419 // split on sequences of 2 or more newlines. But that's not the same as
3420 // multiline mode, which never returns empty records or records with leading
3421 // or trailing newlines, which can occur with RS="\n\n+". So here we loop and
3422 // strip leading/trailing newlines and discard empty lines. See gawk manual,
3423 // "4.9 Multiple-Line Records" for info on this difference.
3424 do {
3425 k = getr(zfp, 1);
3426 if (k < 0) break;
3427 while (k && TT.rgl.recptr[k-1] == '\n') k--;
3428 while (k && TT.rgl.recptr[0] == '\n') k--, TT.rgl.recptr++;
3429 } while (!k);
3430 return k;
3431 }
3432
getrec(void)3433 static ssize_t getrec(void)
3434 {
3435 ssize_t k;
3436 if (TT.rgl.eof) return -1;
3437 if (!TT.cfile->fp) next_fp();
3438 do {
3439 if ((k = getrec_f(TT.cfile)) >= 0) return k;
3440 } while (next_fp());
3441 return -1;
3442 }
3443
getrec_f0_f(struct zfile * zfp)3444 static ssize_t getrec_f0_f(struct zfile *zfp)
3445 {
3446 ssize_t k = getrec_f(zfp);
3447 if (k >= 0) {
3448 copy_to_field0(TT.rgl.recptr, k);
3449 }
3450 return k;
3451 }
3452
getrec_f0(void)3453 static ssize_t getrec_f0(void)
3454 {
3455 ssize_t k = getrec();
3456 if (k >= 0) {
3457 copy_to_field0(TT.rgl.recptr, k);
3458 incr_zvalue(&STACK[NR]);
3459 incr_zvalue(&STACK[FNR]);
3460 }
3461 return k;
3462 }
3463
3464 // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
3465 // fp is file or pipe (is NULL if file/pipe could not be opened)
3466 // FIXME TODO should -1 return be replaced by test at caller?
3467 // v is NULL or an lvalue ref
awk_getline(int source,struct zfile * zfp,struct zvalue * v)3468 static int awk_getline(int source, struct zfile *zfp, struct zvalue *v)
3469 {
3470 ssize_t k;
3471 int is_stream = source != tkeof;
3472 if (is_stream && !zfp->fp) return -1;
3473 if (v) {
3474 if ((k = is_stream ? getrec_f(zfp) : getrec()) < 0) return 0;
3475 zstring_release(&v->vst);
3476 v->vst = new_zstring(TT.rgl.recptr, k);
3477 v->flags = ZF_STR;
3478 check_numeric_string(v); // bug fix 20240514
3479 if (!is_stream) {
3480 incr_zvalue(&STACK[NR]);
3481 incr_zvalue(&STACK[FNR]);
3482 }
3483 } else k = is_stream ? getrec_f0_f(zfp) : getrec_f0();
3484 return k < 0 ? 0 : 1;
3485 }
3486
3487 // Define GAWK_SUB to get the same behavior with sub()/gsub() replacement text
3488 // as with gawk, goawk, and recent bwk awk (nawk) versions. Undefine GAWK_SUB
3489 // to get the simpler POSIX behavior, but I think most users will prefer the
3490 // gawk behavior. See the gawk (GNU Awk) manual,
3491 // sec. 9.1.4.1 // More about '\' and '&' with sub(), gsub(), and gensub()
3492 // for details on the differences.
3493 //
3494 #undef GAWK_SUB
3495 #define GAWK_SUB
3496
3497 // sub(ere, repl[, in]) Substitute the string repl in place of the
3498 // first instance of the extended regular expression ERE in string 'in'
3499 // and return the number of substitutions. An <ampersand> ( '&' )
3500 // appearing in the string repl shall be replaced by the string from in
3501 // that matches the ERE. (partial spec... there's more)
gsub(int opcode,int nargs,int parmbase)3502 static void gsub(int opcode, int nargs, int parmbase)
3503 { (void)nargs;
3504 int field_num = -1;
3505 // compile ensures 3 args
3506 struct zvalue *v = setup_lvalue(0, parmbase, &field_num);
3507 struct zvalue *ere = STKP-2;
3508 struct zvalue *repl = STKP-1;
3509 regex_t rx, *rxp = ℞
3510 rx_zvalue_compile(&rxp, ere);
3511 to_str(repl);
3512 to_str(v);
3513
3514 #define SLEN(zvalp) ((zvalp)->vst->size)
3515 char *p, *rp0 = repl->vst->str, *rp = rp0, *s = v->vst->str;
3516 int namps = 0, nhits = 0, is_sub = (opcode == tksub), eflags = 0;
3517 regoff_t so = -1, eo;
3518 // Count ampersands in repl string; may be overcount due to \& escapes.
3519 for (rp = rp0; *rp; rp++) namps += *rp == '&';
3520 p = s;
3521 regoff_t need = SLEN(v) + 1; // capacity needed for result string
3522 // A pass just to determine needed destination (result) string size.
3523 while(!rx_find(rxp, p, &so, &eo, eflags)) {
3524 need += SLEN(repl) + (eo - so) * (namps - 1);
3525 if (!*p) break;
3526 p += eo ? eo : 1; // ensure progress if empty hit at start
3527 if (is_sub) break;
3528 eflags |= REG_NOTBOL;
3529 }
3530
3531 if (so >= 0) { // at least one hit
3532 struct zstring *z = xzalloc(sizeof(*z) + need);
3533 z->capacity = need;
3534
3535 char *e = z->str; // result destination pointer
3536 p = s;
3537 eflags = 0;
3538 char *ep0 = p, *sp, *ep;
3539 while(!rx_find(rxp, p, &so, &eo, eflags)) {
3540 sp = p + so;
3541 ep = p + eo;
3542 memmove(e, ep0, sp - ep0); // copy unchanged part
3543 e += sp - ep0;
3544 // Skip match if not at start and just after prev match and this is empty
3545 if (p == s || sp - ep0 || eo - so) {
3546 nhits++;
3547 for (rp = rp0; *rp; rp++) { // copy replacement
3548 if (*rp == '&') {
3549 memmove(e, sp, eo - so); //copy match
3550 e += eo - so;
3551 } else if (*rp == '\\') {
3552 if (rp[1] == '&') *e++ = *++rp;
3553 else if (rp[1] != '\\') *e++ = *rp;
3554 else {
3555 #ifdef GAWK_SUB
3556 if (rp[2] == '\\' && rp[3] == '&') {
3557 rp += 2;
3558 *e++ = *rp;
3559 } else if (rp[2] != '&') *e++ = '\\';
3560 #endif
3561 *e++ = *++rp;
3562 }
3563 } else *e++ = *rp;
3564 }
3565 }
3566 ep0 = ep;
3567 if (!*p) break;
3568 p += eo ? eo : 1; // ensure progress if empty hit at start
3569 if (is_sub) break;
3570 eflags |= REG_NOTBOL;
3571 }
3572 // copy remaining subject string
3573 memmove(e, ep0, s + SLEN(v) - ep0);
3574 e += s + SLEN(v) - ep0;
3575 *e = 0;
3576 z->size = e - z->str;
3577 zstring_release(&v->vst);
3578 v->vst = z;
3579 }
3580 rx_zvalue_free(rxp, ere);
3581 if (!IS_RX(STKP-2)) zstring_release(&STKP[-2].vst);
3582 drop_n(3);
3583 push_int_val(nhits);
3584 if (field_num >= 0) fixup_fields(field_num);
3585 }
3586
3587 // Initially set stackp_needmore at MIN_STACK_LEFT before limit.
3588 // When stackp > stackp_needmore, then expand and reset stackp_needmore
add_stack(struct zvalue ** stackp_needmore)3589 static void add_stack(struct zvalue **stackp_needmore)
3590 {
3591 int k = stkn(0); // stack elements in use
3592 zlist_expand(&TT.stack);
3593 STKP = (struct zvalue *)TT.stack.base + k;
3594 *stackp_needmore = (struct zvalue *)TT.stack.limit - MIN_STACK_LEFT;
3595 }
3596
3597 #define CLAMP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
3598
3599 // Main loop of interpreter. Run this once for all BEGIN rules (which
3600 // have had their instructions chained in compile), all END rules (also
3601 // chained in compile), and once for each record of the data file(s).
interpx(int start,int * status)3602 static int interpx(int start, int *status)
3603 {
3604 int *ip = &ZCODE[start];
3605 int opcode, op2, k, r, nargs, nsubscrs, range_num, parmbase = 0;
3606 int field_num;
3607 double nleft, nright, d;
3608 double (*mathfunc[])(double) = {cos, sin, exp, log, sqrt, trunc};
3609 struct zvalue *v, vv,
3610 *stackp_needmore = (struct zvalue*)TT.stack.limit - MIN_STACK_LEFT;
3611 while ((opcode = *ip++)) {
3612
3613 switch (opcode) {
3614 case opquit:
3615 return opquit;
3616
3617 case tknot:
3618 (STKP)->num = ! get_set_logical();
3619 break;
3620
3621 case opnotnot:
3622 get_set_logical();
3623 break;
3624
3625 case opnegate:
3626 STKP->num = -to_num(STKP);
3627 break;
3628
3629 case tkpow: // FALLTHROUGH intentional here
3630 case tkmul: // FALLTHROUGH intentional here
3631 case tkdiv: // FALLTHROUGH intentional here
3632 case tkmod: // FALLTHROUGH intentional here
3633 case tkplus: // FALLTHROUGH intentional here
3634 case tkminus:
3635 nleft = to_num(STKP-1);
3636 nright = to_num(STKP);
3637 switch (opcode) {
3638 case tkpow: nleft = pow(nleft, nright); break;
3639 case tkmul: nleft *= nright; break;
3640 case tkdiv: nleft /= nright; break;
3641 case tkmod: nleft = fmod(nleft, nright); break;
3642 case tkplus: nleft += nright; break;
3643 case tkminus: nleft -= nright; break;
3644 }
3645 drop();
3646 STKP->num = nleft;
3647 break;
3648
3649 // FIXME REDO REDO ?
3650 case tkcat:
3651 to_str(STKP-1);
3652 to_str(STKP);
3653 STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP[0].vst);
3654 drop();
3655 break;
3656
3657 // Comparisons (with the '<', "<=", "!=", "==", '>', and ">="
3658 // operators) shall be made numerically:
3659 // * if both operands are numeric,
3660 // * if one is numeric and the other has a string value that is a
3661 // numeric string,
3662 // * if both have string values that are numeric strings, or
3663 // * if one is numeric and the other has the uninitialized value.
3664 //
3665 // Otherwise, operands shall be converted to strings as required and a
3666 // string comparison shall be made as follows:
3667 // * For the "!=" and "==" operators, the strings shall be compared to
3668 // check if they are identical (not to check if they collate equally).
3669 // * For the other operators, the strings shall be compared using the
3670 // locale-specific collation sequence.
3671 //
3672 // The value of the comparison expression shall be 1 if the relation is
3673 // true, or 0 if the relation is false.
3674 case tklt: // FALLTHROUGH intentional here
3675 case tkle: // FALLTHROUGH intentional here
3676 case tkne: // FALLTHROUGH intentional here
3677 case tkeq: // FALLTHROUGH intentional here
3678 case tkgt: // FALLTHROUGH intentional here
3679 case tkge:
3680 ; int cmp = 31416;
3681
3682 if ( (IS_NUM(&STKP[-1]) &&
3683 (STKP[0].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[0].flags)) ||
3684 (IS_NUM(&STKP[0]) &&
3685 (STKP[-1].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[-1].flags))) {
3686 switch (opcode) {
3687 case tklt: cmp = STKP[-1].num < STKP[0].num; break;
3688 case tkle: cmp = STKP[-1].num <= STKP[0].num; break;
3689 case tkne: cmp = STKP[-1].num != STKP[0].num; break;
3690 case tkeq: cmp = STKP[-1].num == STKP[0].num; break;
3691 case tkgt: cmp = STKP[-1].num > STKP[0].num; break;
3692 case tkge: cmp = STKP[-1].num >= STKP[0].num; break;
3693 }
3694 } else {
3695 cmp = strcmp(to_str(STKP-1)->vst->str, to_str(STKP)->vst->str);
3696 switch (opcode) {
3697 case tklt: cmp = cmp < 0; break;
3698 case tkle: cmp = cmp <= 0; break;
3699 case tkne: cmp = cmp != 0; break;
3700 case tkeq: cmp = cmp == 0; break;
3701 case tkgt: cmp = cmp > 0; break;
3702 case tkge: cmp = cmp >= 0; break;
3703 }
3704 }
3705 drop();
3706 drop();
3707 push_int_val(cmp);
3708 break;
3709
3710 case opmatchrec:
3711 op2 = *ip++;
3712 int mret = match(&FIELD[0], &LITERAL[op2]);
3713 push_int_val(!mret);
3714 break;
3715
3716 case tkmatchop:
3717 case tknotmatch:
3718 mret = match(STKP-1, STKP); // mret == 0 if match
3719 drop();
3720 drop();
3721 push_int_val(!mret == (opcode == tkmatchop));
3722 break;
3723
3724 case tkpowasgn: // FALLTHROUGH intentional here
3725 case tkmodasgn: // FALLTHROUGH intentional here
3726 case tkmulasgn: // FALLTHROUGH intentional here
3727 case tkdivasgn: // FALLTHROUGH intentional here
3728 case tkaddasgn: // FALLTHROUGH intentional here
3729 case tksubasgn:
3730 // Stack is: ... scalar_ref value_to_op_by
3731 // or ... subscript_val map_ref value_to_op_by
3732 // or ... fieldref value_to_op_by
3733 v = setup_lvalue(1, parmbase, &field_num);
3734 to_num(v);
3735 to_num(STKP);
3736 switch (opcode) {
3737 case tkpowasgn:
3738 // TODO
3739 v->num = pow(v->num, STKP->num);
3740 break;
3741 case tkmodasgn:
3742 // TODO
3743 v->num = fmod(v->num, STKP->num);
3744 break;
3745 case tkmulasgn:
3746 v->num *= STKP->num;
3747 break;
3748 case tkdivasgn:
3749 v->num /= STKP->num;
3750 break;
3751 case tkaddasgn:
3752 v->num += STKP->num;
3753 break;
3754 case tksubasgn:
3755 v->num -= STKP->num;
3756 break;
3757 }
3758
3759 drop_n(2);
3760 v->flags = ZF_NUM;
3761 push_val(v);
3762 if (field_num >= 0) fixup_fields(field_num);
3763 break;
3764
3765 case tkasgn:
3766 // Stack is: ... scalar_ref value_to_assign
3767 // or ... subscript_val map_ref value_to_assign
3768 // or ... fieldref value_to_assign
3769 v = setup_lvalue(1, parmbase, &field_num);
3770 force_maybemap_to_scalar(STKP);
3771 zvalue_copy(v, STKP);
3772 swap();
3773 drop();
3774 if (field_num >= 0) fixup_fields(field_num);
3775 break;
3776
3777 case tkincr: // FALLTHROUGH intentional here
3778 case tkdecr: // FALLTHROUGH intentional here
3779 case oppreincr: // FALLTHROUGH intentional here
3780 case oppredecr:
3781 // Stack is: ... scalar_ref
3782 // or ... subscript_val map_ref
3783 // or ... fieldnum fieldref
3784 v = setup_lvalue(0, parmbase, &field_num);
3785 to_num(v);
3786 switch (opcode) {
3787 case tkincr: case tkdecr:
3788 // Must be done in this order because push_val(v) may move v,
3789 // invalidating the pointer.
3790 v->num += (opcode == tkincr) ? 1 : -1;
3791 push_val(v);
3792 // Now reverse the incr/decr on the top TT.stack val.
3793 STKP->num -= (opcode == tkincr) ? 1 : -1;
3794 break;
3795 case oppreincr: case oppredecr:
3796 v->num += (opcode == oppreincr) ? 1 : -1;
3797 push_val(v);
3798 break;
3799 }
3800 swap();
3801 drop();
3802 if (field_num >= 0) fixup_fields(field_num);
3803 break;
3804
3805 case tknumber: // FALLTHROUGH intentional here
3806 case tkstring: // FALLTHROUGH intentional here
3807 case tkregex:
3808 push_val(&LITERAL[*ip++]);
3809 break;
3810
3811 case tkprint:
3812 case tkprintf:
3813 nargs = *ip++;
3814 int outmode = *ip++;
3815 struct zfile *outfp = TT.zstdout;
3816 switch (outmode) {
3817 case tkgt: outfp = setup_file(1, "w"); break; // file
3818 case tkappend: outfp = setup_file(1, "a"); break; // file
3819 case tkpipe: outfp = setup_file(0, "w"); break; // pipe
3820 default: nargs++; break;
3821 }
3822 nargs--;
3823 if (opcode == tkprintf) {
3824 varprint(fprintf, outfp->fp, nargs);
3825 drop_n(nargs);
3826 break;
3827 }
3828 if (!nargs) {
3829 fprintf(outfp->fp, "%s", to_str(&FIELD[0])->vst->str);
3830 } else {
3831 struct zvalue tempv = uninit_zvalue;
3832 zvalue_copy(&tempv, &STACK[OFS]);
3833 to_str(&tempv);
3834 for (int k = 0; k < nargs; k++) {
3835 if (k) fprintf(outfp->fp, "%s", tempv.vst->str);
3836 int sp = stkn(nargs - 1 - k);
3837 ////// FIXME refcnt -- prob. don't need to copy from TT.stack?
3838 v = &STACK[sp];
3839 to_str_fmt(v, OFMT);
3840 struct zstring *zs = v->vst;
3841 fprintf(outfp->fp, "%s", zs ? zs->str : "");
3842 }
3843 zvalue_release_zstring(&tempv);
3844 drop_n(nargs);
3845 }
3846 fputs(ENSURE_STR(&STACK[ORS])->vst->str, outfp->fp);
3847 break;
3848
3849 case opdrop:
3850 drop();
3851 break;
3852
3853 case opdrop_n:
3854 drop_n(*ip++);
3855 break;
3856
3857 // Stack frame layout relative to parmbase:
3858 #define RETURN_VALUE -4
3859 #define RETURN_ADDR -3
3860 #define PREV_PARMBASE -2
3861 #define ARG_CNT -1
3862 #define FUNCTION_NUM 0
3863 // Actual args follow, starting at parmbase + 1
3864 case tkfunction: // function definition
3865 op2 = *ip++; // func table num
3866 struct functab_slot *pfdef = &FUNC_DEF[op2];
3867 struct zlist *loctab = &pfdef->function_locals;
3868 int nparms = zlist_len(loctab)-1;
3869
3870 nargs = popnumval();
3871 int newparmbase = stkn(nargs);
3872 STACK[newparmbase + PREV_PARMBASE].num = parmbase;
3873 parmbase = newparmbase;
3874 for ( ;nargs > nparms; nargs--)
3875 drop();
3876 for ( ;nargs < nparms; nargs++) {
3877 // Push additional "args" that were not passed by the caller, to
3878 // match the formal parameters (parms) defined in the function
3879 // definition. In the local var table we may have the type as scalar
3880 // or map if it is used as such within the function. In that case we
3881 // init the pushed arg from the type of the locals table.
3882 // But if a var appears only as a bare arg in a function call it will
3883 // not be typed in the locals table. In that case we can only say it
3884 // "may be" a map, but we have to assume the possibility and attach a
3885 // map to the var. When/if the var is used as a map or scalar in the
3886 // called function it will be converted to a map or scalar as
3887 // required.
3888 // See force_maybemap_to_scalar().
3889 struct symtab_slot *q = &((struct symtab_slot *)loctab->base)[nargs+1];
3890 vv = (struct zvalue)ZVINIT(q->flags, 0, 0);
3891 if (vv.flags == 0) {
3892 zvalue_map_init(&vv);
3893 vv.flags = ZF_MAYBEMAP;
3894 } else if (IS_MAP(&vv)) {
3895 zvalue_map_init(&vv);
3896 } else {
3897 vv.flags = 0;
3898 }
3899 push_val(&vv);
3900 }
3901 break;
3902
3903 case tkreturn:
3904 nparms = *ip++;
3905 nargs = STACK[parmbase+ARG_CNT].num;
3906 force_maybemap_to_scalar(STKP); // Unneeded?
3907 zvalue_copy(&STACK[parmbase+RETURN_VALUE], STKP);
3908 drop();
3909 // Remove the local args (not supplied by caller) from TT.stack, check to
3910 // release any map data created.
3911 while (stkn(0) > parmbase + nargs) {
3912 if ((STKP)->flags & ZF_ANYMAP) {
3913 zmap_delete_map_incl_slotdata((STKP)->map);
3914 xfree((STKP)->map);
3915 }
3916 drop();
3917 }
3918 while (stkn(0) > parmbase + RETURN_VALUE)
3919 drop();
3920 ip = &ZCODE[(int)STACK[parmbase+RETURN_ADDR].num];
3921 parmbase = STACK[parmbase+PREV_PARMBASE].num;
3922 break;
3923
3924 case opprepcall: // function call prep
3925 if (STKP > stackp_needmore) add_stack(&stackp_needmore);
3926 push_int_val(0); // return value placeholder
3927 push_int_val(0); // return addr
3928 push_int_val(0); // parmbase
3929 push_int_val(0); // arg count
3930 push_int_val(*ip++); // function tbl ref
3931 break;
3932
3933 case tkfunc: // function call
3934 nargs = *ip++;
3935 newparmbase = stkn(nargs);
3936 STACK[newparmbase+RETURN_ADDR].num = ip - &ZCODE[0];
3937 STACK[newparmbase+ARG_CNT].num = nargs;
3938 push_int_val(nargs); // FIXME TODO pass this in a zregister?
3939 ip = &ZCODE[FUNC_DEF[(int)STACK[newparmbase+FUNCTION_NUM].num].zcode_addr];
3940 break;
3941
3942 case tkrbracket: // concat multiple map subscripts
3943 nsubscrs = *ip++;
3944 while (--nsubscrs) {
3945 swap();
3946 to_str(STKP);
3947 push_val(&STACK[SUBSEP]);
3948 to_str(STKP);
3949 STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3950 drop();
3951 swap();
3952 to_str(STKP);
3953 STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3954 drop();
3955 }
3956 break;
3957
3958 case opmapdelete:
3959 case tkdelete:
3960 k = STKP->num;
3961 if (k < 0) k = parmbase - k; // loc of var on TT.stack
3962 v = &STACK[k];
3963 force_maybemap_to_map(v);
3964 if (opcode == opmapdelete) {
3965 zmap_delete_map(v->map);
3966 } else {
3967 drop();
3968 zmap_delete(v->map, to_str(STKP)->vst);
3969 }
3970 drop();
3971 break;
3972
3973 case opmap:
3974 op2 = *ip++;
3975 k = op2 < 0 ? parmbase - op2 : op2;
3976 v = &STACK[k];
3977 force_maybemap_to_map(v);
3978 if (!IS_MAP(v)) FATAL("scalar in array context");
3979 v = get_map_val(v, STKP);
3980 drop(); // drop subscript
3981 push_val(v);
3982 break;
3983
3984 case tkin:
3985 if (!(STKP->flags & ZF_ANYMAP)) FATAL("scalar in array context");
3986 v = zmap_find(STKP->map, to_str(STKP-1)->vst);
3987 drop();
3988 drop();
3989 push_int_val(v ? 1 : 0);
3990 break;
3991
3992 case opmapiternext:
3993 op2 = *ip++;
3994 v = STKP-1;
3995 force_maybemap_to_map(v);
3996 if (!IS_MAP(v)) FATAL("scalar in array context");
3997 struct zmap *m = v->map; // Need for MAPSLOT macro
3998 int zlen = zlist_len(&m->slot);
3999 int kk = STKP->num + 1;
4000 while (kk < zlen && !(MAPSLOT[kk].key)) // skip deleted slots
4001 kk++;
4002 STKP->num = kk; // save index for next iteration
4003 if (kk < zlen) {
4004 struct zvalue *var = setup_lvalue(2, parmbase, &field_num);
4005 var->flags = ZF_STR;
4006 zstring_release(&var->vst);
4007 var->vst = MAPSLOT[kk].key;
4008 zstring_incr_refcnt(var->vst);
4009 ip += op2;
4010 }
4011 break;
4012
4013 case tkvar:
4014 op2 = *ip++;
4015 k = op2 < 0 ? parmbase - op2 : op2;
4016 v = &STACK[k];
4017 push_val(v);
4018 break;
4019
4020 case tkfield:
4021 // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
4022 // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
4023 ip++; // skip dummy "operand" instruction field
4024 push_field((int)(to_num(STKP)));
4025
4026 swap();
4027 drop();
4028 break;
4029
4030 case oppush:
4031 push_int_val(*ip++);
4032 break;
4033
4034 case tkand:
4035 op2 = *ip++;
4036 if (get_set_logical()) drop();
4037 else ip += op2;
4038 break;
4039
4040 case tkor:
4041 op2 = *ip++;
4042 if (!get_set_logical()) drop();
4043 else ip += op2;
4044 break;
4045
4046 case tkwhile:
4047 (STKP)->num = ! get_set_logical();
4048 ATTR_FALLTHROUGH_INTENDED;
4049 // FALLTHROUGH to tkternif
4050 case tkif:
4051 // FALLTHROUGH to tkternif
4052 case tkternif:
4053 op2 = *ip++;
4054 int t = get_set_logical(); // FIXME only need to get, not set
4055 drop();
4056 if (!t) ip += op2;
4057 break;
4058
4059 case tkelse: // FALLTHROUGH intentional here
4060 case tkternelse: // FALLTHROUGH intentional here
4061 case tkbreak: // FALLTHROUGH intentional here
4062 case tkcontinue: // FALLTHROUGH intentional here
4063 case opjump:
4064 op2 = *ip++;
4065 ip += op2;
4066 break;
4067
4068 case opvarref:
4069 op2 = *ip++;
4070 vv = (struct zvalue)ZVINIT(ZF_REF, op2, 0);
4071 push_val(&vv);
4072 break;
4073
4074 case opmapref:
4075 op2 = *ip++;
4076 vv = (struct zvalue)ZVINIT(ZF_MAPREF, op2, 0);
4077 push_val(&vv);
4078 break;
4079
4080 case opfldref:
4081 to_num(STKP);
4082 (STKP)->flags |= ZF_FIELDREF;
4083 ip++; // skip dummy "operand" instruction field
4084 break;
4085
4086 case opprintrec:
4087 puts(to_str(&FIELD[0])->vst->str);
4088 break;
4089
4090 case oprange1:
4091 range_num = *ip++;
4092 op2 = *ip++;
4093 if (TT.range_sw[range_num]) ip += op2;
4094 break;
4095
4096 case oprange2:
4097 range_num = *ip++;
4098 op2 = *ip++;
4099 t = get_set_logical(); // FIXME only need to get, not set
4100 drop();
4101 if (t) TT.range_sw[range_num] = 1;
4102 else ip += op2;
4103 break;
4104
4105 case oprange3:
4106 range_num = *ip++;
4107 t = get_set_logical(); // FIXME only need to get, not set
4108 drop();
4109 if (t) TT.range_sw[range_num] = 0;
4110 break;
4111
4112 case tkexit:
4113 r = popnumval();
4114 if (r != NO_EXIT_STATUS) *status = (int)r & 255;
4115 // TODO FIXME do we need NO_EXIT_STATUS at all? Just use 0?
4116 ATTR_FALLTHROUGH_INTENDED;
4117 case tknext:
4118 case tknextfile:
4119 return opcode;
4120
4121 case tkgetline:
4122 nargs = *ip++;
4123 int source = *ip++;
4124 // TT.stack is:
4125 // if tkgetline 0 tkeof: (nothing stacked; plain getline)
4126 // if tkgetline 1 tkeof: (lvalue)
4127 // if tkgetline 1 tklt: (filename_string)
4128 // if tkgetline 2 tklt: (lvalue) (filename_string)
4129 // if tkgetline 1 tkpipe: (pipe_command_string)
4130 // if tkgetline 2 tkpipe: (pipe_command_string) (lvalue)
4131 // effect is to set:
4132 // if tkgetline 0 tkeof: $0 NF NR FNR
4133 // if tkgetline 1 tkeof: var NR FNR
4134 // if tkgetline 1 tklt: $0 NF
4135 // if tkgetline 2 tklt: var
4136 // if tkgetline 1 tkpipe: $0 NF
4137 // if tkgetline 2 tkpipe: var
4138 // Ensure pipe cmd on top
4139 if (nargs == 2 && source == tkpipe) swap();
4140 struct zfile *zfp = 0;
4141 if (source == tklt || source == tkpipe) {
4142 zfp = setup_file(source == tklt, "r");
4143 nargs--;
4144 }
4145 // now cases are:
4146 // nargs source TT.stack
4147 // 0 tkeof: (nothing; plain getline) from current data file
4148 // 1 tkeof: (lvalue) from current data file
4149 // 0 tklt: (nothing) from named file in 'stream'
4150 // 1 tklt: (lvalue) from named file in 'stream'
4151 // 0 tkpipe: (nothing) from piped command in 'stream'
4152 // 1 tkpipe: (lvalue) from piped command in 'stream'
4153 v = nargs ? setup_lvalue(0, parmbase, &field_num) : 0;
4154 if (v) drop();
4155 // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
4156 // stream is name of file or pipe
4157 // v is NULL or an lvalue ref
4158 if (zfp != badfile) push_int_val(awk_getline(source, zfp, v));
4159 else push_int_val(-1);
4160
4161 // fake return value for now
4162 break;
4163
4164 ////// builtin functions ///////
4165
4166 case tksplit:
4167 nargs = *ip++;
4168 if (nargs == 2) push_val(&STACK[FS]);
4169 struct zstring *s = to_str(STKP-2)->vst;
4170 force_maybemap_to_map(STKP-1);
4171 struct zvalue *a = STKP-1;
4172 struct zvalue *fs = STKP;
4173 zmap_delete_map(a->map);
4174 k = split(s, a, fs);
4175 drop_n(3);
4176 push_int_val(k);
4177 break;
4178
4179 case tkmatch:
4180 nargs = *ip++;
4181 if (!IS_RX(STKP)) to_str(STKP);
4182 regex_t rx_pat, *rxp = &rx_pat;
4183 rx_zvalue_compile(&rxp, STKP);
4184 regoff_t rso = 0, reo = 0; // shut up warning (may be uninit)
4185 k = rx_find(rxp, to_str(STKP-1)->vst->str, &rso, &reo, 0);
4186 rx_zvalue_free(rxp, STKP);
4187 // Force these to num before setting.
4188 to_num(&STACK[RSTART]);
4189 to_num(&STACK[RLENGTH]);
4190 if (k) STACK[RSTART].num = 0, STACK[RLENGTH].num = -1;
4191 else {
4192 reo = utf8cnt(STKP[-1].vst->str, reo);
4193 rso = utf8cnt(STKP[-1].vst->str, rso);
4194 STACK[RSTART].num = rso + 1, STACK[RLENGTH].num = reo - rso;
4195 }
4196 drop();
4197 drop();
4198 push_int_val(k ? 0 : rso + 1);
4199 break;
4200
4201 case tksub:
4202 case tkgsub:
4203 gsub(opcode, *ip++, parmbase); // tksub/tkgsub, args
4204 break;
4205
4206 case tksubstr:
4207 nargs = *ip++;
4208 struct zstring *zz = to_str(STKP - nargs + 1)->vst;
4209 int nchars = utf8cnt(zz->str, zz->size); // number of utf8 codepoints
4210 // Offset of start of string (in chars not bytes); convert 1-based to 0-based
4211 ssize_t mm = CLAMP(trunc(to_num(STKP - nargs + 2)) - 1, 0, nchars);
4212 ssize_t nn = nchars - mm; // max possible substring length (chars)
4213 if (nargs == 3) nn = CLAMP(trunc(to_num(STKP)), 0, nn);
4214 mm = bytesinutf8(zz->str, zz->size, mm);
4215 nn = bytesinutf8(zz->str + mm, zz->size - mm, nn);
4216 struct zstring *zzz = new_zstring(zz->str + mm, nn);
4217 zstring_release(&(STKP - nargs + 1)->vst);
4218 (STKP - nargs + 1)->vst = zzz;
4219 drop_n(nargs - 1);
4220 break;
4221
4222 case tkindex:
4223 nargs = *ip++;
4224 char *s1 = to_str(STKP-1)->vst->str;
4225 char *s3 = strstr(s1, to_str(STKP)->vst->str);
4226 ptrdiff_t offs = s3 ? utf8cnt(s1, s3 - s1) + 1 : 0;
4227 drop();
4228 drop();
4229 push_int_val(offs);
4230 break;
4231
4232 case tkband:
4233 case tkbor:
4234 case tkbxor:
4235 case tklshift:
4236 case tkrshift:
4237 ; size_t acc = to_num(STKP);
4238 nargs = *ip++;
4239 for (int i = 1; i < nargs; i++) switch (opcode) {
4240 case tkband: acc &= (size_t)to_num(STKP-i); break;
4241 case tkbor: acc |= (size_t)to_num(STKP-i); break;
4242 case tkbxor: acc ^= (size_t)to_num(STKP-i); break;
4243 case tklshift: acc = (size_t)to_num(STKP-i) << acc; break;
4244 case tkrshift: acc = (size_t)to_num(STKP-i) >> acc; break;
4245 }
4246 drop_n(nargs);
4247 push_int_val(acc);
4248 break;
4249
4250 case tktolower:
4251 case tktoupper:
4252 nargs = *ip++;
4253 struct zstring *z = to_str(STKP)->vst;
4254 unsigned zzlen = z->size + 4; // Allow for expansion
4255 zz = zstring_update(0, zzlen, "", 0);
4256 char *p = z->str, *e = z->str + z->size, *q = zz->str;
4257 // Similar logic to toybox strlower(), but fixed.
4258 while (p < e) {
4259 unsigned wch;
4260 int len = utf8towc(&wch, p, e-p);
4261 if (len < 1) { // nul byte, error, or truncated code
4262 *q++ = *p++;
4263 continue;
4264 }
4265 p += len;
4266 wch = (opcode == tktolower ? towlower : towupper)(wch);
4267 len = wctoutf8(q, wch);
4268 q += len;
4269 // Need realloc here if overflow possible
4270 if ((len = q - zz->str) + 4 < (int)zzlen) continue;
4271 zz = zstring_update(zz, zzlen = len + 16, "", 0);
4272 q = zz->str + len;
4273 }
4274 *q = 0;
4275 zz->size = q - zz->str;
4276 zstring_release(&z);
4277 STKP->vst = zz;
4278 break;
4279
4280 case tklength:
4281 nargs = *ip++;
4282 v = nargs ? STKP : &FIELD[0];
4283 force_maybemap_to_map(v);
4284 if (IS_MAP(v)) k = v->map->count - v->map->deleted;
4285 else {
4286 to_str(v);
4287 k = utf8cnt(v->vst->str, v->vst->size);
4288 }
4289 if (nargs) drop();
4290 push_int_val(k);
4291 break;
4292
4293 case tksystem:
4294 nargs = *ip++;
4295 fflush(stdout);
4296 fflush(stderr);
4297 r = system(to_str(STKP)->vst->str);
4298 #ifdef WEXITSTATUS
4299 // WEXITSTATUS is in sys/wait.h, but I'm not including that.
4300 // It seems to also be in stdlib.h in gcc and musl-gcc.
4301 // No idea how portable this is!
4302 if (WIFEXITED(r)) r = WEXITSTATUS(r);
4303 #endif
4304 drop();
4305 push_int_val(r);
4306 break;
4307
4308 case tkfflush:
4309 nargs = *ip++;
4310 r = fflush_file(nargs);
4311 if (nargs) drop();
4312 push_int_val(r);
4313 break;
4314
4315 case tkclose:
4316 nargs = *ip++;
4317 r = close_file(to_str(STKP)->vst->str);
4318 drop();
4319 push_int_val(r);
4320 break;
4321
4322 case tksprintf:
4323 nargs = *ip++;
4324 zstring_release(&TT.rgl.zspr);
4325 TT.rgl.zspr = new_zstring("", 0);
4326 varprint(fsprintf, 0, nargs);
4327 drop_n(nargs);
4328 vv = (struct zvalue)ZVINIT(ZF_STR, 0, TT.rgl.zspr);
4329 push_val(&vv);
4330 break;
4331
4332 // Math builtins -- move here (per Oliver Webb suggestion)
4333 case tkatan2:
4334 nargs = *ip++;
4335 d = atan2(to_num(STKP-1), to_num(STKP));
4336 drop();
4337 STKP->num = d;
4338 break;
4339 case tkrand:
4340 nargs = *ip++;
4341 push_int_val(0);
4342 // Get all 53 mantissa bits in play:
4343 // (upper 26 bits * 2^27 + upper 27 bits) / 2^53
4344 STKP->num =
4345 ((random() >> 5) * 134217728.0 + (random() >> 4)) / 9007199254740992.0;
4346 break;
4347 case tksrand:
4348 nargs = *ip++;
4349 if (nargs == 1) {
4350 STKP->num = seedrand(to_num(STKP));
4351 } else push_int_val(seedrand(time(0)));
4352 break;
4353 case tkcos: case tksin: case tkexp: case tklog: case tksqrt: case tkint:
4354 nargs = *ip++;
4355 STKP->num = mathfunc[opcode-tkcos](to_num(STKP));
4356 break;
4357
4358 default:
4359 // This should never happen:
4360 error_exit("!!! Unimplemented opcode %d", opcode);
4361 }
4362 }
4363 return opquit;
4364 }
4365
4366 // interp() wraps the main interpreter loop interpx(). The main purpose
4367 // is to allow the TT.stack to be readjusted after an 'exit' from a function.
4368 // Also catches errors, as the normal operation should leave the TT.stack
4369 // depth unchanged after each run through the rules.
interp(int start,int * status)4370 static int interp(int start, int *status)
4371 {
4372 int stkptrbefore = stkn(0);
4373 int r = interpx(start, status);
4374 // If exit from function, TT.stack will be loaded with args etc. Clean it.
4375 if (r == tkexit) {
4376 // TODO FIXME is this safe? Just remove extra entries?
4377 STKP = &STACK[stkptrbefore];
4378 }
4379 if (stkn(0) - stkptrbefore)
4380 error_exit("!!AWK BUG stack pointer offset: %d", stkn(0) - stkptrbefore);
4381 return r;
4382 }
4383
insert_argv_map(struct zvalue * map,int key,char * value)4384 static void insert_argv_map(struct zvalue *map, int key, char *value)
4385 {
4386 struct zvalue zkey = ZVINIT(ZF_STR, 0, num_to_zstring(key, ENSURE_STR(&STACK[CONVFMT])->vst->str));
4387 struct zvalue *v = get_map_val(map, &zkey);
4388 zvalue_release_zstring(&zkey);
4389 zvalue_release_zstring(v);
4390 *v = new_str_val(value);
4391 check_numeric_string(v);
4392 }
4393
init_globals(int optind,int argc,char ** argv,char * sepstring,struct arg_list * assign_args)4394 static void init_globals(int optind, int argc, char **argv, char *sepstring,
4395 struct arg_list *assign_args)
4396 {
4397 // Global variables reside at the bottom of the TT.stack. Start with the awk
4398 // "special variables": ARGC, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
4399 // NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP
4400
4401 STACK[CONVFMT] = new_str_val("%.6g");
4402 // Init ENVIRON map.
4403 struct zvalue m = ZVINIT(ZF_MAP, 0, 0);
4404 zvalue_map_init(&m);
4405 STACK[ENVIRON] = m;
4406 for (char **pkey = environ; *pkey; pkey++) {
4407 char *pval = strchr(*pkey, '=');
4408 if (!pval) continue;
4409 struct zvalue zkey = ZVINIT(ZF_STR, 0, new_zstring(*pkey, pval - *pkey));
4410 struct zvalue *v = get_map_val(&m, &zkey);
4411 zstring_release(&zkey.vst);
4412 if (v->vst) FFATAL("env var dup? (%s)", pkey);
4413 *v = new_str_val(++pval); // FIXME refcnt
4414 check_numeric_string(v);
4415 }
4416
4417 // Init ARGV map.
4418 m = (struct zvalue)ZVINIT(ZF_MAP, 0, 0);
4419 zvalue_map_init(&m);
4420 STACK[ARGV] = m;
4421 insert_argv_map(&m, 0, TT.progname);
4422 int nargc = 1;
4423 for (int k = optind; k < argc; k++) {
4424 insert_argv_map(&m, nargc, argv[k]);
4425 nargc++;
4426 }
4427
4428 // Init rest of the awk special variables.
4429 STACK[ARGC] = (struct zvalue)ZVINIT(ZF_NUM, nargc, 0);
4430 STACK[FILENAME] = new_str_val("");
4431 STACK[FNR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4432 STACK[FS] = new_str_val(sepstring);
4433 STACK[NF] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4434 STACK[NR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4435 STACK[OFMT] = new_str_val("%.6g");
4436 STACK[OFS] = new_str_val(" ");
4437 STACK[ORS] = new_str_val("\n");
4438 STACK[RLENGTH] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4439 STACK[RS] = new_str_val("\n");
4440 STACK[RSTART] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4441 STACK[SUBSEP] = new_str_val("\034");
4442
4443 // Init program globals.
4444 //
4445 // Push global variables on the TT.stack at offsets matching their index in the
4446 // global var table. In the global var table we may have the type as scalar
4447 // or map if it is used as such in the program. In that case we init the
4448 // pushed arg from the type of the globals table.
4449 // But if a global var appears only as a bare arg in a function call it will
4450 // not be typed in the globals table. In that case we can only say it "may be"
4451 // a map, but we have to assume the possibility and attach a map to the
4452 // var. When/if the var is used as a map or scalar in the called function it
4453 // will be converted to a map or scalar as required.
4454 // See force_maybemap_to_scalar(), and the similar comment in
4455 // 'case tkfunction:' above.
4456 //
4457 int gstx, len = zlist_len(&TT.globals_table);
4458 for (gstx = TT.spec_var_limit; gstx < len; gstx++) {
4459 struct symtab_slot gs = GLOBAL[gstx];
4460 struct zvalue v = ZVINIT(gs.flags, 0, 0);
4461 if (v.flags == 0) {
4462 zvalue_map_init(&v);
4463 v.flags = ZF_MAYBEMAP;
4464 } else if (IS_MAP(&v)) {
4465 zvalue_map_init(&v);
4466 } else {
4467 // Set SCALAR flag 0 to create "uninitialized" scalar.
4468 v.flags = 0;
4469 }
4470 push_val(&v);
4471 }
4472
4473 // Init -v assignment options.
4474 for (struct arg_list *p = assign_args; p; p = p->next) {
4475 char *asgn = p->arg;
4476 char *val = strchr(asgn, '=');
4477 if (!val) error_exit("bad -v assignment format");
4478 *val++ = 0;
4479 assign_global(asgn, val);
4480 }
4481
4482 TT.rgl.cur_arg = new_str_val("<cmdline>");
4483 uninit_string_zvalue = new_str_val("");
4484 zvalue_copy(&FIELD[0], &uninit_string_zvalue);
4485 }
4486
run_files(int * status)4487 static void run_files(int *status)
4488 {
4489 int r = 0;
4490 while (r != tkexit && *status < 0 && getrec_f0() >= 0)
4491 if ((r = interp(TT.cgl.first_recrule, status)) == tknextfile) next_fp();
4492 }
4493
free_literal_regex(void)4494 static void free_literal_regex(void)
4495 {
4496 int len = zlist_len(&TT.literals);
4497 for (int k = 1; k < len; k++)
4498 if (IS_RX(&LITERAL[k])) regfree(LITERAL[k].rx);
4499 }
4500
run(int optind,int argc,char ** argv,char * sepstring,struct arg_list * assign_args)4501 static void run(int optind, int argc, char **argv, char *sepstring,
4502 struct arg_list *assign_args)
4503 {
4504 char *printf_fmt_rx = "%[-+ #0']*([*]|[0-9]*)([.]([*]|[0-9]*))?l?[aAdiouxXfFeEgGcs%]";
4505 init_globals(optind, argc, argv, sepstring, assign_args);
4506 TT.cfile = xzalloc(sizeof(struct zfile));
4507 xregcomp(&TT.rx_default, "[ \t\n]+", REG_EXTENDED);
4508 xregcomp(&TT.rx_last, "[ \t\n]+", REG_EXTENDED);
4509 xregcomp(&TT.rx_printf_fmt, printf_fmt_rx, REG_EXTENDED);
4510 new_file("-", stdin, 'r', 1, 1);
4511 new_file("/dev/stdin", stdin, 'r', 1, 1);
4512 new_file("/dev/stdout", stdout, 'w', 1, 1);
4513 TT.zstdout = TT.zfiles;
4514 new_file("/dev/stderr", stderr, 'w', 1, 1);
4515 seedrand(1);
4516 int status = -1, r = 0;
4517 if (TT.cgl.first_begin) r = interp(TT.cgl.first_begin, &status);
4518 if (r != tkexit)
4519 if (TT.cgl.first_recrule) run_files(&status);
4520 if (TT.cgl.first_end) r = interp(TT.cgl.first_end, &status);
4521 regfree(&TT.rx_printf_fmt);
4522 regfree(&TT.rx_default);
4523 regfree(&TT.rx_last);
4524 free_literal_regex();
4525 close_file(0); // close all files
4526 if (status >= 0) awk_exit(status);
4527 }
4528
4529 ////////////////////
4530 //// main
4531 ////////////////////
4532
progfiles_init(char * progstring,struct arg_list * prog_args)4533 static void progfiles_init(char *progstring, struct arg_list *prog_args)
4534 {
4535 TT.scs->p = progstring ? progstring : " " + 2;
4536 TT.scs->progstring = progstring;
4537 TT.scs->prog_args = prog_args;
4538 TT.scs->filename = "(cmdline)";
4539 TT.scs->maxtok = 256;
4540 TT.scs->tokstr = xzalloc(TT.scs->maxtok);
4541 }
4542
awk(char * sepstring,char * progstring,struct arg_list * prog_args,struct arg_list * assign_args,int optind,int argc,char ** argv,int opt_run_prog)4543 static int awk(char *sepstring, char *progstring, struct arg_list *prog_args,
4544 struct arg_list *assign_args, int optind, int argc, char **argv,
4545 int opt_run_prog)
4546 {
4547 struct scanner_state ss = {0};
4548 TT.scs = &ss;
4549
4550 setlocale(LC_NUMERIC, "");
4551 progfiles_init(progstring, prog_args);
4552 compile();
4553
4554 if (TT.cgl.compile_error_count)
4555 error_exit("%d syntax error(s)", TT.cgl.compile_error_count);
4556 else {
4557 if (opt_run_prog)
4558 run(optind, argc, argv, sepstring, assign_args);
4559 }
4560
4561 return TT.cgl.compile_error_count;
4562 }
4563
awk_main(void)4564 void awk_main(void)
4565 {
4566 char *sepstring = TT.F ? escape_str(TT.F, 0) : " ";
4567 int optind = 0;
4568 char *progstring = NULL;
4569
4570 TT.pbuf = toybuf;
4571 toys.exitval = 2;
4572 if (!TT.f) {
4573 if (*toys.optargs) progstring = toys.optargs[optind++];
4574 else error_exit("No program string\n");
4575 }
4576 TT.progname = toys.which->name;
4577 toys.exitval = awk(sepstring, progstring, TT.f, TT.v,
4578 optind, toys.optc, toys.optargs, !FLAG(c));
4579 }
4580