xref: /aosp_15_r20/external/toybox/toys/pending/awk.c (revision cf5a6c84e2b8763fc1a7db14496fd4742913b199)
1 /* awk.c - An awk implementation.
2  * vi: tabstop=2 softtabstop=2 shiftwidth=2
3  *
4  * Copyright 2024 Ray Gardner <[email protected]>
5  *
6  * See https://pubs.opengroup.org/onlinepubs/9799919799/utilities/awk.html
7  *
8  * Deviations from posix: Don't handle LANG, LC_ALL, etc.
9  *   Accept regex for RS
10  *   Bitwise functions (from gawk): and, or, xor, lshift, rshift
11  *   Attempt to follow tradition (nawk, gawk) where it departs from posix
12  *
13  * TODO: Lazy field splitting; improve performance; more testing/debugging
14 
15 USE_AWK(NEWTOY(awk, "F:v*f*bc", TOYFLAG_USR|TOYFLAG_BIN|TOYFLAG_LINEBUF))
16 
17 config AWK
18   bool "awk"
19   default n
20   help
21     usage:  awk [-F sepstring] [-v assignment]... program [argument...]
22       or:
23             awk [-F sepstring] -f progfile [-f progfile]... [-v assignment]...
24                   [argument...]
25       also:
26       -b : count bytes, not characters (experimental)
27       -c : compile only, do not run
28 */
29 
30 #define FOR_awk
31 #include "toys.h"
32 
33 GLOBALS(
34   struct arg_list *f;
35   struct arg_list *v;
36   char *F;
37 
38   struct scanner_state {
39       char *p;
40       char *progstring;
41       struct arg_list *prog_args;
42       char *filename;
43       char *line;
44       size_t line_size;
45       ssize_t line_len;
46       int line_num;
47       int ch;
48       FILE *fp;
49       // state includes latest token seen
50       int tok;
51       int tokbuiltin;
52       int toktype;
53       char *tokstr;
54       size_t maxtok;
55       size_t toklen;
56       double numval;
57       int error;  // Set if lexical error.
58   } *scs;
59   char *tokstr;
60   int prevtok;
61 
62   struct compiler_globals {
63     int in_print_stmt;
64     int paren_level;
65     int in_function_body;
66     int funcnum;
67     int nparms;
68     int compile_error_count;
69     int first_begin;
70     int last_begin;
71     int first_end;
72     int last_end;
73     int first_recrule;
74     int last_recrule;
75     int break_dest;
76     int continue_dest;
77     int stack_offset_to_fix;  // fixup stack if return in for(e in a)
78     int range_pattern_num;
79     int rule_type;  // tkbegin, tkend, or 0
80   } cgl;
81 
82   // zvalue: the main awk value type
83   // Can be number or string or both, or else map (array) or regex
84   struct zvalue {
85     unsigned flags;
86     double num;
87     union { // anonymous union not in C99; not going to fix it now.
88       struct zstring *vst;
89       struct zmap *map;
90       regex_t *rx;
91     };
92   } nozvalue;   // to shut up compiler warning TODO FIXME
93 
94   struct runtime_globals {
95     struct zvalue cur_arg;
96     FILE *fp;           // current data file
97     int narg;           // cmdline arg index
98     int nfiles;         // num of cmdline data file args processed
99     int eof;            // all cmdline files (incl. stdin) read
100     char *recptr;
101     struct zstring *zspr;      // Global to receive sprintf() string value
102   } rgl;
103 
104   // Expanding sequential list
105   struct zlist {
106     char *base, *limit, *avail;
107     size_t size;
108   } globals_table,  // global symbol table
109     locals_table,     // local symbol table
110     func_def_table;  // function symbol table
111   // runtime lists
112   struct zlist literals, fields, zcode, stack;
113 
114   char *progname;
115 
116   int spec_var_limit;
117   int zcode_last;
118   struct zvalue *stackp;  // top of stack ptr
119 
120   char *pbuf;   // Used for number formatting in num_to_zstring()
121 #define RS_MAX  64
122   char rs_last[RS_MAX];
123   regex_t rx_rs_default, rx_rs_last;
124   regex_t rx_default, rx_last, rx_printf_fmt;
125 #define FS_MAX  64
126   char fs_last[FS_MAX];
127   char one_char_fs[4];
128   int nf_internal;  // should match NF
129   char range_sw[64];   // FIXME TODO quick and dirty set of range switches
130   int file_cnt, std_file_cnt;
131 
132   struct zfile {
133     struct zfile *next;
134     char *fn;
135     FILE *fp;
136     char mode;  // w, a, or r
137     char file_or_pipe;  // 1 if file, 0 if pipe
138     char is_tty, is_std_file;
139     char eof;
140     int ro, lim, buflen;
141     char *buf;
142   } *zfiles, *cfile, *zstdout;
143 )
144 
awk_exit(int status)145 static void awk_exit(int status)
146 {
147   toys.exitval = status;
148   xexit();
149 }
150 #ifdef __GNUC__
151 #define ATTR_FALLTHROUGH_INTENDED __attribute__ ((fallthrough))
152 #else
153 #define ATTR_FALLTHROUGH_INTENDED
154 #endif
155 
156 ////////////////////
157 ////   declarations
158 ////////////////////
159 
160 #define PBUFSIZE  512 // For num_to_zstring()
161 
162 enum toktypes {
163     // EOF (use -1 from stdio.h)
164     ERROR = 2, NEWLINE, VAR, NUMBER, STRING, REGEX, USERFUNC, BUILTIN, TOKEN,
165     KEYWORD
166     };
167 
168 // Must align with lbp_table[]
169 enum tokens {
170     tkunusedtoken, tkeof, tkerr, tknl,
171     tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
172 
173 // static char *ops = " ;  ,  [  ]  (  )  {  }  $  ++ -- ^  !  *  /  %  +  -     "
174 //        "<  <= != == >  >= ~  !~ && || ?  :  ^= %= *= /= += -= =  >> |  ";
175     tksemi, tkcomma, tklbracket, tkrbracket, tklparen, tkrparen, tklbrace,
176     tkrbrace, tkfield, tkincr, tkdecr, tkpow, tknot, tkmul, tkdiv, tkmod,
177     tkplus, tkminus,
178     tkcat, // !!! Fake operator for concatenation (just adjacent string exprs)
179     tklt, tkle, tkne, tkeq, tkgt, tkge, tkmatchop, tknotmatch, tkand, tkor,
180     tkternif, tkternelse, tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
181     tkaddasgn, tksubasgn, tkasgn, tkappend, tkpipe,
182 
183 // static char *keywords = " in        BEGIN     END       if        else      "
184 //    "while     for       do        break     continue  exit      function  "
185 //    "return    next      nextfile  delete    print     printf    getline   ";
186     tkin, tkbegin, tkend, tkif, tkelse,
187     tkwhile, tkfor, tkdo, tkbreak, tkcontinue, tkexit, tkfunction,
188     tkreturn, tknext, tknextfile, tkdelete, tkprint, tkprintf, tkgetline,
189 
190 // static char *builtins = " atan2     cos       sin       exp       "
191 //    "log       sqrt      int       rand      srand     length    "
192 //    "tolower   toupper   system    fflush    "
193 //    "and       or        xor       lshift    rshift    ";
194     tkatan2, tkcos, tksin, tkexp, tklog, tksqrt, tkint, tkrand, tksrand,
195     tklength, tktolower, tktoupper, tksystem, tkfflush,
196     tkband, tkbor, tkbxor, tklshift, tkrshift,
197 
198 // static char *specialfuncs = " close     index     match     split     "
199 //    "sub       gsub      sprintf   substr    ";
200     tkclose, tkindex, tkmatch, tksplit,
201     tksub, tkgsub, tksprintf, tksubstr, tklasttk
202     };
203 
204 enum opcodes {
205     opunusedop = tklasttk,
206     opvarref, opmapref, opfldref, oppush, opdrop, opdrop_n, opnotnot,
207     oppreincr, oppredecr, oppostincr, oppostdecr, opnegate, opjump, opjumptrue,
208     opjumpfalse, opprepcall, opmap, opmapiternext, opmapdelete, opmatchrec,
209     opquit, opprintrec, oprange1, oprange2, oprange3, oplastop
210 };
211 
212 // Special variables (POSIX). Must align with char *spec_vars[]
213 enum spec_var_names { ARGC=1, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
214     NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP };
215 
216 struct symtab_slot {    // global symbol table entry
217   unsigned flags;
218   char *name;
219 };
220 
221 // zstring: flexible string type.
222 // Capacity must be > size because we insert a NUL byte.
223 struct zstring {
224   int refcnt;
225   unsigned size;
226   unsigned capacity;
227   char str[];   // C99 flexible array member
228 };
229 
230 // Flag bits for zvalue and symbol tables
231 #define ZF_MAYBEMAP (1u << 1)
232 #define ZF_MAP      (1u << 2)
233 #define ZF_SCALAR   (1u << 3)
234 #define ZF_NUM      (1u << 4)
235 #define ZF_RX       (1u << 5)
236 #define ZF_STR      (1u << 6)
237 #define ZF_NUMSTR   (1u << 7)   // "numeric string" per posix
238 #define ZF_REF      (1u << 9)   // for lvalues
239 #define ZF_MAPREF   (1u << 10)  // for lvalues
240 #define ZF_FIELDREF (1u << 11)  // for lvalues
241 #define ZF_EMPTY_RX (1u << 12)
242 #define ZF_ANYMAP   (ZF_MAP | ZF_MAYBEMAP)
243 
244 // Macro to help facilitate possible future change in zvalue layout.
245 #define ZVINIT(flags, num, ptr) {(flags), (double)(num), {(ptr)}}
246 
247 #define IS_STR(zvalp) ((zvalp)->flags & ZF_STR)
248 #define IS_RX(zvalp) ((zvalp)->flags & ZF_RX)
249 #define IS_NUM(zvalp) ((zvalp)->flags & ZF_NUM)
250 #define IS_MAP(zvalp) ((zvalp)->flags & ZF_MAP)
251 #define IS_EMPTY_RX(zvalp) ((zvalp)->flags & ZF_EMPTY_RX)
252 
253 #define GLOBAL      ((struct symtab_slot *)TT.globals_table.base)
254 #define LOCAL       ((struct symtab_slot *)TT.locals_table.base)
255 #define FUNC_DEF    ((struct functab_slot *)TT.func_def_table.base)
256 
257 #define LITERAL     ((struct zvalue *)TT.literals.base)
258 #define STACK       ((struct zvalue *)TT.stack.base)
259 #define FIELD       ((struct zvalue *)TT.fields.base)
260 
261 #define ZCODE       ((int *)TT.zcode.base)
262 
263 #define FUNC_DEFINED    (1u)
264 #define FUNC_CALLED     (2u)
265 
266 #define MIN_STACK_LEFT 1024
267 
268 struct functab_slot {    // function symbol table entry
269   unsigned flags;
270   char *name;
271   struct zlist function_locals;
272   int zcode_addr;
273 };
274 
275 // Elements of the hash table (key/value pairs)
276 struct zmap_slot {
277   int hash;       // store hash key to speed hash table expansion
278   struct zstring *key;
279   struct zvalue val;
280 };
281 #define ZMSLOTINIT(hash, key, val) {hash, key, val}
282 
283 // zmap: Mapping data type for arrays; a hash table. Values in hash are either
284 // 0 (unused), -1 (marked deleted), or one plus the number of the zmap slot
285 // containing a key/value pair. The zlist slot entries are numbered from 0 to
286 // count-1, so need to add one to distinguish from unused.  The probe sequence
287 // is borrowed from Python dict, using the "perturb" idea to mix in upper bits
288 // of the original hash value.
289 struct zmap {
290   unsigned mask;  // tablesize - 1; tablesize is 2 ** n
291   int *hash;      // (mask + 1) elements
292   int limit;      // 80% of table size ((mask+1)*8/10)
293   int count;      // number of occupied slots in hash
294   int deleted;    // number of deleted slots
295   struct zlist slot;     // expanding list of zmap_slot elements
296 };
297 
298 #define MAPSLOT    ((struct zmap_slot *)(m->slot).base)
299 #define FFATAL(format, ...) zzerr("$" format, __VA_ARGS__)
300 #define FATAL(...) zzerr("$%s\n", __VA_ARGS__)
301 #define XERR(format, ...) zzerr(format, __VA_ARGS__)
302 
303 #define NO_EXIT_STATUS  (9999987)  // value unlikely to appear in exit stmt
304 
305 
306 
307 ////////////////////
308 //// lib
309 ////////////////////
310 
xfree(void * p)311 static void xfree(void *p)
312 {
313   free(p);
314 }
315 
hexval(int c)316 static int hexval(int c)
317 {
318   // Assumes c is valid hex digit
319   return isdigit(c) ? c - '0' : (c | 040) - 'a' + 10;
320 }
321 
322 ////////////////////
323 //// common defs
324 ////////////////////
325 
326 // These (ops, keywords, builtins) must align with enum tokens
327 static char *ops = " ;  ,  [  ]  (  )  {  }  $  ++ -- ^  !  *  /  %  +  -  .. "
328         "<  <= != == >  >= ~  !~ && || ?  :  ^= %= *= /= += -= =  >> |  ";
329 
330 static char *keywords = " in        BEGIN     END       if        else      "
331     "while     for       do        break     continue  exit      function  "
332     "return    next      nextfile  delete    print     printf    getline   ";
333 
334 static char *builtins = " atan2     cos       sin       exp       log       "
335     "sqrt      int       rand      srand     length    "
336     "tolower   toupper   system    fflush    "
337     "and       or        xor       lshift    rshift    "
338     "close     index     match     split     "
339     "sub       gsub      sprintf   substr    ";
340 
zzerr(char * format,...)341 static void zzerr(char *format, ...)
342 {
343   va_list args;
344   int fatal_sw = 0;
345   fprintf(stderr, "%s: ", TT.progname);
346   if (format[0] == '$') {
347     fprintf(stderr, "FATAL: ");
348     format++;
349     fatal_sw = 1;
350   }
351   fprintf(stderr, "file %s line %d: ", TT.scs->filename, TT.scs->line_num);
352   va_start(args, format);
353   vfprintf(stderr, format, args);
354   va_end(args);
355   if (format[strlen(format)-1] != '\n') fputc('\n', stderr); // TEMP FIXME !!!
356   fflush(stderr);
357   if (fatal_sw) awk_exit(2);
358         // Don't bump error count for warnings
359   else if (!strstr(format, "arning")) TT.cgl.compile_error_count++;
360 }
361 
get_token_text(char * op,int tk)362 static void get_token_text(char *op, int tk)
363 {
364   // This MUST ? be changed if ops string or tk... assignments change!
365   memmove(op, ops + 3 * (tk - tksemi) + 1, 2);
366   op[ op[1] == ' ' ? 1 : 2 ] = 0;
367 }
368 
369 ////////////////////
370 /// UTF-8
371 ////////////////////
372 
373 // Return number of bytes in 'cnt' utf8 codepoints
bytesinutf8(char * str,size_t len,size_t cnt)374 static int bytesinutf8(char *str, size_t len, size_t cnt)
375 {
376   if (FLAG(b)) return cnt;
377   unsigned wch;
378   char *lim = str + len, *s0 = str;
379   while (cnt-- && str < lim) {
380     int r = utf8towc(&wch, str, lim - str);
381     str += r > 0 ? r : 1;
382   }
383   return str - s0;
384 }
385 
386 // Return number of utf8 codepoints in str
utf8cnt(char * str,size_t len)387 static int utf8cnt(char *str, size_t len)
388 {
389   unsigned wch;
390   int cnt = 0;
391   char *lim;
392   if (!len || FLAG(b)) return len;
393   for (lim = str + len; str < lim; cnt++) {
394     int r = utf8towc(&wch, str, lim - str);
395     str += r > 0 ? r : 1;
396   }
397   return cnt;
398 }
399 
400 ////////////////////
401 ////   zlist
402 ////////////////////
403 
zlist_initx(struct zlist * p,size_t size,size_t count)404 static struct zlist *zlist_initx(struct zlist *p, size_t size, size_t count)
405 {
406   p->base = p->avail = xzalloc(count * size);
407   p->limit = p->base + size * count;
408   p->size = size;
409   return p;
410 }
411 
zlist_init(struct zlist * p,size_t size)412 static struct zlist *zlist_init(struct zlist *p, size_t size)
413 {
414 #define SLIST_MAX_INIT_BYTES 128
415   return zlist_initx(p, size, SLIST_MAX_INIT_BYTES / size);
416 }
417 
418 // This is called from zlist_append() and add_stack() in run
zlist_expand(struct zlist * p)419 static void zlist_expand(struct zlist *p)
420 {
421   size_t offset = p->avail - p->base;
422   size_t cap = p->limit - p->base;
423   size_t newcap = maxof(cap + p->size, ((cap / p->size) * 3 / 2) * p->size);
424   if (newcap <= cap) error_exit("mem req error");
425   char *base = xrealloc(p->base, newcap);
426   p->base = base;
427   p->limit = base + newcap;
428   p->avail = base + offset;
429 }
430 
zlist_append(struct zlist * p,void * obj)431 static size_t zlist_append(struct zlist *p, void *obj)
432 {
433   // Insert obj (p->size bytes) at end of list, expand as needed.
434   // Return scaled offset to newly inserted obj; i.e. the
435   // "slot number" 0, 1, 2,...
436   void *objtemp = 0;
437   if (p->avail > p->limit - p->size) {
438     objtemp = xmalloc(p->size);     // Copy obj in case it is in
439     memmove(objtemp, obj, p->size); // the area realloc might free!
440     obj = objtemp;
441     zlist_expand(p);
442   }
443   memmove(p->avail, obj, p->size);
444   if (objtemp) xfree(objtemp);
445   p->avail += p->size;
446   return (p->avail - p->base - p->size) / p->size;  // offset of updated slot
447 }
448 
zlist_len(struct zlist * p)449 static int zlist_len(struct zlist *p)
450 {
451   return (p->avail - p->base) / p->size;
452 }
453 
454 ////////////////////
455 ////   zstring
456 ////////////////////
457 
zstring_release(struct zstring ** s)458 static void zstring_release(struct zstring **s)
459 {
460   if (*s && (**s).refcnt-- == 0) xfree(*s); //free_zstring(s);
461   *s = 0;
462 }
463 
zstring_incr_refcnt(struct zstring * s)464 static void zstring_incr_refcnt(struct zstring *s)
465 {
466   if (s) s->refcnt++;
467 }
468 
469 // !! Use only if 'to' is NULL or its refcnt is 0.
zstring_modify(struct zstring * to,size_t at,char * s,size_t n)470 static struct zstring *zstring_modify(struct zstring *to, size_t at, char *s, size_t n)
471 {
472   size_t cap = at + n + 1;
473   if (!to || to->capacity < cap) {
474     to = xrealloc(to, sizeof(*to) + cap);
475     to->capacity = cap;
476     to->refcnt = 0;
477   }
478   memcpy(to->str + at, s, n);
479   to->size = at + n;
480   to->str[to->size] = '\0';
481   return to;
482 }
483 
484 // The 'to' pointer may move by realloc, so return (maybe updated) pointer.
485 // If refcnt is nonzero then there is another pointer to this zstring,
486 // so copy this one and release it. If refcnt is zero we can mutate this.
zstring_update(struct zstring * to,size_t at,char * s,size_t n)487 static struct zstring *zstring_update(struct zstring *to, size_t at, char *s, size_t n)
488 {
489   if (to && to->refcnt) {
490     struct zstring *to_before = to;
491     to = zstring_modify(0, 0, to->str, to->size);
492     zstring_release(&to_before);
493   }
494   return zstring_modify(to, at, s, n);
495 }
496 
zstring_copy(struct zstring * to,struct zstring * from)497 static struct zstring *zstring_copy(struct zstring *to, struct zstring *from)
498 {
499   return zstring_update(to, 0, from->str, from->size);
500 }
501 
zstring_extend(struct zstring * to,struct zstring * from)502 static struct zstring *zstring_extend(struct zstring *to, struct zstring *from)
503 {
504   return zstring_update(to, to->size, from->str, from->size);
505 }
506 
new_zstring(char * s,size_t size)507 static struct zstring *new_zstring(char *s, size_t size)
508 {
509   return zstring_modify(0, 0, s, size);
510 }
511 
512 ////////////////////
513 ////   zvalue
514 ////////////////////
515 
516 static struct zvalue uninit_zvalue = ZVINIT(0, 0.0, 0);
517 
518 // This will be reassigned in init_globals() with an empty string.
519 // It's a special value used for "uninitialized" field vars
520 // referenced past $NF. See push_field().
521 static struct zvalue uninit_string_zvalue = ZVINIT(0, 0.0, 0);
522 
new_str_val(char * s)523 static struct zvalue new_str_val(char *s)
524 {
525   // Only if no nul inside string!
526   struct zvalue v = ZVINIT(ZF_STR, 0.0, new_zstring(s, strlen(s)));
527   return v;
528 }
529 
zvalue_release_zstring(struct zvalue * v)530 static void zvalue_release_zstring(struct zvalue *v)
531 {
532   if (v && ! (v->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&v->vst);
533 }
534 
535 // push_val() is used for initializing globals (see init_compiler())
536 // but mostly used in runtime
537 // WARNING: push_val may change location of v, so do NOT depend on it after!
538 // Note the incr refcnt used to be after the zlist_append, but that caused a
539 // heap-use-after-free error when the zlist_append relocated the zvalue being
540 // pushed, invalidating the v pointer.
push_val(struct zvalue * v)541 static void push_val(struct zvalue *v)
542 {
543   if (IS_STR(v) && v->vst) v->vst->refcnt++;  // inlined zstring_incr_refcnt()
544   *++TT.stackp = *v;
545 }
546 
zvalue_copy(struct zvalue * to,struct zvalue * from)547 static void zvalue_copy(struct zvalue *to, struct zvalue *from)
548 {
549   if (IS_RX(from)) *to = *from;
550   else {
551     zvalue_release_zstring(to);
552     *to = *from;
553     zstring_incr_refcnt(to->vst);
554   }
555 }
556 
zvalue_dup_zstring(struct zvalue * v)557 static void zvalue_dup_zstring(struct zvalue *v)
558 {
559   struct zstring *z = new_zstring(v->vst->str, v->vst->size);
560   zstring_release(&v->vst);
561   v->vst = z;
562 }
563 
564 ////////////////////
565 ////   zmap (array) implementation
566 ////////////////////
567 
zstring_match(struct zstring * a,struct zstring * b)568 static int zstring_match(struct zstring *a, struct zstring *b)
569 {
570   return a->size == b->size && memcmp(a->str, b->str, a->size) == 0;
571 }
572 
zstring_hash(struct zstring * s)573 static int zstring_hash(struct zstring *s)
574 {   // djb2 -- small, fast, good enough for this
575   unsigned h = 5381;
576   char *p = s->str, *lim = p + s->size;
577   while (p < lim)
578     h = (h << 5) + h + *p++;
579   return h;
580 }
581 
582 enum { PSHIFT = 5 };  // "perturb" shift -- see find_mapslot() below
583 
find_mapslot(struct zmap * m,struct zstring * key,int * hash,int * probe)584 static struct zmap_slot *find_mapslot(struct zmap *m, struct zstring *key, int *hash, int *probe)
585 {
586   struct zmap_slot *x = 0;
587   unsigned perturb = *hash = zstring_hash(key);
588   *probe = *hash & m->mask;
589   int n, first_deleted = -1;
590   while ((n = m->hash[*probe])) {
591     if (n > 0) {
592       x = &MAPSLOT[n-1];
593       if (*hash == x->hash && zstring_match(key, x->key)) {
594         return x;
595       }
596     } else if (first_deleted < 0) first_deleted = *probe;
597     // Based on technique in Python dict implementation. Comment there
598     // (https://github.com/python/cpython/blob/3.10/Objects/dictobject.c)
599     // says
600     //
601     // j = ((5*j) + 1) mod 2**i
602     // For any initial j in range(2**i), repeating that 2**i times generates
603     // each int in range(2**i) exactly once (see any text on random-number
604     // generation for proof).
605     //
606     // The addition of 'perturb' greatly improves the probe sequence. See
607     // the Python dict implementation for more details.
608     *probe = (*probe * 5 + 1 + (perturb >>= PSHIFT)) & m->mask;
609   }
610   if (first_deleted >= 0) *probe = first_deleted;
611   return 0;
612 }
613 
zmap_find(struct zmap * m,struct zstring * key)614 static struct zvalue *zmap_find(struct zmap *m, struct zstring *key)
615 {
616   int hash, probe;
617   struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
618   return x ? &x->val : 0;
619 }
620 
zmap_init(struct zmap * m)621 static void zmap_init(struct zmap *m)
622 {
623   enum {INIT_SIZE = 8};
624   m->mask = INIT_SIZE - 1;
625   m->hash = xzalloc(INIT_SIZE * sizeof(*m->hash));
626   m->limit = INIT_SIZE * 8 / 10;
627   m->count = 0;
628   m->deleted = 0;
629   zlist_init(&m->slot, sizeof(struct zmap_slot));
630 }
631 
zvalue_map_init(struct zvalue * v)632 static void zvalue_map_init(struct zvalue *v)
633 {
634   struct zmap *m = xmalloc(sizeof(*m));
635   zmap_init(m);
636   v->map = m;
637   v->flags |= ZF_MAP;
638 }
639 
zmap_delete_map_incl_slotdata(struct zmap * m)640 static void zmap_delete_map_incl_slotdata(struct zmap *m)
641 {
642   for (struct zmap_slot *p = &MAPSLOT[0]; p < &MAPSLOT[zlist_len(&m->slot)]; p++) {
643     if (p->key) zstring_release(&p->key);
644     if (p->val.vst) zstring_release(&p->val.vst);
645   }
646   xfree(m->slot.base);
647   xfree(m->hash);
648 }
649 
zmap_delete_map(struct zmap * m)650 static void zmap_delete_map(struct zmap *m)
651 {
652   zmap_delete_map_incl_slotdata(m);
653   zmap_init(m);
654 }
655 
zmap_rehash(struct zmap * m)656 static void zmap_rehash(struct zmap *m)
657 {
658   // New table is twice the size of old.
659   int size = m->mask + 1;
660   unsigned mask = 2 * size - 1;
661   int *h = xzalloc(2 * size * sizeof(*m->hash));
662   // Step through the old hash table, set up location in new table.
663   for (int i = 0; i < size; i++) {
664     int n = m->hash[i];
665     if (n > 0) {
666       int hash = MAPSLOT[n-1].hash;
667       unsigned perturb = hash;
668       int p = hash & mask;
669       while (h[p]) {
670         p = (p * 5 + 1 + (perturb >>= PSHIFT)) & mask;
671       }
672       h[p] = n;
673     }
674   }
675   m->mask = mask;
676   xfree(m->hash);
677   m->hash = h;
678   m->limit = 2 * size * 8 / 10;
679 }
680 
zmap_find_or_insert_key(struct zmap * m,struct zstring * key)681 static struct zmap_slot *zmap_find_or_insert_key(struct zmap *m, struct zstring *key)
682 {
683   int hash, probe;
684   struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
685   if (x) return x;
686   // not found; insert it.
687   if (m->count == m->limit) {
688     zmap_rehash(m);         // rehash if getting too full.
689     // rerun find_mapslot to get new probe index
690     x = find_mapslot(m, key, &hash, &probe);
691   }
692   // Assign key to new slot entry and bump refcnt.
693   struct zmap_slot zs = ZMSLOTINIT(hash, key, (struct zvalue)ZVINIT(0, 0.0, 0));
694   zstring_incr_refcnt(key);
695   int n = zlist_append(&m->slot, &zs);
696   m->count++;
697   m->hash[probe] = n + 1;
698   return &MAPSLOT[n];
699 }
700 
zmap_delete(struct zmap * m,struct zstring * key)701 static void zmap_delete(struct zmap *m, struct zstring *key)
702 {
703   int hash, probe;
704   struct zmap_slot *x = find_mapslot(m, key, &hash, &probe);
705   if (!x) return;
706   zstring_release(&MAPSLOT[m->hash[probe] - 1].key);
707   m->hash[probe] = -1;
708   m->deleted++;
709 }
710 
711 ////////////////////
712 //// scan (lexical analyzer)
713 ////////////////////
714 
715 // TODO:
716 // IS line_num getting incr correctly? Newline counts as start of line!?
717 // Handle nuls in file better.
718 // Open files "rb" and handle CRs in program.
719 // Roll gch() into get_char() ?
720 // Deal with signed char (at EOF? elsewhere?)
721 //
722 // 2023-01-11: Allow nul bytes inside strings? regexes?
723 
progfile_open(void)724 static void progfile_open(void)
725 {
726   TT.scs->filename = TT.scs->prog_args->arg;
727   TT.scs->prog_args = TT.scs->prog_args->next;
728   TT.scs->fp = stdin;
729   if (strcmp(TT.scs->filename, "-")) TT.scs->fp = fopen(TT.scs->filename, "r");
730   if (!TT.scs->fp) error_exit("Can't open %s", TT.scs->filename);
731   TT.scs->line_num = 0;
732 }
733 
get_char(void)734 static int get_char(void)
735 {
736   static char *nl = "\n";
737   // On first entry, TT.scs->p points to progstring if any, or null string.
738   for (;;) {
739     int c = *(TT.scs->p)++;
740     if (c) {
741       return c;
742     }
743     if (TT.scs->progstring) {  // Fake newline at end of progstring.
744       if (TT.scs->progstring == nl) return EOF;
745       TT.scs->p = TT.scs->progstring = nl;
746       continue;
747     }
748     // Here if getting from progfile(s).
749     if (TT.scs->line == nl) return EOF;
750     if (!TT.scs->fp) {
751       progfile_open();
752     }
753     // Save last char to allow faking final newline.
754     int lastchar = (TT.scs->p)[-2];
755     TT.scs->line_len = getline(&TT.scs->line, &TT.scs->line_size, TT.scs->fp);
756     if (TT.scs->line_len > 0) {
757       TT.scs->line_num++;
758       TT.scs->p = TT.scs->line;
759       continue;
760     }
761     // EOF
762     // FIXME TODO or check for error? feof() vs. ferror()
763     fclose(TT.scs->fp);
764     TT.scs->fp = 0;
765     TT.scs->p = "  " + 2;
766     if (!TT.scs->prog_args) {
767       xfree(TT.scs->line);
768       if (lastchar == '\n') return EOF;
769       // Fake final newline
770       TT.scs->line = TT.scs->p = nl;
771     }
772   }
773 }
774 
append_this_char(int c)775 static void append_this_char(int c)
776 {
777   if (TT.scs->toklen == TT.scs->maxtok - 1) {
778     TT.scs->maxtok *= 2;
779     TT.scs->tokstr = xrealloc(TT.scs->tokstr, TT.scs->maxtok);
780   }
781   TT.scs->tokstr[TT.scs->toklen++] = c;
782   TT.scs->tokstr[TT.scs->toklen] = 0;
783 }
784 
gch(void)785 static void gch(void)
786 {
787   // FIXME probably not right place to skip CRs.
788   do {
789     TT.scs->ch = get_char();
790   } while (TT.scs->ch == '\r');
791 }
792 
append_char(void)793 static void append_char(void)
794 {
795   append_this_char(TT.scs->ch);
796   gch();
797 }
798 
find_keyword_or_builtin(char * table,int first_tok_in_table)799 static int find_keyword_or_builtin(char *table,
800     int first_tok_in_table)
801 {
802   char s[16] = " ", *p;
803   // keywords and builtin functions are spaced 10 apart for strstr() lookup,
804   // so must be less than that long.
805   if (TT.scs->toklen >= 10) return 0;
806   strcat(s, TT.scs->tokstr);
807   strcat(s, " ");
808   p = strstr(table, s);
809   if (!p) return 0;
810   return first_tok_in_table + (p - table) / 10;
811 }
812 
find_token(void)813 static int find_token(void)
814 {
815   char s[6] = " ", *p;
816   // tokens are spaced 3 apart for strstr() lookup, so must be less than
817   // that long.
818   strcat(s, TT.scs->tokstr);
819   strcat(s, " ");
820   p = strstr(ops, s);
821   if (!p) return 0;
822   return tksemi + (p - ops) / 3;
823 }
824 
find_keyword(void)825 static int find_keyword(void)
826 {
827   return find_keyword_or_builtin(keywords, tkin);
828 }
829 
find_builtin(void)830 static int find_builtin(void)
831 {
832   return find_keyword_or_builtin(builtins, tkatan2);
833 }
834 
get_number(void)835 static void get_number(void)
836 {
837   // Assumes TT.scs->ch is digit or dot on entry.
838   // TT.scs->p points to the following character.
839   // OK formats: 1 1. 1.2 1.2E3 1.2E+3 1.2E-3 1.E2 1.E+2 1.E-2 1E2 .1 .1E2
840   // .1E+2 .1E-2
841   // NOT OK: . .E .E1 .E+ .E+1 ; 1E .1E 1.E 1.E+ 1.E- parse as number
842   // followed by variable E.
843   // gawk accepts 12.E+ and 12.E- as 12; nawk & mawk say syntax error.
844   char *leftover;
845   int len;
846   TT.scs->numval = strtod(TT.scs->p - 1, &leftover);
847   len = leftover - TT.scs->p + 1;
848   if (len == 0) {
849     append_char();
850     TT.scs->toktype = ERROR;
851     TT.scs->tok = tkerr;
852     TT.scs->error = 1;
853     FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
854     return;
855   }
856   while (len--)
857     append_char();
858 }
859 
get_string_or_regex(int endchar)860 static void get_string_or_regex(int endchar)
861 {
862   gch();
863   while (TT.scs->ch != endchar) {
864     if (TT.scs->ch == '\n') {
865       // FIXME Handle unterminated string or regex. Is this OK?
866       // FIXME TODO better diagnostic here?
867       XERR("%s\n", "unterminated string or regex");
868       break;
869     } else if (TT.scs->ch == '\\') {
870       // \\ \a \b \f \n \r \t \v \" \/ \ddd
871       char *p, *escapes = "\\abfnrtv\"/";
872       gch();
873       if (TT.scs->ch == '\n') {  // backslash newline is continuation
874         gch();
875         continue;
876       } else if ((p = strchr(escapes, TT.scs->ch))) {
877         // posix regex does not use these escapes,
878         // but awk does, so do them.
879         int c = "\\\a\b\f\n\r\t\v\"/"[p-escapes];
880         append_this_char(c);
881         // Need to double up \ inside literal regex
882         if (endchar == '/' && c == '\\') append_this_char('\\');
883         gch();
884       } else if (TT.scs->ch == 'x') {
885         gch();
886         if (isxdigit(TT.scs->ch)) {
887           int c = hexval(TT.scs->ch);
888           gch();
889           if (isxdigit(TT.scs->ch)) {
890             c = c * 16 + hexval(TT.scs->ch);
891             gch();
892           }
893           append_this_char(c);
894         } else append_this_char('x');
895       } else if (TT.scs->ch == 'u') {
896         gch();
897         if (isxdigit(TT.scs->ch)) {
898           int i = 0, j = 0, c = 0;
899           char codep[9] = {0};
900           do {
901             codep[j++] = TT.scs->ch;
902             gch();
903           } while (j < 8 && isxdigit(TT.scs->ch));
904           c = strtol(codep, 0, 16);
905           for (i = wctoutf8(codep, c), j = 0; j < i; j++)
906             append_this_char(codep[j]);
907         } else append_this_char('u');
908       } else if (isdigit(TT.scs->ch)) {
909         if (TT.scs->ch < '8') {
910           int k, c = 0;
911           for (k = 0; k < 3; k++) {
912             if (isdigit(TT.scs->ch) && TT.scs->ch < '8') {
913               c = c * 8 + TT.scs->ch - '0';
914               gch();
915             } else
916               break;
917           }
918           append_this_char(c);
919         } else {
920           append_char();
921         }
922       } else {
923         if (endchar == '/') {
924           // pass \ unmolested if not awk escape,
925           // so that regex routines can see it.
926           if (!strchr(".[]()*+?{}|^$-", TT.scs->ch)) {
927             XERR("warning: '\\%c' -- unknown regex escape\n", TT.scs->ch);
928           }
929           append_this_char('\\');
930         } else {
931           XERR("warning: '\\%c' treated as plain '%c'\n", TT.scs->ch, TT.scs->ch);
932         }
933       }
934     } else if (TT.scs->ch == EOF) {
935       FATAL("EOF in string or regex\n");
936     } else {
937       append_char();
938     }
939   }
940   gch();
941 }
942 
ascan_opt_div(int div_op_allowed_here)943 static void ascan_opt_div(int div_op_allowed_here)
944 {
945   int n;
946   for (;;) {
947     TT.scs->tokbuiltin = 0;
948     TT.scs->toklen = 0;
949     TT.scs->tokstr[0] = 0;
950     while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
951       gch();
952     if (TT.scs->ch == '\\') {
953       append_char();
954       if (TT.scs->ch == '\n') {
955         gch();
956         continue;
957       }
958       TT.scs->toktype = ERROR;   // \ not last char in line.
959       TT.scs->tok = tkerr;
960       TT.scs->error = 3;
961       FATAL("backslash not last char in line\n");
962       return;
963     }
964     break;
965   }
966   // Note \<NEWLINE> in comment does not continue it.
967   if (TT.scs->ch == '#') {
968     gch();
969     while (TT.scs->ch != '\n')
970       gch();
971     // Need to fall through here to pick up newline.
972   }
973   if (TT.scs->ch == '\n') {
974     TT.scs->toktype = NEWLINE;
975     TT.scs->tok = tknl;
976     append_char();
977   } else if (isalpha(TT.scs->ch) || TT.scs->ch == '_') {
978     append_char();
979     while (isalnum(TT.scs->ch) || TT.scs->ch == '_') {
980       append_char();
981     }
982     if ((n = find_keyword()) != 0) {
983       TT.scs->toktype = KEYWORD;
984       TT.scs->tok = n;
985     } else if ((n = find_builtin()) != 0) {
986       TT.scs->toktype = BUILTIN;
987       TT.scs->tok = tkbuiltin;
988       TT.scs->tokbuiltin = n;
989     } else if (TT.scs->ch == '(') {
990       TT.scs->toktype = USERFUNC;
991       TT.scs->tok = tkfunc;
992     } else {
993       TT.scs->toktype = VAR;
994       TT.scs->tok = tkvar;
995       // skip whitespace to be able to check for , or )
996       while (TT.scs->ch == ' ' || TT.scs->ch == '\t')
997         gch();
998     }
999     return;
1000   } else if (TT.scs->ch == '"') {
1001     TT.scs->toktype = STRING;
1002     TT.scs->tok = tkstring;
1003     get_string_or_regex('"');
1004   } else if (isdigit(TT.scs->ch) || TT.scs->ch == '.') {
1005     TT.scs->toktype = NUMBER;
1006     TT.scs->tok = tknumber;
1007     get_number();
1008   } else if (TT.scs->ch == '/' && ! div_op_allowed_here) {
1009     TT.scs->toktype = REGEX;
1010     TT.scs->tok = tkregex;
1011     get_string_or_regex('/');
1012   } else if (TT.scs->ch == EOF) {
1013     TT.scs->toktype = EOF;
1014     TT.scs->tok = tkeof;
1015   } else if (TT.scs->ch == '\0') {
1016     append_char();
1017     TT.scs->toktype = ERROR;
1018     TT.scs->tok = tkerr;
1019     TT.scs->error = 5;
1020     FATAL("null char\n");
1021   } else {
1022     // All other tokens.
1023     TT.scs->toktype = TT.scs->ch;
1024     append_char();
1025     // Special case for **= and ** tokens
1026     if (TT.scs->toktype == '*' && TT.scs->ch == '*') {
1027       append_char();
1028       if (TT.scs->ch == '=') {
1029         append_char();
1030         TT.scs->tok = tkpowasgn;
1031       } else TT.scs->tok = tkpow;
1032       TT.scs->toktype = TT.scs->tok + 200;
1033       return;
1034     }
1035     // Is it a 2-character token?
1036     if (TT.scs->ch != ' ' && TT.scs->ch != '\n') {
1037       append_this_char(TT.scs->ch);
1038       if (find_token()) {
1039         TT.scs->tok = find_token();
1040         TT.scs->toktype = TT.scs->tok + 200;
1041         gch();  // Eat second char of token.
1042         return;
1043       }
1044       TT.scs->toklen--;  // Not 2-character token; back off.
1045       TT.scs->tokstr[TT.scs->toklen] = 0;
1046     }
1047     TT.scs->tok = find_token();
1048     if (TT.scs->tok) return;
1049     TT.scs->toktype = ERROR;
1050     TT.scs->tok = tkerr;
1051     TT.scs->error = 4;
1052     FFATAL("Unexpected token '%s'\n", TT.scs->tokstr);
1053   }
1054 }
1055 
scan_opt_div(int div_op_allowed_here)1056 static void scan_opt_div(int div_op_allowed_here)
1057 {
1058   // TODO FIXME need better diags for bad tokens!
1059   // TODO Also set global syntax error flag.
1060   do ascan_opt_div(div_op_allowed_here); while (TT.scs->tok == tkerr);
1061 }
1062 
init_scanner(void)1063 static void init_scanner(void)
1064 {
1065   TT.prevtok = tkeof;
1066   gch();
1067 }
1068 
1069 // POSIX says '/' does not begin a regex wherever '/' or '/=' can mean divide.
1070 // Pretty sure if / or /= comes after these, it means divide:
1071 static char div_preceders[] = {tknumber, tkstring, tkvar, tkgetline, tkrparen, tkrbracket, tkincr, tkdecr, 0};
1072 
1073 // For checking end of prev statement for termination and if '/' can come next
1074 
scan(void)1075 static void scan(void)
1076 {
1077   TT.prevtok = TT.scs->tok;
1078   if (TT.prevtok && strchr(div_preceders, TT.prevtok)) scan_opt_div(1);
1079   else scan_opt_div(0);
1080   TT.tokstr = TT.scs->tokstr;
1081 }
1082 
1083 ////////////////////
1084 //// compile
1085 ////////////////////
1086 
1087 //  NOTES:
1088 //  NL ok after , { && || do else OR after right paren after if/while/for
1089 //  TODO:
1090 //    see case tkgetline -- test more
1091 //    case tkmatchop, tknotmatch -- fix ~ (/re/)
1092 
1093 // Forward declarations -- for mutually recursive parsing functions
1094 static int expr(int rbp);
1095 static void lvalue(void);
1096 static int primary(void);
1097 static void stmt(void);
1098 static void action(int action_type);
1099 
1100 #define CURTOK() (TT.scs->tok)
1101 #define ISTOK(toknum) (TT.scs->tok == (toknum))
1102 
havetok(int tk)1103 static int havetok(int tk)
1104 {
1105   if (!ISTOK(tk)) return 0;
1106   scan();
1107   return 1;
1108 }
1109 
1110 //// code and "literal" emitters
gencd(int op)1111 static void gencd(int op)
1112 {
1113   TT.zcode_last = zlist_append(&TT.zcode, &op);
1114 }
1115 
gen2cd(int op,int n)1116 static void gen2cd(int op, int n)
1117 {
1118   gencd(op);
1119   gencd(n);
1120 }
1121 
make_literal_str_val(char * s)1122 static int make_literal_str_val(char *s)
1123 {
1124   // Only if no nul inside string!
1125   struct zvalue v = new_str_val(s);
1126   return zlist_append(&TT.literals, &v);
1127 }
1128 
make_literal_regex_val(char * s)1129 static int make_literal_regex_val(char *s)
1130 {
1131   regex_t *rx;
1132   rx = xmalloc(sizeof(*rx));
1133   xregcomp(rx, s, REG_EXTENDED);
1134   struct zvalue v = ZVINIT(ZF_RX, 0, 0);
1135   v.rx = rx;
1136   // Flag empty rx to make it easy to identify for split() special case
1137   if (!*s) v.flags |= ZF_EMPTY_RX;
1138   return zlist_append(&TT.literals, &v);
1139 }
1140 
make_literal_num_val(double num)1141 static int make_literal_num_val(double num)
1142 {
1143   struct zvalue v = ZVINIT(ZF_NUM, num, 0);
1144   return zlist_append(&TT.literals, &v);
1145 }
1146 
make_uninit_val(void)1147 static int make_uninit_val(void)
1148 {
1149   return zlist_append(&TT.literals, &uninit_zvalue);
1150 }
1151 //// END code and "literal" emitters
1152 
1153 //// Symbol tables functions
find_func_def_entry(char * s)1154 static int find_func_def_entry(char *s)
1155 {
1156   for (int k = 1; k < zlist_len(&TT.func_def_table); k++)
1157     if (!strcmp(s, FUNC_DEF[k].name)) return k;
1158   return 0;
1159 }
1160 
add_func_def_entry(char * s)1161 static int add_func_def_entry(char *s)
1162 {
1163   struct functab_slot ent = {0, 0, {0, 0, 0, 0}, 0};
1164   ent.name = xstrdup(s);
1165   int slotnum = zlist_append(&TT.func_def_table, &ent);
1166   return slotnum;
1167 }
1168 
find_global(char * s)1169 static int find_global(char *s)
1170 {
1171   for (int k = 1; k < zlist_len(&TT.globals_table); k++)
1172     if (!strcmp(s, GLOBAL[k].name)) return k;
1173   return 0;
1174 }
1175 
add_global(char * s)1176 static int add_global(char *s)
1177 {
1178   struct symtab_slot ent = {0, 0};
1179   ent.name = xstrdup(s);
1180   int slotnum = zlist_append(&TT.globals_table, &ent);
1181   return slotnum;
1182 }
1183 
find_local_entry(char * s)1184 static int find_local_entry(char *s)
1185 {
1186   for (int k = 1; k < zlist_len(&TT.locals_table); k++)
1187     if (!strcmp(s, LOCAL[k].name)) return k;
1188   return 0;
1189 }
1190 
add_local_entry(char * s)1191 static int add_local_entry(char *s)
1192 {
1193   struct symtab_slot ent = {0, 0};
1194   ent.name = xstrdup(s);
1195   int slotnum = zlist_append(&TT.locals_table, &ent);
1196   return slotnum;
1197 }
1198 
find_or_add_var_name(void)1199 static int find_or_add_var_name(void)
1200 {
1201   int slotnum = 0;    // + means global; - means local to function
1202   int globals_ent = 0;
1203   int locals_ent = find_local_entry(TT.tokstr);   // in local symbol table?
1204   if (locals_ent) {
1205     slotnum = -locals_ent;
1206   } else {
1207     globals_ent = find_global(TT.tokstr);
1208     if (!globals_ent) globals_ent = add_global(TT.tokstr);
1209     slotnum = globals_ent;
1210     if (find_func_def_entry(TT.tokstr))
1211       // POSIX: The same name shall not be used both as a variable name
1212       // with global scope and as the name of a function.
1213       XERR("var '%s' used as function name\n", TT.tokstr);
1214   }
1215   return slotnum;
1216 }
1217 
1218 //// END Symbol tables functions
1219 
1220 //// Initialization
init_locals_table(void)1221 static void init_locals_table(void)
1222 {
1223   static struct symtab_slot locals_ent;
1224   zlist_init(&TT.locals_table, sizeof(struct symtab_slot));
1225   zlist_append(&TT.locals_table, &locals_ent);
1226 }
1227 
init_tables(void)1228 static void init_tables(void)
1229 {
1230   static struct symtab_slot global_ent;
1231   static struct functab_slot func_ent;
1232 
1233   // Append dummy elements in lists to force valid offsets nonzero.
1234   zlist_init(&TT.globals_table, sizeof(struct symtab_slot));
1235   zlist_append(&TT.globals_table, &global_ent);
1236   zlist_init(&TT.func_def_table, sizeof(struct functab_slot));
1237   zlist_append(&TT.func_def_table, &func_ent);
1238   init_locals_table();
1239   zlist_init(&TT.zcode, sizeof(int));
1240   gencd(tkeof);   // to ensure zcode offsets are non-zero
1241   zlist_init(&TT.literals, sizeof(struct zvalue));
1242   // Init stack size at twice MIN_STACK_LEFT. MIN_STACK_LEFT is at least as
1243   // many entries as any statement may ever take.  Currently there is no diag
1244   // if this is exceeded; prog. will probably crash. 1024 should be plenty?
1245   zlist_initx(&TT.stack, sizeof(struct zvalue), 2 * MIN_STACK_LEFT);
1246   TT.stackp = (struct zvalue *)TT.stack.base;
1247   zlist_init(&TT.fields, sizeof(struct zvalue));
1248   zlist_append(&TT.literals, &uninit_zvalue);
1249   zlist_append(&TT.stack, &uninit_zvalue);
1250   zlist_append(&TT.fields, &uninit_zvalue);
1251   FIELD[0].vst = new_zstring("", 0);
1252 }
1253 
init_compiler(void)1254 static void init_compiler(void)
1255 {
1256   // Special variables (POSIX). Must align with enum spec_var_names
1257   static char *spec_vars[] = { "ARGC", "ARGV", "CONVFMT", "ENVIRON", "FILENAME",
1258       "FNR", "FS", "NF", "NR", "OFMT", "OFS", "ORS", "RLENGTH", "RS", "RSTART",
1259       "SUBSEP", 0};
1260 
1261   init_tables();
1262   for (int k = 0; spec_vars[k]; k++) {
1263     TT.spec_var_limit = add_global(spec_vars[k]);
1264     GLOBAL[TT.spec_var_limit++].flags |= (k == 1 || k == 3) ? ZF_MAP : ZF_SCALAR;
1265     push_val(&uninit_zvalue);
1266   }
1267 }
1268 //// END Initialization
1269 
1270 //// Parsing and compiling to TT.zcode
1271 // Left binding powers
1272 static int lbp_table[] = {  // Must align with enum Toks
1273   0, 0, 0, 0,     // tkunusedtoken, tkeof, tkerr, tknl,
1274   250, 250, 250,  // tkvar, tknumber, tkstring,
1275   250, 250, 250,  // tkregex, tkfunc, tkbuiltin,
1276   0, 0, 210, 0, // tksemi, tkcomma, tklbracket, tkrbracket,
1277   200, 0, 0, 0, // tklparen, tkrparen, tklbrace, tkrbrace,
1278   190, 180, 180, 170, 160, // tkfield, tkincr, tkdecr, tkpow, tknot,
1279   150, 150, 150, 140, 140, // tkmul, tkdiv, tkmod, tkplus, tkminus,
1280   130, // tkcat, // FAKE (?) optor for concatenation (adjacent string exprs)
1281   110, 110, 110, 110, 110, 110, // tklt, tkle, tkne, tkeq, tkgt, tkge,
1282   100, 100, // tkmatchop, tknotmatch,
1283   80, 70, // tkand, tkor,
1284   60, 0, // tkternif, tkternelse,
1285   50, 50, 50, 50,   // tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1286   50, 50, 50, // tkaddasgn, tksubasgn, tkasgn,
1287   0, 120, // tkappend, tkpipe,
1288   90 // tkin
1289 };
1290 
getlbp(int tok)1291 static int getlbp(int tok)
1292 {
1293   // FIXME: should tkappend be here too? is tkpipe needed?
1294   // In print statement outside parens: make '>' end an expression
1295   if (TT.cgl.in_print_stmt && ! TT.cgl.paren_level && (tok == tkgt || tok == tkpipe))
1296     return 0;
1297   return (0 <= tok && tok <= tkin) ? lbp_table[tok] :
1298     // getline is special, not a normal builtin.
1299     // close, index, match, split, sub, gsub, sprintf, substr
1300     // are really builtin functions though bwk treats them as keywords.
1301     (tkgetline <= tok && tok <= tksubstr) ? 240 : 0;     // FIXME 240 is temp?
1302 }
1303 
1304 // Get right binding power. Same as left except for right associative optors
getrbp(int tok)1305 static int getrbp(int tok)
1306 {
1307   int lbp = getlbp(tok);
1308   // ternary (?:), assignment, power ops are right associative
1309   return (lbp <= 60 || lbp == 170) ? lbp - 1 : lbp;
1310 }
1311 
unexpected_eof(void)1312 static void unexpected_eof(void)
1313 {
1314   error_exit("terminated with error(s)");
1315 }
1316 
1317 //// syntax error diagnostic and recovery (Turner's method)
1318 // D.A. Turner, Error diagnosis and recovery in one pass compilers,
1319 // Information Processing Letters, Volume 6, Issue 4, 1977, Pages 113-115
1320 static int recovering = 0;
1321 
complain(int tk)1322 static void complain(int tk)
1323 {
1324   char op[3], tkstr[10];
1325   if (recovering) return;
1326   recovering = 1;
1327   if (!strcmp(TT.tokstr, "\n")) TT.tokstr = "<newline>";
1328   if (tksemi <= tk && tk <= tkpipe) {
1329     get_token_text(op, tk);
1330     XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, op);
1331   } else if (tk >= tkin && tk <= tksubstr) {
1332     if (tk < tkatan2) memmove(tkstr, keywords + 1 + 10 * (tk - tkin), 10);
1333     else memmove(tkstr, builtins + 1 + 10 * (tk - tkatan2), 10);
1334     *strchr(tkstr, ' ') = 0;
1335     XERR("syntax near '%s' -- '%s' expected\n", TT.tokstr, tkstr);
1336   } else XERR("syntax near '%s'\n", TT.tokstr);
1337 }
1338 
expect(int tk)1339 static void expect(int tk)
1340 {
1341   if (recovering) {
1342     while (!ISTOK(tkeof) && !ISTOK(tk))
1343       scan();
1344     if (ISTOK(tkeof)) unexpected_eof();
1345     scan(); // consume expected token
1346     recovering = 0;
1347   } else if (!havetok(tk)) complain(tk);
1348 }
1349 
skip_to(char * tklist)1350 static void skip_to(char *tklist)
1351 {
1352   do scan(); while (!ISTOK(tkeof) && !strchr(tklist, CURTOK()));
1353   if (ISTOK(tkeof)) unexpected_eof();
1354 }
1355 
1356 //// END syntax error diagnostic and recovery (Turner's method)
1357 
optional_nl_or_semi(void)1358 static void optional_nl_or_semi(void)
1359 {
1360   while (havetok(tknl) || havetok(tksemi))
1361     ;
1362 }
1363 
optional_nl(void)1364 static void optional_nl(void)
1365 {
1366   while (havetok(tknl))
1367     ;
1368 }
1369 
rparen(void)1370 static void rparen(void)
1371 {
1372   expect(tkrparen);
1373   optional_nl();
1374 }
1375 
have_comma(void)1376 static int have_comma(void)
1377 {
1378   if (!havetok(tkcomma)) return 0;
1379   optional_nl();
1380   return 1;
1381 }
1382 
check_set_map(int slotnum)1383 static void check_set_map(int slotnum)
1384 {
1385   // POSIX: The same name shall not be used within the same scope both as
1386   // a scalar variable and as an array.
1387   if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_SCALAR)
1388     XERR("scalar param '%s' used as array\n", LOCAL[-slotnum].name);
1389   if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_SCALAR)
1390     XERR("scalar var '%s' used as array\n", GLOBAL[slotnum].name);
1391   if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_MAP;
1392   if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_MAP;
1393 }
1394 
check_set_scalar(int slotnum)1395 static void check_set_scalar(int slotnum)
1396 {
1397   if (slotnum < 0 && LOCAL[-slotnum].flags & ZF_MAP)
1398     XERR("array param '%s' used as scalar\n", LOCAL[-slotnum].name);
1399   if (slotnum > 0 && GLOBAL[slotnum].flags & ZF_MAP)
1400     XERR("array var '%s' used as scalar\n", GLOBAL[slotnum].name);
1401   if (slotnum < 0) LOCAL[-slotnum].flags |= ZF_SCALAR;
1402   if (slotnum > 0) GLOBAL[slotnum].flags |= ZF_SCALAR;
1403 }
1404 
map_name(void)1405 static void map_name(void)
1406 {
1407   int slotnum;
1408   check_set_map(slotnum = find_or_add_var_name());
1409   gen2cd(tkvar, slotnum);
1410 }
1411 
check_builtin_arg_counts(int tk,int num_args,char * fname)1412 static void check_builtin_arg_counts(int tk, int num_args, char *fname)
1413 {
1414   static char builtin_1_arg[] = { tkcos, tksin, tkexp, tklog, tksqrt, tkint,
1415                                   tktolower, tktoupper, tkclose, tksystem, 0};
1416   static char builtin_2_arg[] = { tkatan2, tkmatch, tkindex, tklshift, tkrshift, 0};
1417   static char builtin_al_2_arg[] = { tkband, tkbor, tkbxor, 0};
1418   static char builtin_2_3_arg[] = { tksub, tkgsub, tksplit, tksubstr, 0};
1419   static char builtin_0_1_arg[] = { tksrand, tklength, tkfflush, 0};
1420 
1421   if (tk == tkrand && num_args)
1422     XERR("function '%s' expected no args, got %d\n", fname, num_args);
1423   else if (strchr(builtin_1_arg, tk) && num_args != 1)
1424     XERR("function '%s' expected 1 arg, got %d\n", fname, num_args);
1425   else if (strchr(builtin_2_arg, tk) && num_args != 2)
1426     XERR("function '%s' expected 2 args, got %d\n", fname, num_args);
1427   else if (strchr(builtin_al_2_arg, tk) && num_args < 2)
1428     XERR("function '%s' expected at least 2 args, got %d\n", fname, num_args);
1429   else if (strchr(builtin_2_3_arg, tk) && num_args != 2 && num_args != 3)
1430     XERR("function '%s' expected 2 or 3 args, got %d\n", fname, num_args);
1431   else if (strchr(builtin_0_1_arg, tk) && num_args != 0 && num_args != 1)
1432     XERR("function '%s' expected no arg or 1 arg, got %d\n", fname, num_args);
1433 }
1434 
builtin_call(int tk,char * builtin_name)1435 static void builtin_call(int tk, char *builtin_name)
1436 {
1437   int num_args = 0;
1438   expect(tklparen);
1439   TT.cgl.paren_level++;
1440   switch (tk) {
1441     case tksub:
1442     case tkgsub:
1443       if (ISTOK(tkregex)) {
1444         gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1445         scan();
1446       } else expr(0);
1447       expect(tkcomma);
1448       optional_nl();
1449       expr(0);
1450       if (have_comma()) {
1451         lvalue();
1452       } else {
1453         gen2cd(tknumber, make_literal_num_val(0));
1454         gen2cd(opfldref, tkeof);
1455       }
1456       num_args = 3;
1457       break;
1458 
1459     case tkmatch:
1460       expr(0);
1461       expect(tkcomma);
1462       optional_nl();
1463       if (ISTOK(tkregex)) {
1464         gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1465         scan();
1466       } else expr(0);
1467       num_args = 2;
1468       break;
1469 
1470     case tksplit:
1471       expr(0);
1472       expect(tkcomma);
1473       optional_nl();
1474       if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1475         map_name();
1476         scan();
1477       } else {
1478         XERR("%s\n", "expected array name as split() 2nd arg");
1479         expr(0);
1480       }
1481       // FIXME some recovery needed here!?
1482       num_args = 2;
1483       if (have_comma()) {
1484         if (ISTOK(tkregex)) {
1485           gen2cd(tkregex, make_literal_regex_val(TT.tokstr));
1486           scan();
1487         } else expr(0);
1488         num_args++;
1489       }
1490       break;
1491 
1492     case tklength:
1493       if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1494         gen2cd(tkvar, find_or_add_var_name());
1495         scan();
1496         num_args++;
1497       }
1498       ATTR_FALLTHROUGH_INTENDED;
1499 
1500     default:
1501       if (ISTOK(tkrparen)) break;
1502       do {
1503         expr(0);
1504         num_args++;
1505       } while (have_comma());
1506       break;
1507   }
1508   expect(tkrparen);
1509   TT.cgl.paren_level--;
1510 
1511   check_builtin_arg_counts(tk, num_args, builtin_name);
1512 
1513   gen2cd(tk, num_args);
1514 }
1515 
function_call(void)1516 static void function_call(void)
1517 {
1518   // Function call: generate TT.zcode to:
1519   //  push placeholder for return value, push placeholder for return addr,
1520   //  push args, then push number of args, then:
1521   //      for builtins: gen opcode (e.g. tkgsub)
1522   //      for user func: gen (tkfunc, number-of-args)
1523   int functk = 0, funcnum = 0;
1524   char builtin_name[16];  // be sure it's long enough for all builtins
1525   if (ISTOK(tkbuiltin)) {
1526     functk = TT.scs->tokbuiltin;
1527     strcpy(builtin_name, TT.tokstr);
1528   } else if (ISTOK(tkfunc)) { // user function
1529     funcnum = find_func_def_entry(TT.tokstr);
1530     if (!funcnum) funcnum = add_func_def_entry(TT.tokstr);
1531     FUNC_DEF[funcnum].flags |= FUNC_CALLED;
1532     gen2cd(opprepcall, funcnum);
1533   } else error_exit("bad function %s!", TT.tokstr);
1534   scan();
1535   // length() can appear without parens
1536   int num_args = 0;
1537   if (functk == tklength && !ISTOK(tklparen)) {
1538     gen2cd(functk, 0);
1539     return;
1540   }
1541   if (functk) {   // builtin
1542     builtin_call(functk, builtin_name);
1543     return;
1544   }
1545   expect(tklparen);
1546   TT.cgl.paren_level++;
1547   if (ISTOK(tkrparen)) {
1548     scan();
1549   } else {
1550     do {
1551       if (ISTOK(tkvar) && (TT.scs->ch == ',' || TT.scs->ch == ')')) {
1552         // Function call arg that is a lone variable. Cannot tell in this
1553         // context if it is a scalar or map. Just add it to symbol table.
1554         gen2cd(tkvar, find_or_add_var_name());
1555         scan();
1556       } else expr(0);
1557       num_args++;
1558     } while (have_comma());
1559     expect(tkrparen);
1560   }
1561   TT.cgl.paren_level--;
1562   gen2cd(tkfunc, num_args);
1563 }
1564 
var(void)1565 static void var(void)
1566 {
1567   // var name is in TT.tokstr
1568   // slotnum: + means global; - means local to function
1569   int slotnum = find_or_add_var_name();
1570   scan();
1571   if (havetok(tklbracket)) {
1572     check_set_map(slotnum);
1573     int num_subscripts = 0;
1574     do {
1575       expr(0);
1576       num_subscripts++;
1577     } while (have_comma());
1578     expect(tkrbracket);
1579     if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1580     gen2cd(opmap, slotnum);
1581   } else {
1582     check_set_scalar(slotnum);
1583     gen2cd(tkvar, slotnum);
1584   }
1585 }
1586 
1587 //   Dollar $ tkfield can be followed by "any" expresson, but
1588 //   the way it binds varies.
1589 //   The following are valid lvalues:
1590 //   $ ( expr )
1591 //   $ tkvar $ tknumber $ tkstring $ tkregex
1592 //   $ tkfunc(...)
1593 //   $ tkbuiltin(...)
1594 //   $ length   # with no parens after
1595 //   $ tkclose(), ... $ tksubstr
1596 //   $ tkgetline FIXME TODO TEST THIS
1597 //   $ ++ lvalue
1598 //   $ -- lvalue
1599 //   $ + expression_up_to_exponentiation (also -, ! prefix ops)
1600 //   $ $ whatever_can_follow_and_bind_to_dollar
1601 //
1602 //     tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1603 //     tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline,
1604 //     tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1605 //
1606 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k*k }'
1607 // 18
1608 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k*k }'
1609 // 18
1610 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $k^k }'
1611 // 81
1612 // ray@radon:~$ awk 'BEGIN { $0 = "7 9 5 8"; k=2; print $+k^k }'
1613 // 8
1614 
field_op(void)1615 static void field_op(void)
1616 {
1617   // CURTOK() must be $ here.
1618   expect(tkfield);
1619   // tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1620   // tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1621   // tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1622   if (ISTOK(tkfield)) field_op();
1623   else if (ISTOK(tkvar)) var();
1624   else primary();
1625   // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
1626   // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
1627   gen2cd(tkfield, tkeof);
1628 }
1629 
1630 // Tokens that can start expression
1631 static char exprstartsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc,
1632   tkbuiltin, tkfield, tkminus, tkplus, tknot, tkincr, tkdecr, tklparen,
1633   tkgetline, tkclose, tkindex, tkmatch, tksplit, tksub, tkgsub, tksprintf,
1634   tksubstr, tkband, tkbor, tkbxor, tkrshift, tklshift, 0};
1635 
1636 // Tokens that can end statement
1637 static char stmtendsy[] = {tknl, tksemi, tkrbrace, 0};
1638 
1639 // Tokens that can follow expressions of a print statement
1640 static char printexprendsy[] = {tkgt, tkappend, tkpipe, tknl, tksemi, tkrbrace, 0};
1641 
1642 // !! Ensure this:
1643 // ternary op is right associative, so
1644 // a ? b : c ? d : e        evaluates as
1645 // a ? b : (c ? d : e)      not as
1646 // (a ? b : c) ? d : e
1647 
convert_push_to_reference(void)1648 static void convert_push_to_reference(void)
1649 {
1650   if (ZCODE[TT.zcode_last - 1] == tkvar) ZCODE[TT.zcode_last-1] = opvarref;
1651   else if (ZCODE[TT.zcode_last - 1] == opmap) ZCODE[TT.zcode_last - 1] = opmapref;
1652   else if (ZCODE[TT.zcode_last - 1] == tkfield) ZCODE[TT.zcode_last - 1] = opfldref;
1653   else error_exit("bad lvalue?");
1654 }
1655 
lvalue(void)1656 static void lvalue(void)
1657 {
1658   if (ISTOK(tkfield)) {
1659     field_op();
1660     convert_push_to_reference();
1661   } else if (ISTOK(tkvar)) {
1662     var();
1663     convert_push_to_reference();
1664   } else {
1665     XERR("syntax near '%s' (bad lvalue)\n", TT.tokstr);
1666   }
1667 }
1668 
primary(void)1669 static int primary(void)
1670 {
1671   //  On entry: CURTOK() is first token of expression
1672   //  On exit: CURTOK() is infix operator (for binary_op() to handle) or next
1673   //   token after end of expression.
1674   //  return -1 for field or var (potential lvalue);
1675   //      2 or more for comma-separated expr list
1676   //          as in "multiple subscript expression in array"
1677   //          e.g. (1, 2) in array_name, or a print/printf list;
1678   //      otherwise return 0
1679   //
1680   //  expr can start with:
1681   //      tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin, tkfield, tkminus,
1682   //      tkplus, tknot, tkincr, tkdecr, tklparen, tkgetline, tkclose, tkindex,
1683   //      tkmatch, tksplit, tksub, tkgsub, tksprintf, tksubstr
1684   //
1685   //  bwk treats these as keywords, not builtins: close index match split sub gsub
1686   //      sprintf substr
1687   //
1688   //  bwk builtins are: atan2 cos sin exp log sqrt int rand srand length tolower
1689   //      toupper system fflush
1690   //  NOTE: fflush() is NOT in POSIX awk
1691   //
1692   //  primary() must consume prefix and postfix operators as well as
1693   //      num, string, regex, var, var with subscripts, and function calls
1694 
1695   int num_exprs = 0;
1696   int nargs, modifier;
1697   int tok = CURTOK();
1698   switch (tok) {
1699     case tkvar:
1700     case tkfield:
1701       if (ISTOK(tkvar)) var();
1702       else field_op();
1703       if (ISTOK(tkincr) || ISTOK(tkdecr)) {
1704         convert_push_to_reference();
1705         gencd(CURTOK());
1706         scan();
1707       } else return -1;
1708       break;
1709 
1710     case tknumber:
1711       gen2cd(tknumber, make_literal_num_val(TT.scs->numval));
1712       scan();
1713       break;
1714 
1715     case tkstring:
1716       gen2cd(tkstring, make_literal_str_val(TT.tokstr));
1717       scan();
1718       break;
1719 
1720     case tkregex:
1721       // When an ERE token appears as an expression in any context other
1722       // than as the right-hand of the '~' or "!~" operator or as one of
1723       // the built-in function arguments described below, the value of
1724       // the resulting expression shall be the equivalent of: $0 ~ /ere/
1725       // FIXME TODO
1726       gen2cd(opmatchrec, make_literal_regex_val(TT.tokstr));
1727       scan();
1728       break;
1729 
1730     case tkbuiltin: // various builtins
1731     case tkfunc:    // user-defined function
1732       function_call();
1733       break;
1734 
1735     // Unary prefix ! + -
1736     case tknot:
1737     case tkminus:
1738     case tkplus:
1739       scan();
1740       expr(getlbp(tknot));   // unary +/- same precedence as !
1741       if (tok == tknot) gencd(tknot);
1742       else gencd(opnegate);               // forces to number
1743       if (tok == tkplus) gencd(opnegate); // forces to number
1744       break;
1745 
1746       // Unary prefix ++ -- MUST take lvalue
1747     case tkincr:
1748     case tkdecr:
1749       scan();
1750       lvalue();
1751       if (tok == tkincr) gencd(oppreincr);
1752       else gencd(oppredecr);
1753       break;
1754 
1755     case tklparen:
1756       scan();
1757       TT.cgl.paren_level++;
1758       num_exprs = 0;
1759       do {
1760         expr(0);
1761         num_exprs++;
1762       } while (have_comma());
1763       expect(tkrparen);
1764       TT.cgl.paren_level--;
1765       if (num_exprs > 1) return num_exprs;
1766       break;
1767 
1768     case tkgetline:
1769       // getline may be (according to awk book):
1770       // getline [var [<file]]
1771       // getline <file
1772       // cmd | getline [var]
1773       // var must be lvalue (can be any lvalue?)
1774       scan();
1775       nargs = 0;
1776       modifier = tkeof;
1777       if (ISTOK(tkfield) || ISTOK(tkvar)) {
1778         lvalue();
1779         nargs++;
1780       }
1781       if (havetok(tklt)) {
1782         expr(getrbp(tkcat));   // bwk "historical practice" precedence
1783         nargs++;
1784         modifier = tklt;
1785       }
1786       gen2cd(tkgetline, nargs);
1787       gencd(modifier);
1788       break;
1789 
1790     default:
1791       XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1792       skip_to(stmtendsy);
1793       break;
1794   }
1795   return 0;
1796 }
1797 
binary_op(int optor)1798 static void binary_op(int optor)  // Also for ternary ?: optor.
1799 {
1800   int nargs, cdx = 0;  // index in TT.zcode list
1801   int rbp = getrbp(optor);
1802   if (optor != tkcat) scan();
1803   // CURTOK() holds first token of right operand.
1804   switch (optor) {
1805     case tkin:
1806       // right side of 'in' must be (only) an array name
1807       map_name();
1808       gencd(tkin);
1809       scan();
1810       // FIXME TODO 20230109 x = y in a && 2 works OK?
1811       // x = y in a + 2 does not; it's parsed as x = (y in a) + 2
1812       // The +2 is not cat'ed with (y in a) as in bwk's OTA.
1813       // Other awks see y in a + 2 as a syntax error. They (may)
1814       // not want anything after y in a except a lower binding operator
1815       // (&& || ?:) or end of expression, i.e. ')' ';' '}'
1816       break;
1817 
1818   case tkpipe:
1819       expect(tkgetline);
1820       nargs = 1;
1821       if (ISTOK(tkfield) || ISTOK(tkvar)) {
1822         lvalue();
1823         nargs++;
1824       }
1825       gen2cd(tkgetline, nargs);
1826       gencd(tkpipe);
1827       break;
1828 
1829   case tkand:
1830   case tkor:
1831       optional_nl();
1832       gen2cd(optor, -1);  // tkand: jump if false, else drop
1833       cdx = TT.zcode_last;   // tkor:  jump if true, else drop
1834       expr(rbp);
1835       gencd(opnotnot);    // replace TT.stack top with truth value
1836       ZCODE[cdx] = TT.zcode_last - cdx;
1837       break;
1838 
1839   case tkternif:
1840       gen2cd(optor, -1);
1841       cdx = TT.zcode_last;
1842       expr(0);
1843       expect(tkternelse);
1844       gen2cd(tkternelse, -1);
1845       ZCODE[cdx] = TT.zcode_last - cdx;
1846       cdx = TT.zcode_last;
1847       expr(rbp);
1848       ZCODE[cdx] = TT.zcode_last - cdx;
1849       break;
1850 
1851   case tkmatchop:
1852   case tknotmatch:
1853       expr(rbp);
1854       if (ZCODE[TT.zcode_last - 1] == opmatchrec) ZCODE[TT.zcode_last - 1] = tkregex;
1855       gencd(optor);
1856       break;
1857 
1858   default:
1859       expr(rbp);
1860       gencd(optor);
1861   }
1862 }
1863 
cat_start_concated_expr(int tok)1864 static int cat_start_concated_expr(int tok)
1865 {
1866   // concat'ed expr can start w/ var number string func builtin $ ! ( (or ++ if prev was not lvalue)
1867   static char exprstarttermsy[] = {tkvar, tknumber, tkstring, tkregex, tkfunc, tkbuiltin,
1868     tkfield, tknot, tkincr, tkdecr, tklparen, tkgetline, 0};
1869 
1870   // NOTE this depends on builtins (close etc) being >= tkgetline
1871   return !! strchr(exprstarttermsy, tok) || tok >= tkgetline;
1872 }
1873 
1874 #define CALLED_BY_PRINT 99987 // Arbitrary, different from any real rbp value
1875 
expr(int rbp)1876 static int expr(int rbp)
1877 {
1878   // On entry: TT.scs has first symbol of expression, e.g. var, number, string,
1879   // regex, func, getline, left paren, prefix op ($ ++ -- ! unary + or -) etc.
1880   static char asgnops[] = {tkpowasgn, tkmodasgn, tkmulasgn, tkdivasgn,
1881     tkaddasgn, tksubasgn, tkasgn, 0};
1882   int prim_st = primary();
1883   // If called directly by print_stmt(), and found a parenthesized expression list
1884   //    followed by an end of print statement: any of > >> | ; } <newline>
1885   //    Then: return the count of expressions in list
1886   //    Else: continue parsing an expression
1887   if (rbp == CALLED_BY_PRINT) {
1888     if (prim_st > 0 && strchr(printexprendsy, CURTOK())) return prim_st;
1889     else rbp = 0;
1890   }
1891 
1892   // mult_expr_list in parens must be followed by 'in' unless it
1893   // immediately follows print or printf, where it may still be followed
1894   // by 'in' ... unless at end of statement
1895   if (prim_st > 0 && ! ISTOK(tkin))
1896     XERR("syntax near '%s'; expected 'in'\n", TT.tokstr);
1897   if (prim_st > 0) gen2cd(tkrbracket, prim_st);
1898   // primary() has eaten subscripts, function args, postfix ops.
1899   // CURTOK() should be a binary op.
1900   int optor = CURTOK();
1901   if (strchr(asgnops, optor)) {
1902 
1903     // TODO FIXME ?  NOT SURE IF THIS WORKS RIGHT!
1904     // awk does not parse according to POSIX spec in some odd cases.
1905     // When an assignment (lvalue =) is on the right of certain operators,
1906     // it is not treated as a bad lvalue (as it is in C).
1907     // Example: (1 && a=2) # no error; the assignment is performed.
1908     // This happens for ?: || && ~ !~ < <= ~= == > >=
1909     //
1910     static char odd_assignment_rbp[] = {59, 60, 70, 80, 100, 110, 0};
1911     if (prim_st < 0 && (rbp <= getrbp(optor) || strchr(odd_assignment_rbp, rbp))) {
1912       convert_push_to_reference();
1913       scan();
1914       expr(getrbp(optor));
1915       gencd(optor);
1916       return 0;
1917     }
1918     XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
1919     skip_to(stmtendsy);
1920   }
1921   if (cat_start_concated_expr(optor)) optor = tkcat;
1922   while (rbp < getlbp(optor)) {
1923     binary_op(optor);
1924     // HERE tok s/b an operator or expression terminator ( ; etc.).
1925     optor = CURTOK();
1926     if (cat_start_concated_expr(optor)) optor = tkcat;
1927   }
1928   return 0;
1929 }
1930 
print_stmt(int tk)1931 static void print_stmt(int tk)
1932 {
1933   static char outmodes[] = {tkgt, tkappend, tkpipe, 0};
1934   int num_exprs = 0, outmode;
1935   TT.cgl.in_print_stmt = 1;
1936   expect(tk); // tkprint or tkprintf
1937   if ((tk == tkprintf) || !strchr(printexprendsy, CURTOK())) {
1938     // printf always needs expression
1939     // print non-empty statement needs expression
1940     num_exprs = expr(CALLED_BY_PRINT);
1941     if (num_exprs > 0 && !strchr(printexprendsy, CURTOK())) FATAL("print stmt bug");
1942     if (!num_exprs) {
1943       for (num_exprs++; have_comma(); num_exprs++)
1944         expr(0);
1945     }
1946   }
1947   outmode = CURTOK();
1948   if (strchr(outmodes, outmode)) {
1949     scan();
1950     expr(0); // FIXME s/b only bwk term? check POSIX
1951     num_exprs++;
1952   } else outmode = 0;
1953   gen2cd(tk, num_exprs);
1954   gencd(outmode);
1955   TT.cgl.in_print_stmt = 0;
1956 }
1957 
delete_stmt(void)1958 static void delete_stmt(void)
1959 {
1960   expect(tkdelete);
1961   if (ISTOK(tkvar)) {
1962     int slotnum = find_or_add_var_name();
1963     check_set_map(slotnum);
1964     scan();
1965     if (havetok(tklbracket)) {
1966       int num_subscripts = 0;
1967       do {
1968         expr(0);
1969         num_subscripts++;
1970       } while (have_comma());
1971       expect(tkrbracket);
1972       if (num_subscripts > 1) gen2cd(tkrbracket, num_subscripts);
1973       gen2cd(opmapref, slotnum);
1974       gencd(tkdelete);
1975     } else {
1976       // delete entire map (elements only; var is still a map)
1977       gen2cd(opmapref, slotnum);
1978       gencd(opmapdelete);
1979     }
1980   } else expect(tkvar);
1981 }
1982 
simple_stmt(void)1983 static void simple_stmt(void)
1984 {
1985   if (strchr(exprstartsy, CURTOK())) {
1986     expr(0);
1987     gencd(opdrop);
1988     return;
1989   }
1990   switch (CURTOK()) {
1991     case tkprint:
1992     case tkprintf:
1993       print_stmt(CURTOK());
1994       break;
1995 
1996     case tkdelete:
1997       delete_stmt();
1998       break;
1999 
2000     default:
2001       XERR("syntax near '%s'\n", TT.tokstr[0] == '\n' ? "\\n" : TT.tokstr);
2002       skip_to(stmtendsy);
2003   }
2004 }
2005 
prev_was_terminated(void)2006 static int prev_was_terminated(void)
2007 {
2008   return !!strchr(stmtendsy, TT.prevtok);
2009 }
2010 
is_nl_semi(void)2011 static int is_nl_semi(void)
2012 {
2013   return ISTOK(tknl) || ISTOK(tksemi);
2014 }
2015 
if_stmt(void)2016 static void if_stmt(void)
2017 {
2018   expect(tkif);
2019   expect(tklparen);
2020   expr(0);
2021   rparen();
2022   gen2cd(tkif, -1);
2023   int cdx = TT.zcode_last;
2024   stmt();
2025   if (!prev_was_terminated() && is_nl_semi()) {
2026     scan();
2027     optional_nl();
2028   }
2029   if (prev_was_terminated()) {
2030     optional_nl();
2031     if (havetok(tkelse)) {
2032       gen2cd(tkelse, -1);
2033       ZCODE[cdx] = TT.zcode_last - cdx;
2034       cdx = TT.zcode_last;
2035       optional_nl();
2036       stmt();
2037     }
2038   }
2039   ZCODE[cdx] = TT.zcode_last - cdx;
2040 }
2041 
save_break_continue(int * brk,int * cont)2042 static void save_break_continue(int *brk, int *cont)
2043 {
2044   *brk = TT.cgl.break_dest;
2045   *cont = TT.cgl.continue_dest;
2046 }
2047 
restore_break_continue(int * brk,int * cont)2048 static void restore_break_continue(int *brk, int *cont)
2049 {
2050   TT.cgl.break_dest = *brk;
2051   TT.cgl.continue_dest = *cont;
2052 }
2053 
while_stmt(void)2054 static void while_stmt(void)
2055 {
2056   int brk, cont;
2057   save_break_continue(&brk, &cont);
2058   expect(tkwhile);
2059   expect(tklparen);
2060   TT.cgl.continue_dest = TT.zcode_last + 1;
2061   expr(0);
2062   rparen();
2063   gen2cd(tkwhile, 2);    // drop, jump if true
2064   TT.cgl.break_dest = TT.zcode_last + 1;
2065   gen2cd(opjump, -1);     // jump here to break
2066   stmt();
2067   gen2cd(opjump, -1);     // jump to continue
2068   ZCODE[TT.zcode_last] = TT.cgl.continue_dest - TT.zcode_last - 1;
2069   ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2070   restore_break_continue(&brk, &cont);
2071 }
2072 
do_stmt(void)2073 static void do_stmt(void)
2074 {
2075   int brk, cont;
2076   save_break_continue(&brk, &cont);
2077   expect(tkdo);
2078   optional_nl();
2079   gen2cd(opjump, 4);   // jump over jumps, to statement
2080   TT.cgl.continue_dest = TT.zcode_last + 1;
2081   gen2cd(opjump, -1);   // here on continue
2082   TT.cgl.break_dest = TT.zcode_last + 1;
2083   gen2cd(opjump, -1);   // here on break
2084   stmt();
2085   if (!prev_was_terminated()) {
2086     if (is_nl_semi()) {
2087       scan();
2088       optional_nl();
2089     } else {
2090       XERR("syntax near '%s' -- ';' or newline expected\n", TT.tokstr);
2091       // FIXME
2092     }
2093   }
2094   ZCODE[TT.cgl.continue_dest + 1] = TT.zcode_last - TT.cgl.continue_dest - 1;
2095   optional_nl();
2096   expect(tkwhile);
2097   expect(tklparen);
2098   expr(0);
2099   rparen();
2100   gen2cd(tkwhile, TT.cgl.break_dest - TT.zcode_last - 1);
2101   ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2102   restore_break_continue(&brk, &cont);
2103 }
2104 
for_not_map_iter(void)2105 static void for_not_map_iter(void)
2106 {
2107   // Here after loop initialization, if any; loop condition
2108   int condition_loc = TT.zcode_last + 1;
2109   if (havetok(tksemi)) {
2110     // "endless" loop variant; no condition
2111     // no NL allowed here in OTA
2112     gen2cd(opjump, -1);     // jump to statement
2113   } else {
2114     optional_nl();                // NOT posix or awk book; in OTA
2115     expr(0);                 // loop while true
2116     expect(tksemi);
2117     gen2cd(tkwhile, -1);    // drop, jump to statement if true
2118   }
2119   optional_nl();                    // NOT posix or awk book; in OTA
2120   TT.cgl.break_dest = TT.zcode_last + 1;
2121   gen2cd(opjump, -1);
2122   TT.cgl.continue_dest = TT.zcode_last + 1;
2123   if (!ISTOK(tkrparen)) simple_stmt();  // "increment"
2124   gen2cd(opjump, condition_loc - TT.zcode_last - 3);
2125   rparen();
2126   ZCODE[TT.cgl.break_dest - 1] = TT.zcode_last - TT.cgl.break_dest + 1;
2127   stmt();
2128   gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2129   ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2130 }
2131 
valid_for_array_iteration(int first,int last)2132 static int valid_for_array_iteration(int first, int last)
2133 {
2134   return ZCODE[first] == tkvar && ZCODE[first + 2] == tkvar
2135       && ZCODE[first + 4] == tkin && ZCODE[first + 5] == opdrop
2136       && first + 5 == last;
2137 }
2138 
for_stmt(void)2139 static void for_stmt(void)
2140 {
2141   int brk, cont;
2142   save_break_continue(&brk, &cont);
2143   expect(tkfor);
2144   expect(tklparen);
2145   if (havetok(tksemi)) {
2146     // No "initialization" part
2147     for_not_map_iter();
2148   } else {
2149     int loop_start_loc = TT.zcode_last + 1;
2150     simple_stmt();  // initializaton part, OR varname in arrayname form
2151     if (!havetok(tkrparen)) {
2152       expect(tksemi);
2153       for_not_map_iter();
2154     } else {
2155       // Must be map iteration
2156       // Check here for varname in varname!
2157       // FIXME TODO must examine generated TT.zcode for var in array?
2158       if (!valid_for_array_iteration(loop_start_loc, TT.zcode_last))
2159         XERR("%s", "bad 'for (var in array)' loop\n");
2160       else {
2161         ZCODE[TT.zcode_last-5] = opvarref;
2162         ZCODE[TT.zcode_last-1] = tknumber;
2163         ZCODE[TT.zcode_last] = make_literal_num_val(-1);
2164         TT.cgl.continue_dest = TT.zcode_last + 1;
2165         gen2cd(opmapiternext, 2);
2166         TT.cgl.break_dest = TT.zcode_last + 1;
2167         gen2cd(opjump, -1);   // fill in with loc after stmt
2168       }
2169       optional_nl();
2170       // fixup TT.stack if return or exit inside for (var in array)
2171       TT.cgl.stack_offset_to_fix += 3;
2172       stmt();
2173       TT.cgl.stack_offset_to_fix -= 3;
2174       gen2cd(opjump, TT.cgl.continue_dest - TT.zcode_last - 3);
2175       ZCODE[TT.cgl.break_dest + 1] = TT.zcode_last - TT.cgl.break_dest - 1;
2176       gencd(opdrop);
2177       gencd(opdrop);
2178       gencd(opdrop);
2179     }
2180   }
2181   restore_break_continue(&brk, &cont);
2182 }
2183 
stmt(void)2184 static void stmt(void)
2185 {
2186   switch (CURTOK()) {
2187     case tkeof:
2188       break;     // FIXME ERROR?
2189 
2190     case tkbreak:
2191       scan();
2192       if (TT.cgl.break_dest) gen2cd(tkbreak, TT.cgl.break_dest - TT.zcode_last - 3);
2193       else XERR("%s", "break not in a loop\n");
2194       break;
2195 
2196     case tkcontinue:
2197       scan();
2198       if (TT.cgl.continue_dest)
2199         gen2cd(tkcontinue, TT.cgl.continue_dest - TT.zcode_last - 3);
2200       else XERR("%s", "continue not in a loop\n");
2201       break;
2202 
2203     case tknext:
2204       scan();
2205       gencd(tknext);
2206       if (TT.cgl.rule_type) XERR("%s", "next inside BEGIN or END\n");
2207       if (TT.cgl.in_function_body) XERR("%s", "next inside function def\n");
2208       break;
2209 
2210     case tknextfile:
2211       scan();
2212       gencd(tknextfile);
2213       if (TT.cgl.rule_type) XERR("%s", "nextfile inside BEGIN or END\n");
2214       if (TT.cgl.in_function_body) XERR("%s", "nextfile inside function def\n");
2215       break;
2216 
2217     case tkexit:
2218       scan();
2219       if (strchr(exprstartsy, CURTOK())) {
2220         expr(0);
2221       } else gen2cd(tknumber, make_literal_num_val(NO_EXIT_STATUS));
2222       gencd(tkexit);
2223       break;
2224 
2225     case tkreturn:
2226       scan();
2227       if (TT.cgl.stack_offset_to_fix) gen2cd(opdrop_n, TT.cgl.stack_offset_to_fix);
2228       if (strchr(exprstartsy, CURTOK())) {
2229         expr(0);
2230       } else gen2cd(tknumber, make_literal_num_val(0.0));
2231       gen2cd(tkreturn, TT.cgl.nparms);
2232       if (!TT.cgl.in_function_body) XERR("%s", "return outside function def\n");
2233       break;
2234 
2235     case tklbrace:
2236       action(tklbrace);
2237       break;
2238 
2239     case tkif:
2240       if_stmt();
2241       break;
2242 
2243     case tkwhile:
2244       while_stmt();
2245       break;
2246 
2247     case tkdo:
2248       do_stmt();
2249       break;
2250 
2251     case tkfor:
2252       for_stmt();
2253       break;
2254 
2255     case tksemi:
2256       scan();
2257       break;
2258     default:
2259       simple_stmt();      // expression print printf delete
2260   }
2261 }
2262 
add_param(int funcnum,char * s)2263 static void add_param(int funcnum, char *s)
2264 {
2265   if (!find_local_entry(s)) add_local_entry(s);
2266   else XERR("function '%s' dup param '%s'\n", FUNC_DEF[funcnum].name, s);
2267   TT.cgl.nparms++;
2268 
2269   // POSIX: The same name shall not be used as both a function parameter name
2270   // and as the name of a function or a special awk variable.
2271   // !!! NOTE seems implementations exc. mawk only compare param names with
2272   // builtin funcs; use same name as userfunc is OK!
2273   if (!strcmp(s, FUNC_DEF[funcnum].name))
2274     XERR("function '%s' param '%s' matches func name\n",
2275         FUNC_DEF[funcnum].name, s);
2276   if (find_global(s) && find_global(s) < TT.spec_var_limit)
2277     XERR("function '%s' param '%s' matches special var\n",
2278         FUNC_DEF[funcnum].name, s);
2279 }
2280 
function_def(void)2281 static void function_def(void)
2282 {
2283   expect(tkfunction);
2284   int funcnum = find_func_def_entry(TT.tokstr);
2285   if (!funcnum) {
2286     funcnum = add_func_def_entry(TT.tokstr);
2287   } else if (FUNC_DEF[funcnum].flags & FUNC_DEFINED) {
2288     XERR("dup defined function '%s'\n", TT.tokstr);
2289   }
2290   FUNC_DEF[funcnum].flags |= FUNC_DEFINED;
2291   if (find_global(TT.tokstr)) {
2292     // POSIX: The same name shall not be used both as a variable name with
2293     // global scope and as the name of a function.
2294     XERR("function name '%s' previously defined\n", TT.tokstr);
2295   }
2296 
2297   gen2cd(tkfunction, funcnum);
2298   FUNC_DEF[funcnum].zcode_addr = TT.zcode_last - 1;
2299   TT.cgl.funcnum = funcnum;
2300   TT.cgl.nparms = 0;
2301   if (ISTOK(tkfunc)) expect(tkfunc); // func name with no space before (
2302   else expect(tkvar);  // func name with space before (
2303   expect(tklparen);
2304   if (ISTOK(tkvar)) {
2305     add_param(funcnum, TT.tokstr);
2306     scan();
2307     // FIXME is the the best way? what if TT.tokstr not a tkvar?
2308     while (have_comma()) {
2309       add_param(funcnum, TT.tokstr);
2310       expect(tkvar);
2311     }
2312   }
2313   rparen();
2314   if (ISTOK(tklbrace)) {
2315     TT.cgl.in_function_body = 1;
2316     action(tkfunc);
2317     TT.cgl.in_function_body = 0;
2318     // Need to return uninit value if falling off end of function.
2319     gen2cd(tknumber, make_uninit_val());
2320     gen2cd(tkreturn, TT.cgl.nparms);
2321   } else {
2322     XERR("syntax near '%s'\n", TT.tokstr);
2323     // FIXME some recovery needed here!?
2324   }
2325   // Do not re-init locals table for dup function.
2326   // Avoids memory leak detected by LeakSanitizer.
2327   if (!FUNC_DEF[funcnum].function_locals.base) {
2328     FUNC_DEF[funcnum].function_locals = TT.locals_table;
2329     init_locals_table();
2330   }
2331 }
2332 
action(int action_type)2333 static void action(int action_type)
2334 {
2335 (void)action_type;
2336   // action_type is tkbegin, tkend, tkdo (every line), tkif (if pattern),
2337   //                  tkfunc (function body), tklbrace (compound statement)
2338   // Should have lbrace on entry.
2339   expect(tklbrace);
2340   for (;;) {
2341     if (ISTOK(tkeof)) unexpected_eof();
2342     optional_nl_or_semi();
2343     if (havetok(tkrbrace)) {
2344       break;
2345     }
2346     stmt();
2347     // stmt() is normally unterminated here, but may be terminated if we
2348     // have if with no else (had to consume terminator looking for else)
2349     //   !!!   if (ISTOK(tkrbrace) || prev_was_terminated())
2350     if (prev_was_terminated()) continue;
2351     if (!is_nl_semi() && !ISTOK(tkrbrace)) {
2352       XERR("syntax near '%s' -- newline, ';', or '}' expected\n", TT.tokstr);
2353       while (!is_nl_semi() && !ISTOK(tkrbrace) && !ISTOK(tkeof)) scan();
2354       if (ISTOK(tkeof)) unexpected_eof();
2355     }
2356     if (havetok(tkrbrace)) break;
2357     // Must be semicolon or newline
2358     scan();
2359   }
2360 }
2361 
rule(void)2362 static void rule(void)
2363 {
2364   //       pa_pat
2365   //     | pa_pat lbrace stmtlist '}'
2366   //     | pa_pat ',' opt_nl pa_pat
2367   //     | pa_pat ',' opt_nl pa_pat lbrace stmtlist '}'
2368   //     | lbrace stmtlist '}'
2369   //     | XBEGIN lbrace stmtlist '}'
2370   //     | XEND lbrace stmtlist '}'
2371   //     | FUNC funcname '(' varlist rparen  lbrace stmtlist '}'
2372 
2373   switch (CURTOK()) {
2374     case tkbegin:
2375       scan();
2376       if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin] = TT.zcode_last - TT.cgl.last_begin;
2377       else TT.cgl.first_begin = TT.zcode_last + 1;
2378 
2379       TT.cgl.rule_type = tkbegin;
2380       action(tkbegin);
2381       TT.cgl.rule_type = 0;
2382       gen2cd(opjump, -1);
2383       TT.cgl.last_begin = TT.zcode_last;
2384       break;
2385 
2386     case tkend:
2387       scan();
2388       if (TT.cgl.last_end) ZCODE[TT.cgl.last_end] = TT.zcode_last - TT.cgl.last_end;
2389       else TT.cgl.first_end = TT.zcode_last + 1;
2390 
2391       TT.cgl.rule_type = tkbegin;
2392       action(tkend);
2393       TT.cgl.rule_type = 0;
2394       gen2cd(opjump, -1);
2395       TT.cgl.last_end = TT.zcode_last;
2396       break;
2397 
2398     case tklbrace:
2399       if (TT.cgl.last_recrule)
2400         ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2401       else TT.cgl.first_recrule = TT.zcode_last + 1;
2402       action(tkdo);
2403       gen2cd(opjump, -1);
2404       TT.cgl.last_recrule = TT.zcode_last;
2405       break;
2406 
2407     case tkfunction:
2408       function_def();
2409       break;
2410     default:
2411       if (TT.cgl.last_recrule)
2412         ZCODE[TT.cgl.last_recrule] = TT.zcode_last - TT.cgl.last_recrule;
2413       else TT.cgl.first_recrule = TT.zcode_last + 1;
2414       gen2cd(opjump, 1);
2415       gencd(tkeof);
2416       int cdx = 0, saveloc = TT.zcode_last;
2417       expr(0);
2418       if (!have_comma()) {
2419         gen2cd(tkif, -1);
2420         cdx = TT.zcode_last;
2421       } else {
2422         gen2cd(oprange2, ++TT.cgl.range_pattern_num);
2423         gencd(-1);
2424         cdx = TT.zcode_last;
2425         ZCODE[saveloc-2] = oprange1;
2426         ZCODE[saveloc-1] = TT.cgl.range_pattern_num;
2427         ZCODE[saveloc] = TT.zcode_last - saveloc;
2428         expr(0);
2429         gen2cd(oprange3, TT.cgl.range_pattern_num);
2430       }
2431       if (ISTOK(tklbrace)) {
2432         action(tkif);
2433         ZCODE[cdx] = TT.zcode_last - cdx;
2434       } else {
2435         gencd(opprintrec);   // print $0 ?
2436         ZCODE[cdx] = TT.zcode_last - cdx;
2437       }
2438       gen2cd(opjump, -1);
2439       TT.cgl.last_recrule = TT.zcode_last;
2440   }
2441 }
2442 
diag_func_def_ref(void)2443 static void diag_func_def_ref(void)
2444 {
2445   int n = zlist_len(&TT.func_def_table);
2446   for (int k = 1; k < n; k++) {
2447     if ((FUNC_DEF[k].flags & FUNC_CALLED) &&
2448             !(FUNC_DEF[k].flags & FUNC_DEFINED)) {
2449       // Sorry, we can't tell where this was called from, for now at least.
2450       XERR("Undefined function '%s'", FUNC_DEF[k].name);
2451     }
2452   }
2453 }
2454 
compile(void)2455 static void compile(void)
2456 {
2457   init_compiler();
2458   init_scanner();
2459   scan();
2460   optional_nl_or_semi();        // Does posix allow NL or ; before first rule?
2461   while (! ISTOK(tkeof)) {
2462     rule();
2463     optional_nl_or_semi();        // NOT POSIX
2464   }
2465 
2466 
2467   if (TT.cgl.last_begin) ZCODE[TT.cgl.last_begin-1] = opquit;
2468   if (TT.cgl.last_end) ZCODE[TT.cgl.last_end-1] = opquit;
2469   if (TT.cgl.last_recrule) ZCODE[TT.cgl.last_recrule-1] = opquit;
2470 
2471   gen2cd(tknumber, make_literal_num_val(0.0));
2472   gencd(tkexit);
2473   gencd(opquit);
2474   // If there are only BEGIN and END or only END actions, generate actions to
2475   // read all input before END.
2476   if (TT.cgl.first_end && !TT.cgl.first_recrule) {
2477     gencd(opquit);
2478     TT.cgl.first_recrule = TT.zcode_last;
2479   }
2480   gencd(opquit);  // One more opcode to keep ip in bounds in run code.
2481   diag_func_def_ref();
2482 }
2483 
2484 ////////////////////
2485 //// runtime
2486 ////////////////////
2487 
check_numeric_string(struct zvalue * v)2488 static void check_numeric_string(struct zvalue *v)
2489 {
2490   if (v->vst) {
2491     char *end, *s = v->vst->str;
2492     // Significant speed gain with this test:
2493     // num string must begin space, +, -, ., or digit.
2494     if (strchr("+-.1234567890 ", *s)) {
2495       double num = strtod(s, &end);
2496       if (s == end || end[strspn(end, " ")]) return;
2497       v->num = num;
2498       v->flags |= ZF_NUM | ZF_STR | ZF_NUMSTR;
2499     }
2500   }
2501 }
2502 
num_to_zstring(double n,char * fmt)2503 static struct zstring *num_to_zstring(double n, char *fmt)
2504 {
2505   int k;
2506   if (n == (long long)n) k = snprintf(TT.pbuf, PBUFSIZE, "%lld", (long long)n);
2507   else k = snprintf(TT.pbuf, PBUFSIZE, fmt, n);
2508   if (k < 0 || k >= PBUFSIZE) FFATAL("error encoding %f via '%s'", n, fmt);
2509   return new_zstring(TT.pbuf, k);
2510 }
2511 
2512 ////////////////////
2513 //// regex routines
2514 ////////////////////
2515 
escape_str(char * s,int is_regex)2516 static char *escape_str(char *s, int is_regex)
2517 {
2518   char *p, *escapes = is_regex ? "abfnrtv\"/" : "\\abfnrtv\"/";
2519   // FIXME TODO should / be in there?
2520   char *s0 = s, *to = s;
2521   while ((*to = *s)) {
2522     if (*s != '\\') { to++, s++;
2523     } else if ((p = strchr(escapes, *++s))) {
2524       // checking char after \ for known escapes
2525       int c = (is_regex?"\a\b\f\n\r\t\v\"/":"\\\a\b\f\n\r\t\v\"/")[p-escapes];
2526       if (c) *to = c, s++;  // else final backslash
2527       to++;
2528     } else if ('0' <= *s && *s <= '9') {
2529       int k, c = *s++ - '0';
2530       for (k = 0; k < 2 && '0' <= *s && *s <= '9'; k++)
2531         c = c * 8 + *s++ - '0';
2532       *to++ = c;
2533     } else if (*s == 'x') {
2534       if (isxdigit(s[1])) {
2535         int c = hexval(*++s);
2536         if (isxdigit(s[1])) c = c * 16 + hexval(*++s);
2537         *to++ = c, s++;
2538       }
2539     } else {
2540       if (is_regex) *to++ = '\\';
2541       *to++ = *s++;
2542     }
2543   }
2544   return s0;
2545 }
2546 
force_maybemap_to_scalar(struct zvalue * v)2547 static void force_maybemap_to_scalar(struct zvalue *v)
2548 {
2549   if (!(v->flags & ZF_ANYMAP)) return;
2550   if (v->flags & ZF_MAP || v->map->count)
2551     FATAL("array in scalar context");
2552   v->flags = 0;
2553   v->map = 0; // v->flags = v->map = 0 gets warning
2554 }
2555 
force_maybemap_to_map(struct zvalue * v)2556 static void force_maybemap_to_map(struct zvalue *v)
2557 {
2558   if (v->flags & ZF_MAYBEMAP) v->flags = ZF_MAP;
2559 }
2560 
2561 // fmt_offs is either CONVFMT or OFMT (offset in stack to zvalue)
to_str_fmt(struct zvalue * v,int fmt_offs)2562 static struct zvalue *to_str_fmt(struct zvalue *v, int fmt_offs)
2563 {
2564   force_maybemap_to_scalar(v);
2565   // TODO: consider handling numstring differently
2566   if (v->flags & ZF_NUMSTR) v->flags = ZF_STR;
2567   if (IS_STR(v)) return v;
2568   else if (!v->flags) { // uninitialized
2569     v->vst = new_zstring("", 0);
2570   } else if (IS_NUM(v)) {
2571     zvalue_release_zstring(v);
2572     if (!IS_STR(&STACK[fmt_offs])) {
2573       zstring_release(&STACK[fmt_offs].vst);
2574       STACK[fmt_offs].vst = num_to_zstring(STACK[fmt_offs].num, "%.6g");
2575       STACK[fmt_offs].flags = ZF_STR;
2576     }
2577     v->vst = num_to_zstring(v->num, STACK[fmt_offs].vst->str);
2578   } else {
2579     FATAL("Wrong or unknown type in to_str_fmt\n");
2580   }
2581   v->flags = ZF_STR;
2582   return v;
2583 }
2584 
to_str(struct zvalue * v)2585 static struct zvalue *to_str(struct zvalue *v)
2586 {
2587   return to_str_fmt(v, CONVFMT);
2588 }
2589 
2590 // TODO FIXME Is this needed? (YES -- investigate) Just use to_str()?
2591 #define ENSURE_STR(v) (IS_STR(v) ? (v) : to_str(v))
2592 
rx_zvalue_compile(regex_t ** rx,struct zvalue * pat)2593 static void rx_zvalue_compile(regex_t **rx, struct zvalue *pat)
2594 {
2595   if (IS_RX(pat)) *rx = pat->rx;
2596   else {
2597     zvalue_dup_zstring(to_str(pat));
2598     escape_str(pat->vst->str, 1);
2599     xregcomp(*rx, pat->vst->str, REG_EXTENDED);
2600   }
2601 }
2602 
rx_zvalue_free(regex_t * rx,struct zvalue * pat)2603 static void rx_zvalue_free(regex_t *rx, struct zvalue *pat)
2604 {
2605   if (!IS_RX(pat) || rx != pat->rx) regfree(rx);
2606 }
2607 
2608 // Used by the match/not match ops (~ !~) and implicit $0 match (/regex/)
match(struct zvalue * zvsubject,struct zvalue * zvpat)2609 static int match(struct zvalue *zvsubject, struct zvalue *zvpat)
2610 {
2611   int r;
2612   regex_t rx, *rxp = &rx;
2613   rx_zvalue_compile(&rxp, zvpat);
2614   if ((r = regexec(rxp, to_str(zvsubject)->vst->str, 0, 0, 0)) != 0) {
2615     if (r != REG_NOMATCH) {
2616       char errbuf[256];
2617       regerror(r, &rx, errbuf, sizeof(errbuf));
2618       // FIXME TODO better diagnostic here
2619       error_exit("regex match error %d: %s", r, errbuf);
2620     }
2621     rx_zvalue_free(rxp, zvpat);
2622     return 1;
2623   }
2624   rx_zvalue_free(rxp, zvpat);
2625   return 0;
2626 }
2627 
rx_find(regex_t * rx,char * s,regoff_t * start,regoff_t * end,int eflags)2628 static int rx_find(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2629 {
2630   regmatch_t matches[1];
2631   int r = regexec(rx, s, 1, matches, eflags);
2632   if (r == REG_NOMATCH) return r;
2633   if (r) FATAL("regexec error");  // TODO ? use regerr() to meaningful msg
2634   *start = matches[0].rm_so;
2635   *end = matches[0].rm_eo;
2636   return 0;
2637 }
2638 
2639 // Differs from rx_find() in that FS cannot match null (empty) string.
2640 // See https://www.austingroupbugs.net/view.php?id=1468.
rx_find_FS(regex_t * rx,char * s,regoff_t * start,regoff_t * end,int eflags)2641 static int rx_find_FS(regex_t *rx, char *s, regoff_t *start, regoff_t *end, int eflags)
2642 {
2643   int r = rx_find(rx, s, start, end, eflags);
2644   if (r || *start != *end) return r;  // not found, or found non-empty match
2645   // Found empty match, retry starting past the match
2646   char *p = s + *end;
2647   if (!*p) return REG_NOMATCH;  // End of string, no non-empty match found
2648   // Empty match not at EOS, move ahead and try again
2649   while (!r && *start == *end && *++p)
2650     r = rx_find(rx, p, start, end, eflags);
2651   if (r || !*p) return REG_NOMATCH;  // no non-empty match found
2652   *start += p - s;  // offsets from original string
2653   *end += p - s;
2654   return 0;
2655 }
2656 
2657 ////////////////////
2658 ////   fields
2659 ////////////////////
2660 
2661 #define FIELDS_MAX  102400 // Was 1024; need more for toybox awk test
2662 #define THIS_MEANS_SET_NF 999999999
2663 
get_int_val(struct zvalue * v)2664 static int get_int_val(struct zvalue *v)
2665 {
2666   if (IS_NUM(v)) return (int)v->num;
2667   if (IS_STR(v) && v->vst) return (int)atof(v->vst->str);
2668   return 0;
2669 }
2670 
2671 // A single-char FS is never a regex, so make it a [<char>] regex to
2672 // match only that one char in case FS is a regex metachar.
2673 // If regex FS is needed, must use > 1 char. If a '.' regex
2674 // is needed, use e.g. '.|.' (unlikely case).
fmt_one_char_fs(char * fs)2675 static char *fmt_one_char_fs(char *fs)
2676 {
2677   if (strlen(fs) != 1) return fs;
2678   snprintf(TT.one_char_fs, sizeof(TT.one_char_fs), "[%c]", fs[0]);
2679   return TT.one_char_fs;
2680 }
2681 
rx_fs_prep(char * fs)2682 static regex_t *rx_fs_prep(char *fs)
2683 {
2684   if (!strcmp(fs, " ")) return &TT.rx_default;
2685   if (!strcmp(fs, TT.fs_last)) return &TT.rx_last;
2686   if (strlen(fs) >= FS_MAX) FATAL("FS too long");
2687   strcpy(TT.fs_last, fs);
2688   regfree(&TT.rx_last);
2689   xregcomp(&TT.rx_last, fmt_one_char_fs(fs), REG_EXTENDED);
2690   return &TT.rx_last;
2691 }
2692 
2693 // Only for use by split() builtin
set_map_element(struct zmap * m,int k,char * val,size_t len)2694 static void set_map_element(struct zmap *m, int k, char *val, size_t len)
2695 {
2696   // Do not need format here b/c k is integer, uses "%lld" format.
2697   struct zstring *key = num_to_zstring(k, "");// "" vs 0 format avoids warning
2698   struct zmap_slot *zs = zmap_find_or_insert_key(m, key);
2699   zstring_release(&key);
2700   zs->val.vst = zstring_update(zs->val.vst, 0, val, len);
2701   zs->val.flags = ZF_STR;
2702   check_numeric_string(&zs->val);
2703 }
2704 
set_zvalue_str(struct zvalue * v,char * s,size_t size)2705 static void set_zvalue_str(struct zvalue *v, char *s, size_t size)
2706 {
2707   v->vst = zstring_update(v->vst, 0, s, size);
2708   v->flags = ZF_STR;
2709 }
2710 
2711 // All changes to NF go through here!
set_nf(int nf)2712 static void set_nf(int nf)
2713 {
2714   if (nf < 0) FATAL("NF set negative");
2715   STACK[NF].num = TT.nf_internal = nf;
2716   STACK[NF].flags = ZF_NUM;
2717 }
2718 
set_field(struct zmap * unused,int fnum,char * s,size_t size)2719 static void set_field(struct zmap *unused, int fnum, char *s, size_t size)
2720 { (void)unused;
2721   if (fnum < 0 || fnum > FIELDS_MAX) FFATAL("bad field num %d\n", fnum);
2722   int nfields = zlist_len(&TT.fields);
2723   // Need nfields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2724   while (nfields <= fnum)
2725     nfields = zlist_append(&TT.fields, &uninit_zvalue) + 1;
2726   set_zvalue_str(&FIELD[fnum], s, size);
2727   set_nf(fnum);
2728   check_numeric_string(&FIELD[fnum]);
2729 }
2730 
2731 // Split s via fs, using setter; return number of TT.fields.
2732 // This is used to split TT.fields and also for split() builtin.
splitter(void (* setter)(struct zmap *,int,char *,size_t),struct zmap * m,char * s,struct zvalue * zvfs)2733 static int splitter(void (*setter)(struct zmap *, int, char *, size_t), struct zmap *m, char *s, struct zvalue *zvfs)
2734 {
2735   regex_t *rx;
2736   regoff_t offs, end;
2737   int multiline_null_rs = !ENSURE_STR(&STACK[RS])->vst->str[0];
2738   int nf = 0, r = 0, eflag = 0;
2739   int one_char_fs = 0;
2740   char *s0 = s, *fs = "";
2741   if (!IS_RX(zvfs)) {
2742     to_str(zvfs);
2743     fs = zvfs->vst->str;
2744     one_char_fs = utf8cnt(zvfs->vst->str, zvfs->vst->size) == 1;
2745   }
2746   // Empty string or empty fs (regex).
2747   // Need to include !*s b/c empty string, otherwise
2748   // split("", a, "x") splits to a 1-element (empty element) array
2749   if (!*s || (IS_STR(zvfs) && !*fs) || IS_EMPTY_RX(zvfs)) {
2750     while (*s) {
2751       if (*s < 128) setter(m, ++nf, s++, 1);
2752       else {        // Handle UTF-8
2753         char cbuf[8];
2754         unsigned wc;
2755         int nc = utf8towc(&wc, s, strlen(s));
2756         if (nc < 2) FFATAL("bad string for split: \"%s\"\n", s0);
2757         s += nc;
2758         nc = wctoutf8(cbuf, wc);
2759         setter(m, ++nf, cbuf, nc);
2760       }
2761     }
2762     return nf;
2763   }
2764   if (IS_RX(zvfs)) rx = zvfs->rx;
2765   else rx = rx_fs_prep(fs);
2766   while (*s) {
2767     // Find the next occurrence of FS.
2768     // rx_find_FS() returns 0 if found. If nonzero, the field will
2769     // be the rest of the record (all of it if first time through).
2770     if ((r = rx_find_FS(rx, s, &offs, &end, eflag))) offs = end = strlen(s);
2771     if (setter == set_field && multiline_null_rs && one_char_fs) {
2772       // Contra POSIX, if RS=="" then newline is always also a
2773       // field separator only if FS is a single char (see gawk manual)
2774       int k = strcspn(s, "\n");
2775       if (k < offs) offs = k, end = k + 1;
2776     }
2777     eflag |= REG_NOTBOL;
2778 
2779     // Field will be s up to (not including) the offset. If offset
2780     // is zero and FS is found and FS is ' ' (TT.rx_default "[ \t]+"),
2781     // then the find is the leading or trailing spaces and/or tabs.
2782     // If so, skip this (empty) field, otherwise set field, length is offs.
2783     if (offs || r || rx != &TT.rx_default) setter(m, ++nf, s, offs);
2784     s += end;
2785   }
2786   if (!r && rx != &TT.rx_default) setter(m, ++nf, "", 0);
2787   return nf;
2788 }
2789 
build_fields(void)2790 static void build_fields(void)
2791 {
2792   char *rec = FIELD[0].vst->str;
2793   // TODO test this -- why did I not want to split empty $0?
2794   // Maybe don't split empty $0 b/c non-default FS gets NF==1 with splitter()?
2795   set_nf(*rec ? splitter(set_field, 0, rec, to_str(&STACK[FS])) : 0);
2796 }
2797 
rebuild_field0(void)2798 static void rebuild_field0(void)
2799 {
2800   struct zstring *s = FIELD[0].vst;
2801   int nf = TT.nf_internal;
2802   if (!nf) {
2803     zvalue_copy(&FIELD[0], &uninit_string_zvalue);
2804     return;
2805   }
2806   // uninit value needed for eventual reference to .vst in zstring_release()
2807   struct zvalue tempv = uninit_zvalue;
2808   zvalue_copy(&tempv, to_str(&STACK[OFS]));
2809   for (int i = 1; i <= nf; i++) {
2810     if (i > 1) {
2811       s = s ? zstring_extend(s, tempv.vst) : zstring_copy(s, tempv.vst);
2812     }
2813     if (FIELD[i].flags) to_str(&FIELD[i]);
2814     if (FIELD[i].vst) {
2815       if (i > 1) s = zstring_extend(s, FIELD[i].vst);
2816       else s = zstring_copy(s, FIELD[i].vst);
2817     }
2818   }
2819   FIELD[0].vst = s;
2820   FIELD[0].flags |= ZF_STR;
2821   zvalue_release_zstring(&tempv);
2822 }
2823 
2824 // get field ref (lvalue ref) in prep for assignment to field.
2825 // [... assigning to a nonexistent field (for example, $(NF+2)=5) shall
2826 // increase the value of NF; create any intervening TT.fields with the
2827 // uninitialized value; and cause the value of $0 to be recomputed, with the
2828 // TT.fields being separated by the value of OFS.]
2829 // Called by setup_lvalue()
get_field_ref(int fnum)2830 static struct zvalue *get_field_ref(int fnum)
2831 {
2832   if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2833   if (fnum > TT.nf_internal) {
2834     // Ensure TT.fields list is large enough for fnum
2835     // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2836     for (int i = TT.nf_internal + 1; i <= fnum; i++) {
2837       if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2838       zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2839     }
2840     set_nf(fnum);
2841   }
2842   return &FIELD[fnum];
2843 }
2844 
2845 // Called by tksplit op
split(struct zstring * s,struct zvalue * a,struct zvalue * fs)2846 static int split(struct zstring *s, struct zvalue *a, struct zvalue *fs)
2847 {
2848   return splitter(set_map_element, a->map, s->str, fs);
2849 }
2850 
2851 // Called by getrec_f0_f() and getrec_f0()
copy_to_field0(char * buf,size_t k)2852 static void copy_to_field0(char *buf, size_t k)
2853 {
2854   set_zvalue_str(&FIELD[0], buf, k);
2855   check_numeric_string(&FIELD[0]);
2856   build_fields();
2857 }
2858 
2859 // After changing $0, must rebuild TT.fields & reset NF
2860 // Changing other field must rebuild $0
2861 // Called by gsub() and assignment ops.
fixup_fields(int fnum)2862 static void fixup_fields(int fnum)
2863 {
2864   if (fnum == THIS_MEANS_SET_NF) {  // NF was assigned to
2865     int new_nf = get_int_val(&STACK[NF]);
2866     // Ensure TT.fields list is large enough for fnum
2867     // Need len of TT.fields to be > fnum b/c e.g. fnum==1 implies 2 TT.fields
2868     for (int i = TT.nf_internal + 1; i <= new_nf; i++) {
2869       if (i == zlist_len(&TT.fields)) zlist_append(&TT.fields, &uninit_zvalue);
2870       zvalue_copy(&FIELD[i], &uninit_string_zvalue);
2871     }
2872     set_nf(TT.nf_internal = STACK[NF].num);
2873     rebuild_field0();
2874     return;
2875   }
2876   // fnum is # of field that was just updated.
2877   // If it's 0, need to rebuild the TT.fields 1... n.
2878   // If it's non-0, need to rebuild field 0.
2879   to_str(&FIELD[fnum]);
2880   if (fnum) check_numeric_string(&FIELD[fnum]);
2881   if (fnum) rebuild_field0();
2882   else build_fields();
2883 }
2884 
2885 // Fetching non-existent field gets uninit string value; no change to NF!
2886 // Called by tkfield op       // TODO inline it?
push_field(int fnum)2887 static void push_field(int fnum)
2888 {
2889   if (fnum < 0 || fnum > FIELDS_MAX) error_exit("bad field num %d", fnum);
2890   // Contrary to posix, awk evaluates TT.fields beyond $NF as empty strings.
2891   if (fnum > TT.nf_internal) push_val(&uninit_string_zvalue);
2892   else push_val(&FIELD[fnum]);
2893 }
2894 
2895 ////////////////////
2896 ////   END fields
2897 ////////////////////
2898 
2899 #define STKP    TT.stackp   // pointer to top of stack
2900 
seedrand(double seed)2901 static double seedrand(double seed)
2902 {
2903   static double prev_seed;
2904   double r = prev_seed;
2905   srandom(trunc(prev_seed = seed));
2906   return r;
2907 }
2908 
popnumval(void)2909 static int popnumval(void)
2910 {
2911   return STKP-- -> num;
2912 }
2913 
drop(void)2914 static void drop(void)
2915 {
2916   if (!(STKP->flags & (ZF_ANYMAP | ZF_RX))) zstring_release(&STKP->vst);
2917   STKP--;
2918 }
2919 
drop_n(int n)2920 static void drop_n(int n)
2921 {
2922   while (n--) drop();
2923 }
2924 
swap(void)2925 static void swap(void)
2926 {
2927   struct zvalue tmp = STKP[-1];
2928   STKP[-1] = STKP[0];
2929   STKP[0] = tmp;
2930 }
2931 
2932 // Set and return logical (0/1) val of top TT.stack value; flag value as NUM.
get_set_logical(void)2933 static int get_set_logical(void)
2934 {
2935   struct zvalue *v = STKP;
2936   force_maybemap_to_scalar(v);
2937   int r = 0;
2938   if (IS_NUM(v)) r = !! v->num;
2939   else if (IS_STR(v)) r = (v->vst && v->vst->str[0]);
2940   zvalue_release_zstring(v);
2941   v->num = r;
2942   v->flags = ZF_NUM;
2943   return r;
2944 }
2945 
2946 
to_num(struct zvalue * v)2947 static double to_num(struct zvalue *v)
2948 {
2949   force_maybemap_to_scalar(v);
2950   if (v->flags & ZF_NUMSTR) zvalue_release_zstring(v);
2951   else if (!IS_NUM(v)) {
2952     v->num = 0.0;
2953     if (IS_STR(v) && v->vst) v->num = atof(v->vst->str);
2954     zvalue_release_zstring(v);
2955   }
2956   v->flags = ZF_NUM;
2957   return v->num;
2958 }
2959 
set_num(struct zvalue * v,double n)2960 static void set_num(struct zvalue *v, double n)
2961 {
2962   zstring_release(&v->vst);
2963   v->num = n;
2964   v->flags = ZF_NUM;
2965 }
2966 
incr_zvalue(struct zvalue * v)2967 static void incr_zvalue(struct zvalue *v)
2968 {
2969   v->num = trunc(to_num(v)) + 1;
2970 }
2971 
push_int_val(ptrdiff_t n)2972 static void push_int_val(ptrdiff_t n)
2973 {
2974   struct zvalue v = ZVINIT(ZF_NUM, n, 0);
2975   push_val(&v);
2976 }
2977 
get_map_val(struct zvalue * v,struct zvalue * key)2978 static struct zvalue *get_map_val(struct zvalue *v, struct zvalue *key)
2979 {
2980   struct zmap_slot *x = zmap_find_or_insert_key(v->map, to_str(key)->vst);
2981   return &x->val;
2982 }
2983 
setup_lvalue(int ref_stack_ptr,int parmbase,int * field_num)2984 static struct zvalue *setup_lvalue(int ref_stack_ptr, int parmbase, int *field_num)
2985 {
2986   // ref_stack_ptr is number of slots down in stack the ref is
2987   // for +=, *=, etc
2988   // Stack is: ... scalar_ref value_to_op_by
2989   // or ... subscript_val map_ref value_to_op_by
2990   // or ... fieldref value_to_op_by
2991   // for =, ++, --
2992   // Stack is: ... scalar_ref
2993   // or ... subscript_val map_ref
2994   // or ... fieldnum fieldref
2995   int k;
2996   struct zvalue *ref, *v = 0; // init v to mute "may be uninit" warning
2997   *field_num = -1;
2998   ref = STKP - ref_stack_ptr;
2999   if (ref->flags & ZF_FIELDREF) return get_field_ref(*field_num = ref->num);
3000   k = ref->num >= 0 ? ref->num : parmbase - ref->num;
3001   if (k == NF) *field_num = THIS_MEANS_SET_NF;
3002   v = &STACK[k];
3003   if (ref->flags & ZF_REF) {
3004     force_maybemap_to_scalar(v);
3005   } else if (ref->flags & ZF_MAPREF) {
3006     force_maybemap_to_map(v);
3007     if (!IS_MAP(v)) FATAL("scalar in array context");
3008     v = get_map_val(v, STKP - ref_stack_ptr - 1);
3009     swap();
3010     drop();
3011   } else FATAL("assignment to bad lvalue");
3012   return v; // order FATAL() and return to mute warning
3013 }
3014 
new_file(char * fn,FILE * fp,char mode,char file_or_pipe,char is_std_file)3015 static struct zfile *new_file(char *fn, FILE *fp, char mode, char file_or_pipe,
3016                               char is_std_file)
3017 {
3018   struct zfile *f = xzalloc(sizeof(struct zfile));
3019   *f = (struct zfile){TT.zfiles, xstrdup(fn), fp, mode, file_or_pipe,
3020                 isatty(fileno(fp)), is_std_file, 0, 0, 0, 0, 0};
3021   return TT.zfiles = f;
3022 }
3023 
fflush_all(void)3024 static int fflush_all(void)
3025 {
3026   int ret = 0;
3027   for (struct zfile *p = TT.zfiles; p; p = p->next)
3028     if (fflush(p->fp)) ret = -1;
3029   return ret;
3030 }
3031 
fflush_file(int nargs)3032 static int fflush_file(int nargs)
3033 {
3034   if (!nargs) return fflush_all();
3035 
3036   to_str(STKP);   // filename at top of TT.stack
3037   // Null string means flush all
3038   if (!STKP[0].vst->str[0]) return fflush_all();
3039 
3040   // is it open in file table?
3041   for (struct zfile *p = TT.zfiles; p; p = p->next)
3042     if (!strcmp(STKP[0].vst->str, p->fn))
3043       if (!fflush(p->fp)) return 0;
3044   return -1;    // error, or file not found in table
3045 }
close_file(char * fn)3046 static int close_file(char *fn)
3047 {
3048   // !fn (null ptr) means close all (exc. stdin/stdout/stderr)
3049   int r = 0;
3050   struct zfile *np, **pp = &TT.zfiles;
3051   for (struct zfile *p = TT.zfiles; p; p = np) {
3052     np = p->next;   // save in case unlinking file (invalidates p->next)
3053     // Don't close std files -- wrecks print/printf (can be fixed though TODO)
3054     if ((!p->is_std_file) && (!fn || !strcmp(fn, p->fn))) {
3055       xfree(p->buf);
3056       xfree(p->fn);
3057       r = (p->fp) ? (p->file_or_pipe ? fclose : pclose)(p->fp) : -1;
3058       *pp = p->next;
3059       xfree(p);
3060       if (fn) return r;
3061     } else pp = &p->next; // only if not unlinking zfile
3062   }
3063   return -1;  // file not in table, or closed all files
3064 }
3065 
3066 static struct zfile badfile_obj, *badfile = &badfile_obj;
3067 
3068 // FIXME TODO check if file/pipe/mode matches what's in the table already.
3069 // Apparently gawk/mawk/nawk are OK with different mode, but just use the file
3070 // in whatever mode it's already in; i.e. > after >> still appends.
setup_file(char file_or_pipe,char * mode)3071 static struct zfile *setup_file(char file_or_pipe, char *mode)
3072 {
3073   to_str(STKP);   // filename at top of TT.stack
3074   char *fn = STKP[0].vst->str;
3075   // is it already open in file table?
3076   for (struct zfile *p = TT.zfiles; p; p = p->next)
3077     if (!strcmp(fn, p->fn)) {
3078       drop();
3079       return p;   // open; return it
3080     }
3081   FILE *fp = (file_or_pipe ? fopen : popen)(fn, mode);
3082   if (fp) {
3083     struct zfile *p = new_file(fn, fp, *mode, file_or_pipe, 0);
3084     drop();
3085     return p;
3086   }
3087   if (*mode != 'r') FFATAL("cannot open '%s'\n", fn);
3088   drop();
3089   return badfile;
3090 }
3091 
3092 // TODO FIXME should be a function?
3093 #define stkn(n) ((int)(TT.stackp - (n) - (struct zvalue *)TT.stack.base))
3094 
getcnt(int k)3095 static int getcnt(int k)
3096 {
3097   if (k >= stkn(0)) FATAL("too few args for printf\n");
3098   return (int)to_num(&STACK[k]);
3099 }
3100 
fsprintf(FILE * ignored,const char * fmt,...)3101 static int fsprintf(FILE *ignored, const char *fmt, ...)
3102 {
3103   (void)ignored;
3104   va_list args, args2;
3105   va_start(args, fmt);
3106   va_copy(args2, args);
3107   int len = vsnprintf(0, 0, fmt, args); // size needed
3108   va_end(args);
3109   if (len < 0) FATAL("Bad sprintf format");
3110   // Unfortunately we have to mess with zstring internals here.
3111   if (TT.rgl.zspr->size + len + 1 > TT.rgl.zspr->capacity) {
3112       // This should always work b/c capacity > size
3113       unsigned cap = 2 * TT.rgl.zspr->capacity + len;
3114       TT.rgl.zspr = xrealloc(TT.rgl.zspr, sizeof(*TT.rgl.zspr) + cap);
3115       TT.rgl.zspr->capacity = cap;
3116     }
3117   vsnprintf(TT.rgl.zspr->str + TT.rgl.zspr->size, len+1, fmt, args2);
3118   TT.rgl.zspr->size += len;
3119   TT.rgl.zspr->str[TT.rgl.zspr->size] = 0;
3120   va_end(args2);
3121   return 0;
3122 }
3123 
varprint(int (* fpvar)(FILE *,const char *,...),FILE * outfp,int nargs)3124 static void varprint(int(*fpvar)(FILE *, const char *, ...), FILE *outfp, int nargs)
3125 {
3126   int k, nn, nnc, fmtc, holdc, cnt1 = 0, cnt2 = 0;
3127   char *s = 0;  // to shut up spurious warning
3128   regoff_t offs = -1, e = -1;
3129   char *pfmt, *fmt = to_str(STKP-nargs+1)->vst->str;
3130   k = stkn(nargs - 2);
3131   while (*fmt) {
3132     double n = 0;
3133     nn = strcspn(fmt, "%");
3134     if (nn) {
3135       holdc = fmt[nn];
3136       fmt[nn] = 0;
3137       fpvar(outfp, "%s", fmt);
3138       fmt[nn] = holdc;
3139     }
3140     fmt += nn;
3141     if (!*(pfmt = fmt)) break;
3142     nnc = strcspn(fmt+1, "aAdiouxXfFeEgGcs%");
3143     fmtc = fmt[nnc+1];
3144     if (!fmtc) FFATAL("bad printf format '%s'", fmt);
3145     holdc = fmt[nnc+2];
3146     fmt[nnc+2] = 0;
3147     if (rx_find(&TT.rx_printf_fmt, fmt, &offs, &e, 0))
3148       FFATAL("bad printf format <%s>\n", fmt);
3149     int nargsneeded = 1;
3150     for (char *p = strchr(fmt, '*'); p; p = strchr(p+1, '*'))
3151       nargsneeded++;
3152     nargsneeded -= fmtc == '%';
3153 
3154     switch (nargsneeded) {
3155       case 0:
3156         fpvar(outfp, fmt);
3157         break;
3158       case 3:
3159         cnt1 = getcnt(k++);
3160         ATTR_FALLTHROUGH_INTENDED;
3161       case 2:
3162         cnt2 = getcnt(k++);
3163         ATTR_FALLTHROUGH_INTENDED;
3164       case 1:
3165         if (k > stkn(0)) FATAL("too few args for printf\n");
3166         if (fmtc == 's') {
3167           s = to_str(&STACK[k++])->vst->str;
3168         } else if (fmtc == 'c' && !IS_NUM(&STACK[k])) {
3169           unsigned wch;
3170           struct zvalue *z = &STACK[k++];
3171           if (z->vst && z->vst->str[0])
3172             n = utf8towc(&wch, z->vst->str, z->vst->size) < 1 ? 0xfffd : wch;
3173         } else {
3174           n = to_num(&STACK[k++]);
3175         }
3176         if (strchr("cdiouxX", fmtc)) {
3177           pfmt = strcpy(TT.pbuf, fmt);
3178           if (pfmt[nnc] != 'l') {
3179             strcpy(pfmt+nnc+1, "l_");
3180             pfmt[nnc+2] = fmtc;
3181           }
3182         }
3183         if (fmtc == 'c' && n > 0x10ffff) n = 0xfffd;  // musl won't take larger "wchar"
3184         switch (nargsneeded) {
3185           case 1:
3186             if (fmtc == 's') fpvar(outfp, pfmt, s);
3187             else if (fmtc == 'c') fpvar(outfp, pfmt, (wint_t)n);
3188             else if (strchr("di", fmtc)) fpvar(outfp, pfmt, (long)n);
3189             else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, (unsigned long)n);
3190             else fpvar(outfp, pfmt, n);
3191             break;
3192           case 2:
3193             if (fmtc == 's') fpvar(outfp, pfmt, cnt2, s);
3194             else if (fmtc == 'c') fpvar(outfp, pfmt, cnt2, (wint_t)n);
3195             else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt2, (long)n);
3196             else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt2, (unsigned long)n);
3197             else fpvar(outfp, pfmt, cnt2, n);
3198             break;
3199           case 3:
3200             if (fmtc == 's') fpvar(outfp, pfmt, cnt1, cnt2, s);
3201             else if (fmtc == 'c') fpvar(outfp, pfmt, cnt1, cnt2, (wint_t)n);
3202             else if (strchr("di", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (long)n);
3203             else if (strchr("ouxX", fmtc)) fpvar(outfp, pfmt, cnt1, cnt2, (unsigned long)n);
3204             else fpvar(outfp, pfmt, cnt1, cnt2, n);
3205             break;
3206         }
3207         break;
3208       default:
3209         FATAL("bad printf format\n");
3210     }
3211     fmt += nnc + 2;
3212     *fmt = holdc;
3213   }
3214 }
3215 
is_ok_varname(char * v)3216 static int is_ok_varname(char *v)
3217 {
3218   char *ok = "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_";
3219   if (!*v) return 0;
3220   for (int i = 0; v[i]; i++)
3221     if (i ? !strchr(ok, v[i]) : !strchr(ok + 10, v[i])) return 0;
3222   return 1;
3223 }
3224 
3225 // FIXME TODO return value never used. What if assign to var not in globals?
assign_global(char * var,char * value)3226 static int assign_global(char *var, char *value)
3227 {
3228   if (!is_ok_varname(var)) FFATAL("Invalid variable name '%s'\n", var);
3229   int globals_ent = find_global(var);
3230   if (globals_ent) {
3231     struct zvalue *v = &STACK[globals_ent];
3232     if (IS_MAP(v)) error_exit("-v assignment to array");  // Maybe not needed?
3233 
3234 // The compile phase may insert a var in global table with flag of zero.  Then
3235 // init_globals() will assign a ZF_MAYBEMAP flag to it. If it is then assigned
3236 // via -v option or by assignment_arg() it will here be assigned a string value.
3237 // So first, remove all map data to prevent memory leak. BUG FIX // 2024-02-13.
3238     if (v->flags & ZF_ANYMAP) {
3239       zmap_delete_map_incl_slotdata(v->map);
3240       xfree(v->map);
3241       v->map = 0;
3242       v->flags &= ~ZF_ANYMAP;
3243     }
3244 
3245     zvalue_release_zstring(v);
3246     value = xstrdup(value);
3247     *v = new_str_val(escape_str(value, 0));
3248     xfree(value);
3249     check_numeric_string(v);
3250     return 1;
3251   }
3252   return 0;
3253 }
3254 
3255 // If valid assignment arg, assign the global and return 1;
3256 // otherwise return 0.
3257 // TODO FIXME This does not check the format of the variable per posix.
3258 // Needs to start w/ _A-Za-z then _A-Za-z0-9
3259 // If not valid assignment form, then nextfilearg needs to treat as filename.
assignment_arg(char * arg)3260 static int assignment_arg(char *arg)
3261 {
3262   char *val = strchr(arg, '=');
3263   if (val) {
3264     *val++ = 0;
3265     if (!is_ok_varname(arg)) {
3266       *--val = '=';
3267       return 0;
3268     }
3269     assign_global(arg, val);
3270     *--val = '=';
3271     return 1;
3272   } else return 0;
3273 }
3274 
nextfilearg(void)3275 static char *nextfilearg(void)
3276 {
3277   char *arg;
3278   do {
3279     if (++TT.rgl.narg >= (int)to_num(&STACK[ARGC])) return 0;
3280     struct zvalue *v = &STACK[ARGV];
3281     struct zvalue zkey = ZVINIT(ZF_STR, 0,
3282         num_to_zstring(TT.rgl.narg, to_str(&STACK[CONVFMT])->vst->str));
3283     arg = "";
3284     if (zmap_find(v->map, zkey.vst)) {
3285       zvalue_copy(&TT.rgl.cur_arg, to_str(get_map_val(v, &zkey)));
3286       arg = TT.rgl.cur_arg.vst->str;
3287     }
3288     zvalue_release_zstring(&zkey);
3289   } while (!*arg || assignment_arg(arg));
3290   TT.rgl.nfiles++;
3291   return arg;
3292 }
3293 
next_fp(void)3294 static int next_fp(void)
3295 {
3296   char *fn = nextfilearg();
3297   if (TT.cfile->fp && TT.cfile->fp != stdin) fclose(TT.cfile->fp);
3298   if ((!fn && !TT.rgl.nfiles && TT.cfile->fp != stdin) || (fn && !strcmp(fn, "-"))) {
3299     xfree(TT.cfile->buf);
3300     *TT.cfile = (struct zfile){0};
3301     TT.cfile->fp = stdin;
3302     TT.cfile->fn = "-";
3303     zvalue_release_zstring(&STACK[FILENAME]);
3304     STACK[FILENAME].vst = new_zstring("-", 1);
3305   } else if (fn) {
3306     xfree(TT.cfile->buf);
3307     *TT.cfile = (struct zfile){0};
3308     if (!(TT.cfile->fp = fopen(fn, "r"))) FFATAL("can't open %s\n", fn);
3309     TT.cfile->fn = fn;
3310     zvalue_copy(&STACK[FILENAME], &TT.rgl.cur_arg);
3311   } else {
3312     TT.rgl.eof = 1;
3313     return 0;
3314   }
3315   set_num(&STACK[FNR], 0);
3316   TT.cfile->is_tty = isatty(fileno(TT.cfile->fp));
3317   return 1;
3318 }
3319 
rx_find_rs(regex_t * rx,char * s,long len,regoff_t * start,regoff_t * end,int one_byte_rs)3320 static int rx_find_rs(regex_t *rx, char *s, long len,
3321                       regoff_t *start, regoff_t *end, int one_byte_rs)
3322 {
3323   regmatch_t matches[1];
3324   if (one_byte_rs) {
3325     char *p = memchr(s, one_byte_rs, len);
3326     if (!p) return REG_NOMATCH;
3327     *start = p - s;
3328     *end = *start + 1;
3329   } else {
3330     int r = regexec0(rx, s, len, 1, matches, 0);
3331     if (r == REG_NOMATCH) return r;
3332     if (r) FATAL("regexec error");  // TODO ? use regerr() to meaningful msg
3333     *start = matches[0].rm_so;
3334     *end = matches[0].rm_eo;
3335   }
3336   return 0;
3337 }
3338 
3339 // get a record; return length, or -1 at EOF
3340 // Does work for getrec_f() for regular RS or multiline
getr(struct zfile * zfp,int rs_mode)3341 static ssize_t getr(struct zfile *zfp, int rs_mode)
3342 {
3343   // zfp->buf (initially null) points to record buffer
3344   // zfp->buflen -- size of allocated buf
3345   // TT.rgl.recptr -- points to where record is being / has been read into
3346   // zfp->ro -- offset in buf to record data
3347   // zfp->lim -- offset to 1+last byte read in buffer
3348   // rs_mode nonzero iff multiline mode; reused for one-byte RS
3349 
3350   regex_t rsrx; // FIXME Need to cache and avoid rx compile on every record?
3351   long ret = -1;
3352   int r = -REG_NOMATCH;   // r cannot have this value after rx_findx() below
3353   regoff_t so = 0, eo = 0;
3354   size_t m = 0, n = 0;
3355 
3356   xregcomp(&rsrx, rs_mode ? "\n\n+" : fmt_one_char_fs(STACK[RS].vst->str),
3357       REG_EXTENDED);
3358   rs_mode = strlen(STACK[RS].vst->str) == 1 ? STACK[RS].vst->str[0] : 0;
3359   for ( ;; ) {
3360     if (zfp->ro == zfp->lim && zfp->eof) break; // EOF & last record; return -1
3361 
3362     // Allocate initial buffer, and expand iff buffer holds one
3363     //   possibly (probably) incomplete record.
3364     if (zfp->ro == 0 && zfp->lim == zfp->buflen)
3365       zfp->buf = xrealloc(zfp->buf,
3366           (zfp->buflen = maxof(512, zfp->buflen * 2)) + 1);
3367 
3368     if ((m = zfp->buflen - zfp->lim) && !zfp->eof) {
3369       // Read iff space left in buffer
3370       if (zfp->is_tty) m = 1;
3371       n = fread(zfp->buf + zfp->lim, 1, m, zfp->fp);
3372       if (n < m) {
3373         if (ferror(zfp->fp)) FFATAL("i/o error %d on %s!", errno, zfp->fn);
3374         zfp->eof = 1;
3375         if (!n && r == -REG_NOMATCH) break; // catch empty file here
3376       }
3377       zfp->lim += n;
3378       zfp->buf[zfp->lim] = 0;
3379     }
3380     TT.rgl.recptr = zfp->buf + zfp->ro;
3381     r = rx_find_rs(&rsrx, TT.rgl.recptr, zfp->lim - zfp->ro, &so, &eo, rs_mode);
3382     if (!r && so == eo) r = 1;  // RS was empty, so fake not found
3383 
3384     if (!zfp->eof && (r
3385           || (zfp->lim - (zfp->ro + eo)) < zfp->buflen / 4) && !zfp->is_tty) {
3386       // RS not found, or found near lim. Slide up and try to get more data
3387       // If recptr at start of buf and RS not found then expand buffer
3388       memmove(zfp->buf, TT.rgl.recptr, zfp->lim - zfp->ro);
3389       zfp->lim -= zfp->ro;
3390       zfp->ro = 0;
3391       continue;
3392     }
3393     ret = so;   // If RS found, then 'so' is rec length
3394     if (zfp->eof) {
3395       if (r) {  // EOF and RS not found; rec is all data left in buf
3396         ret = zfp->lim - zfp->ro;
3397         zfp->ro = zfp->lim; // set ro for -1 return on next call
3398       } else zfp->ro += eo; // RS found; advance ro
3399     } else zfp->ro += eo; // Here only if RS found not near lim
3400 
3401     if (!r || !zfp->is_tty) {
3402       // If is_tty then RS found; reset buffer pointers;
3403       // is_tty uses one rec per buffer load
3404       if (zfp->is_tty) zfp->ro = zfp->lim = 0;
3405       break;
3406     } // RS not found AND is_tty; loop to keep reading
3407   }
3408   regfree(&rsrx);
3409   return ret;
3410 }
3411 
3412 // get a record; return length, or -1 at EOF
getrec_f(struct zfile * zfp)3413 static ssize_t getrec_f(struct zfile *zfp)
3414 {
3415   int k;
3416   if (ENSURE_STR(&STACK[RS])->vst->str[0]) return getr(zfp, 0);
3417   // RS == "" so multiline read
3418   // Passing 1 to getr() forces multiline mode, which uses regex "\n\n+" to
3419   // split on sequences of 2 or more newlines. But that's not the same as
3420   // multiline mode, which never returns empty records or records with leading
3421   // or trailing newlines, which can occur with RS="\n\n+". So here we loop and
3422   // strip leading/trailing newlines and discard empty lines. See gawk manual,
3423   // "4.9 Multiple-Line Records" for info on this difference.
3424   do {
3425     k = getr(zfp, 1);
3426     if (k < 0) break;
3427     while (k && TT.rgl.recptr[k-1] == '\n') k--;
3428     while (k && TT.rgl.recptr[0] == '\n') k--, TT.rgl.recptr++;
3429   } while (!k);
3430   return k;
3431 }
3432 
getrec(void)3433 static ssize_t getrec(void)
3434 {
3435   ssize_t k;
3436   if (TT.rgl.eof) return -1;
3437   if (!TT.cfile->fp) next_fp();
3438   do {
3439     if ((k = getrec_f(TT.cfile)) >= 0) return k;
3440   } while (next_fp());
3441   return -1;
3442 }
3443 
getrec_f0_f(struct zfile * zfp)3444 static ssize_t getrec_f0_f(struct zfile *zfp)
3445 {
3446   ssize_t k = getrec_f(zfp);
3447   if (k >= 0) {
3448     copy_to_field0(TT.rgl.recptr, k);
3449   }
3450   return k;
3451 }
3452 
getrec_f0(void)3453 static ssize_t getrec_f0(void)
3454 {
3455   ssize_t k = getrec();
3456   if (k >= 0) {
3457     copy_to_field0(TT.rgl.recptr, k);
3458     incr_zvalue(&STACK[NR]);
3459     incr_zvalue(&STACK[FNR]);
3460   }
3461   return k;
3462 }
3463 
3464 // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
3465 // fp is file or pipe (is NULL if file/pipe could not be opened)
3466 // FIXME TODO should -1 return be replaced by test at caller?
3467 // v is NULL or an lvalue ref
awk_getline(int source,struct zfile * zfp,struct zvalue * v)3468 static int awk_getline(int source, struct zfile *zfp, struct zvalue *v)
3469 {
3470   ssize_t k;
3471   int is_stream = source != tkeof;
3472   if (is_stream && !zfp->fp) return -1;
3473   if (v) {
3474     if ((k = is_stream ? getrec_f(zfp) : getrec()) < 0) return 0;
3475     zstring_release(&v->vst);
3476     v->vst = new_zstring(TT.rgl.recptr, k);
3477     v->flags = ZF_STR;
3478     check_numeric_string(v);    // bug fix 20240514
3479     if (!is_stream) {
3480       incr_zvalue(&STACK[NR]);
3481       incr_zvalue(&STACK[FNR]);
3482     }
3483   } else k = is_stream ? getrec_f0_f(zfp) : getrec_f0();
3484   return k < 0 ? 0 : 1;
3485 }
3486 
3487 // Define GAWK_SUB to get the same behavior with sub()/gsub() replacement text
3488 // as with gawk, goawk, and recent bwk awk (nawk) versions. Undefine GAWK_SUB
3489 // to get the simpler POSIX behavior, but I think most users will prefer the
3490 // gawk behavior. See the gawk (GNU Awk) manual,
3491 // sec. 9.1.4.1 // More about '\' and '&' with sub(), gsub(), and gensub()
3492 // for details on the differences.
3493 //
3494 #undef GAWK_SUB
3495 #define GAWK_SUB
3496 
3497 // sub(ere, repl[, in]) Substitute the string repl in place of the
3498 // first instance of the extended regular expression ERE in string 'in'
3499 // and return the number of substitutions.  An <ampersand> ( '&' )
3500 // appearing in the string repl shall be replaced by the string from in
3501 // that matches the ERE. (partial spec... there's more)
gsub(int opcode,int nargs,int parmbase)3502 static void gsub(int opcode, int nargs, int parmbase)
3503 { (void)nargs;
3504   int field_num = -1;
3505   // compile ensures 3 args
3506   struct zvalue *v = setup_lvalue(0, parmbase, &field_num);
3507   struct zvalue *ere = STKP-2;
3508   struct zvalue *repl = STKP-1;
3509   regex_t rx, *rxp = &rx;
3510   rx_zvalue_compile(&rxp, ere);
3511   to_str(repl);
3512   to_str(v);
3513 
3514 #define SLEN(zvalp) ((zvalp)->vst->size)
3515   char *p, *rp0 = repl->vst->str, *rp = rp0, *s = v->vst->str;
3516   int namps = 0, nhits = 0, is_sub = (opcode == tksub), eflags = 0;
3517   regoff_t so = -1, eo;
3518   // Count ampersands in repl string; may be overcount due to \& escapes.
3519   for (rp = rp0; *rp; rp++) namps += *rp == '&';
3520   p = s;
3521   regoff_t need = SLEN(v) + 1;  // capacity needed for result string
3522   // A pass just to determine needed destination (result) string size.
3523   while(!rx_find(rxp, p, &so, &eo, eflags)) {
3524     need += SLEN(repl) + (eo - so) * (namps - 1);
3525     if (!*p) break;
3526     p += eo ? eo : 1; // ensure progress if empty hit at start
3527     if (is_sub) break;
3528     eflags |= REG_NOTBOL;
3529   }
3530 
3531   if (so >= 0) {  // at least one hit
3532     struct zstring *z = xzalloc(sizeof(*z) + need);
3533     z->capacity = need;
3534 
3535     char *e = z->str; // result destination pointer
3536     p = s;
3537     eflags = 0;
3538     char *ep0 = p, *sp, *ep;
3539     while(!rx_find(rxp, p, &so, &eo, eflags)) {
3540       sp = p + so;
3541       ep = p + eo;
3542       memmove(e, ep0, sp - ep0);  // copy unchanged part
3543       e += sp - ep0;
3544       // Skip match if not at start and just after prev match and this is empty
3545       if (p == s || sp - ep0 || eo - so) {
3546         nhits++;
3547         for (rp = rp0; *rp; rp++) { // copy replacement
3548           if (*rp == '&') {
3549             memmove(e, sp, eo - so);  //copy match
3550             e += eo - so;
3551           } else if (*rp == '\\') {
3552             if (rp[1] == '&') *e++ = *++rp;
3553             else if (rp[1] != '\\') *e++ = *rp;
3554             else {
3555 #ifdef GAWK_SUB
3556               if (rp[2] == '\\' && rp[3] == '&') {
3557                 rp += 2;
3558                 *e++ = *rp;
3559               } else if (rp[2] != '&') *e++ = '\\';
3560 #endif
3561               *e++ = *++rp;
3562             }
3563           } else *e++ = *rp;
3564         }
3565       }
3566       ep0 = ep;
3567       if (!*p) break;
3568       p += eo ? eo : 1; // ensure progress if empty hit at start
3569       if (is_sub) break;
3570       eflags |= REG_NOTBOL;
3571     }
3572     // copy remaining subject string
3573     memmove(e, ep0, s + SLEN(v) - ep0);
3574     e += s + SLEN(v) - ep0;
3575     *e = 0;
3576     z->size = e - z->str;
3577     zstring_release(&v->vst);
3578     v->vst = z;
3579   }
3580   rx_zvalue_free(rxp, ere);
3581   if (!IS_RX(STKP-2)) zstring_release(&STKP[-2].vst);
3582   drop_n(3);
3583   push_int_val(nhits);
3584   if (field_num >= 0) fixup_fields(field_num);
3585 }
3586 
3587 // Initially set stackp_needmore at MIN_STACK_LEFT before limit.
3588 // When stackp > stackp_needmore, then expand and reset stackp_needmore
add_stack(struct zvalue ** stackp_needmore)3589 static void add_stack(struct zvalue **stackp_needmore)
3590 {
3591   int k = stkn(0);  // stack elements in use
3592   zlist_expand(&TT.stack);
3593   STKP = (struct zvalue *)TT.stack.base + k;
3594   *stackp_needmore = (struct zvalue *)TT.stack.limit - MIN_STACK_LEFT;
3595 }
3596 
3597 #define CLAMP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
3598 
3599 // Main loop of interpreter. Run this once for all BEGIN rules (which
3600 // have had their instructions chained in compile), all END rules (also
3601 // chained in compile), and once for each record of the data file(s).
interpx(int start,int * status)3602 static int interpx(int start, int *status)
3603 {
3604   int *ip = &ZCODE[start];
3605   int opcode, op2, k, r, nargs, nsubscrs, range_num, parmbase = 0;
3606   int field_num;
3607   double nleft, nright, d;
3608   double (*mathfunc[])(double) = {cos, sin, exp, log, sqrt, trunc};
3609   struct zvalue *v, vv,
3610         *stackp_needmore = (struct zvalue*)TT.stack.limit - MIN_STACK_LEFT;
3611   while ((opcode = *ip++)) {
3612 
3613     switch (opcode) {
3614       case opquit:
3615         return opquit;
3616 
3617       case tknot:
3618         (STKP)->num = ! get_set_logical();
3619         break;
3620 
3621       case opnotnot:
3622         get_set_logical();
3623         break;
3624 
3625       case opnegate:
3626         STKP->num = -to_num(STKP);
3627         break;
3628 
3629       case tkpow:         // FALLTHROUGH intentional here
3630       case tkmul:         // FALLTHROUGH intentional here
3631       case tkdiv:         // FALLTHROUGH intentional here
3632       case tkmod:         // FALLTHROUGH intentional here
3633       case tkplus:        // FALLTHROUGH intentional here
3634       case tkminus:
3635         nleft = to_num(STKP-1);
3636         nright = to_num(STKP);
3637         switch (opcode) {
3638           case tkpow: nleft = pow(nleft, nright); break;
3639           case tkmul: nleft *= nright; break;
3640           case tkdiv: nleft /= nright; break;
3641           case tkmod: nleft = fmod(nleft, nright); break;
3642           case tkplus: nleft += nright; break;
3643           case tkminus: nleft -= nright; break;
3644         }
3645         drop();
3646         STKP->num = nleft;
3647         break;
3648 
3649       // FIXME REDO REDO ?
3650       case tkcat:
3651         to_str(STKP-1);
3652         to_str(STKP);
3653         STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP[0].vst);
3654         drop();
3655         break;
3656 
3657         // Comparisons (with the '<', "<=", "!=", "==", '>', and ">="
3658         // operators) shall be made numerically:
3659         // * if both operands are numeric,
3660         // * if one is numeric and the other has a string value that is a
3661         //   numeric string,
3662         // * if both have string values that are numeric strings, or
3663         // * if one is numeric and the other has the uninitialized value.
3664         //
3665         // Otherwise, operands shall be converted to strings as required and a
3666         // string comparison shall be made as follows:
3667         // * For the "!=" and "==" operators, the strings shall be compared to
3668         //   check if they are identical (not to check if they collate equally).
3669         // * For the other operators, the strings shall be compared using the
3670         //   locale-specific collation sequence.
3671         //
3672         // The value of the comparison expression shall be 1 if the relation is
3673         // true, or 0 if the relation is false.
3674       case tklt:          // FALLTHROUGH intentional here
3675       case tkle:          // FALLTHROUGH intentional here
3676       case tkne:          // FALLTHROUGH intentional here
3677       case tkeq:          // FALLTHROUGH intentional here
3678       case tkgt:          // FALLTHROUGH intentional here
3679       case tkge:
3680         ; int cmp = 31416;
3681 
3682         if (  (IS_NUM(&STKP[-1]) &&
3683               (STKP[0].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[0].flags)) ||
3684               (IS_NUM(&STKP[0]) &&
3685               (STKP[-1].flags & (ZF_NUM | ZF_NUMSTR) || !STKP[-1].flags))) {
3686           switch (opcode) {
3687             case tklt: cmp = STKP[-1].num < STKP[0].num; break;
3688             case tkle: cmp = STKP[-1].num <= STKP[0].num; break;
3689             case tkne: cmp = STKP[-1].num != STKP[0].num; break;
3690             case tkeq: cmp = STKP[-1].num == STKP[0].num; break;
3691             case tkgt: cmp = STKP[-1].num > STKP[0].num; break;
3692             case tkge: cmp = STKP[-1].num >= STKP[0].num; break;
3693           }
3694         } else {
3695           cmp = strcmp(to_str(STKP-1)->vst->str, to_str(STKP)->vst->str);
3696           switch (opcode) {
3697             case tklt: cmp = cmp < 0; break;
3698             case tkle: cmp = cmp <= 0; break;
3699             case tkne: cmp = cmp != 0; break;
3700             case tkeq: cmp = cmp == 0; break;
3701             case tkgt: cmp = cmp > 0; break;
3702             case tkge: cmp = cmp >= 0; break;
3703           }
3704         }
3705         drop();
3706         drop();
3707         push_int_val(cmp);
3708         break;
3709 
3710       case opmatchrec:
3711         op2 = *ip++;
3712         int mret = match(&FIELD[0], &LITERAL[op2]);
3713         push_int_val(!mret);
3714         break;
3715 
3716       case tkmatchop:
3717       case tknotmatch:
3718         mret = match(STKP-1, STKP); // mret == 0 if match
3719         drop();
3720         drop();
3721         push_int_val(!mret == (opcode == tkmatchop));
3722         break;
3723 
3724       case tkpowasgn:     // FALLTHROUGH intentional here
3725       case tkmodasgn:     // FALLTHROUGH intentional here
3726       case tkmulasgn:     // FALLTHROUGH intentional here
3727       case tkdivasgn:     // FALLTHROUGH intentional here
3728       case tkaddasgn:     // FALLTHROUGH intentional here
3729       case tksubasgn:
3730         // Stack is: ... scalar_ref value_to_op_by
3731         // or ... subscript_val map_ref value_to_op_by
3732         // or ... fieldref value_to_op_by
3733         v = setup_lvalue(1, parmbase, &field_num);
3734         to_num(v);
3735         to_num(STKP);
3736         switch (opcode) {
3737           case tkpowasgn:
3738             // TODO
3739             v->num = pow(v->num, STKP->num);
3740             break;
3741           case tkmodasgn:
3742             // TODO
3743             v->num = fmod(v->num, STKP->num);
3744             break;
3745           case tkmulasgn:
3746             v->num *= STKP->num;
3747             break;
3748           case tkdivasgn:
3749             v->num /= STKP->num;
3750             break;
3751           case tkaddasgn:
3752             v->num += STKP->num;
3753             break;
3754           case tksubasgn:
3755             v->num -= STKP->num;
3756             break;
3757         }
3758 
3759         drop_n(2);
3760         v->flags = ZF_NUM;
3761         push_val(v);
3762         if (field_num >= 0) fixup_fields(field_num);
3763         break;
3764 
3765       case tkasgn:
3766         // Stack is: ... scalar_ref value_to_assign
3767         // or ... subscript_val map_ref value_to_assign
3768         // or ... fieldref value_to_assign
3769         v = setup_lvalue(1, parmbase, &field_num);
3770         force_maybemap_to_scalar(STKP);
3771         zvalue_copy(v, STKP);
3772         swap();
3773         drop();
3774         if (field_num >= 0) fixup_fields(field_num);
3775         break;
3776 
3777       case tkincr:        // FALLTHROUGH intentional here
3778       case tkdecr:        // FALLTHROUGH intentional here
3779       case oppreincr:     // FALLTHROUGH intentional here
3780       case oppredecr:
3781         // Stack is: ... scalar_ref
3782         // or ... subscript_val map_ref
3783         // or ... fieldnum fieldref
3784         v = setup_lvalue(0, parmbase, &field_num);
3785         to_num(v);
3786         switch (opcode) {
3787           case tkincr: case tkdecr:
3788             // Must be done in this order because push_val(v) may move v,
3789             // invalidating the pointer.
3790             v->num += (opcode == tkincr) ? 1 : -1;
3791             push_val(v);
3792             // Now reverse the incr/decr on the top TT.stack val.
3793             STKP->num -= (opcode == tkincr) ? 1 : -1;
3794             break;
3795           case oppreincr: case oppredecr:
3796             v->num += (opcode == oppreincr) ? 1 : -1;
3797             push_val(v);
3798             break;
3799         }
3800         swap();
3801         drop();
3802         if (field_num >= 0) fixup_fields(field_num);
3803         break;
3804 
3805       case tknumber:      // FALLTHROUGH intentional here
3806       case tkstring:      // FALLTHROUGH intentional here
3807       case tkregex:
3808         push_val(&LITERAL[*ip++]);
3809         break;
3810 
3811       case tkprint:
3812       case tkprintf:
3813         nargs = *ip++;
3814         int outmode = *ip++;
3815         struct zfile *outfp = TT.zstdout;
3816         switch (outmode) {
3817           case tkgt: outfp = setup_file(1, "w"); break;     // file
3818           case tkappend: outfp = setup_file(1, "a"); break; // file
3819           case tkpipe: outfp = setup_file(0, "w"); break;   // pipe
3820           default: nargs++; break;
3821         }
3822         nargs--;
3823         if (opcode == tkprintf) {
3824           varprint(fprintf, outfp->fp, nargs);
3825           drop_n(nargs);
3826           break;
3827         }
3828         if (!nargs) {
3829           fprintf(outfp->fp, "%s", to_str(&FIELD[0])->vst->str);
3830         } else {
3831           struct zvalue tempv = uninit_zvalue;
3832           zvalue_copy(&tempv, &STACK[OFS]);
3833           to_str(&tempv);
3834           for (int k = 0; k < nargs; k++) {
3835             if (k) fprintf(outfp->fp, "%s", tempv.vst->str);
3836             int sp = stkn(nargs - 1 - k);
3837             ////// FIXME refcnt -- prob. don't need to copy from TT.stack?
3838             v = &STACK[sp];
3839             to_str_fmt(v, OFMT);
3840             struct zstring *zs = v->vst;
3841             fprintf(outfp->fp, "%s", zs ? zs->str : "");
3842           }
3843           zvalue_release_zstring(&tempv);
3844           drop_n(nargs);
3845         }
3846         fputs(ENSURE_STR(&STACK[ORS])->vst->str, outfp->fp);
3847         break;
3848 
3849       case opdrop:
3850         drop();
3851         break;
3852 
3853       case opdrop_n:
3854         drop_n(*ip++);
3855         break;
3856 
3857         // Stack frame layout relative to parmbase:
3858 #define RETURN_VALUE    -4
3859 #define RETURN_ADDR     -3
3860 #define PREV_PARMBASE   -2
3861 #define ARG_CNT         -1
3862 #define FUNCTION_NUM    0
3863         // Actual args follow, starting at parmbase + 1
3864       case tkfunction:    // function definition
3865         op2 = *ip++;    // func table num
3866         struct functab_slot *pfdef = &FUNC_DEF[op2];
3867         struct zlist *loctab = &pfdef->function_locals;
3868         int nparms = zlist_len(loctab)-1;
3869 
3870         nargs = popnumval();
3871         int newparmbase = stkn(nargs);
3872         STACK[newparmbase + PREV_PARMBASE].num = parmbase;
3873         parmbase = newparmbase;
3874         for ( ;nargs > nparms; nargs--)
3875           drop();
3876         for ( ;nargs < nparms; nargs++) {
3877           // Push additional "args" that were not passed by the caller, to
3878           // match the formal parameters (parms) defined in the function
3879           // definition. In the local var table we may have the type as scalar
3880           // or map if it is used as such within the function. In that case we
3881           // init the pushed arg from the type of the locals table.
3882           // But if a var appears only as a bare arg in a function call it will
3883           // not be typed in the locals table. In that case we can only say it
3884           // "may be" a map, but we have to assume the possibility and attach a
3885           // map to the var. When/if the var is used as a map or scalar in the
3886           // called function it will be converted to a map or scalar as
3887           // required.
3888           // See force_maybemap_to_scalar().
3889           struct symtab_slot *q = &((struct symtab_slot *)loctab->base)[nargs+1];
3890           vv = (struct zvalue)ZVINIT(q->flags, 0, 0);
3891           if (vv.flags == 0) {
3892             zvalue_map_init(&vv);
3893             vv.flags = ZF_MAYBEMAP;
3894           } else if (IS_MAP(&vv)) {
3895             zvalue_map_init(&vv);
3896           } else {
3897             vv.flags = 0;
3898           }
3899           push_val(&vv);
3900         }
3901         break;
3902 
3903       case tkreturn:
3904         nparms = *ip++;
3905         nargs = STACK[parmbase+ARG_CNT].num;
3906         force_maybemap_to_scalar(STKP); // Unneeded?
3907         zvalue_copy(&STACK[parmbase+RETURN_VALUE], STKP);
3908         drop();
3909         // Remove the local args (not supplied by caller) from TT.stack, check to
3910         // release any map data created.
3911         while (stkn(0) > parmbase + nargs) {
3912           if ((STKP)->flags & ZF_ANYMAP) {
3913             zmap_delete_map_incl_slotdata((STKP)->map);
3914             xfree((STKP)->map);
3915           }
3916           drop();
3917         }
3918         while (stkn(0) > parmbase + RETURN_VALUE)
3919           drop();
3920         ip = &ZCODE[(int)STACK[parmbase+RETURN_ADDR].num];
3921         parmbase = STACK[parmbase+PREV_PARMBASE].num;
3922         break;
3923 
3924       case opprepcall:    // function call prep
3925         if (STKP > stackp_needmore) add_stack(&stackp_needmore);
3926         push_int_val(0);      // return value placeholder
3927         push_int_val(0);      // return addr
3928         push_int_val(0);      // parmbase
3929         push_int_val(0);      // arg count
3930         push_int_val(*ip++);  // function tbl ref
3931         break;
3932 
3933       case tkfunc:        // function call
3934         nargs = *ip++;
3935         newparmbase = stkn(nargs);
3936         STACK[newparmbase+RETURN_ADDR].num = ip - &ZCODE[0];
3937         STACK[newparmbase+ARG_CNT].num = nargs;
3938         push_int_val(nargs);      // FIXME TODO pass this in a zregister?
3939         ip = &ZCODE[FUNC_DEF[(int)STACK[newparmbase+FUNCTION_NUM].num].zcode_addr];
3940         break;
3941 
3942       case tkrbracket:    // concat multiple map subscripts
3943         nsubscrs = *ip++;
3944         while (--nsubscrs) {
3945           swap();
3946           to_str(STKP);
3947           push_val(&STACK[SUBSEP]);
3948           to_str(STKP);
3949           STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3950           drop();
3951           swap();
3952           to_str(STKP);
3953           STKP[-1].vst = zstring_extend(STKP[-1].vst, STKP->vst);
3954           drop();
3955         }
3956         break;
3957 
3958       case opmapdelete:
3959       case tkdelete:
3960         k = STKP->num;
3961         if (k < 0) k = parmbase - k;    // loc of var on TT.stack
3962         v = &STACK[k];
3963         force_maybemap_to_map(v);
3964         if (opcode == opmapdelete) {
3965           zmap_delete_map(v->map);
3966         } else {
3967           drop();
3968           zmap_delete(v->map, to_str(STKP)->vst);
3969         }
3970         drop();
3971         break;
3972 
3973       case opmap:
3974         op2 = *ip++;
3975         k = op2 < 0 ? parmbase - op2 : op2;
3976         v = &STACK[k];
3977         force_maybemap_to_map(v);
3978         if (!IS_MAP(v)) FATAL("scalar in array context");
3979         v = get_map_val(v, STKP);
3980         drop();     // drop subscript
3981         push_val(v);
3982         break;
3983 
3984       case tkin:
3985         if (!(STKP->flags & ZF_ANYMAP)) FATAL("scalar in array context");
3986         v = zmap_find(STKP->map, to_str(STKP-1)->vst);
3987         drop();
3988         drop();
3989         push_int_val(v ? 1 : 0);
3990         break;
3991 
3992       case opmapiternext:
3993         op2 = *ip++;
3994         v = STKP-1;
3995         force_maybemap_to_map(v);
3996         if (!IS_MAP(v)) FATAL("scalar in array context");
3997         struct zmap *m = v->map;   // Need for MAPSLOT macro
3998         int zlen = zlist_len(&m->slot);
3999         int kk = STKP->num + 1;
4000         while (kk < zlen && !(MAPSLOT[kk].key)) // skip deleted slots
4001           kk++;
4002         STKP->num = kk; // save index for next iteration
4003         if (kk < zlen) {
4004           struct zvalue *var = setup_lvalue(2, parmbase, &field_num);
4005           var->flags = ZF_STR;
4006           zstring_release(&var->vst);
4007           var->vst = MAPSLOT[kk].key;
4008           zstring_incr_refcnt(var->vst);
4009           ip += op2;
4010         }
4011         break;
4012 
4013       case tkvar:
4014         op2 = *ip++;
4015         k = op2 < 0 ? parmbase - op2 : op2;
4016         v = &STACK[k];
4017         push_val(v);
4018         break;
4019 
4020       case tkfield:
4021         // tkfield op has "dummy" 2nd word so that convert_push_to_reference(void)
4022         // can find either tkfield or tkvar at same place (ZCODE[TT.zcode_last-1]).
4023         ip++; // skip dummy "operand" instruction field
4024         push_field((int)(to_num(STKP)));
4025 
4026         swap();
4027         drop();
4028         break;
4029 
4030       case oppush:
4031         push_int_val(*ip++);
4032         break;
4033 
4034       case tkand:
4035         op2 = *ip++;
4036         if (get_set_logical()) drop();
4037         else ip += op2;
4038         break;
4039 
4040       case tkor:
4041         op2 = *ip++;
4042         if (!get_set_logical()) drop();
4043         else ip += op2;
4044         break;
4045 
4046       case tkwhile:
4047         (STKP)->num = ! get_set_logical();
4048         ATTR_FALLTHROUGH_INTENDED;
4049         // FALLTHROUGH to tkternif
4050       case tkif:
4051         // FALLTHROUGH to tkternif
4052       case tkternif:
4053         op2 = *ip++;
4054         int t = get_set_logical();  // FIXME only need to get, not set
4055         drop();
4056         if (!t) ip += op2;
4057         break;
4058 
4059       case tkelse:        // FALLTHROUGH intentional here
4060       case tkternelse:    // FALLTHROUGH intentional here
4061       case tkbreak:       // FALLTHROUGH intentional here
4062       case tkcontinue:    // FALLTHROUGH intentional here
4063       case opjump:
4064         op2 = *ip++;
4065         ip += op2;
4066         break;
4067 
4068       case opvarref:
4069         op2 = *ip++;
4070         vv = (struct zvalue)ZVINIT(ZF_REF, op2, 0);
4071         push_val(&vv);
4072         break;
4073 
4074       case opmapref:
4075         op2 = *ip++;
4076         vv = (struct zvalue)ZVINIT(ZF_MAPREF, op2, 0);
4077         push_val(&vv);
4078         break;
4079 
4080       case opfldref:
4081         to_num(STKP);
4082         (STKP)->flags |= ZF_FIELDREF;
4083         ip++; // skip dummy "operand" instruction field
4084         break;
4085 
4086       case opprintrec:
4087         puts(to_str(&FIELD[0])->vst->str);
4088         break;
4089 
4090       case oprange1:
4091         range_num = *ip++;
4092         op2 = *ip++;
4093         if (TT.range_sw[range_num]) ip += op2;
4094         break;
4095 
4096       case oprange2:
4097         range_num = *ip++;
4098         op2 = *ip++;
4099         t = get_set_logical();  // FIXME only need to get, not set
4100         drop();
4101         if (t) TT.range_sw[range_num] = 1;
4102         else ip += op2;
4103         break;
4104 
4105       case oprange3:
4106         range_num = *ip++;
4107         t = get_set_logical();  // FIXME only need to get, not set
4108         drop();
4109         if (t) TT.range_sw[range_num] = 0;
4110         break;
4111 
4112       case tkexit:
4113         r = popnumval();
4114         if (r != NO_EXIT_STATUS) *status = (int)r & 255;
4115         // TODO FIXME do we need NO_EXIT_STATUS at all? Just use 0?
4116         ATTR_FALLTHROUGH_INTENDED;
4117       case tknext:
4118       case tknextfile:
4119         return opcode;
4120 
4121       case tkgetline:
4122         nargs = *ip++;
4123         int source = *ip++;
4124         // TT.stack is:
4125         // if tkgetline 0 tkeof:   (nothing stacked; plain getline)
4126         // if tkgetline 1 tkeof:   (lvalue)
4127         // if tkgetline 1 tklt:    (filename_string)
4128         // if tkgetline 2 tklt:    (lvalue) (filename_string)
4129         // if tkgetline 1 tkpipe:  (pipe_command_string)
4130         // if tkgetline 2 tkpipe:  (pipe_command_string) (lvalue)
4131         // effect is to set:
4132         // if tkgetline 0 tkeof:   $0 NF NR FNR
4133         // if tkgetline 1 tkeof:   var NR FNR
4134         // if tkgetline 1 tklt:    $0 NF
4135         // if tkgetline 2 tklt:    var
4136         // if tkgetline 1 tkpipe:  $0 NF
4137         // if tkgetline 2 tkpipe:  var
4138         // Ensure pipe cmd on top
4139         if (nargs == 2 && source == tkpipe) swap();
4140         struct zfile *zfp = 0;
4141         if (source == tklt || source == tkpipe) {
4142           zfp = setup_file(source == tklt, "r");
4143           nargs--;
4144         }
4145         // now cases are:
4146         // nargs source  TT.stack
4147         //  0 tkeof:   (nothing; plain getline) from current data file
4148         //  1 tkeof:   (lvalue)  from current data file
4149         //  0 tklt:    (nothing) from named file in 'stream'
4150         //  1 tklt:    (lvalue)  from  named file in 'stream'
4151         //  0 tkpipe:  (nothing) from piped command in 'stream'
4152         //  1 tkpipe:  (lvalue)  from piped command in 'stream'
4153         v = nargs ? setup_lvalue(0, parmbase, &field_num) : 0;
4154         if (v) drop();
4155         // source is tkeof (no pipe/file), tklt (file), or tkpipe (pipe)
4156         // stream is name of file or pipe
4157         // v is NULL or an lvalue ref
4158         if (zfp != badfile) push_int_val(awk_getline(source, zfp, v));
4159         else push_int_val(-1);
4160 
4161         // fake return value for now
4162         break;
4163 
4164         ////// builtin functions ///////
4165 
4166       case tksplit:
4167         nargs = *ip++;
4168         if (nargs == 2) push_val(&STACK[FS]);
4169         struct zstring *s = to_str(STKP-2)->vst;
4170         force_maybemap_to_map(STKP-1);
4171         struct zvalue *a = STKP-1;
4172         struct zvalue *fs = STKP;
4173         zmap_delete_map(a->map);
4174         k = split(s, a, fs);
4175         drop_n(3);
4176         push_int_val(k);
4177         break;
4178 
4179       case tkmatch:
4180         nargs = *ip++;
4181         if (!IS_RX(STKP)) to_str(STKP);
4182         regex_t rx_pat, *rxp = &rx_pat;
4183         rx_zvalue_compile(&rxp, STKP);
4184         regoff_t rso = 0, reo = 0;  // shut up warning (may be uninit)
4185         k = rx_find(rxp, to_str(STKP-1)->vst->str, &rso, &reo, 0);
4186         rx_zvalue_free(rxp, STKP);
4187         // Force these to num before setting.
4188         to_num(&STACK[RSTART]);
4189         to_num(&STACK[RLENGTH]);
4190         if (k) STACK[RSTART].num = 0, STACK[RLENGTH].num = -1;
4191         else {
4192           reo = utf8cnt(STKP[-1].vst->str, reo);
4193           rso = utf8cnt(STKP[-1].vst->str, rso);
4194           STACK[RSTART].num = rso + 1, STACK[RLENGTH].num = reo - rso;
4195         }
4196         drop();
4197         drop();
4198         push_int_val(k ? 0 : rso + 1);
4199         break;
4200 
4201       case tksub:
4202       case tkgsub:
4203         gsub(opcode, *ip++, parmbase);  // tksub/tkgsub, args
4204         break;
4205 
4206       case tksubstr:
4207         nargs = *ip++;
4208         struct zstring *zz = to_str(STKP - nargs + 1)->vst;
4209         int nchars = utf8cnt(zz->str, zz->size);  // number of utf8 codepoints
4210         // Offset of start of string (in chars not bytes); convert 1-based to 0-based
4211         ssize_t mm = CLAMP(trunc(to_num(STKP - nargs + 2)) - 1, 0, nchars);
4212         ssize_t nn = nchars - mm;   // max possible substring length (chars)
4213         if (nargs == 3) nn = CLAMP(trunc(to_num(STKP)), 0, nn);
4214         mm = bytesinutf8(zz->str, zz->size, mm);
4215         nn = bytesinutf8(zz->str + mm, zz->size - mm, nn);
4216         struct zstring *zzz = new_zstring(zz->str + mm, nn);
4217         zstring_release(&(STKP - nargs + 1)->vst);
4218         (STKP - nargs + 1)->vst = zzz;
4219         drop_n(nargs - 1);
4220         break;
4221 
4222       case tkindex:
4223         nargs = *ip++;
4224         char *s1 = to_str(STKP-1)->vst->str;
4225         char *s3 = strstr(s1, to_str(STKP)->vst->str);
4226         ptrdiff_t offs = s3 ? utf8cnt(s1, s3 - s1) + 1 : 0;
4227         drop();
4228         drop();
4229         push_int_val(offs);
4230         break;
4231 
4232       case tkband:
4233       case tkbor:
4234       case tkbxor:
4235       case tklshift:
4236       case tkrshift:
4237         ; size_t acc = to_num(STKP);
4238         nargs = *ip++;
4239         for (int i = 1; i < nargs; i++) switch (opcode) {
4240           case tkband: acc &= (size_t)to_num(STKP-i); break;
4241           case tkbor:  acc |= (size_t)to_num(STKP-i); break;
4242           case tkbxor: acc ^= (size_t)to_num(STKP-i); break;
4243           case tklshift: acc = (size_t)to_num(STKP-i) << acc; break;
4244           case tkrshift: acc = (size_t)to_num(STKP-i) >> acc; break;
4245         }
4246         drop_n(nargs);
4247         push_int_val(acc);
4248         break;
4249 
4250       case tktolower:
4251       case tktoupper:
4252         nargs = *ip++;
4253         struct zstring *z = to_str(STKP)->vst;
4254         unsigned zzlen = z->size + 4; // Allow for expansion
4255         zz = zstring_update(0, zzlen, "", 0);
4256         char *p = z->str, *e = z->str + z->size, *q = zz->str;
4257         // Similar logic to toybox strlower(), but fixed.
4258         while (p < e) {
4259           unsigned wch;
4260           int len = utf8towc(&wch, p, e-p);
4261           if (len < 1) {  // nul byte, error, or truncated code
4262             *q++ = *p++;
4263             continue;
4264           }
4265           p += len;
4266           wch = (opcode == tktolower ? towlower : towupper)(wch);
4267           len = wctoutf8(q, wch);
4268           q += len;
4269           // Need realloc here if overflow possible
4270           if ((len = q - zz->str) + 4 < (int)zzlen) continue;
4271           zz = zstring_update(zz, zzlen = len + 16, "", 0);
4272           q = zz->str + len;
4273         }
4274         *q = 0;
4275         zz->size = q - zz->str;
4276         zstring_release(&z);
4277         STKP->vst = zz;
4278         break;
4279 
4280       case tklength:
4281         nargs = *ip++;
4282         v = nargs ? STKP : &FIELD[0];
4283         force_maybemap_to_map(v);
4284         if (IS_MAP(v)) k = v->map->count - v->map->deleted;
4285         else {
4286           to_str(v);
4287           k = utf8cnt(v->vst->str, v->vst->size);
4288         }
4289         if (nargs) drop();
4290         push_int_val(k);
4291         break;
4292 
4293       case tksystem:
4294         nargs = *ip++;
4295         fflush(stdout);
4296         fflush(stderr);
4297         r = system(to_str(STKP)->vst->str);
4298 #ifdef WEXITSTATUS
4299         // WEXITSTATUS is in sys/wait.h, but I'm not including that.
4300         // It seems to also be in stdlib.h in gcc and musl-gcc.
4301         // No idea how portable this is!
4302         if (WIFEXITED(r)) r = WEXITSTATUS(r);
4303 #endif
4304         drop();
4305         push_int_val(r);
4306         break;
4307 
4308       case tkfflush:
4309         nargs = *ip++;
4310         r = fflush_file(nargs);
4311         if (nargs) drop();
4312         push_int_val(r);
4313         break;
4314 
4315       case tkclose:
4316         nargs = *ip++;
4317         r = close_file(to_str(STKP)->vst->str);
4318         drop();
4319         push_int_val(r);
4320         break;
4321 
4322       case tksprintf:
4323         nargs = *ip++;
4324         zstring_release(&TT.rgl.zspr);
4325         TT.rgl.zspr = new_zstring("", 0);
4326         varprint(fsprintf, 0, nargs);
4327         drop_n(nargs);
4328         vv = (struct zvalue)ZVINIT(ZF_STR, 0, TT.rgl.zspr);
4329         push_val(&vv);
4330         break;
4331 
4332       // Math builtins -- move here (per Oliver Webb suggestion)
4333       case tkatan2:
4334         nargs = *ip++;
4335         d = atan2(to_num(STKP-1), to_num(STKP));
4336         drop();
4337         STKP->num = d;
4338         break;
4339       case tkrand:
4340         nargs = *ip++;
4341         push_int_val(0);
4342         // Get all 53 mantissa bits in play:
4343         // (upper 26 bits * 2^27 + upper 27 bits) / 2^53
4344         STKP->num =
4345           ((random() >> 5) * 134217728.0 + (random() >> 4)) / 9007199254740992.0;
4346         break;
4347       case tksrand:
4348         nargs = *ip++;
4349         if (nargs == 1) {
4350           STKP->num = seedrand(to_num(STKP));
4351         } else push_int_val(seedrand(time(0)));
4352         break;
4353       case tkcos: case tksin: case tkexp: case tklog: case tksqrt: case tkint:
4354         nargs = *ip++;
4355         STKP->num = mathfunc[opcode-tkcos](to_num(STKP));
4356         break;
4357 
4358       default:
4359         // This should never happen:
4360         error_exit("!!! Unimplemented opcode %d", opcode);
4361     }
4362   }
4363   return opquit;
4364 }
4365 
4366 // interp() wraps the main interpreter loop interpx(). The main purpose
4367 // is to allow the TT.stack to be readjusted after an 'exit' from a function.
4368 // Also catches errors, as the normal operation should leave the TT.stack
4369 // depth unchanged after each run through the rules.
interp(int start,int * status)4370 static int interp(int start, int *status)
4371 {
4372   int stkptrbefore = stkn(0);
4373   int r = interpx(start, status);
4374   // If exit from function, TT.stack will be loaded with args etc. Clean it.
4375   if (r == tkexit) {
4376     // TODO FIXME is this safe? Just remove extra entries?
4377     STKP = &STACK[stkptrbefore];
4378   }
4379   if (stkn(0) - stkptrbefore)
4380     error_exit("!!AWK BUG stack pointer offset: %d", stkn(0) - stkptrbefore);
4381   return r;
4382 }
4383 
insert_argv_map(struct zvalue * map,int key,char * value)4384 static void insert_argv_map(struct zvalue *map, int key, char *value)
4385 {
4386   struct zvalue zkey = ZVINIT(ZF_STR, 0, num_to_zstring(key, ENSURE_STR(&STACK[CONVFMT])->vst->str));
4387   struct zvalue *v = get_map_val(map, &zkey);
4388   zvalue_release_zstring(&zkey);
4389   zvalue_release_zstring(v);
4390   *v = new_str_val(value);
4391   check_numeric_string(v);
4392 }
4393 
init_globals(int optind,int argc,char ** argv,char * sepstring,struct arg_list * assign_args)4394 static void init_globals(int optind, int argc, char **argv, char *sepstring,
4395     struct arg_list *assign_args)
4396 {
4397   // Global variables reside at the bottom of the TT.stack. Start with the awk
4398   // "special variables":  ARGC, ARGV, CONVFMT, ENVIRON, FILENAME, FNR, FS, NF,
4399   // NR, OFMT, OFS, ORS, RLENGTH, RS, RSTART, SUBSEP
4400 
4401   STACK[CONVFMT] = new_str_val("%.6g");
4402   // Init ENVIRON map.
4403   struct zvalue m = ZVINIT(ZF_MAP, 0, 0);
4404   zvalue_map_init(&m);
4405   STACK[ENVIRON] = m;
4406   for (char **pkey = environ; *pkey; pkey++) {
4407     char *pval = strchr(*pkey, '=');
4408     if (!pval) continue;
4409     struct zvalue zkey = ZVINIT(ZF_STR, 0, new_zstring(*pkey, pval - *pkey));
4410     struct zvalue *v = get_map_val(&m, &zkey);
4411     zstring_release(&zkey.vst);
4412     if (v->vst) FFATAL("env var dup? (%s)", pkey);
4413     *v = new_str_val(++pval);    // FIXME refcnt
4414     check_numeric_string(v);
4415   }
4416 
4417   // Init ARGV map.
4418   m = (struct zvalue)ZVINIT(ZF_MAP, 0, 0);
4419   zvalue_map_init(&m);
4420   STACK[ARGV] = m;
4421   insert_argv_map(&m, 0, TT.progname);
4422   int nargc = 1;
4423   for (int k = optind; k < argc; k++) {
4424     insert_argv_map(&m, nargc, argv[k]);
4425     nargc++;
4426   }
4427 
4428   // Init rest of the awk special variables.
4429   STACK[ARGC] = (struct zvalue)ZVINIT(ZF_NUM, nargc, 0);
4430   STACK[FILENAME] = new_str_val("");
4431   STACK[FNR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4432   STACK[FS] = new_str_val(sepstring);
4433   STACK[NF] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4434   STACK[NR] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4435   STACK[OFMT] = new_str_val("%.6g");
4436   STACK[OFS] = new_str_val(" ");
4437   STACK[ORS] = new_str_val("\n");
4438   STACK[RLENGTH] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4439   STACK[RS] = new_str_val("\n");
4440   STACK[RSTART] = (struct zvalue)ZVINIT(ZF_NUM, 0, 0);
4441   STACK[SUBSEP] = new_str_val("\034");
4442 
4443   // Init program globals.
4444   //
4445   // Push global variables on the TT.stack at offsets matching their index in the
4446   // global var table.  In the global var table we may have the type as scalar
4447   // or map if it is used as such in the program. In that case we init the
4448   // pushed arg from the type of the globals table.
4449   // But if a global var appears only as a bare arg in a function call it will
4450   // not be typed in the globals table. In that case we can only say it "may be"
4451   // a map, but we have to assume the possibility and attach a map to the
4452   // var. When/if the var is used as a map or scalar in the called function it
4453   // will be converted to a map or scalar as required.
4454   // See force_maybemap_to_scalar(), and the similar comment in
4455   // 'case tkfunction:' above.
4456   //
4457   int gstx, len = zlist_len(&TT.globals_table);
4458   for (gstx = TT.spec_var_limit; gstx < len; gstx++) {
4459     struct symtab_slot gs = GLOBAL[gstx];
4460     struct zvalue v = ZVINIT(gs.flags, 0, 0);
4461     if (v.flags == 0) {
4462       zvalue_map_init(&v);
4463       v.flags = ZF_MAYBEMAP;
4464     } else if (IS_MAP(&v)) {
4465       zvalue_map_init(&v);
4466     } else {
4467       // Set SCALAR flag 0 to create "uninitialized" scalar.
4468       v.flags = 0;
4469     }
4470     push_val(&v);
4471   }
4472 
4473   // Init -v assignment options.
4474   for (struct arg_list *p = assign_args; p; p = p->next) {
4475     char *asgn = p->arg;
4476     char *val = strchr(asgn, '=');
4477     if (!val) error_exit("bad -v assignment format");
4478     *val++ = 0;
4479     assign_global(asgn, val);
4480   }
4481 
4482   TT.rgl.cur_arg = new_str_val("<cmdline>");
4483   uninit_string_zvalue = new_str_val("");
4484   zvalue_copy(&FIELD[0], &uninit_string_zvalue);
4485 }
4486 
run_files(int * status)4487 static void run_files(int *status)
4488 {
4489   int r = 0;
4490   while (r != tkexit && *status < 0 && getrec_f0() >= 0)
4491     if ((r = interp(TT.cgl.first_recrule, status)) == tknextfile) next_fp();
4492 }
4493 
free_literal_regex(void)4494 static void free_literal_regex(void)
4495 {
4496   int len = zlist_len(&TT.literals);
4497   for (int k = 1; k < len; k++)
4498     if (IS_RX(&LITERAL[k])) regfree(LITERAL[k].rx);
4499 }
4500 
run(int optind,int argc,char ** argv,char * sepstring,struct arg_list * assign_args)4501 static void run(int optind, int argc, char **argv, char *sepstring,
4502     struct arg_list *assign_args)
4503 {
4504   char *printf_fmt_rx = "%[-+ #0']*([*]|[0-9]*)([.]([*]|[0-9]*))?l?[aAdiouxXfFeEgGcs%]";
4505   init_globals(optind, argc, argv, sepstring, assign_args);
4506   TT.cfile = xzalloc(sizeof(struct zfile));
4507   xregcomp(&TT.rx_default, "[ \t\n]+", REG_EXTENDED);
4508   xregcomp(&TT.rx_last, "[ \t\n]+", REG_EXTENDED);
4509   xregcomp(&TT.rx_printf_fmt, printf_fmt_rx, REG_EXTENDED);
4510   new_file("-", stdin, 'r', 1, 1);
4511   new_file("/dev/stdin", stdin, 'r', 1, 1);
4512   new_file("/dev/stdout", stdout, 'w', 1, 1);
4513   TT.zstdout = TT.zfiles;
4514   new_file("/dev/stderr", stderr, 'w', 1, 1);
4515   seedrand(1);
4516   int status = -1, r = 0;
4517   if (TT.cgl.first_begin) r = interp(TT.cgl.first_begin, &status);
4518   if (r != tkexit)
4519     if (TT.cgl.first_recrule) run_files(&status);
4520   if (TT.cgl.first_end) r = interp(TT.cgl.first_end, &status);
4521   regfree(&TT.rx_printf_fmt);
4522   regfree(&TT.rx_default);
4523   regfree(&TT.rx_last);
4524   free_literal_regex();
4525   close_file(0);    // close all files
4526   if (status >= 0) awk_exit(status);
4527 }
4528 
4529 ////////////////////
4530 //// main
4531 ////////////////////
4532 
progfiles_init(char * progstring,struct arg_list * prog_args)4533 static void progfiles_init(char *progstring, struct arg_list *prog_args)
4534 {
4535   TT.scs->p = progstring ? progstring : "  " + 2;
4536   TT.scs->progstring = progstring;
4537   TT.scs->prog_args = prog_args;
4538   TT.scs->filename = "(cmdline)";
4539   TT.scs->maxtok = 256;
4540   TT.scs->tokstr = xzalloc(TT.scs->maxtok);
4541 }
4542 
awk(char * sepstring,char * progstring,struct arg_list * prog_args,struct arg_list * assign_args,int optind,int argc,char ** argv,int opt_run_prog)4543 static int awk(char *sepstring, char *progstring, struct arg_list *prog_args,
4544     struct arg_list *assign_args, int optind, int argc, char **argv,
4545     int opt_run_prog)
4546 {
4547   struct scanner_state ss = {0};
4548   TT.scs = &ss;
4549 
4550   setlocale(LC_NUMERIC, "");
4551   progfiles_init(progstring, prog_args);
4552   compile();
4553 
4554   if (TT.cgl.compile_error_count)
4555     error_exit("%d syntax error(s)", TT.cgl.compile_error_count);
4556   else {
4557     if (opt_run_prog)
4558       run(optind, argc, argv, sepstring, assign_args);
4559   }
4560 
4561   return TT.cgl.compile_error_count;
4562 }
4563 
awk_main(void)4564 void awk_main(void)
4565 {
4566   char *sepstring = TT.F ? escape_str(TT.F, 0) : " ";
4567   int optind = 0;
4568   char *progstring = NULL;
4569 
4570   TT.pbuf = toybuf;
4571   toys.exitval = 2;
4572   if (!TT.f) {
4573     if (*toys.optargs) progstring = toys.optargs[optind++];
4574     else error_exit("No program string\n");
4575   }
4576   TT.progname = toys.which->name;
4577   toys.exitval = awk(sepstring, progstring, TT.f, TT.v,
4578       optind, toys.optc, toys.optargs, !FLAG(c));
4579 }
4580