ota: Merge one true awk 20240422 (a3b68e649d2d)

Apr 22, 2024:
	fixed regex engine gototab reallocation issue that was
	introduced during the Nov 24 rewrite. Thanks to Arnold Robbins.
	Fixed a scan bug in split in the case the separator is a single
	character. thanks to Oguz Ismail for spotting the issue.

Mar 10, 2024:
	fixed use-after-free bug in fnematch due to adjbuf invalidating
	the pointers to buf. thanks to github user caffe3 for spotting
	the issue and providing a fix, and to Miguel Pineiro Jr.
	for the alternative fix.
	MAX_UTF_BYTES in fnematch has been replaced with awk_mb_cur_max.
	thanks to Miguel Pineiro Jr.

Sponsored by:		Netflix

(cherry picked from commit 1023317ac491090f8d84a62999ffc303cf88528c)
This commit is contained in:
Warner Losh 2024-05-04 15:50:33 -06:00
parent f65f02ccf2
commit 02cae85fdb
14 changed files with 44 additions and 395 deletions

View file

@ -47,30 +47,6 @@
* test/T.lilly: Remove gawk warnings from output, improves
portability.
2019-10-17 Arnold D. Robbins <arnold@skeeve.com>
Pull in systime() and strftime() from the NetBSD awk.
* awk.1: Document the functions.
* run.c (bltin): Implement the functions.
* awk.h: Add defines for systime and strftime.
* lex.c: Add support for systime and strftime.
2019-10-07 Arnold D. Robbins <arnold@skeeve.com>
Integrate features from different *BSD versions of awk.
Gensub support from NetBSD. Bitwise functions from OpenBSD.
* awk.h: Add defines for and, or, xor, compl, lshift and rshift.
* awkgram.y: Add support for gensub.
* maketab.c: Ditto.
* lex.c: Add support for gensub and bitwise functions.
* parse.c (node5, op5): New functions.
* proto.h (node5, op5): New declarations.
* run.c (bltin): Implement the bitwise functions.
(gensub): New function.
* awk.1: Document additional functions.
2019-10-07 Arnold D. Robbins <arnold@skeeve.com>
* b.c (fnematch): Change type of pbuf from unsigned char to char.

View file

@ -25,6 +25,20 @@ THIS SOFTWARE.
This file lists all bug fixes, changes, etc., made since the
second edition of the AWK book was published in September 2023.
Apr 22, 2024:
fixed regex engine gototab reallocation issue that was
introduced during the Nov 24 rewrite. Thanks to Arnold Robbins.
Fixed a scan bug in split in the case the separator is a single
character. thanks to Oguz Ismail for spotting the issue.
Mar 10, 2024:
fixed use-after-free bug in fnematch due to adjbuf invalidating
the pointers to buf. thanks to github user caffe3 for spotting
the issue and providing a fix, and to Miguel Pineiro Jr.
for the alternative fix.
MAX_UTF_BYTES in fnematch has been replaced with awk_mb_cur_max.
thanks to Miguel Pineiro Jr.
Jan 22, 2024:
Restore the ability to compile with g++. Thanks to
Arnold Robbins.

View file

@ -305,25 +305,6 @@ and
.B gsub
return the number of replacements.
.TP
\fBgensub(\fIpat\fB, \fIrepl\fB, \fIhow\fR [\fB, \fItarget\fR]\fB)\fR
replaces instances of
.I pat
in
.I target
with
.IR repl .
If
.I how
is \fB"g"\fR or \fB"G"\fR, do so globally. Otherwise,
.I how
is a number indicating which occurrence to replace. If no
.IR target ,
use
.BR $0 .
Return the resulting string;
.I target
is not modified.
.TP
.BI sprintf( fmt , " expr" , " ...\fB)
the string resulting from formatting
.I expr ...
@ -332,28 +313,6 @@ according to the
format
.IR fmt .
.TP
.B systime()
returns the current date and time as a standard
``seconds since the epoch'' value.
.TP
.BI strftime( fmt ", " timestamp\^ )
formats
.I timestamp
(a value in seconds since the epoch)
according to
.IR fmt ,
which is a format string as supported by
.IR strftime (3).
Both
.I timestamp
and
.I fmt
may be omitted; if no
.IR timestamp ,
the current time of day is used, and if no
.IR fmt ,
a default format of \fB"%a %b %e %H:%M:%S %Z %Y"\fR is used.
.TP
.BI system( cmd )
executes
.I cmd
@ -413,17 +372,6 @@ In all cases,
returns 1 for a successful input,
0 for end of file, and \-1 for an error.
.PP
The functions
.BR compl ,
.BR and ,
.BR or ,
.BR xor ,
.BR lshift ,
and
.B rshift
peform the corresponding bitwise operations on their
operands, which are first truncated to integer.
.PP
Patterns are arbitrary Boolean combinations
(with
.BR "! || &&" )

View file

@ -154,14 +154,6 @@ extern Cell *symtabloc; /* SYMTAB */
#define FTOUPPER 12
#define FTOLOWER 13
#define FFLUSH 14
#define FAND 15
#define FFOR 16
#define FXOR 17
#define FCOMPL 18
#define FLSHIFT 19
#define FRSHIFT 20
#define FSYSTIME 21
#define FSTRFTIME 22
/* Node: parse tree is made of nodes, with Cell's at bottom */

View file

@ -53,7 +53,7 @@ Node *arglist = 0; /* list of args for current function */
%token <i> FINAL DOT ALL CCL NCCL CHAR OR STAR QUEST PLUS EMPTYRE ZERO
%token <i> AND BOR APPEND EQ GE GT LE LT NE IN
%token <i> ARG BLTIN BREAK CLOSE CONTINUE DELETE DO EXIT FOR FUNC
%token <i> GENSUB SUB GSUB IF INDEX LSUBSTR MATCHFCN NEXT NEXTFILE
%token <i> SUB GSUB IF INDEX LSUBSTR MATCHFCN NEXT NEXTFILE
%token <i> ADD MINUS MULT DIVIDE MOD
%token <i> ASSIGN ASGNOP ADDEQ SUBEQ MULTEQ DIVEQ MODEQ POWEQ
%token <i> PRINT PRINTF SPRINTF
@ -377,24 +377,6 @@ term:
| INCR var { $$ = op1(PREINCR, $2); }
| var DECR { $$ = op1(POSTDECR, $1); }
| var INCR { $$ = op1(POSTINCR, $1); }
| GENSUB '(' reg_expr comma pattern comma pattern ')'
{ $$ = op5(GENSUB, NIL, (Node*)makedfa($3, 1), $5, $7, rectonode()); }
| GENSUB '(' pattern comma pattern comma pattern ')'
{ if (constnode($3)) {
$$ = op5(GENSUB, NIL, (Node *)makedfa(strnode($3), 1), $5, $7, rectonode());
free($3);
} else
$$ = op5(GENSUB, (Node *)1, $3, $5, $7, rectonode());
}
| GENSUB '(' reg_expr comma pattern comma pattern comma pattern ')'
{ $$ = op5(GENSUB, NIL, (Node*)makedfa($3, 1), $5, $7, $9); }
| GENSUB '(' pattern comma pattern comma pattern comma pattern ')'
{ if (constnode($3)) {
$$ = op5(GENSUB, NIL, (Node *)makedfa(strnode($3),1), $5,$7,$9);
free($3);
} else
$$ = op5(GENSUB, (Node *)1, $3, $5, $7, $9);
}
| GETLINE var LT term { $$ = op3(GETLINE, $2, itonp($3), $4); }
| GETLINE LT term { $$ = op3(GETLINE, NIL, itonp($2), $3); }
| GETLINE var { $$ = op3(GETLINE, $2, NIL, NIL); }

View file

@ -651,8 +651,8 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implem
if (tab->inuse + 1 >= tab->allocated)
resize_gototab(f, state);
f->gototab[state].entries[f->gototab[state].inuse-1].ch = ch;
f->gototab[state].entries[f->gototab[state].inuse-1].state = val;
f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
f->gototab[state].entries[f->gototab[state].inuse].state = val;
f->gototab[state].inuse++;
return val;
} else {
@ -677,9 +677,9 @@ static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implem
gtt *tab = & f->gototab[state];
if (tab->inuse + 1 >= tab->allocated)
resize_gototab(f, state);
++tab->inuse;
f->gototab[state].entries[tab->inuse].ch = ch;
f->gototab[state].entries[tab->inuse].state = val;
++tab->inuse;
qsort(f->gototab[state].entries,
f->gototab[state].inuse, sizeof(gtte), entry_cmp);
@ -830,8 +830,6 @@ int nematch(fa *f, const char *p0) /* non-empty match, for sub */
}
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long
/*
* NAME
* fnematch
@ -868,16 +866,28 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
do {
/*
* Call u8_rune with at least MAX_UTF_BYTES ahead in
* Call u8_rune with at least awk_mb_cur_max ahead in
* the buffer until EOF interferes.
*/
if (k - j < MAX_UTF_BYTES) {
if (k + MAX_UTF_BYTES > buf + bufsize) {
if (k - j < awk_mb_cur_max) {
if (k + awk_mb_cur_max > buf + bufsize) {
char *obuf = buf;
adjbuf((char **) &buf, &bufsize,
bufsize + MAX_UTF_BYTES,
bufsize + awk_mb_cur_max,
quantum, 0, "fnematch");
/* buf resized, maybe moved. update pointers */
*pbufsize = bufsize;
if (obuf != buf) {
i = buf + (i - obuf);
j = buf + (j - obuf);
k = buf + (k - obuf);
*pbuf = buf;
if (patlen)
patbeg = buf + (patbeg - obuf);
}
}
for (n = MAX_UTF_BYTES ; n > 0; n--) {
for (n = awk_mb_cur_max ; n > 0; n--) {
*k++ = (c = getc(f)) != EOF ? c : 0;
if (c == EOF) {
if (ferror(f))
@ -914,10 +924,6 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
s = 2;
} while (1);
/* adjbuf() may have relocated a resized buffer. Inform the world. */
*pbuf = buf;
*pbufsize = bufsize;
if (patlen) {
/*
* Under no circumstances is the last character fed to

View file

@ -27,6 +27,6 @@ do
then
rm -f $OUT
else
echo '++++ $i failed!'
echo "+++ $i failed!"
fi
done

View file

@ -0,0 +1,3 @@
normal status 42
death by signal status 257
death by signal with core dump status 262

View file

@ -47,11 +47,9 @@ const Keyword keywords[] = { /* keep sorted: binary searched */
{ "BEGIN", XBEGIN, XBEGIN },
{ "END", XEND, XEND },
{ "NF", VARNF, VARNF },
{ "and", FAND, BLTIN },
{ "atan2", FATAN, BLTIN },
{ "break", BREAK, BREAK },
{ "close", CLOSE, CLOSE },
{ "compl", FCOMPL, BLTIN },
{ "continue", CONTINUE, CONTINUE },
{ "cos", FCOS, BLTIN },
{ "delete", DELETE, DELETE },
@ -63,7 +61,6 @@ const Keyword keywords[] = { /* keep sorted: binary searched */
{ "for", FOR, FOR },
{ "func", FUNC, FUNC },
{ "function", FUNC, FUNC },
{ "gensub", GENSUB, GENSUB },
{ "getline", GETLINE, GETLINE },
{ "gsub", GSUB, GSUB },
{ "if", IF, IF },
@ -72,30 +69,24 @@ const Keyword keywords[] = { /* keep sorted: binary searched */
{ "int", FINT, BLTIN },
{ "length", FLENGTH, BLTIN },
{ "log", FLOG, BLTIN },
{ "lshift", FLSHIFT, BLTIN },
{ "match", MATCHFCN, MATCHFCN },
{ "next", NEXT, NEXT },
{ "nextfile", NEXTFILE, NEXTFILE },
{ "or", FFOR, BLTIN },
{ "print", PRINT, PRINT },
{ "printf", PRINTF, PRINTF },
{ "rand", FRAND, BLTIN },
{ "return", RETURN, RETURN },
{ "rshift", FRSHIFT, BLTIN },
{ "sin", FSIN, BLTIN },
{ "split", SPLIT, SPLIT },
{ "sprintf", SPRINTF, SPRINTF },
{ "sqrt", FSQRT, BLTIN },
{ "srand", FSRAND, BLTIN },
{ "strftime", FSTRFTIME, BLTIN },
{ "sub", SUB, SUB },
{ "substr", SUBSTR, SUBSTR },
{ "system", FSYSTEM, BLTIN },
{ "systime", FSYSTIME, BLTIN },
{ "tolower", FTOLOWER, BLTIN },
{ "toupper", FTOUPPER, BLTIN },
{ "while", WHILE, WHILE },
{ "xor", FXOR, BLTIN },
};
#define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }

View file

@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
THIS SOFTWARE.
****************************************************************/
const char *version = "version 20240122";
const char *version = "version 20240422";
#define DEBUG
#include <stdio.h>

View file

@ -104,7 +104,6 @@ struct xx
{ ARG, "arg", "arg" },
{ VARNF, "getnf", "NF" },
{ GETLINE, "awkgetline", "getline" },
{ GENSUB, "gensub", "gensub" },
{ 0, "", "" },
};

View file

@ -93,20 +93,6 @@ Node *node4(int a, Node *b, Node *c, Node *d, Node *e)
return(x);
}
Node *node5(int a, Node *b, Node *c, Node *d, Node *e, Node *f)
{
Node *x;
x = nodealloc(5);
x->nobj = a;
x->narg[0] = b;
x->narg[1] = c;
x->narg[2] = d;
x->narg[3] = e;
x->narg[4] = f;
return(x);
}
Node *stat1(int a, Node *b)
{
Node *x;
@ -179,15 +165,6 @@ Node *op4(int a, Node *b, Node *c, Node *d, Node *e)
return(x);
}
Node *op5(int a, Node *b, Node *c, Node *d, Node *e, Node *f)
{
Node *x;
x = node5(a,b,c,d,e,f);
x->ntype = NEXPR;
return(x);
}
Node *celltonode(Cell *a, int b)
{
Node *x;

View file

@ -73,14 +73,12 @@ extern Node *node1(int, Node *);
extern Node *node2(int, Node *, Node *);
extern Node *node3(int, Node *, Node *, Node *);
extern Node *node4(int, Node *, Node *, Node *, Node *);
extern Node *node5(int, Node *, Node *, Node *, Node *, Node *);
extern Node *stat3(int, Node *, Node *, Node *);
extern Node *op2(int, Node *, Node *);
extern Node *op1(int, Node *);
extern Node *stat1(int, Node *);
extern Node *op3(int, Node *, Node *, Node *);
extern Node *op4(int, Node *, Node *, Node *, Node *);
extern Node *op5(int, Node *, Node *, Node *, Node *, Node *);
extern Node *stat2(int, Node *, Node *);
extern Node *stat4(int, Node *, Node *, Node *, Node *);
extern Node *celltonode(Cell *, int);
@ -199,7 +197,6 @@ extern const char *filename(FILE *);
extern Cell *closefile(Node **, int);
extern void closeall(void);
extern Cell *dosub(Node **, int);
extern Cell *gensub(Node **, int);
extern FILE *popen(const char *, const char *);
extern int pclose(FILE *);

View file

@ -1827,7 +1827,7 @@ Cell *split(Node **a, int nnn) /* split(a[0], a[1], a[2]); a[3] is type */
for (;;) {
n++;
t = s;
while (*s != sep && *s != '\n' && *s != '\0')
while (*s != sep && *s != '\0')
s++;
temp = *s;
setptr(s, '\0');
@ -2062,14 +2062,12 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis
{
Cell *x, *y;
Awkfloat u;
int t, sz;
int t;
Awkfloat tmp;
char *buf, *fmt;
char *buf;
Node *nextarg;
FILE *fp;
int status = 0;
time_t tv;
struct tm *tm;
int estatus = 0;
t = ptoi(a[0]);
@ -2111,64 +2109,6 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis
nextarg = nextarg->nnext;
}
break;
case FCOMPL:
u = ~((int)getfval(x));
break;
case FAND:
if (nextarg == 0) {
WARNING("and requires two arguments; returning 0");
u = 0;
break;
}
y = execute(a[1]->nnext);
u = ((int)getfval(x)) & ((int)getfval(y));
tempfree(y);
nextarg = nextarg->nnext;
break;
case FFOR:
if (nextarg == 0) {
WARNING("or requires two arguments; returning 0");
u = 0;
break;
}
y = execute(a[1]->nnext);
u = ((int)getfval(x)) | ((int)getfval(y));
tempfree(y);
nextarg = nextarg->nnext;
break;
case FXOR:
if (nextarg == 0) {
WARNING("xor requires two arguments; returning 0");
u = 0;
break;
}
y = execute(a[1]->nnext);
u = ((int)getfval(x)) ^ ((int)getfval(y));
tempfree(y);
nextarg = nextarg->nnext;
break;
case FLSHIFT:
if (nextarg == 0) {
WARNING("lshift requires two arguments; returning 0");
u = 0;
break;
}
y = execute(a[1]->nnext);
u = ((int)getfval(x)) << ((int)getfval(y));
tempfree(y);
nextarg = nextarg->nnext;
break;
case FRSHIFT:
if (nextarg == 0) {
WARNING("rshift requires two arguments; returning 0");
u = 0;
break;
}
y = execute(a[1]->nnext);
u = ((int)getfval(x)) >> ((int)getfval(y));
tempfree(y);
nextarg = nextarg->nnext;
break;
case FSYSTEM:
fflush(stdout); /* in case something is buffered already */
estatus = status = system(getsval(x));
@ -2223,41 +2163,6 @@ Cell *bltin(Node **a, int n) /* builtin functions. a[0] is type, a[1] is arg lis
else
u = fflush(fp);
break;
case FSYSTIME:
u = time((time_t *) 0);
break;
case FSTRFTIME:
/* strftime([format [,timestamp]]) */
if (nextarg) {
y = execute(nextarg);
nextarg = nextarg->nnext;
tv = (time_t) getfval(y);
tempfree(y);
} else
tv = time((time_t *) 0);
tm = localtime(&tv);
if (tm == NULL)
FATAL("bad time %ld", (long)tv);
if (isrec(x)) {
/* format argument not provided, use default */
fmt = tostring("%a %b %d %H:%M:%S %Z %Y");
} else
fmt = tostring(getsval(x));
sz = 32;
buf = NULL;
do {
if ((buf = realloc(buf, (sz *= 2))) == NULL)
FATAL("out of memory in strftime");
} while (strftime(buf, sz, fmt, tm) == 0 && fmt[0] != '\0');
y = gettemp();
setsval(y, buf);
free(fmt);
free(buf);
return y;
default: /* can't happen */
FATAL("illegal function type %d", t);
break;
@ -2501,7 +2406,7 @@ void backsub(char **pb_ptr, const char **sptr_ptr);
Cell *dosub(Node **a, int subop) /* sub and gsub */
{
fa *pfa;
int tempstat;
int tempstat = 0;
char *repl;
Cell *x;
@ -2637,147 +2542,6 @@ next_search:
return x;
}
Cell *gensub(Node **a, int nnn) /* global selective substitute */
/* XXX incomplete - doesn't support backreferences \0 ... \9 */
{
Cell *x, *y, *res, *h;
char *rptr;
const char *sptr;
char *buf, *pb;
const char *t, *q;
fa *pfa;
int mflag, tempstat, num, whichm;
int bufsz = recsize;
if ((buf = malloc(bufsz)) == NULL)
FATAL("out of memory in gensub");
mflag = 0; /* if mflag == 0, can replace empty string */
num = 0;
x = execute(a[4]); /* source string */
t = getsval(x);
res = copycell(x); /* target string - initially copy of source */
res->csub = CTEMP; /* result values are temporary */
if (a[0] == 0) /* 0 => a[1] is already-compiled regexpr */
pfa = (fa *) a[1]; /* regular expression */
else {
y = execute(a[1]);
pfa = makedfa(getsval(y), 1);
tempfree(y);
}
y = execute(a[2]); /* replacement string */
h = execute(a[3]); /* which matches should be replaced */
sptr = getsval(h);
if (sptr[0] == 'g' || sptr[0] == 'G')
whichm = -1;
else {
/*
* The specified number is index of replacement, starting
* from 1. GNU awk treats index lower than 0 same as
* 1, we do same for compatibility.
*/
whichm = (int) getfval(h) - 1;
if (whichm < 0)
whichm = 0;
}
tempfree(h);
if (pmatch(pfa, t)) {
char *sl;
tempstat = pfa->initstat;
pfa->initstat = 2;
pb = buf;
rptr = getsval(y);
/*
* XXX if there are any backreferences in subst string,
* complain now.
*/
for (sl = rptr; (sl = strchr(sl, '\\')) && sl[1]; sl++) {
if (strchr("0123456789", sl[1])) {
FATAL("gensub doesn't support backreferences (subst \"%s\")", rptr);
}
}
do {
if (whichm >= 0 && whichm != num) {
num++;
adjbuf(&buf, &bufsz, (pb - buf) + (patbeg - t) + patlen, recsize, &pb, "gensub");
/* copy the part of string up to and including
* match to output buffer */
while (t < patbeg + patlen)
*pb++ = *t++;
continue;
}
if (patlen == 0 && *patbeg != 0) { /* matched empty string */
if (mflag == 0) { /* can replace empty */
num++;
sptr = rptr;
while (*sptr != 0) {
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
if (*sptr == '\\') {
backsub(&pb, &sptr);
} else if (*sptr == '&') {
sptr++;
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
for (q = patbeg; q < patbeg+patlen; )
*pb++ = *q++;
} else
*pb++ = *sptr++;
}
}
if (*t == 0) /* at end */
goto done;
adjbuf(&buf, &bufsz, 2+pb-buf, recsize, &pb, "gensub");
*pb++ = *t++;
if (pb > buf + bufsz) /* BUG: not sure of this test */
FATAL("gensub result0 %.30s too big; can't happen", buf);
mflag = 0;
}
else { /* matched nonempty string */
num++;
sptr = t;
adjbuf(&buf, &bufsz, 1+(patbeg-sptr)+pb-buf, recsize, &pb, "gensub");
while (sptr < patbeg)
*pb++ = *sptr++;
sptr = rptr;
while (*sptr != 0) {
adjbuf(&buf, &bufsz, 5+pb-buf, recsize, &pb, "gensub");
if (*sptr == '\\') {
backsub(&pb, &sptr);
} else if (*sptr == '&') {
sptr++;
adjbuf(&buf, &bufsz, 1+patlen+pb-buf, recsize, &pb, "gensub");
for (q = patbeg; q < patbeg+patlen; )
*pb++ = *q++;
} else
*pb++ = *sptr++;
}
t = patbeg + patlen;
if (patlen == 0 || *t == 0 || *(t-1) == 0)
goto done;
if (pb > buf + bufsz)
FATAL("gensub result1 %.30s too big; can't happen", buf);
mflag = 1;
}
} while (pmatch(pfa,t));
sptr = t;
adjbuf(&buf, &bufsz, 1+strlen(sptr)+pb-buf, 0, &pb, "gensub");
while ((*pb++ = *sptr++) != 0)
;
done: if (pb > buf + bufsz)
FATAL("gensub result2 %.30s too big; can't happen", buf);
*pb = '\0';
setsval(res, buf);
pfa->initstat = tempstat;
}
tempfree(x);
tempfree(y);
free(buf);
return(res);
}
void backsub(char **pb_ptr, const char **sptr_ptr) /* handle \\& variations */
{ /* sptr[0] == '\\' */
char *pb = *pb_ptr;