Merge branch 'master' of https://github.com/onetrueawk/awk

ctSkennerton · Mar 10, 2021 · 04e75e8 · 04e75e8
2 parents d1f287a + c0f4e97
commit 04e75e8
Show file tree

Hide file tree

Showing 12 changed files with 245 additions and 50 deletions.
diff --git a/FIXES b/FIXES
@@ -25,6 +25,37 @@ THIS SOFTWARE.
 This file lists all bug fixes, changes, etc., made since the AWK book
 was sent to the printers in August, 1987.
 
+February 15, 2021:
+ Small fix so that awk will compile again with g++. Thanks to
+ Arnold Robbins.
+
+January 06, 2021:
+ Fix a decision bug with trailing stuff in lib.c:is_valid_number
+ after recent changes. Thanks to Ozan Yigit.
+
+December 18, 2020:
+ Fix problems converting inf and NaN values in lib.c:is_valid_number.
+ Enhance number to string conversion to do the right thing for
+ NaN and inf values. Things are now pretty much the same as in
+ gawk. (Found a gawk bug while we're at it.) Added a torture
+ test for these values. Thanks to Arnold Robbins. Allows closing
+ of PR #101.
+
+December 15, 2020:
+ Merge PR #99, which gets the right header for strcasecmp.
+ Thanks to GitHub user michaelforney.
+
+December 8, 2020:
+ Merge PR #98: Disallow hex data. Allow only +nan, -nan,
+ +inf, -inf (case independent) to give NaN and infinity values.
+ Improve things so that string to double conversion is only
+ done once, yielding something of a speedup. This obviate
+ PR #95. Thanks to Arnold Robbins.
+
+December 3, 2020:
+ Fix to argument parsing to avoid printing spurious newlines.
+ Thanks to Todd Miller. Merges PR #97.
+
 October 13, 2020:
  Add casts before all the calls to malloc/calloc/realloc in order
  to get it to compile with g++. Thanks to Arnold Robbins.

diff --git a/bioawk.1 b/bioawk.1
@@ -729,3 +729,56 @@ The scope rules for variables in functions are a botch;
 the syntax is worse.
 .PP
 Only eight-bit characters sets are handled correctly.
+.SH UNUSUAL FLOATING-POINT VALUES
+.I Awk
+was designed before IEEE 754 arithmetic defined Not-A-Number (NaN)
+and Infinity values, which are supported by all modern floating-point
+hardware.
+.PP
+Because
+.I awk
+uses
+.IR strtod (3)
+and
+.IR atof (3)
+to convert string values to double-precision floating-point values,
+modern C libraries also convert strings starting with
+.B inf
+and
+.B nan
+into infinity and NaN values respectively. This led to strange results,
+with something like this:
+.PP
+.EX
+.nf
+echo nancy | awk '{ print $1 + 0 }'
+.fi
+.EE
+.PP
+printing
+.B nan
+instead of zero.
+.PP
+.I Awk
+now follows GNU AWK, and prefilters string values before attempting
+to convert them to numbers, as follows:
+.TP
+.I "Hexadecimal values"
+Hexadecimal values (allowed since C99) convert to zero, as they did
+prior to C99.
+.TP
+.I "NaN values"
+The two strings
+.B +nan
+and
+.B \-nan
+(case independent) convert to NaN. No others do.
+(NaNs can have signs.)
+.TP
+.I "Infinity values"
+The two strings
+.B +inf
+and
+.B \-inf
+(case independent) convert to positive and negative infinity, respectively.
+No others do.
diff --git a/bugs-fixed/inf-nan-torture.awk b/bugs-fixed/inf-nan-torture.awk
@@ -0,0 +1,4 @@
+{
+ for (i = 1; i <= NF; i++)
+ print i, $i, $i + 0
+}
diff --git a/bugs-fixed/inf-nan-torture.in b/bugs-fixed/inf-nan-torture.in
@@ -0,0 +1 @@
+-inf -inform inform -nan -nancy nancy -123 0 123 +123 nancy +nancy +nan inform +inform +inf
diff --git a/bugs-fixed/inf-nan-torture.ok b/bugs-fixed/inf-nan-torture.ok
@@ -0,0 +1,16 @@
+1 -inf -inf
+2 -inform 0
+3 inform 0
+4 -nan -nan
+5 -nancy 0
+6 nancy 0
+7 -123 -123
+8 0 0
+9 123 123
+10 +123 123
+11 nancy 0
+12 +nancy 0
+13 +nan +nan
+14 inform 0
+15 +inform 0
+16 +inf +inf
diff --git a/lex.c b/lex.c
@@ -208,7 +208,12 @@ int yylex(void)
  return word(buf);
  if (isdigit(c)) {
  char *cp = tostring(buf);
- yylval.cp = setsymtab(buf, cp, atof(buf), CON|NUM, symtab);
+ double result;
+
+ if (is_number(cp, & result))
+ yylval.cp = setsymtab(buf, cp, result, CON|NUM, symtab);
+ else
+ yylval.cp = setsymtab(buf, cp, 0.0, STR, symtab);
  free(cp);
  /* should this also have STR set? */
  RET(NUMBER);

diff --git a/lib.c b/lib.c
@@ -25,11 +25,13 @@ THIS SOFTWARE.
 #define DEBUG
 #include <stdio.h>
 #include <string.h>
+#include <strings.h>
 #include <ctype.h>
 #include <errno.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <limits.h>
+#include <math.h>
 #include "awk.h"
 
 char EMPTY[] = { '\0' };
@@ -183,12 +185,14 @@ int getrec(char **pbuf, int *pbufsize, bool isrecord) /* get next input record *
  innew = false;
  if (c != 0 || buf[0] != '\0') { /* normal record */
  if (isrecord) {
+ double result;
+
  if (freeable(fldtab[0]))
  xfree(fldtab[0]->sval);
  fldtab[0]->sval = buf; /* buf == record */
  fldtab[0]->tval = REC | STR | DONTFREE;
- if (is_number(fldtab[0]->sval)) {
- fldtab[0]->fval = atof(fldtab[0]->sval);
+ if (is_number(fldtab[0]->sval, & result)) {
+ fldtab[0]->fval = result;
  fldtab[0]->tval |= NUM;
  }
  }
@@ -295,15 +299,16 @@ void setclvar(char *s) /* set var=value from s */
 {
  char *p;
  Cell *q;
+ double result;
 
  for (p=s; *p != '='; p++)
  ;
  *p++ = 0;
  p = qstring(p, '\0');
  q = setsymtab(s, p, 0.0, STR, symtab);
  setsval(q, p);
- if (is_number(q->sval)) {
- q->fval = atof(q->sval);
+ if (is_number(q->sval, & result)) {
+ q->fval = result;
  q->tval |= NUM;
  }
  DPRINTF("command line set %s to |%s|\n", s, p);
@@ -404,9 +409,11 @@ void fldbld(void) /* create fields from current record */
  lastfld = i;
  donefld = true;
  for (j = 1; j <= lastfld; j++) {
+ double result;
+
  p = fldtab[j];
- if(is_number(p->sval)) {
- p->fval = atof(p->sval);
+ if(is_number(p->sval, & result)) {
+ p->fval = result;
  p->tval |= NUM;
  }
  }
@@ -671,12 +678,11 @@ void error()
  fprintf(stderr, " source line number %d", curnode->lineno);
  else if (lineno)
  fprintf(stderr, " source line number %d", lineno);
+ if (compile_time == COMPILING && cursource() != NULL)
+ fprintf(stderr, " source file %s", cursource());
+ fprintf(stderr, "\n");
+ eprint();
  }
-
- if (compile_time == COMPILING && cursource() != NULL)
- fprintf(stderr, " source file %s", cursource());
- fprintf(stderr, "\n");
- eprint();
 }
 
 void eprint(void) /* try to print context around error */
@@ -759,24 +765,76 @@ int isclvar(const char *s) /* is s of form var=something ? */
 /* strtod is supposed to be a proper test of what's a valid number */
 /* appears to be broken in gcc on linux: thinks 0x123 is a valid FP number */
 /* wrong: violates 4.10.1.4 of ansi C standard */
+
 /* well, not quite. As of C99, hex floating point is allowed. so this is
- * a bit of a mess.
+ * a bit of a mess. We work around the mess by checking for a hexadecimal
+ * value and disallowing it. Similarly, we now follow gawk and allow only
+ * +nan, -nan, +inf, and -inf for NaN and infinity values.
  */
 
-#include <math.h>
-int is_number(const char *s)
+/*
+ * This routine now has a more complicated interface, the main point
+ * being to avoid the double conversion of a string to double, and
+ * also to convey out, if requested, the information that the numeric
+ * value was a leading string or is all of the string. The latter bit
+ * is used in getfval().
+ */
+
+bool is_valid_number(const char *s, bool trailing_stuff_ok,
+ bool *no_trailing, double *result)
 {
  double r;
  char *ep;
+ bool retval = false;
+ bool is_nan = false;
+ bool is_inf = false;
+
+ if (no_trailing)
+ *no_trailing = false;
+
+ while (isspace(*s))
+ s++;
+
+ // no hex floating point, sorry
+ if (s[0] == '0' && tolower(s[1]) == 'x')
+ return false;
+
+ // allow +nan, -nan, +inf, -inf, any other letter, no
+ if (s[0] == '+' || s[0] == '-') {
+ is_nan = (strncasecmp(s+1, "nan", 3) == 0);
+ is_inf = (strncasecmp(s+1, "inf", 3) == 0);
+ if ((is_nan || is_inf)
+ && (isspace(s[4]) || s[4] == '\0'))
+ goto convert;
+ else if (! isdigit(s[1]) && s[1] != '.')
+ return false;
+ }
+ else if (! isdigit(s[0]) && s[0] != '.')
+ return false;
+
+convert:
  errno = 0;
  r = strtod(s, &ep);
- if (ep == s || r == HUGE_VAL || errno == ERANGE)
- return 0;
- /* allow \r as well. windows files aren't going to go away. */
- while (*ep == ' ' || *ep == '\t' || *ep == '\n' || *ep == '\r')
+ if (ep == s || errno == ERANGE)
+ return false;
+
+ if (isnan(r) && s[0] == '-' && signbit(r) == 0)
+ r = -r;
+
+ if (result != NULL)
+ *result = r;
+
+ /*
+ * check for trailing stuff
+ */
+ while (isspace(*ep))
  ep++;
- if (*ep == '\0')
- return 1;
- else
- return 0;
+
+ if (no_trailing != NULL)
+ *no_trailing = (*ep == '\0');
+
+ // return true if found the end, or trailing stuff is allowed
+ retval = *ep == '\0' || trailing_stuff_ok;
+
+ return retval;
 }
diff --git a/main.c b/main.c
@@ -22,7 +22,7 @@ ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
 THIS SOFTWARE.
 ****************************************************************/
 
-const char *version = "version 20201013";
+const char *version = "version 20210215";
 
 #define DEBUG
 #include <stdio.h>

diff --git a/proto.h b/proto.h
@@ -146,7 +146,9 @@ extern void eprint(void);
 extern void bclass(int);
 extern double errcheck(double, const char *);
 extern int isclvar(const char *);
-extern int is_number(const char *);
+extern bool is_valid_number(const char *s, bool trailing_stuff_ok,
+ bool *no_trailing, double *result);
+#define is_number(s, val) is_valid_number(s, false, NULL, val)
 
 extern int adjbuf(char **pb, int *sz, int min, int q, char **pbp, const char *what);
 extern void run(Node *);