From f6ce73bd76ee9b07bb13a6df9a5663a38ccf4013 Mon Sep 17 00:00:00 2001
From: Tavian Barnes <tavianator@gmail.com>
Date: Tue, 27 Oct 2009 20:35:17 -0400
Subject: Tokenize numeric values.

---
 dimension/tokenize.c         | 61 ++++++++++++++++++++++++++++++++++++++++++--
 dimension/tokenize.h         |  5 ++++
 tests/dimension/Makefile.am  |  2 +-
 tests/dimension/numeric.pov  |  2 ++
 tests/dimension/tokenizer.sh | 11 +++++++-
 5 files changed, 77 insertions(+), 4 deletions(-)
 create mode 100644 tests/dimension/numeric.pov

diff --git a/dimension/tokenize.c b/dimension/tokenize.c
index 7019ad7..c34cfc6 100644
--- a/dimension/tokenize.c
+++ b/dimension/tokenize.c
@@ -40,7 +40,9 @@ dmnsn_tokenize(FILE *file)
     return NULL;
   }
 
-  char *map = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0), *next = map;
+  char *map = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0),
+    *next = map,
+    *endi, *endf;
 
   if (map == MAP_FAILED) {
     fprintf(stderr, "Couldn't mmap() input stream.\n");
@@ -50,19 +52,28 @@ dmnsn_tokenize(FILE *file)
   dmnsn_token token;
   dmnsn_array *tokens = dmnsn_new_array(sizeof(dmnsn_token));
 
+  unsigned int line = 0, col = 0;
+  unsigned int i;
+
   while (next - map < size) {
     /* Saves us some code repetition in the vast majority of cases */
     token.value = NULL;
 
     switch (*next) {
     case ' ':
-    case '\n':
     case '\r':
     case '\t':
     case '\f':
     case '\v':
       /* Skip whitespace */
       ++next;
+      ++col;
+      continue;
+
+    case '\n':
+      ++next;
+      ++line;
+      col = 0;
       continue;
 
     /* Macro to make basic symbol tokens easier */
@@ -86,6 +97,46 @@ dmnsn_tokenize(FILE *file)
     dmnsn_simple_token('/', DMNSN_SLASH);
     dmnsn_simple_token(',', DMNSN_COMMA);
 
+    /* Numeric values */
+    case '.': /* Number begins with a decimal point, as in `.2' */
+    case '0':
+    case '1':
+    case '2':
+    case '3':
+    case '4':
+    case '5':
+    case '6':
+    case '7':
+    case '8':
+    case '9':
+      strtoul(next, &endi, 0);
+      strtod(next, &endf);
+      if (endf > endi
+          /* These next conditions catch invalid octal integers being parsed as
+             floats, eg 08 */
+          && (*endi == '.' || *endi == 'e' || *endi == 'E' || *endi == 'p'
+              || *endi == 'P'))
+      {
+        token.type = DMNSN_FLOAT;
+        token.value = malloc(endf - next + 1);
+        strncpy(token.value, next, endf - next);
+        token.value[endf - next] = '\0';
+        next = endf;
+      } else if (endi > next) {
+        token.type = DMNSN_INT;
+        token.value = malloc(endi - next + 1);
+        strncpy(token.value, next, endi - next);
+        token.value[endi - next] = '\0';
+        next = endi;
+      } else {
+        fprintf(stderr, "Invalid numeric value on line %u, column %u.\n",
+                line, col);
+        dmnsn_delete_tokens(tokens);
+        munmap(map, size);
+        return NULL;
+      }
+      break;
+
     default:
       /* Unrecognised character */
       fprintf(stderr, "Unrecognized character 0x%X in input.\n",
@@ -97,6 +148,7 @@ dmnsn_tokenize(FILE *file)
 
     dmnsn_array_push(tokens, &token);
     ++next;
+    ++col;
   }
 
   munmap(map, size);
@@ -157,6 +209,7 @@ dmnsn_token_name(dmnsn_token_type token_type)
   case type:                                                                   \
     return str;
 
+  /* Punctuation */
   dmnsn_token_map(DMNSN_LBRACE,   "{");
   dmnsn_token_map(DMNSN_RBRACE,   "}")
   dmnsn_token_map(DMNSN_LPAREN,   "\\(");
@@ -171,6 +224,10 @@ dmnsn_token_name(dmnsn_token_type token_type)
   dmnsn_token_map(DMNSN_SLASH,    "/");
   dmnsn_token_map(DMNSN_COMMA,    ",");
 
+  /* Numeric values */
+  dmnsn_token_map(DMNSN_INT,   "int");
+  dmnsn_token_map(DMNSN_FLOAT, "float");
+
   default:
     printf("Warning: unrecognised token %d.\n", (int)token_type);
     return "unrecognized-token";
diff --git a/dimension/tokenize.h b/dimension/tokenize.h
index e64b7eb..91d59f6 100644
--- a/dimension/tokenize.h
+++ b/dimension/tokenize.h
@@ -20,6 +20,7 @@
 #include "../libdimension/dimension.h"
 
 typedef enum {
+  /* Punctuation */
   DMNSN_LBRACE,   /* { */
   DMNSN_RBRACE,   /* } */
   DMNSN_LPAREN,   /* ( */
@@ -33,6 +34,10 @@ typedef enum {
   DMNSN_STAR,     /* * */
   DMNSN_SLASH,    /* / */
   DMNSN_COMMA,    /* , */
+
+  /* Numeric values */
+  DMNSN_INT,
+  DMNSN_FLOAT,
 } dmnsn_token_type;
 
 typedef struct dmnsn_token dmnsn_token;
diff --git a/tests/dimension/Makefile.am b/tests/dimension/Makefile.am
index 7bacab5..e9b7f7a 100644
--- a/tests/dimension/Makefile.am
+++ b/tests/dimension/Makefile.am
@@ -25,4 +25,4 @@ TESTS_ENVIRONMENT = top_builddir=$(top_builddir)
 tokenizer.sh:
 	cp $(srcdir)/tokenizer.sh .
 
-EXTRA_DIST = tokenizer.sh punctuation.pov
+EXTRA_DIST = tokenizer.sh punctuation.pov numeric.pov
diff --git a/tests/dimension/numeric.pov b/tests/dimension/numeric.pov
new file mode 100644
index 0000000..df850d6
--- /dev/null
+++ b/tests/dimension/numeric.pov
@@ -0,0 +1,2 @@
+1 123456789 01234567 0x123456789 -0x01
+.1 0.1 1.0 0.123456789 -0.123456789
diff --git a/tests/dimension/tokenizer.sh b/tests/dimension/tokenizer.sh
index d965741..edd3bb4 100755
--- a/tests/dimension/tokenizer.sh
+++ b/tests/dimension/tokenizer.sh
@@ -24,6 +24,15 @@ punctuation_exp='({ \( [ < + - * / , > ] \) })'
 
 if [ "$punctuation" != "$punctuation_exp" ]; then
   echo "punctuation.pov tokenized as \"$punctuation\"" >&2
-  echo " -- expected \"$punctuation_exp\"" >&2
+  echo "                 -- expected \"$punctuation_exp\"" >&2
+  exit 1;
+fi
+
+numeric=$(${top_builddir}/dimension/dimension --tokenize ${srcdir}/numeric.pov)
+numeric_exp='((int "1") (int "123456789") (int "01234567") (int "0x123456789") - (int "0x01") (float ".1") (float "0.1") (float "1.0") (float "0.123456789") - (float "0.123456789"))'
+
+if [ "$numeric" != "$numeric_exp" ]; then
+  echo "numeric.pov tokenized as \"$numeric\"" >&2
+  echo "             -- expected \"$numeric_exp\"" >&2
   exit 1;
 fi
-- 
cgit v1.2.3