From 6021ae28535b3c6dcc55d4b2bf5b76236c86e309 Mon Sep 17 00:00:00 2001
From: liquidaty <info@liquidaty.com>
Date: Wed, 17 Apr 2024 11:13:50 -0700
Subject: [PATCH] Compare: add --tolerance option to ignore differences between
 numeric strings within a numeric tolerance (#168)

* compare: add --tolerance option
---
 README.md                                     | 48 ++++++++++++--
 app/compare.c                                 | 66 +++++++++++++------
 app/compare_internal.h                        |  6 ++
 app/test/Makefile                             | 15 +++++
 app/test/expected/test-compare-tolerance.out1 |  5 ++
 app/test/expected/test-compare-tolerance.out2 |  2 +
 app/test/expected/test-compare-tolerance.out3 |  3 +
 app/test/expected/test-compare-tolerance.out4 |  5 ++
 app/utils/string.c                            | 10 +++
 data/compare/tolerance1.csv                   |  2 +
 data/compare/tolerance2.csv                   |  2 +
 examples/lib/README.md                        |  8 ++-
 include/zsv/utils/string.h                    | 12 ++++
 13 files changed, 159 insertions(+), 25 deletions(-)
 create mode 100644 app/test/expected/test-compare-tolerance.out1
 create mode 100644 app/test/expected/test-compare-tolerance.out2
 create mode 100644 app/test/expected/test-compare-tolerance.out3
 create mode 100644 app/test/expected/test-compare-tolerance.out4
 create mode 100644 data/compare/tolerance1.csv
 create mode 100644 data/compare/tolerance2.csv

diff --git a/README.md b/README.md
index d9884e7f..7c42fe5d 100644
--- a/README.md
+++ b/README.md
@@ -225,17 +225,20 @@ for speed and ease of development for extending and/or customizing to your needs
 
 * `echo`: read CSV from stdin and write it back out to stdout. This is mostly
   useful for demonstrating how to use the API and also how to create a plug-in,
-  and has some limited utility beyond that e.g. for adding/removing the UTF8
-  BOM, or cleaning up bad UTF8
+  and has several uses beyond that including adding/removing BOM,
+  cleaning up bad UTF8,
+  whitespace or blank column trimming,
+  limiting output to a contiguous data block, skipping leading garbage, and even
+  proving substitution values without modifying the underlying source
 * `select`: re-shape CSV by skipping leading garbage, combining header rows into
   a single header, selecting or excluding specified columns, removing duplicate
-  columns, sampling, searching and more
-* `sql`: run ad-hoc SQL query on a CSV file
+  columns, sampling, converting from fixed-width input, searching and more
+* `sql`: treat one or more CSV files like database tables and query with SQL
 * `desc`: provide a quick description of your table data
 * `pretty`: format for console (fixed-width) display, or convert to markdown
   format
 * `2json`: convert CSV to JSON. Optionally, output in [database schema](docs/db.schema.json)
-* `2tsv`: convert CSV to TSV
+* `2tsv`: convert to TSV (tab-delimited) format
 * `compare`: compare two or more tables of data and output the differences
 * `paste` (alpha): horizontally paste two tables together (given inputs X and Y,
    output 1...N rows where each row all columns of X in row N, followed by all columns of Y in row N)
@@ -264,6 +267,41 @@ zsv sql my_population_data.csv "select * from data where population > 100000"
 
 ### Using the API
 
+Simple API usage examples include:
+
+Pull parsing:
+```
+zsv_parser parser = zsv_new(...);
+while(zsv_next_row(parser) == zsv_status_row) { /* for each row */
+    // do something
+  size_t cell_count = zsv_cell_count(parser);
+  for(size_t i = 0; i < cell_count; i++) {
+    struct zsv_cell c = zsv_get_cell(parser, i);
+    fprintf(stderr, "Cell: %.*s\n", c.len, c.str);
+    ...
+  }
+```
+
+Push parsing:
+```
+static void my_row_handler(void *ctx) {
+  zsv_parser p = ctx;
+  size_t cell_count = zsv_cell_count(p);
+  for(size_t i = 0, j = zsv_cell_count(p); i < j; i++) {
+    ...
+  }
+}
+
+int main() {
+  zsv_parser p = zsv_new(NULL);
+  zsv_set_row_handler(p, my_row_handler);
+  zsv_set_context(p, p);
+
+  enum zsv_status stat;
+  while((stat = zsv_parse_more(data.parser)) == zsv_status_ok) ;
+
+```
+
 Full application code examples can be found at [examples/lib/README.md](examples/lib/README.md).
 
 An example of using the API, compiled to wasm and called via Javascript,
diff --git a/app/compare.c b/app/compare.c
index 9526ab16..de8330f0 100644
--- a/app/compare.c
+++ b/app/compare.c
@@ -9,6 +9,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
+#include <math.h>
 #include <jsonwriter.h>
 
 #include <sqlite3.h>
@@ -227,25 +228,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,
 
 #define ZSV_COMPARE_MISSING "Missing"
 
-//  if(last_ix + 1 < data->input_count) {
-    // if we don't have data from every input, then output "Missing" for missing inputs
-    char got_missing = 0;
-    for(unsigned i = 0; i < data->input_count; i++) {
-      struct zsv_compare_input *input = data->inputs_to_sort[i];
-      if(i > last_ix) {
-        got_missing = 1;
-        unsigned input_ix = input->index;
-        values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
-        values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
-      }
-    }
-    if(got_missing) {
-      const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
-      zsv_compare_output_tuple(data, key_input, key_names, values, 1);
-      // reset values
-      memset(values, 0, data->input_count * sizeof(*values));
+  // if we don't have data from every input, then output "Missing" for missing inputs
+  char got_missing = 0;
+  for(unsigned i = 0; i < data->input_count; i++) {
+    struct zsv_compare_input *input = data->inputs_to_sort[i];
+    if(i > last_ix) {
+      got_missing = 1;
+      unsigned input_ix = input->index;
+      values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING;
+      values[input_ix].len = strlen(ZSV_COMPARE_MISSING);
     }
-//  }
+  }
+  if(got_missing) {
+    const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)"<key>";
+    zsv_compare_output_tuple(data, key_input, key_names, values, 1);
+    // reset values
+    memset(values, 0, data->input_count * sizeof(*values));
+  }
 
   // for each output column
   zsv_compare_unique_colname *output_col = data->output_colnames_first;
@@ -272,8 +271,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data,
         if(!output_col)
           output_col = input->output_colnames[input_col_ix];
         values[input_ix] = data->get_cell(input, input_col_ix);
-        if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix))
+        if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix)) {
           different = 1;
+          if(data->tolerance.value
+             && values[first_input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN
+             && values[input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN) {
+            // check if both are numbers with a difference less than the given tolerance            
+            double d1, d2;
+            memcpy(data->tolerance.str1, values[first_input_ix].str, values[first_input_ix].len);
+            data->tolerance.str1[values[first_input_ix].len] = '\0';
+            memcpy(data->tolerance.str2, values[input_ix].str, values[input_ix].len);
+            data->tolerance.str2[values[input_ix].len] = '\0';
+            if(!zsv_strtod_exact(data->tolerance.str1, &d1)
+               && !zsv_strtod_exact(data->tolerance.str2, &d2)
+               && fabs(d1 - d2) < data->tolerance.value)
+              different = 0;
+          }
+        }
       }
     }
 
@@ -608,6 +622,10 @@ static int compare_usage() {
     "  --sort             : sort on keys before comparing",
     "  --sort-in-memory   : for sorting,  use in-memory instead of temporary db",
     "                       (see https://www.sqlite.org/inmemorydb.html)",
+    "  --tolerance <value>: ignore differences where both values are numeric",
+    "                       strings with values differing by less than the given",
+    "                       amount e.g. --tolerance 0.01 will ignore differences",
+    "                       of numeric strings such as 123.45 vs 123.44",
     "  --json             : output as JSON",
     "  --json-compact     : output as compact JSON",
     "  --json-object      : output as an array of objects",
@@ -695,6 +713,16 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op
             data->added_colcount++;
         }
       }
+    } else if(!strcmp(arg, "--tolerance")) {
+      const char *next_arg = zsv_next_arg(++arg_i, argc, argv, &err);
+      if(next_arg) {
+        if(zsv_strtod_exact(next_arg, &data->tolerance.value))
+          fprintf(stderr, "Invalid numeric value: %s\n", next_arg), err = 1;
+        else if(data->tolerance.value < 0)
+          fprintf(stderr, "Tolerance must be greater than zero (got %s)\n", next_arg), err = 1;
+        else
+          data->tolerance.value = nextafterf(data->tolerance.value, INFINITY);
+      }
     } else if(!strcmp(arg, "--sort")) {
       data->sort = 1;
     } else if(!strcmp(arg, "--json")) {
diff --git a/app/compare_internal.h b/app/compare_internal.h
index 52bb49c3..4c3acc70 100644
--- a/app/compare_internal.h
+++ b/app/compare_internal.h
@@ -106,6 +106,12 @@ struct zsv_compare_data {
 
   sqlite3 *sort_db; // used when --sort option was specified
 
+  struct {
+    double value;
+#define ZSV_COMPARE_MAX_NUMBER_BUFF_LEN 128
+    char   str1[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
+    char   str2[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN];
+  } tolerance;
   struct {
     char type; // 'j' for json
     union {
diff --git a/app/test/Makefile b/app/test/Makefile
index e25e0a42..e5f62d67 100644
--- a/app/test/Makefile
+++ b/app/test/Makefile
@@ -495,6 +495,20 @@ test-desc: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}
 	@(${PREFIX} $< < ${TEST_DATA_DIR}/test/$*-trim.csv ${REDIRECT2} ${TMP_DIR}/$@.trim && \
 	${CMP} ${TMP_DIR}/$@.trim expected/$@.trim && ${TEST_PASS} || ${TEST_FAIL})
 
+test-compare-tolerance: ${BUILD_DIR}/bin/zsv_compare${EXE}
+	@(${PREFIX} $< ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out1 && \
+	${CMP} ${TMP_DIR}/$@.out1 expected/$@.out1 && ${TEST_PASS} || ${TEST_FAIL})
+
+	@(${PREFIX} $< --tolerance 0.001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out2 && \
+	${CMP} ${TMP_DIR}/$@.out2 expected/$@.out2 && ${TEST_PASS} || ${TEST_FAIL})
+
+	@(${PREFIX} $< --tolerance 0.0001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out3 && \
+	${CMP} ${TMP_DIR}/$@.out3 expected/$@.out3 && ${TEST_PASS} || ${TEST_FAIL})
+
+	@(${PREFIX} $< --tolerance 0.00001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out4 && \
+	${CMP} ${TMP_DIR}/$@.out4 expected/$@.out4 && ${TEST_PASS} || ${TEST_FAIL})
+
+
 test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}
 	@${TEST_INIT}
 	@(${PREFIX} $< compare/t1.csv compare/t2.csv compare/t3.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \
@@ -529,3 +543,4 @@ test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE}
 
 	@(${PREFIX} $< ../../data/compare/t1.csv ../../data/compare/t2.csv --add AccentCity --sort -k country -k city ${REDIRECT1} ${TMP_DIR}/$@.out10 && \
 	${CMP} ${TMP_DIR}/$@.out10 expected/$@.out10 && ${TEST_PASS} || ${TEST_FAIL})
+
diff --git a/app/test/expected/test-compare-tolerance.out1 b/app/test/expected/test-compare-tolerance.out1
new file mode 100644
index 00000000..c2edf924
--- /dev/null
+++ b/app/test/expected/test-compare-tolerance.out1
@@ -0,0 +1,5 @@
+Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
+1,A,1,1.01
+1,B,1,1.001
+1,C,1,1.0001
+1,D,1,1.00009
diff --git a/app/test/expected/test-compare-tolerance.out2 b/app/test/expected/test-compare-tolerance.out2
new file mode 100644
index 00000000..68e7993e
--- /dev/null
+++ b/app/test/expected/test-compare-tolerance.out2
@@ -0,0 +1,2 @@
+Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
+1,A,1,1.01
diff --git a/app/test/expected/test-compare-tolerance.out3 b/app/test/expected/test-compare-tolerance.out3
new file mode 100644
index 00000000..bcf8af67
--- /dev/null
+++ b/app/test/expected/test-compare-tolerance.out3
@@ -0,0 +1,3 @@
+Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
+1,A,1,1.01
+1,B,1,1.001
diff --git a/app/test/expected/test-compare-tolerance.out4 b/app/test/expected/test-compare-tolerance.out4
new file mode 100644
index 00000000..c2edf924
--- /dev/null
+++ b/app/test/expected/test-compare-tolerance.out4
@@ -0,0 +1,5 @@
+Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv
+1,A,1,1.01
+1,B,1,1.001
+1,C,1,1.0001
+1,D,1,1.00009
diff --git a/app/utils/string.c b/app/utils/string.c
index aa96ffcc..ec37852a 100644
--- a/app/utils/string.c
+++ b/app/utils/string.c
@@ -330,6 +330,16 @@ size_t zsv_strunescape_backslash(unsigned char *s, size_t len) {
   return j;
 }
 
+// zsv_strtod_exact(const char *s): return error; if 0, set value of *d
+int zsv_strtod_exact(const char *s, double *d) {
+  if(!*s) return 1;
+  char *end;
+  *d = strtod(s, &end);
+  if(*end) return 1;
+  return 0;
+}
+
+
 #ifndef ZSV_STRING_LIB_ONLY
 struct zsv_cell zsv_get_cell_trimmed(zsv_parser parser, size_t ix) {
   struct zsv_cell c = zsv_get_cell(parser, ix);
diff --git a/data/compare/tolerance1.csv b/data/compare/tolerance1.csv
new file mode 100644
index 00000000..77c53863
--- /dev/null
+++ b/data/compare/tolerance1.csv
@@ -0,0 +1,2 @@
+A,B,C,D
+1,1,1,1
diff --git a/data/compare/tolerance2.csv b/data/compare/tolerance2.csv
new file mode 100644
index 00000000..7eb08ea0
--- /dev/null
+++ b/data/compare/tolerance2.csv
@@ -0,0 +1,2 @@
+A,B,C,D
+1.01,1.001,1.0001,1.00009
diff --git a/examples/lib/README.md b/examples/lib/README.md
index 19349cab..939f7ed6 100644
--- a/examples/lib/README.md
+++ b/examples/lib/README.md
@@ -70,7 +70,13 @@ returns `zsv_status_row` until no more rows are left to parse
 ```
 zsv_parser parser = zsv_new(...);
 while(zsv_next_row(parser) == zsv_status_row) { /* for each row */
-    // do something
+  // do something
+  size_t cell_count = zsv_cell_count(parser);
+  for(size_t i = 0; i < cell_count; i++) {
+    struct zsv_cell c = zsv_get_cell(parser, i);
+    fprintf(stderr, "Cell: %.*s\n", c.len, c.str);
+    ...
+  }
 }
 ```
 
diff --git a/include/zsv/utils/string.h b/include/zsv/utils/string.h
index 30ed5905..21131717 100644
--- a/include/zsv/utils/string.h
+++ b/include/zsv/utils/string.h
@@ -118,8 +118,20 @@ size_t zsv_strnext_is_sign(const unsigned char *s, size_t len);
  */
 size_t zsv_strnext_is_currency(const unsigned char *s, size_t len);
 
+
+/*
+ * Convert a string to a double. must convert entire string, else returns error
+ * @param s     string to convert
+ * @param d     pointer to converted value, on success
+ *  
+ * @returns     0 on success, non-zero on error
+ */
+int zsv_strtod_exact(const char *s, double *d);
+
 /*
  * `zsv_get_cell_trimmed` is equivalent to `zsv_get_cell`, except that it
+ * @param s     string to convert
+ * @param len   length of input string
  * returns a value with leading and trailing whitespace removed
  */
 #include <zsv.h>