From 6021ae28535b3c6dcc55d4b2bf5b76236c86e309 Mon Sep 17 00:00:00 2001 From: liquidaty Date: Wed, 17 Apr 2024 11:13:50 -0700 Subject: [PATCH] Compare: add --tolerance option to ignore differences between numeric strings within a numeric tolerance (#168) * compare: add --tolerance option --- README.md | 48 ++++++++++++-- app/compare.c | 66 +++++++++++++------ app/compare_internal.h | 6 ++ app/test/Makefile | 15 +++++ app/test/expected/test-compare-tolerance.out1 | 5 ++ app/test/expected/test-compare-tolerance.out2 | 2 + app/test/expected/test-compare-tolerance.out3 | 3 + app/test/expected/test-compare-tolerance.out4 | 5 ++ app/utils/string.c | 10 +++ data/compare/tolerance1.csv | 2 + data/compare/tolerance2.csv | 2 + examples/lib/README.md | 8 ++- include/zsv/utils/string.h | 12 ++++ 13 files changed, 159 insertions(+), 25 deletions(-) create mode 100644 app/test/expected/test-compare-tolerance.out1 create mode 100644 app/test/expected/test-compare-tolerance.out2 create mode 100644 app/test/expected/test-compare-tolerance.out3 create mode 100644 app/test/expected/test-compare-tolerance.out4 create mode 100644 data/compare/tolerance1.csv create mode 100644 data/compare/tolerance2.csv diff --git a/README.md b/README.md index d9884e7f..7c42fe5d 100644 --- a/README.md +++ b/README.md @@ -225,17 +225,20 @@ for speed and ease of development for extending and/or customizing to your needs * `echo`: read CSV from stdin and write it back out to stdout. This is mostly useful for demonstrating how to use the API and also how to create a plug-in, - and has some limited utility beyond that e.g. for adding/removing the UTF8 - BOM, or cleaning up bad UTF8 + and has several uses beyond that including adding/removing BOM, + cleaning up bad UTF8, + whitespace or blank column trimming, + limiting output to a contiguous data block, skipping leading garbage, and even + proving substitution values without modifying the underlying source * `select`: re-shape CSV by skipping leading garbage, combining header rows into a single header, selecting or excluding specified columns, removing duplicate - columns, sampling, searching and more -* `sql`: run ad-hoc SQL query on a CSV file + columns, sampling, converting from fixed-width input, searching and more +* `sql`: treat one or more CSV files like database tables and query with SQL * `desc`: provide a quick description of your table data * `pretty`: format for console (fixed-width) display, or convert to markdown format * `2json`: convert CSV to JSON. Optionally, output in [database schema](docs/db.schema.json) -* `2tsv`: convert CSV to TSV +* `2tsv`: convert to TSV (tab-delimited) format * `compare`: compare two or more tables of data and output the differences * `paste` (alpha): horizontally paste two tables together (given inputs X and Y, output 1...N rows where each row all columns of X in row N, followed by all columns of Y in row N) @@ -264,6 +267,41 @@ zsv sql my_population_data.csv "select * from data where population > 100000" ### Using the API +Simple API usage examples include: + +Pull parsing: +``` +zsv_parser parser = zsv_new(...); +while(zsv_next_row(parser) == zsv_status_row) { /* for each row */ + // do something + size_t cell_count = zsv_cell_count(parser); + for(size_t i = 0; i < cell_count; i++) { + struct zsv_cell c = zsv_get_cell(parser, i); + fprintf(stderr, "Cell: %.*s\n", c.len, c.str); + ... + } +``` + +Push parsing: +``` +static void my_row_handler(void *ctx) { + zsv_parser p = ctx; + size_t cell_count = zsv_cell_count(p); + for(size_t i = 0, j = zsv_cell_count(p); i < j; i++) { + ... + } +} + +int main() { + zsv_parser p = zsv_new(NULL); + zsv_set_row_handler(p, my_row_handler); + zsv_set_context(p, p); + + enum zsv_status stat; + while((stat = zsv_parse_more(data.parser)) == zsv_status_ok) ; + +``` + Full application code examples can be found at [examples/lib/README.md](examples/lib/README.md). An example of using the API, compiled to wasm and called via Javascript, diff --git a/app/compare.c b/app/compare.c index 9526ab16..de8330f0 100644 --- a/app/compare.c +++ b/app/compare.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -227,25 +228,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data, #define ZSV_COMPARE_MISSING "Missing" -// if(last_ix + 1 < data->input_count) { - // if we don't have data from every input, then output "Missing" for missing inputs - char got_missing = 0; - for(unsigned i = 0; i < data->input_count; i++) { - struct zsv_compare_input *input = data->inputs_to_sort[i]; - if(i > last_ix) { - got_missing = 1; - unsigned input_ix = input->index; - values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING; - values[input_ix].len = strlen(ZSV_COMPARE_MISSING); - } - } - if(got_missing) { - const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)""; - zsv_compare_output_tuple(data, key_input, key_names, values, 1); - // reset values - memset(values, 0, data->input_count * sizeof(*values)); + // if we don't have data from every input, then output "Missing" for missing inputs + char got_missing = 0; + for(unsigned i = 0; i < data->input_count; i++) { + struct zsv_compare_input *input = data->inputs_to_sort[i]; + if(i > last_ix) { + got_missing = 1; + unsigned input_ix = input->index; + values[input_ix].str = (unsigned char *)ZSV_COMPARE_MISSING; + values[input_ix].len = strlen(ZSV_COMPARE_MISSING); } -// } + } + if(got_missing) { + const unsigned char *key_names = data->print_key_col_names ? zsv_compare_combined_key_names(data) : (const unsigned char *)""; + zsv_compare_output_tuple(data, key_input, key_names, values, 1); + // reset values + memset(values, 0, data->input_count * sizeof(*values)); + } // for each output column zsv_compare_unique_colname *output_col = data->output_colnames_first; @@ -272,8 +271,23 @@ static void zsv_compare_print_row(struct zsv_compare_data *data, if(!output_col) output_col = input->output_colnames[input_col_ix]; values[input_ix] = data->get_cell(input, input_col_ix); - if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix)) + if(i > 0 && !different && data->cmp(data->cmp_ctx, values[first_input_ix], values[input_ix], data, input_col_ix)) { different = 1; + if(data->tolerance.value + && values[first_input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN + && values[input_ix].len < ZSV_COMPARE_MAX_NUMBER_BUFF_LEN) { + // check if both are numbers with a difference less than the given tolerance + double d1, d2; + memcpy(data->tolerance.str1, values[first_input_ix].str, values[first_input_ix].len); + data->tolerance.str1[values[first_input_ix].len] = '\0'; + memcpy(data->tolerance.str2, values[input_ix].str, values[input_ix].len); + data->tolerance.str2[values[input_ix].len] = '\0'; + if(!zsv_strtod_exact(data->tolerance.str1, &d1) + && !zsv_strtod_exact(data->tolerance.str2, &d2) + && fabs(d1 - d2) < data->tolerance.value) + different = 0; + } + } } } @@ -608,6 +622,10 @@ static int compare_usage() { " --sort : sort on keys before comparing", " --sort-in-memory : for sorting, use in-memory instead of temporary db", " (see https://www.sqlite.org/inmemorydb.html)", + " --tolerance : ignore differences where both values are numeric", + " strings with values differing by less than the given", + " amount e.g. --tolerance 0.01 will ignore differences", + " of numeric strings such as 123.45 vs 123.44", " --json : output as JSON", " --json-compact : output as compact JSON", " --json-object : output as an array of objects", @@ -695,6 +713,16 @@ int ZSV_MAIN_FUNC(ZSV_COMMAND)(int argc, const char *argv[], struct zsv_opts *op data->added_colcount++; } } + } else if(!strcmp(arg, "--tolerance")) { + const char *next_arg = zsv_next_arg(++arg_i, argc, argv, &err); + if(next_arg) { + if(zsv_strtod_exact(next_arg, &data->tolerance.value)) + fprintf(stderr, "Invalid numeric value: %s\n", next_arg), err = 1; + else if(data->tolerance.value < 0) + fprintf(stderr, "Tolerance must be greater than zero (got %s)\n", next_arg), err = 1; + else + data->tolerance.value = nextafterf(data->tolerance.value, INFINITY); + } } else if(!strcmp(arg, "--sort")) { data->sort = 1; } else if(!strcmp(arg, "--json")) { diff --git a/app/compare_internal.h b/app/compare_internal.h index 52bb49c3..4c3acc70 100644 --- a/app/compare_internal.h +++ b/app/compare_internal.h @@ -106,6 +106,12 @@ struct zsv_compare_data { sqlite3 *sort_db; // used when --sort option was specified + struct { + double value; +#define ZSV_COMPARE_MAX_NUMBER_BUFF_LEN 128 + char str1[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN]; + char str2[ZSV_COMPARE_MAX_NUMBER_BUFF_LEN]; + } tolerance; struct { char type; // 'j' for json union { diff --git a/app/test/Makefile b/app/test/Makefile index e25e0a42..e5f62d67 100644 --- a/app/test/Makefile +++ b/app/test/Makefile @@ -495,6 +495,20 @@ test-desc: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} @(${PREFIX} $< < ${TEST_DATA_DIR}/test/$*-trim.csv ${REDIRECT2} ${TMP_DIR}/$@.trim && \ ${CMP} ${TMP_DIR}/$@.trim expected/$@.trim && ${TEST_PASS} || ${TEST_FAIL}) +test-compare-tolerance: ${BUILD_DIR}/bin/zsv_compare${EXE} + @(${PREFIX} $< ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out1 && \ + ${CMP} ${TMP_DIR}/$@.out1 expected/$@.out1 && ${TEST_PASS} || ${TEST_FAIL}) + + @(${PREFIX} $< --tolerance 0.001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out2 && \ + ${CMP} ${TMP_DIR}/$@.out2 expected/$@.out2 && ${TEST_PASS} || ${TEST_FAIL}) + + @(${PREFIX} $< --tolerance 0.0001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out3 && \ + ${CMP} ${TMP_DIR}/$@.out3 expected/$@.out3 && ${TEST_PASS} || ${TEST_FAIL}) + + @(${PREFIX} $< --tolerance 0.00001 ../../data/compare/tolerance1.csv ../../data/compare/tolerance2.csv ${REDIRECT1} ${TMP_DIR}/$@.out4 && \ + ${CMP} ${TMP_DIR}/$@.out4 expected/$@.out4 && ${TEST_PASS} || ${TEST_FAIL}) + + test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} @${TEST_INIT} @(${PREFIX} $< compare/t1.csv compare/t2.csv compare/t3.csv ${REDIRECT1} ${TMP_DIR}/$@.out && \ @@ -529,3 +543,4 @@ test-compare: test-%: ${BUILD_DIR}/bin/zsv_%${EXE} @(${PREFIX} $< ../../data/compare/t1.csv ../../data/compare/t2.csv --add AccentCity --sort -k country -k city ${REDIRECT1} ${TMP_DIR}/$@.out10 && \ ${CMP} ${TMP_DIR}/$@.out10 expected/$@.out10 && ${TEST_PASS} || ${TEST_FAIL}) + diff --git a/app/test/expected/test-compare-tolerance.out1 b/app/test/expected/test-compare-tolerance.out1 new file mode 100644 index 00000000..c2edf924 --- /dev/null +++ b/app/test/expected/test-compare-tolerance.out1 @@ -0,0 +1,5 @@ +Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv +1,A,1,1.01 +1,B,1,1.001 +1,C,1,1.0001 +1,D,1,1.00009 diff --git a/app/test/expected/test-compare-tolerance.out2 b/app/test/expected/test-compare-tolerance.out2 new file mode 100644 index 00000000..68e7993e --- /dev/null +++ b/app/test/expected/test-compare-tolerance.out2 @@ -0,0 +1,2 @@ +Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv +1,A,1,1.01 diff --git a/app/test/expected/test-compare-tolerance.out3 b/app/test/expected/test-compare-tolerance.out3 new file mode 100644 index 00000000..bcf8af67 --- /dev/null +++ b/app/test/expected/test-compare-tolerance.out3 @@ -0,0 +1,3 @@ +Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv +1,A,1,1.01 +1,B,1,1.001 diff --git a/app/test/expected/test-compare-tolerance.out4 b/app/test/expected/test-compare-tolerance.out4 new file mode 100644 index 00000000..c2edf924 --- /dev/null +++ b/app/test/expected/test-compare-tolerance.out4 @@ -0,0 +1,5 @@ +Row #,Column,../../data/compare/tolerance1.csv,../../data/compare/tolerance2.csv +1,A,1,1.01 +1,B,1,1.001 +1,C,1,1.0001 +1,D,1,1.00009 diff --git a/app/utils/string.c b/app/utils/string.c index aa96ffcc..ec37852a 100644 --- a/app/utils/string.c +++ b/app/utils/string.c @@ -330,6 +330,16 @@ size_t zsv_strunescape_backslash(unsigned char *s, size_t len) { return j; } +// zsv_strtod_exact(const char *s): return error; if 0, set value of *d +int zsv_strtod_exact(const char *s, double *d) { + if(!*s) return 1; + char *end; + *d = strtod(s, &end); + if(*end) return 1; + return 0; +} + + #ifndef ZSV_STRING_LIB_ONLY struct zsv_cell zsv_get_cell_trimmed(zsv_parser parser, size_t ix) { struct zsv_cell c = zsv_get_cell(parser, ix); diff --git a/data/compare/tolerance1.csv b/data/compare/tolerance1.csv new file mode 100644 index 00000000..77c53863 --- /dev/null +++ b/data/compare/tolerance1.csv @@ -0,0 +1,2 @@ +A,B,C,D +1,1,1,1 diff --git a/data/compare/tolerance2.csv b/data/compare/tolerance2.csv new file mode 100644 index 00000000..7eb08ea0 --- /dev/null +++ b/data/compare/tolerance2.csv @@ -0,0 +1,2 @@ +A,B,C,D +1.01,1.001,1.0001,1.00009 diff --git a/examples/lib/README.md b/examples/lib/README.md index 19349cab..939f7ed6 100644 --- a/examples/lib/README.md +++ b/examples/lib/README.md @@ -70,7 +70,13 @@ returns `zsv_status_row` until no more rows are left to parse ``` zsv_parser parser = zsv_new(...); while(zsv_next_row(parser) == zsv_status_row) { /* for each row */ - // do something + // do something + size_t cell_count = zsv_cell_count(parser); + for(size_t i = 0; i < cell_count; i++) { + struct zsv_cell c = zsv_get_cell(parser, i); + fprintf(stderr, "Cell: %.*s\n", c.len, c.str); + ... + } } ``` diff --git a/include/zsv/utils/string.h b/include/zsv/utils/string.h index 30ed5905..21131717 100644 --- a/include/zsv/utils/string.h +++ b/include/zsv/utils/string.h @@ -118,8 +118,20 @@ size_t zsv_strnext_is_sign(const unsigned char *s, size_t len); */ size_t zsv_strnext_is_currency(const unsigned char *s, size_t len); + +/* + * Convert a string to a double. must convert entire string, else returns error + * @param s string to convert + * @param d pointer to converted value, on success + * + * @returns 0 on success, non-zero on error + */ +int zsv_strtod_exact(const char *s, double *d); + /* * `zsv_get_cell_trimmed` is equivalent to `zsv_get_cell`, except that it + * @param s string to convert + * @param len length of input string * returns a value with leading and trailing whitespace removed */ #include