From 79d1953f4c7a172e97576c60a6b89214611f2512 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Sat, 29 Jul 2023 08:31:30 -0700
Subject: [PATCH 1/2] Switch to augur curate format-dates

---
 ingest/bin/transform-date-fields              | 154 ------------------
 ingest/workflow/snakemake_rules/transform.smk |   2 +-
 2 files changed, 1 insertion(+), 155 deletions(-)
 delete mode 100755 ingest/bin/transform-date-fields

diff --git a/ingest/bin/transform-date-fields b/ingest/bin/transform-date-fields
deleted file mode 100755
index 4ff2a698..00000000
--- a/ingest/bin/transform-date-fields
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python3
-"""
-Standardizes format of date fields of the NDJSON record from stdin to
-ISO 8601 date (YYYY-MM-DD) and outputs modified records to stdout.
-"""
-import argparse
-import json
-from datetime import datetime
-from sys import stderr, stdin, stdout
-
-
-def format_date(date_string: str, expected_formats: list) -> str:
-    """
-    Originally from nextstrain/ncov-ingest
-
-    Format *date_string* to ISO 8601 date (YYYY-MM-DD).
-    If *date_string* does not match *expected_formats*, return *date_string*.
-    If *date_string* is missing the year, return masked date 'XXXX-XX-XX'.
-    If *date_string* is an incomplete date (i.e. missing month or day), then
-    missing values are masked with 'XX'.
-
-    >>> expected_formats = ['%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%m-%d']
-
-    >>> format_date("01-01", expected_formats)
-    'XXXX-XX-XX'
-
-    >>> format_date("2020", expected_formats)
-    '2020-XX-XX'
-
-    >>> format_date("2020-01", expected_formats)
-    '2020-01-XX'
-
-    >>> format_date("2020-1-15", expected_formats)
-    '2020-01-15'
-
-    >>> format_date("2020-1-1", expected_formats)
-    '2020-01-01'
-
-    >>> format_date("2020-01-15", expected_formats)
-    '2020-01-15'
-
-    >>> format_date("2020-01-15T00:00:00Z", expected_formats)
-    '2020-01-15'
-    """
-    # Potential directives that datetime accepts that can return the correct year, month, day fields
-    # see https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
-    #
-    # Allows us to check if year/month/day are included in the date format so we
-    # know when to mask incomplete dates with 'XX'
-    all_field_directives = {'%c', '%x',
-        ('%G', '%V', '%A'), ('%G', '%V', '%a'), ('%G', '%V', '%w'), ('%G', '%V', '%u')
-    }
-    month_and_day_directives = {'%j',
-        ('%U', '%A'), ('%U', '%a'), ('%U', '%w'), ('%U', '%u'),
-        ('%W', '%A'), ('%W', '%a'), ('%W', '%w'), ('%W', '%u')
-    }
-    year_directives = {'%y', '%Y'}
-    month_directives = {'%b', '%B', '%m'}
-    day_directives = {'%d'}
-
-    def directive_is_included(potential_directives: set, date_format: str) -> bool:
-        """
-        Checks if any of the directives in *potential_directives* is included
-        in *date_format* string.
-
-        If an element within *potential_directives* is a tuple, then all directives
-        within the tuple must be included in *date_format*.
-        """
-        return any(
-            (
-                (isinstance(directive, str) and directive in date_format) or
-                (isinstance(directive, tuple) and all(sub_directive in date_format for sub_directive in directive))
-            )
-            for directive in potential_directives
-        )
-
-    for date_format in expected_formats:
-        try:
-            parsed_date = datetime.strptime(date_string, date_format)
-        except ValueError:
-            continue
-
-        # Default to date masked as 'XXXX-XX-XX' so we don't return incorrect dates
-        year_string = 'XXXX'
-        month_string = day_string = 'XX'
-
-        parsed_year_string = str(parsed_date.year)
-        parsed_month_string = str(parsed_date.month).zfill(2)
-        parsed_day_string = str(parsed_date.day).zfill(2)
-
-        # If a directive for ALL fields is included in date format,
-        # then use all of the parsed field strings
-        if (directive_is_included(all_field_directives, date_format)):
-            year_string = parsed_year_string
-            month_string = parsed_month_string
-            day_string = parsed_day_string
-
-        # If not all fields directives are included, then check year
-        # directive was included in date format
-        elif(directive_is_included(year_directives, date_format)):
-            year_string = parsed_year_string
-
-            # Only check for month and day directives if year is included
-            # Check if directive for BOTH month and year is included in date format
-            if (directive_is_included(month_and_day_directives, date_format)):
-                month_string = parsed_month_string
-                day_string = parsed_day_string
-
-            # If not directives for BOTH month and day are included, then check
-            # month directive was included in date format
-            elif(directive_is_included(month_directives, date_format)):
-                month_string = parsed_month_string
-
-                # Only check for day directives if month is included
-                if(directive_is_included(day_directives, date_format)):
-                    day_string = parsed_day_string
-
-        return f"{year_string}-{month_string}-{day_string}"
-
-    if date_string:
-        print(
-            f"WARNING: Unable to transform date string {date_string!r} because it does not match",
-            f"any of the expected formats {expected_formats}.",
-            file=stderr
-        )
-
-    return date_string
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--date-fields", nargs="+",
-        help="List of date field names in the NDJSON record that need to be standardized.")
-    parser.add_argument("--expected-date-formats", nargs="+",
-        help="Expected date formats that are currently in the provided date fields." +
-             "If a date string matches multiple formats, it will be parsed as the first format in the list.")
-
-    args = parser.parse_args()
-
-    expected_formats = args.expected_date_formats
-
-    for record in stdin:
-        record = json.loads(record)
-
-        for field in args.date_fields:
-            date_string = record.get(field)
-            if date_string:
-                record[field] = format_date(date_string, expected_formats)
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index 20a5f104..94da43ba 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -71,7 +71,7 @@ rule transform:
             | ./vendored/transform-strain-names \
                 --strain-regex {params.strain_regex} \
                 --backup-fields {params.strain_backup_fields} \
-            | ./bin/transform-date-fields \
+            | augur curate format-dates \
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
             | ./vendored/transform-genbank-location \

From 0aa951928c8be3ea61c2eefd2c92036cb89ff124 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Tue, 1 Aug 2023 17:40:51 -0700
Subject: [PATCH 2/2] Switch to augur curate titlecase

---
 ingest/bin/transform-string-fields            | 84 -------------------
 ingest/workflow/snakemake_rules/transform.smk |  2 +-
 2 files changed, 1 insertion(+), 85 deletions(-)
 delete mode 100755 ingest/bin/transform-string-fields

diff --git a/ingest/bin/transform-string-fields b/ingest/bin/transform-string-fields
deleted file mode 100755
index e0749e68..00000000
--- a/ingest/bin/transform-string-fields
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env python3
-"""
-Standardizes string fields of the NDJSON record from stdin and outputs the
-modified record to stdout.
-"""
-import argparse
-import json
-import re
-from sys import stdin, stdout
-from typing import Optional, Set, Union
-
-
-def titlecase(text: Union[str, None], articles: Set[str] = {}, abbreviations: Set[str] = {}) -> Optional[str]:
-    """
-    Originally from nextstrain/ncov-ingest
-
-    Returns a title cased location name from the given location name
-    *tokens*. Ensures that no tokens contained in the *whitelist_tokens* are
-    converted to title case.
-
-    >>> articles = {'a', 'and', 'of', 'the', 'le'}
-    >>> abbreviations = {'USA', 'DC'}
-
-    >>> titlecase("the night OF THE LIVING DEAD", articles)
-    'The Night of the Living Dead'
-
-    >>> titlecase("BRAINE-LE-COMTE, FRANCE", articles)
-    'Braine-le-Comte, France'
-
-    >>> titlecase("auvergne-RHÔNE-alpes", articles)
-    'Auvergne-Rhône-Alpes'
-
-    >>> titlecase("washington DC, usa", articles, abbreviations)
-    'Washington DC, USA'
-    """
-    if not isinstance(text, str):
-        return
-
-    words = enumerate(re.split(r'\b', text))
-
-    def changecase(index, word):
-        casefold = word.casefold()
-        upper = word.upper()
-
-        if upper in abbreviations:
-            return upper
-        elif casefold in articles and index != 1:
-            return word.lower()
-        else:
-            return word.title()
-
-    return ''.join(changecase(i, w) for i, w in words)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-    parser.add_argument("--titlecase-fields", nargs="*",
-        help="List of fields to convert to titlecase.")
-    parser.add_argument("--articles", nargs="*",
-        help="List of articles that should not be cast to titlecase.")
-    parser.add_argument("--abbreviations", nargs="*",
-        help="List of abbreviations that should not be cast to titlecase, keeps uppercase.")
-
-
-    args = parser.parse_args()
-
-    if args.articles:
-        articles = set(args.articles)
-
-    if args.abbreviations:
-        abbreviations = set(args.abbreviations)
-
-    for record in stdin:
-        record = json.loads(record)
-
-        if args.titlecase_fields:
-            for field in args.titlecase_fields:
-                record[field] = titlecase(record.get(field, ""), articles, abbreviations)
-
-        json.dump(record, stdout, allow_nan=False, indent=None, separators=',:')
-        print()
diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk
index 94da43ba..a39fb735 100644
--- a/ingest/workflow/snakemake_rules/transform.smk
+++ b/ingest/workflow/snakemake_rules/transform.smk
@@ -75,7 +75,7 @@ rule transform:
                 --date-fields {params.date_fields} \
                 --expected-date-formats {params.expected_date_formats} \
             | ./vendored/transform-genbank-location \
-            | ./bin/transform-string-fields \
+            | augur curate titlecase \
                 --titlecase-fields {params.titlecase_fields} \
                 --articles {params.articles} \
                 --abbreviations {params.abbreviations} \