From 3ef534e38af05b8d927fc96a3ecbc150552b06e2 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 16 May 2023 16:05:59 +1200
Subject: [PATCH 1/2] Update arguments to augur clades

This updates the workflow to use the new clades interface from augur
v22 (see nextstrain/augur#728). In the process we can remove two
rules from the workflow. The minimum augur version is bumped to 22.0.1,
as that includes a couple of important bug-fixes.

If this workflow is run with augur prior to v22, the emerging_lineages
rule  will error due to unknown arguments.

The script add_branch_labels.py is no longer used and thus removed here
(as recommended in code review: https://github.com/nextstrain/ncov/pull/1000#discussion_r1195787805)
Note that it contained unused functionality to export spike mutations;
if we reinstate this in the future we should update the output format
to produce a node-data JSON with a custom branch label to avoid modifying
the auspice JSON after export.
---
 docs/src/reference/change_log.md           |  4 ++
 scripts/add_branch_labels.py               | 63 ----------------------
 workflow/envs/nextstrain.yaml              |  2 +-
 workflow/snakemake_rules/main_workflow.smk | 44 ++-------------
 4 files changed, 10 insertions(+), 103 deletions(-)
 delete mode 100644 scripts/add_branch_labels.py

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
index c7d60f193..bf16dcd6e 100644
--- a/docs/src/reference/change_log.md
+++ b/docs/src/reference/change_log.md
@@ -5,6 +5,10 @@ We also use this change log to document new features that maintain backward comp
 
 ## New features since last version update
 
+## v13 (16 May 2022)
+
+- 16 May 2023: Update workflow to support [Augur v22](https://github.com/nextstrain/augur/releases/tag/22.0.0) which updates the `augur clades` interface and structure of the output files to allow specifying the clade label & coloring keys. Because we use custom scripts to parse these files this worflow also needed updating. This change results in a simplifying of the nCoV pipeline (PR [1000](https://github.com/nextstrain/ncov/pull/1000)).
+
 - 11 April 2023: Elevate XBB.1.16 as new clade 23B. See [PR 1059](https://github.com/nextstrain/ncov/pull/1059) for the rationale behind this clade update.
 
 - 6 April 2023: Update conda environment dependencies: augur 19.2.0 -> 21.1.0, nextalign/nextclade 2.9.1 -> 2.13.1, iqtree 2.2.0_beta -> 2.2.0.3. [PR 1056](https://github.com/nextstrain/ncov/pull/1056)
diff --git a/scripts/add_branch_labels.py b/scripts/add_branch_labels.py
deleted file mode 100644
index 613d28df3..000000000
--- a/scripts/add_branch_labels.py
+++ /dev/null
@@ -1,63 +0,0 @@
-import argparse
-import json
-from Bio import Phylo
-from collections import defaultdict
-
-def extract_spike_mutations(node_data):
-    data = {}
-    for name, node in node_data["nodes"].items():
-        smuts = node.get("aa_muts", {}).get("S", [])
-        if smuts:
-            data[name] = ", ".join(smuts)
-    return data
-
-def extract_clade_labels(node_data):
-    data = {}
-    for name, node in node_data["nodes"].items():
-        if "clade_annotation" in node:
-            data[name] = node["clade_annotation"]
-    return data
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="Remove extraneous colorings",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter
-    )
-
-    parser.add_argument('--input', type=str, metavar="JSON", required=True, help="input Auspice JSON")
-    parser.add_argument('--mutations', type=str, required=False, help="mutations node data file")
-    parser.add_argument('--emerging-clades', type=str, required=True, help="emerging clades node data file")
-    parser.add_argument('--output', type=str, metavar="JSON", required=True, help="output Auspice JSON")
-    args = parser.parse_args()
-
-    with open(args.input, "r") as f:
-        auspice_json = json.load(f)
-
-    if args.mutations:
-        with open(args.mutations, "r") as f:
-            spike_mutations = extract_spike_mutations(json.load(f))
-    else:
-        spike_mutations = {}
-
-    with open(args.emerging_clades, "r") as f:
-        clade_labels = extract_clade_labels(json.load(f))
-
-    def attach_labels(n): # closure
-      if n["name"] in spike_mutations or n["name"] in clade_labels:
-          if "branch_attrs" not in n:
-              n["branch_attrs"]={}
-          if "labels" not in n["branch_attrs"]:
-              n["branch_attrs"]["labels"]={}
-          if n["name"] in spike_mutations:
-              n["branch_attrs"]["labels"]["spike_mutations"] = spike_mutations[n["name"]]
-          if n["name"] in clade_labels:
-              n["branch_attrs"]["labels"]["emerging_lineage"] = clade_labels[n["name"]]
-
-      if "children" in n:
-          for c in n["children"]:
-              attach_labels(c)
-
-    attach_labels(auspice_json["tree"])
-
-    with open(args.output, 'w') as f:
-        json.dump(auspice_json, f, indent=2)
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
index 00cfbb3af..c2b1ce23a 100644
--- a/workflow/envs/nextstrain.yaml
+++ b/workflow/envs/nextstrain.yaml
@@ -4,7 +4,7 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - augur=21.1.0
+  - augur=22.0.1
   - epiweeks=2.1.2
   - iqtree=2.2.0.3
   - nextalign=2.13.1
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 7dfa61b35..27ba1dcb2 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -1075,7 +1075,7 @@ rule emerging_lineages:
         emerging_lineages = config["files"]["emerging_lineages"],
         clades = config["files"]["clades"]
     output:
-        clade_data = "results/{build_name}/temp_emerging_lineages.json"
+        clade_data = "results/{build_name}/emerging_lineages.json"
     log:
         "logs/emerging_lineages_{build_name}.txt"
     benchmark:
@@ -1089,27 +1089,11 @@ rule emerging_lineages:
         augur clades --tree {input.tree} \
             --mutations {input.nuc_muts} {input.aa_muts} \
             --clades {input.emerging_lineages} \
+            --membership-name emerging_lineage \
+            --label-name emerging_lineage \
             --output-node-data {output.clade_data} 2>&1 | tee {log}
         """
 
-rule rename_emerging_lineages:
-    input:
-        node_data = rules.emerging_lineages.output.clade_data
-    output:
-        clade_data = "results/{build_name}/emerging_lineages.json"
-    benchmark:
-        "benchmarks/rename_emerging_lineages_{build_name}.txt"
-    run:
-        import json
-        with open(input.node_data, 'r', encoding='utf-8') as fh:
-            d = json.load(fh)
-            new_data = {}
-            for k,v in d['nodes'].items():
-                if "clade_membership" in v:
-                    new_data[k] = {"emerging_lineage": v["clade_membership"]}
-        with open(output.clade_data, "w") as fh:
-            json.dump({"nodes": new_data}, fh, indent=2)
-
 rule colors:
     message: "Constructing colors file"
     input:
@@ -1366,7 +1350,7 @@ def _get_node_data_by_wildcards(wildcards):
         rules.refine.output.node_data,
         rules.ancestral.output.node_data,
         rules.translate.output.node_data,
-        rules.rename_emerging_lineages.output.clade_data,
+        rules.emerging_lineages.output.clade_data,
         rules.clades.output.clade_data,
         rules.recency.output.node_data,
         rules.traits.output.node_data,
@@ -1462,28 +1446,10 @@ rule export:
             --output {output.auspice_json} 2>&1 | tee {log}
         """
 
-rule add_branch_labels:
-    message: "Adding custom branch labels to the Auspice JSON"
-    input:
-        auspice_json = rules.export.output.auspice_json,
-        emerging_clades = rules.emerging_lineages.output.clade_data
-    output:
-        auspice_json = "results/{build_name}/ncov_with_branch_labels.json"
-    log:
-        "logs/add_branch_labels{build_name}.txt"
-    conda: config["conda_environment"]
-    shell:
-        """
-        python3 ./scripts/add_branch_labels.py \
-            --input {input.auspice_json} \
-            --emerging-clades {input.emerging_clades} \
-            --output {output.auspice_json}
-        """
-
 rule include_hcov19_prefix:
     message: "Rename strains to include hCoV-19/ prefix"
     input:
-        auspice_json = rules.add_branch_labels.output.auspice_json,
+        auspice_json = rules.export.output.auspice_json,
         tip_frequencies = rules.tip_frequencies.output.tip_frequencies_json
     output:
         auspice_json = "results/{build_name}/ncov_with_hcov19_prefix.json",

From 7b5312e6e2dc8db1c4826fb3ca22b1e9cca85643 Mon Sep 17 00:00:00 2001
From: James Hadfield <hadfield.james@gmail.com>
Date: Tue, 16 May 2023 17:13:54 +1200
Subject: [PATCH 2/2] Restore branch label ordering

This is a minor aesthetic change which restores the order of the
branch labels in Auspice so that clade appears before emerging_lingeage.
The change in ordering was introduced by the previous commit as
we moved from a custom script to using `augur clades` interface from
Augur v22.0.0

The ordering is not specified and cannot be configured, but in practice
it is the order of the keys in the dictionary (and which is stable in
CPython 3.6, Python 3.7, and most (all?) JS engines).
---
 workflow/snakemake_rules/main_workflow.smk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
index 27ba1dcb2..70177bd61 100644
--- a/workflow/snakemake_rules/main_workflow.smk
+++ b/workflow/snakemake_rules/main_workflow.smk
@@ -1350,8 +1350,8 @@ def _get_node_data_by_wildcards(wildcards):
         rules.refine.output.node_data,
         rules.ancestral.output.node_data,
         rules.translate.output.node_data,
-        rules.emerging_lineages.output.clade_data,
         rules.clades.output.clade_data,
+        rules.emerging_lineages.output.clade_data,
         rules.recency.output.node_data,
         rules.traits.output.node_data,
         rules.logistic_growth.output.node_data,