Merge pull request #22 from PathoGenOmics-Lab/feature-minorrev

Minor workflow revision
PathoGenOmics-Lab · Oct 16, 2023 · e00656b · e00656b
2 parents e381577 + a8a552b
commit e00656b
Show file tree

Hide file tree

Showing 15 changed files with 266 additions and 202 deletions.
diff --git a/.test/targets.yaml b/.test/targets.yaml
@@ -36,3 +36,7 @@ NSP:
   "../data/report_files/nsp_annotation.csv"
 REPORT_QMD:
   "../template.qmd"
+FEATURES_JSON:
+  "../config/sarscov2_features.json"
+GENETIC_CODE_JSON:
+  "../config/standard_genetic_code.json"
diff --git a/README.md b/README.md
@@ -25,26 +25,6 @@ snakemake --use-conda -c4
 The pipeline also allows for its [execution in an HPC environment](config/README.md#run-modes).
 Please check [config/README.md](config/README.md) for in-detail setup instructions.
 
-## Methodology
-
-### Pairwise distances between samples
-
-In order to describe in a better way the relationship between samples, distances beween them are calculated tacking into account allele frequencies in sequencing data. Our aproach to compute the distance between sets of allele frequencies is based on FST formula and define the distance between two samples as:
-
-```math
-d(M,N)=\sum\limits_{i=1}^I \frac {\sum\limits_{j=1}^J (M_{ij} -N_{ij})^2 } {4 - \sum\limits_{j=1}^J (M_{ij} +N_{ij})^2 }
-```
-
-where:
-
-$M$ y $N$: Two sequences.
-
-$i = 1... I :$ Index over polymorphic sites.
-
-$j = 1... J :$ Index over alleles.
-
-$M_{ij}$ : Frequency of allel $j$ in position $i$ for sequence $M$.
-
 ## Contributors
 
 [![Contributors figure](https://contrib.rocks/image?repo=PathoGenOmics-Lab/VIPERA)](https://github.com/PathoGenOmics-Lab/VIPERA/graphs/contributors)

diff --git a/build_targets.py b/build_targets.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+from pathlib import Path
+from typing import List
+
+import yaml
+
+
+def find_file_with_extension(directory: Path, prefix: str, extensions: List[str]) -> str:
+    candidate_files = []
+    for path in directory.rglob(f"*"):
+        if any(path.name.endswith(ext) for ext in extensions) and path.name.startswith(prefix):
+            candidate_files.append(path)
+    if len(candidate_files) == 1:
+        return candidate_files[0].as_posix()
+    else:
+        sys.exit(f"ERROR: {len(candidate_files)} candidates found in '{directory}' for prefix '{prefix}' with extensions {extensions}: {candidate_files}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--sample-directory",
+        help="Directory containing sample sequencing data",
+        required=True,
+        type=Path
+    )
+    parser.add_argument(
+        "-s", "--sample-names",
+        nargs="+",
+        help="Sample names to look for in the sample directory",
+        required=True
+    )
+    parser.add_argument(
+        "-b", "--bam-extensions",
+        nargs="+",
+        help="File extensions for BAM files",
+        required=False,
+        default=[".trim.sort.bam"]
+    )
+    parser.add_argument(
+        "-f", "--fasta-extensions",
+        nargs="+",
+        help="File extensions for FASTA files",
+        required=False,
+        default=[".fa", ".fasta"]
+    )
+    parser.add_argument(
+        "-m", "--metadata-csv",
+        help="Metadata CSV file",
+        required=True,
+        type=Path
+    )
+    parser.add_argument(
+        "-o", "--output-yaml",
+        help="Output YAML file",
+        required=True
+    )
+    args = parser.parse_args()
+
+    # Build targets
+    targets = {"SAMPLES": {}}
+    for sample_name in args.sample_names:
+        targets["SAMPLES"][sample_name] = {}
+        targets["SAMPLES"][sample_name]["bam"] = find_file_with_extension(args.sample_directory, sample_name, args.bam_extensions)
+        targets["SAMPLES"][sample_name]["fasta"] = find_file_with_extension(args.sample_directory, sample_name, args.fasta_extensions)
+
+    # Write empty fields
+    if args.metadata_csv.is_file():
+        targets["METADATA"] = args.metadata_csv.as_posix()
+    else:
+        sys.exit(f"ERROR: metadata file '{args.metadata_csv}' does not exist")
+    targets["OUTPUT_DIRECTORY"] = "output"
+    targets["CONTEXT_FASTA"] = None
+    targets["MAPPING_REFERENCES_FASTA"] = None
+
+    # Write output
+    with open(args.output_yaml, "w") as fw:
+        yaml.dump(targets, fw, indent=2)
diff --git a/config/README.md b/config/README.md
@@ -9,7 +9,13 @@ Modify the [target configuration file](config/targets.yaml)
 to point the `SAMPLES` and `METADATA` parameters to your data. The `OUTPUT_DIRECTORY`
 parameter should point to your desired results directory.
 
-It should look like this:
+The script [`build_targets.py`](build_targets.py) simplifies the process of creating
+the configuration file. To run this script, you need to have PyYAML installed. It
+takes a list of sample names, a directory with BAM and FASTA files, and the path to
+the metadata table as required inputs. Then, it searches the directory for files
+that have the appropriate extensions and sample names and adds them to the configuration file.
+
+The file should look like this:
 
 ```yaml
 SAMPLES:

diff --git a/config/config.yaml b/config/config.yaml
@@ -4,6 +4,10 @@ ALIGNMENT_REFERENCE:
   "NC_045512.2"
 PROBLEMATIC_VCF_URL:
   "https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf"
+FEATURES_JSON:
+  "config/sarscov2_features.json"
+GENETIC_CODE_JSON:
+  "config/standard_genetic_code.json"
 TREE_MODEL:
   "GTR+F+I+G4"
 UFBOOT_REPS:

diff --git a/config/sarscov2_features.json b/config/sarscov2_features.json
@@ -0,0 +1,14 @@
+{
+    "ORF1ab polyprotein": "orf1ab",
+    "ORF1a polyprotein": "orf1ab",
+    "surface glycoprotein": "S",
+    "ORF3a protein": "ORF3a",
+    "envelope protein": "E",
+    "membrane glycoprotein": "M",
+    "ORF6 protein": "ORF6",
+    "ORF7a protein": "ORF7",
+    "ORF7b": "ORF7",
+    "ORF8 protein": "ORF8",
+    "nucleocapsid phosphoprotein": "N",
+    "ORF10 protein": "ORF10"
+}
diff --git a/config/standard_genetic_code.json b/config/standard_genetic_code.json
@@ -0,0 +1,66 @@
+{
+    "AAA": "K",
+    "AAC": "N",
+    "AAG": "K",
+    "AAT": "N",
+    "ACA": "T",
+    "ACC": "T",
+    "ACG": "T",
+    "ACT": "T",
+    "AGA": "R",
+    "AGC": "S",
+    "AGG": "R",
+    "AGT": "S",
+    "ATA": "I",
+    "ATC": "I",
+    "ATG": "M",
+    "ATT": "I",
+    "CAA": "Q",
+    "CAC": "H",
+    "CAG": "Q",
+    "CAT": "H",
+    "CCA": "P",
+    "CCC": "P",
+    "CCG": "P",
+    "CCT": "P",
+    "CGA": "R",
+    "CGC": "R",
+    "CGG": "R",
+    "CGT": "R",
+    "CTA": "L",
+    "CTC": "L",
+    "CTG": "L",
+    "CTT": "L",
+    "GAA": "E",
+    "GAC": "D",
+    "GAG": "E",
+    "GAT": "D",
+    "GCA": "A",
+    "GCC": "A",
+    "GCG": "A",
+    "GCT": "A",
+    "GGA": "G",
+    "GGC": "G",
+    "GGG": "G",
+    "GGT": "G",
+    "GTA": "V",
+    "GTC": "V",
+    "GTG": "V",
+    "GTT": "V",
+    "TAA": "*",
+    "TAC": "Y",
+    "TAG": "*",
+    "TAT": "Y",
+    "TCA": "S",
+    "TCC": "S",
+    "TCG": "S",
+    "TCT": "S",
+    "TGA": "*",
+    "TGC": "C",
+    "TGG": "W",
+    "TGT": "C",
+    "TTA": "L",
+    "TTC": "F",
+    "TTG": "L",
+    "TTT": "F"
+}
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -14,7 +14,7 @@ include: "rules/common.smk"
 
 # Workflow version
 repo_version = get_version_str('.')
-__version__ = "0.1" + f"(git: {repo_version})"
+__version__ = "0.1" + f" (git: {repo_version})"
 
 # Targets
 ## Output

diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk
@@ -1,11 +1,11 @@
 rule N_S_sites:
     threads: 1
     conda: "../envs/biopython.yaml"
-    params:
-        genetic_code = "standard"
     input:
         fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
-        gb = OUTDIR/"reference.gb"
+        gb = OUTDIR/"reference.gb",
+        features = Path(config["FEATURES_JSON"]).resolve(),
+        genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve()
     output:
         csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv")
     log:

diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk
@@ -11,15 +11,15 @@ rule fetch_alignment_reference:
         "esearch -db nucleotide -query {params.ref} | efetch -format fasta > {output.fasta} 2> {log}"
 
 
-rule fetch_alignment_gb:
+rule fetch_reference_gb:
     threads: 1
     conda: "../envs/fetch.yaml"
     params:
         ref = config["ALIGNMENT_REFERENCE"]
     output:
         fasta = OUTDIR/"reference.gb"
     log:
-        LOGDIR / "fetch_alignment_reference" / "log.txt"
+        LOGDIR / "fetch_reference_gb" / "log.txt"
     shell:
         "esearch -db nucleotide -query {params.ref} | efetch -format gb > {output.fasta} 2> {log}"
 

diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
@@ -18,7 +18,8 @@ rule window:
         step = config["WINDOW"]["STEP"]
     input:
         vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
-        gb = OUTDIR/"reference.gb"
+        gb = OUTDIR/"reference.gb",
+        features = Path(config["FEATURES_JSON"]).resolve()
     output:
         window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"),
     log:

diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk
@@ -58,7 +58,8 @@ rule annotation:
     shadow: "shallow"
     input:
         gb = OUTDIR/"reference.gb",
-        ref = OUTDIR/"reference.fasta"
+        ref = OUTDIR/"reference.fasta",
+        features = Path(config["FEATURES_JSON"]).resolve()
     output:
         df = temp(OUTDIR/"annotation.csv")
     log: