From 216b0eabf00cd67977528f73bbf707f6e89219f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 11:10:55 +0200
Subject: [PATCH 01/11] Read feature key from file

---
 config/config.yaml                        |  2 +
 config/sarscov2_features.json             | 14 +++++
 workflow/rules/evolution.smk              |  3 +-
 workflow/rules/report.smk                 |  3 +-
 workflow/rules/vaf.smk                    |  3 +-
 workflow/scripts/N_S_sites_from_fasta.py  | 76 +++++++----------------
 workflow/scripts/report/get_annotation.py | 33 ++++------
 workflow/scripts/window.py                | 44 ++++---------
 8 files changed, 70 insertions(+), 108 deletions(-)
 create mode 100644 config/sarscov2_features.json

diff --git a/config/config.yaml b/config/config.yaml
index 0265b52..30cba33 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -4,6 +4,8 @@ ALIGNMENT_REFERENCE:
   "NC_045512.2"
 PROBLEMATIC_VCF_URL:
   "https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf"
+FEATURES_JSON:
+  "config/sarscov2_features.json"
 TREE_MODEL:
   "GTR+F+I+G4"
 UFBOOT_REPS:
diff --git a/config/sarscov2_features.json b/config/sarscov2_features.json
new file mode 100644
index 0000000..2031f32
--- /dev/null
+++ b/config/sarscov2_features.json
@@ -0,0 +1,14 @@
+{
+    "ORF1ab polyprotein": "orf1ab",
+    "ORF1a polyprotein": "orf1ab",
+    "surface glycoprotein": "S",
+    "ORF3a protein": "ORF3a",
+    "envelope protein": "E",
+    "membrane glycoprotein": "M",
+    "ORF6 protein": "ORF6",
+    "ORF7a protein": "ORF7",
+    "ORF7b": "ORF7",
+    "ORF8 protein": "ORF8",
+    "nucleocapsid phosphoprotein": "N",
+    "ORF10 protein": "ORF10"
+}
diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk
index e0464ef..a4b0d49 100644
--- a/workflow/rules/evolution.smk
+++ b/workflow/rules/evolution.smk
@@ -5,7 +5,8 @@ rule N_S_sites:
         genetic_code = "standard"
     input:
         fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
-        gb = OUTDIR/"reference.gb"
+        gb = OUTDIR/"reference.gb",
+        features = config["FEATURES_JSON"]
     output:
         csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv")
     log:
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index 59363d4..f138c31 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -18,7 +18,8 @@ rule window:
         step = 1
     input:
         vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
-        gb = OUTDIR/"reference.gb"
+        gb = OUTDIR/"reference.gb",
+        features = config["FEATURES_JSON"]
     output:
         window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"),
     log:
diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk
index fdbe516..821ae87 100644
--- a/workflow/rules/vaf.smk
+++ b/workflow/rules/vaf.smk
@@ -58,7 +58,8 @@ rule annotation:
     shadow: "shallow"
     input:
         gb = OUTDIR/"reference.gb",
-        ref = OUTDIR/"reference.fasta"
+        ref = OUTDIR/"reference.fasta",
+        features = config["FEATURES_JSON"]
     output:
         df = temp(OUTDIR/"annotation.csv")
     log:
diff --git a/workflow/scripts/N_S_sites_from_fasta.py b/workflow/scripts/N_S_sites_from_fasta.py
index b053ca0..8327bb7 100644
--- a/workflow/scripts/N_S_sites_from_fasta.py
+++ b/workflow/scripts/N_S_sites_from_fasta.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 import logging
+import json
 import copy
 import pandas as pd
 from gb2seq.alignment import Gb2Alignment
@@ -8,38 +9,14 @@
 from dark.fasta import FastaReads
 
 
-SCov2_features = [        # List with id names of SARS-CoV-2 features
-     "ORF1ab polyprotein",
-    "surface glycoprotein",
-    "ORF3a protein",
-    "envelope protein",
-    "membrane glycoprotein",
-    "ORF6 protein",
-    "ORF7a protein",
-    "ORF7b",
-    "ORF8 protein",
-    "nucleocapsid phosphoprotein",
-    "ORF10 protein"
-]
-
-
-# Create alignment object
-features = Features(snakemake.input.gb)
-seq = list(FastaReads(snakemake.input.fasta))[0]
-aln = Gb2Alignment(seq,features)
-
-
-def split_into_codons(seq:str) -> list:
-    """Split the complete CDS feature in to a list of codons."""
-
-    codons = [ seq[i:i + 3] for i in range(0, len(seq), 3) if 'N' not in seq[i:i + 3] ]
-    
+def split_into_codons(seq: str) -> list:
+    """Split the complete CDS feature in to a list of codons"""
+    codons = [seq[i:i + 3] for i in range(0, len(seq), 3) if 'N' not in seq[i:i + 3]]
     return codons
 
-def geneticCode(name:str) -> dict:
-
-    """ Dictionary that maps codons to amino acids """ 
 
+def geneticCode(name: str) -> dict:
+    """ Dictionary that maps codons to amino acids""" 
     if name == 'standard':
         gc = {  'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 'AGA':'R', 'AGC':'S', 'AGG':'R', \
                 'AGT':'S','ATA':'I','ATC':'I','ATG':'M','ATT':'I','CAA':'Q','CAC':'H','CAG':'Q','CAT':'H','CCA':'P','CCC':'P','CCG':'P', \
@@ -47,11 +24,10 @@ def geneticCode(name:str) -> dict:
                 'GAT':'D','GCA':'A','GCC':'A','GCG':'A','GCT':'A','GGA':'G','GGC':'G','GGG':'G','GGT':'G','GTA':'V','GTC':'V','GTG':'V', \
                 'GTT':'V','TAA':'*','TAC':'Y','TAG':'*','TAT':'Y','TCA':'S','TCC':'S','TCG':'S','TCT':'S','TGA':'*','TGC':'C','TGG':'W', \
                 'TGT':'C','TTA':'L','TTC':'F','TTG':'L','TTT':'F'  }
-        
     return gc
 
-def potential_changes_dict(nt_to_aa:dict) -> dict:
 
+def potential_changes_dict(nt_to_aa: dict) -> dict:
     """ Generate a dictionary, with S and N pre-calculated for all 
     possible codons (key:codon, value: (S,N).
     ARGS:
@@ -63,7 +39,6 @@ def potential_changes_dict(nt_to_aa:dict) -> dict:
         Sources of formulae:
         http://www.megasoftware.net/mega4/WebHelp/part_iv___evolutionary_analysis/computing_evolutionary_distances/distance_models/synonymouse_and_nonsynonymous_substitution_models/hc_nei_gojobori_method.htm
     """
-
     potential_changes = {   'S': {  'AAA':0.0,'AAC':0.0,'AAG':0.0,'AAT':0.0, 'ACA':0.0, 'ACC':0.0, 'ACG':0.0, 'ACT':0.0, 'AGA':0.0, 'AGC':0.0, \
                                     'AGG':0.0, 'AGT':0.0, 'ATA':0.0, 'ATC':0.0, 'ATG':0.0, 'ATT':0.0, 'CAA':0.0, 'CAC':0.0, 'CAG':0.0, 'CAT':0.0, \
                                     'CCA':0.0,'CCC':0.0,'CCG':0.0,'CCT':0.0,'CGA':0.0,'CGC':0.0,'CGG':0.0,'CGT':0.0,'CTA':0.0,'CTC':0.0,'CTG':0.0, \
@@ -76,26 +51,18 @@ def potential_changes_dict(nt_to_aa:dict) -> dict:
                                     'CCT':0.0,'CGA':0.0,'CGC':0.0,'CGG':0.0,'CGT':0.0,'CTA':0.0,'CTC':0.0,'CTG':0.0,'CTT':0.0,'GAA':0.0,'GAC':0.0,'GAG':0.0, \
                                     'GAT':0.0,'GCA':0.0,'GCC':0.0,'GCG':0.0,'GCT':0.0,'GGA':0.0,'GGC':0.0,'GGG':0.0,'GGT':0.0,'GTA':0.0,'GTC':0.0,'GTG':0.0, \
                                     'GTT':0.0,'TAA':0.0,'TAC':0.0,'TAG':0.0,'TAT':0.0,'TCA':0.0,'TCC':0.0,'TCG':0.0,'TCT':0.0,'TGA':0.0,'TGC':0.0,'TGG':0.0, \
-                                    'TGT':0.0,'TTA':0.0,'TTC':0.0,'TTG':0.0,'TTT':0.0}}   
-
-
+                                    'TGT':0.0,'TTA':0.0,'TTC':0.0,'TTG':0.0,'TTT':0.0}}
     # Mutate (substitutions) all possible codons in the given genetic code, and count proportions of mutations that are synonymous and non-synonmyous
     for codon in nt_to_aa.keys():
-
         # for each bp position in the codon...
         for codon_p in range(0,2+1):
-
             nts = ['A','G','T','C']  
             nts.remove(codon[codon_p]) # we do not consider self substitutions, e.g. A->A
-
             # ...and for each nucleotide that the bp can change 
-            
             for nt in nts:
-
                 codon_mutated = list(copy.deepcopy(codon))
                 codon_mutated[codon_p] = nt  # mutate the basepair
                 codon_mutated = ''.join(codon_mutated)
-                
                 # ...count how many of them are synonymous.
                 if nt_to_aa[codon]==nt_to_aa[codon_mutated]:
                     potential_changes['S'][codon]+=1.0/3.0 
@@ -104,21 +71,16 @@ def potential_changes_dict(nt_to_aa:dict) -> dict:
     return potential_changes
 
 
-def get_feature_codons(alignment:Gb2Alignment, annotation:list) -> dict:
-
-    dct = { key:alignment.ntSequences(key)[1].sequence for key in annotation }
-
-    return { key:split_into_codons(item) for key,item in dct.items() }
-
+def get_feature_codons(alignment: Gb2Alignment, annotation: list) -> dict:
+    dct = {key:alignment.ntSequences(key)[1].sequence for key in annotation}
+    return {key:split_into_codons(item) for key,item in dct.items()}
 
-def get_df(codons:dict) -> pd.DataFrame:
 
+def get_df(codons: dict) -> pd.DataFrame:
     keys = []
     N_sites = []
     S_sites = []
-
     values =  potential_changes_dict(geneticCode(snakemake.params.genetic_code))
-
     for key,item in codons.items():
 
         keys.append(key)
@@ -131,12 +93,22 @@ def get_df(codons:dict) -> pd.DataFrame:
     
     return pd.DataFrame({ 'gene':keys, 'N':N_sites, 'S':S_sites })
 
+
 def main():
 
     logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
     
+    logging.info("Reading features")
+    with open(snakemake.input.features) as f:
+        feature_list = list(json.load(f).keys())
+    
+    logging.info("Create alignment object")
+    features = Features(snakemake.input.gb)
+    seq = list(FastaReads(snakemake.input.fasta))[0]
+    aln = Gb2Alignment(seq, features)
+
     logging.info("Splitting ancestral sequence into codons")
-    codons_dict =  get_feature_codons(aln,SCov2_features)
+    codons_dict =  get_feature_codons(aln, feature_list)
 
     logging.info("Calculating synonymous and non synonymous sites")
     df = get_df(codons_dict)
@@ -146,4 +118,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/workflow/scripts/report/get_annotation.py b/workflow/scripts/report/get_annotation.py
index 8d7140c..c6eb974 100644
--- a/workflow/scripts/report/get_annotation.py
+++ b/workflow/scripts/report/get_annotation.py
@@ -1,47 +1,36 @@
 #!/usr/bin/env python3
 
+import json
 import logging
 import pandas as pd
 from gb2seq.features import Features
 from Bio import SeqIO
 
-replace_terry = {
-    "ORF1ab polyprotein": "orf1ab",
-    "ORF1a polyprotein": "orf1ab",
-    "surface glycoprotein": "S",
-    "ORF3a protein": "ORF3a",
-    "envelope protein": "E",
-    "membrane glycoprotein": "M",
-    "ORF6 protein": "ORF6",
-    "ORF7a protein": "ORF7",
-    "ORF7b": "ORF7",
-    "ORF8 protein": "ORF8",
-    "nucleocapsid phosphoprotein": "N",
-    "ORF10 protein": "ORF10"
-}
 
 def main():
     logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
+    
+    logging.info("Reading features")
     ft = Features(snakemake.input.gb)
+    with open(snakemake.input.features) as f:
+        feature_key = list(json.load(f))
 
+    logging.info("Reading reference")
     reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)
-
     positions = [x for x in range(len(reference))]
 
+    logging.info("Building annotation")
     genes = []
     for pos in positions:
-
         if len(ft.getFeatureNames(pos)) == 0:
             genes.append("Intergenic")
         else:
-            genes.append(replace_terry[list(ft.getFeatureNames(pos))[0]])
+            genes.append(feature_key[list(ft.getFeatureNames(pos))[0]])
 
-    df = pd.DataFrame({"POS":positions, "GEN": genes})
-
-    df.to_csv(snakemake.output.df,index= False)
+    logging.info("Writing table")
+    df = pd.DataFrame({"POS": positions, "GEN": genes})
+    df.to_csv(snakemake.output.df, index= False)
 
 
 if __name__ == "__main__":
     main()
-
-    
\ No newline at end of file
diff --git a/workflow/scripts/window.py b/workflow/scripts/window.py
index 6e980d0..916d93c 100644
--- a/workflow/scripts/window.py
+++ b/workflow/scripts/window.py
@@ -1,66 +1,48 @@
 #!/usr/bin/env python3
 
-
 import logging
+import json
 import pandas as pd
 from gb2seq.features import Features
 
 
-replace_terry = {
-    "ORF1ab polyprotein": "orf1ab",
-    "ORF1a polyprotein": "orf1ab",
-    "surface glycoprotein": "S",
-    "ORF3a protein": "ORF3a",
-    "envelope protein": "E",
-    "membrane glycoprotein": "M",
-    "ORF6 protein": "ORF6",
-    "ORF7a protein": "ORF7",
-    "ORF7b": "ORF7",
-    "ORF8 protein": "ORF8",
-    "nucleocapsid phosphoprotein": "N",
-    "ORF10 protein": "ORF10"
-}
-
-
-def get_polimorphic_sites(df:pd.DataFrame) -> set:
-
+def get_polimorphic_sites(df: pd.DataFrame) -> set:
     return set(df.POS)
 
 
-def window_calculation(sites:set, window:int, step:int, genome_size:int, coord:str) -> pd.DataFrame:
-
+def window_calculation(sites: set, window: int, step: int, genome_size: int, coord: str) -> pd.DataFrame:
     ft = Features(coord) # Create Features object to obtain annotations
-    
     positions = []
     pct = []
     genes = []
     lim_sup = genome_size + 1
-
     for position in range(1, lim_sup, step):
-        
         if len(ft.getFeatureNames(position)) == 0:
             genes.append("Intergenic")
         else:
             genes.append(list(ft.getFeatureNames(position))[0])
-    
         # Add percent (excluding initial and final positions)
         if position - window not in range(1, lim_sup):
             pct.append(0)
         else:
             # Calculate nº of polimorphisms in the window
-            num_snp = len([ x for x in sites if x in range(position - window, position + 1) ]) # Calculate nº of polimorphisms in the window
+            num_snp = len([x for x in sites if x in range(position - window, position + 1)])
             pct.append(num_snp / window)
-        
         positions.append(position)
-
-    return pd.DataFrame({ "position": positions, "fractions": pct, "gen": genes })
+    return pd.DataFrame({"position": positions, "fractions": pct, "gen": genes})
 
 
 def main():
+
     logging.basicConfig(filename=snakemake.log[0], format=snakemake.config["LOG_PY_FMT"], level=logging.INFO)
 
+    logging.info("Reading input VCF")
     df = pd.read_table(snakemake.input.vcf)
 
+    logging.info("Reading features")
+    with open(snakemake.input.features) as f:
+        features_key = json.load(f)
+
     logging.info("Getting polimorphic sites")
     sites = get_polimorphic_sites(df)
 
@@ -69,8 +51,8 @@ def main():
     frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, 29903, snakemake.input.gb)
     
     logging.info("Saving results")
-    frame.replace(replace_terry, inplace = True)
-    frame.to_csv(snakemake.output.window_df,index= False)
+    frame.replace(features_key, inplace = True)
+    frame.to_csv(snakemake.output.window_df, index= False)
 
 
 if __name__ == "__main__":

From 23dbdcda00eb3de4a1d406a15ff7c93bb82dc936 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 11:25:43 +0200
Subject: [PATCH 02/11] Read genetic code from file

---
 config/config.yaml                       |  2 +
 config/standard_genetic_code.json        | 66 +++++++++++++++++
 workflow/rules/evolution.smk             |  5 +-
 workflow/scripts/N_S_sites_from_fasta.py | 92 +++++++++---------------
 4 files changed, 105 insertions(+), 60 deletions(-)
 create mode 100644 config/standard_genetic_code.json

diff --git a/config/config.yaml b/config/config.yaml
index 30cba33..3860db2 100644
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -6,6 +6,8 @@ PROBLEMATIC_VCF_URL:
   "https://raw.githubusercontent.com/W-L/ProblematicSites_SARS-CoV2/da322c32004f7b16bdaa6a8ee7fd24d56e79d9dc/problematic_sites_sarsCov2.vcf"
 FEATURES_JSON:
   "config/sarscov2_features.json"
+GENETIC_CODE_JSON:
+  "config/standard_genetic_code.json"
 TREE_MODEL:
   "GTR+F+I+G4"
 UFBOOT_REPS:
diff --git a/config/standard_genetic_code.json b/config/standard_genetic_code.json
new file mode 100644
index 0000000..7863317
--- /dev/null
+++ b/config/standard_genetic_code.json
@@ -0,0 +1,66 @@
+{
+    "AAA": "K",
+    "AAC": "N",
+    "AAG": "K",
+    "AAT": "N",
+    "ACA": "T",
+    "ACC": "T",
+    "ACG": "T",
+    "ACT": "T",
+    "AGA": "R",
+    "AGC": "S",
+    "AGG": "R",
+    "AGT": "S",
+    "ATA": "I",
+    "ATC": "I",
+    "ATG": "M",
+    "ATT": "I",
+    "CAA": "Q",
+    "CAC": "H",
+    "CAG": "Q",
+    "CAT": "H",
+    "CCA": "P",
+    "CCC": "P",
+    "CCG": "P",
+    "CCT": "P",
+    "CGA": "R",
+    "CGC": "R",
+    "CGG": "R",
+    "CGT": "R",
+    "CTA": "L",
+    "CTC": "L",
+    "CTG": "L",
+    "CTT": "L",
+    "GAA": "E",
+    "GAC": "D",
+    "GAG": "E",
+    "GAT": "D",
+    "GCA": "A",
+    "GCC": "A",
+    "GCG": "A",
+    "GCT": "A",
+    "GGA": "G",
+    "GGC": "G",
+    "GGG": "G",
+    "GGT": "G",
+    "GTA": "V",
+    "GTC": "V",
+    "GTG": "V",
+    "GTT": "V",
+    "TAA": "*",
+    "TAC": "Y",
+    "TAG": "*",
+    "TAT": "Y",
+    "TCA": "S",
+    "TCC": "S",
+    "TCG": "S",
+    "TCT": "S",
+    "TGA": "*",
+    "TGC": "C",
+    "TGG": "W",
+    "TGT": "C",
+    "TTA": "L",
+    "TTC": "F",
+    "TTG": "L",
+    "TTT": "F"
+}
diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk
index a4b0d49..e8ae219 100644
--- a/workflow/rules/evolution.smk
+++ b/workflow/rules/evolution.smk
@@ -1,12 +1,11 @@
 rule N_S_sites:
     threads: 1
     conda: "../envs/biopython.yaml"
-    params:
-        genetic_code = "standard"
     input:
         fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
         gb = OUTDIR/"reference.gb",
-        features = config["FEATURES_JSON"]
+        features = config["FEATURES_JSON"],
+        genetic_code = config["GENETIC_CODE_JSON"]
     output:
         csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv")
     log:
diff --git a/workflow/scripts/N_S_sites_from_fasta.py b/workflow/scripts/N_S_sites_from_fasta.py
index 8327bb7..91e430d 100644
--- a/workflow/scripts/N_S_sites_from_fasta.py
+++ b/workflow/scripts/N_S_sites_from_fasta.py
@@ -11,63 +11,40 @@
 
 def split_into_codons(seq: str) -> list:
     """Split the complete CDS feature in to a list of codons"""
-    codons = [seq[i:i + 3] for i in range(0, len(seq), 3) if 'N' not in seq[i:i + 3]]
+    codons = [seq[i:i + 3] for i in range(0, len(seq), 3) if "N" not in seq[i:i + 3]]
     return codons
 
 
-def geneticCode(name: str) -> dict:
-    """ Dictionary that maps codons to amino acids""" 
-    if name == 'standard':
-        gc = {  'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 'AGA':'R', 'AGC':'S', 'AGG':'R', \
-                'AGT':'S','ATA':'I','ATC':'I','ATG':'M','ATT':'I','CAA':'Q','CAC':'H','CAG':'Q','CAT':'H','CCA':'P','CCC':'P','CCG':'P', \
-                'CCT':'P','CGA':'R','CGC':'R','CGG':'R','CGT':'R','CTA':'L','CTC':'L','CTG':'L','CTT':'L','GAA':'E','GAC':'D','GAG':'E', \
-                'GAT':'D','GCA':'A','GCC':'A','GCG':'A','GCT':'A','GGA':'G','GGC':'G','GGG':'G','GGT':'G','GTA':'V','GTC':'V','GTG':'V', \
-                'GTT':'V','TAA':'*','TAC':'Y','TAG':'*','TAT':'Y','TCA':'S','TCC':'S','TCG':'S','TCT':'S','TGA':'*','TGC':'C','TGG':'W', \
-                'TGT':'C','TTA':'L','TTC':'F','TTG':'L','TTT':'F'  }
-    return gc
-
-
-def potential_changes_dict(nt_to_aa: dict) -> dict:
-    """ Generate a dictionary, with S and N pre-calculated for all 
-    possible codons (key:codon, value: (S,N).
-    ARGS:
-        nt_to_aa, a dict mapping codons (keys), e.g. 'TTA', to 
-            amino-acid letter (values), e.g. 'L'
-            
-            e.g. geneticCode("standard")
-    Notes:
-        Sources of formulae:
-        http://www.megasoftware.net/mega4/WebHelp/part_iv___evolutionary_analysis/computing_evolutionary_distances/distance_models/synonymouse_and_nonsynonymous_substitution_models/hc_nei_gojobori_method.htm
-    """
-    potential_changes = {   'S': {  'AAA':0.0,'AAC':0.0,'AAG':0.0,'AAT':0.0, 'ACA':0.0, 'ACC':0.0, 'ACG':0.0, 'ACT':0.0, 'AGA':0.0, 'AGC':0.0, \
-                                    'AGG':0.0, 'AGT':0.0, 'ATA':0.0, 'ATC':0.0, 'ATG':0.0, 'ATT':0.0, 'CAA':0.0, 'CAC':0.0, 'CAG':0.0, 'CAT':0.0, \
-                                    'CCA':0.0,'CCC':0.0,'CCG':0.0,'CCT':0.0,'CGA':0.0,'CGC':0.0,'CGG':0.0,'CGT':0.0,'CTA':0.0,'CTC':0.0,'CTG':0.0, \
-                                    'CTT':0.0,'GAA':0.0,'GAC':0.0,'GAG':0.0,'GAT':0.0,'GCA':0.0,'GCC':0.0,'GCG':0.0,'GCT':0.0,'GGA':0.0,'GGC':0.0, \
-                                    'GGG':0.0,'GGT':0.0,'GTA':0.0,'GTC':0.0,'GTG':0.0,'GTT':0.0,'TAA':0.0,'TAC':0.0,'TAG':0.0,'TAT':0.0,'TCA':0.0, \
-                                    'TCC':0.0,'TCG':0.0,'TCT':0.0,'TGA':0.0,'TGC':0.0,'TGG':0.0,'TGT':0.0,'TTA':0.0,'TTC':0.0,'TTG':0.0,'TTT':0.0},
-
-                            'N': {  'AAA':0.0, 'AAC':0.0, 'AAG':0.0, 'AAT':0.0, 'ACA':0.0, 'ACC':0.0, 'ACG':0.0, 'ACT':0.0, 'AGA':0.0, 'AGC':0.0, 'AGG':0.0, \
-                                    'AGT':0.0,'ATA':0.0,'ATC':0.0,'ATG':0.0,'ATT':0.0,'CAA':0.0,'CAC':0.0,'CAG':0.0,'CAT':0.0,'CCA':0.0,'CCC':0.0,'CCG':0.0, \
-                                    'CCT':0.0,'CGA':0.0,'CGC':0.0,'CGG':0.0,'CGT':0.0,'CTA':0.0,'CTC':0.0,'CTG':0.0,'CTT':0.0,'GAA':0.0,'GAC':0.0,'GAG':0.0, \
-                                    'GAT':0.0,'GCA':0.0,'GCC':0.0,'GCG':0.0,'GCT':0.0,'GGA':0.0,'GGC':0.0,'GGG':0.0,'GGT':0.0,'GTA':0.0,'GTC':0.0,'GTG':0.0, \
-                                    'GTT':0.0,'TAA':0.0,'TAC':0.0,'TAG':0.0,'TAT':0.0,'TCA':0.0,'TCC':0.0,'TCG':0.0,'TCT':0.0,'TGA':0.0,'TGC':0.0,'TGG':0.0, \
-                                    'TGT':0.0,'TTA':0.0,'TTC':0.0,'TTG':0.0,'TTT':0.0}}
+def potential_changes_dict(genetic_code: dict) -> dict:
+    """Generate a dictionary with S and N pre-calculated for all possible codons (key:codon, value: (S,N)"""
+    potential_changes = {   "S": {  "AAA":0.0,"AAC":0.0,"AAG":0.0,"AAT":0.0,"ACA":0.0,"ACC":0.0,"ACG":0.0,"ACT":0.0,"AGA":0.0,"AGC":0.0, \
+                                    "AGG":0.0,"AGT":0.0,"ATA":0.0,"ATC":0.0,"ATG":0.0,"ATT":0.0,"CAA":0.0,"CAC":0.0,"CAG":0.0,"CAT":0.0, \
+                                    "CCA":0.0,"CCC":0.0,"CCG":0.0,"CCT":0.0,"CGA":0.0,"CGC":0.0,"CGG":0.0,"CGT":0.0,"CTA":0.0,"CTC":0.0,"CTG":0.0, \
+                                    "CTT":0.0,"GAA":0.0,"GAC":0.0,"GAG":0.0,"GAT":0.0,"GCA":0.0,"GCC":0.0,"GCG":0.0,"GCT":0.0,"GGA":0.0,"GGC":0.0, \
+                                    "GGG":0.0,"GGT":0.0,"GTA":0.0,"GTC":0.0,"GTG":0.0,"GTT":0.0,"TAA":0.0,"TAC":0.0,"TAG":0.0,"TAT":0.0,"TCA":0.0, \
+                                    "TCC":0.0,"TCG":0.0,"TCT":0.0,"TGA":0.0,"TGC":0.0,"TGG":0.0,"TGT":0.0,"TTA":0.0,"TTC":0.0,"TTG":0.0,"TTT":0.0},
+                            "N": {  "AAA":0.0,"AAC":0.0,"AAG":0.0,"AAT":0.0,"ACA":0.0,"ACC":0.0,"ACG":0.0,"ACT":0.0,"AGA":0.0,"AGC":0.0,"AGG":0.0, \
+                                    "AGT":0.0,"ATA":0.0,"ATC":0.0,"ATG":0.0,"ATT":0.0,"CAA":0.0,"CAC":0.0,"CAG":0.0,"CAT":0.0,"CCA":0.0,"CCC":0.0,"CCG":0.0, \
+                                    "CCT":0.0,"CGA":0.0,"CGC":0.0,"CGG":0.0,"CGT":0.0,"CTA":0.0,"CTC":0.0,"CTG":0.0,"CTT":0.0,"GAA":0.0,"GAC":0.0,"GAG":0.0, \
+                                    "GAT":0.0,"GCA":0.0,"GCC":0.0,"GCG":0.0,"GCT":0.0,"GGA":0.0,"GGC":0.0,"GGG":0.0,"GGT":0.0,"GTA":0.0,"GTC":0.0,"GTG":0.0, \
+                                    "GTT":0.0,"TAA":0.0,"TAC":0.0,"TAG":0.0,"TAT":0.0,"TCA":0.0,"TCC":0.0,"TCG":0.0,"TCT":0.0,"TGA":0.0,"TGC":0.0,"TGG":0.0, \
+                                    "TGT":0.0,"TTA":0.0,"TTC":0.0,"TTG":0.0,"TTT":0.0}}
     # Mutate (substitutions) all possible codons in the given genetic code, and count proportions of mutations that are synonymous and non-synonmyous
-    for codon in nt_to_aa.keys():
-        # for each bp position in the codon...
-        for codon_p in range(0,2+1):
-            nts = ['A','G','T','C']  
-            nts.remove(codon[codon_p]) # we do not consider self substitutions, e.g. A->A
+    for codon in genetic_code.keys():
+        for codon_p in range(0, 3):
+            nts = ["A", "G", "T", "C"]
+            # Do not consider self substitutions, e.g. A->A
+            nts.remove(codon[codon_p]) 
             # ...and for each nucleotide that the bp can change 
             for nt in nts:
                 codon_mutated = list(copy.deepcopy(codon))
                 codon_mutated[codon_p] = nt  # mutate the basepair
-                codon_mutated = ''.join(codon_mutated)
+                codon_mutated = "".join(codon_mutated)
                 # ...count how many of them are synonymous.
-                if nt_to_aa[codon]==nt_to_aa[codon_mutated]:
-                    potential_changes['S'][codon]+=1.0/3.0 
+                if genetic_code[codon] == genetic_code[codon_mutated]:
+                    potential_changes["S"][codon]+=1.0/3.0 
                 else:
-                    potential_changes['N'][codon]+=1.0/3.0
+                    potential_changes["N"][codon]+=1.0/3.0
     return potential_changes
 
 
@@ -76,22 +53,19 @@ def get_feature_codons(alignment: Gb2Alignment, annotation: list) -> dict:
     return {key:split_into_codons(item) for key,item in dct.items()}
 
 
-def get_df(codons: dict) -> pd.DataFrame:
+def get_df(codons: dict, genetic_code: dict) -> pd.DataFrame:
     keys = []
     N_sites = []
     S_sites = []
-    values =  potential_changes_dict(geneticCode(snakemake.params.genetic_code))
+    values = potential_changes_dict(genetic_code)
     for key,item in codons.items():
-
         keys.append(key)
-
-        N = sum([values['N'][x] for x in item if x in values['N'].keys()])
-        S = sum([values['S'][x] for x in item if x in values['S'].keys()])
-
+        N = sum([values["N"][x] for x in item if x in values["N"].keys()])
+        S = sum([values["S"][x] for x in item if x in values["S"].keys()])
         N_sites.append(N)
         S_sites.append(S) 
     
-    return pd.DataFrame({ 'gene':keys, 'N':N_sites, 'S':S_sites })
+    return pd.DataFrame({ "gene":keys, "N":N_sites, "S":S_sites })
 
 
 def main():
@@ -101,6 +75,10 @@ def main():
     logging.info("Reading features")
     with open(snakemake.input.features) as f:
         feature_list = list(json.load(f).keys())
+
+    logging.info("Reading genetic code")
+    with open(snakemake.input.genetic_code) as f:
+        genetic_code = json.load(f)
     
     logging.info("Create alignment object")
     features = Features(snakemake.input.gb)
@@ -108,10 +86,10 @@ def main():
     aln = Gb2Alignment(seq, features)
 
     logging.info("Splitting ancestral sequence into codons")
-    codons_dict =  get_feature_codons(aln, feature_list)
+    codons_dict = get_feature_codons(aln, feature_list)
 
     logging.info("Calculating synonymous and non synonymous sites")
-    df = get_df(codons_dict)
+    df = get_df(codons_dict, genetic_code)
 
     logging.info("Saving results")
     df.to_csv(snakemake.output.csv,index= False)

From 20fdd2ecedb79c7daa16bd0868a7698dc3ef45a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 11:41:17 +0200
Subject: [PATCH 03/11] Clean substitution key initialization

---
 workflow/scripts/N_S_sites_from_fasta.py | 49 ++++++++++--------------
 1 file changed, 21 insertions(+), 28 deletions(-)

diff --git a/workflow/scripts/N_S_sites_from_fasta.py b/workflow/scripts/N_S_sites_from_fasta.py
index 91e430d..e98ac94 100644
--- a/workflow/scripts/N_S_sites_from_fasta.py
+++ b/workflow/scripts/N_S_sites_from_fasta.py
@@ -2,7 +2,7 @@
 
 import logging
 import json
-import copy
+import itertools as it
 import pandas as pd
 from gb2seq.alignment import Gb2Alignment
 from gb2seq.features import Features
@@ -15,36 +15,30 @@ def split_into_codons(seq: str) -> list:
     return codons
 
 
-def potential_changes_dict(genetic_code: dict) -> dict:
-    """Generate a dictionary with S and N pre-calculated for all possible codons (key:codon, value: (S,N)"""
-    potential_changes = {   "S": {  "AAA":0.0,"AAC":0.0,"AAG":0.0,"AAT":0.0,"ACA":0.0,"ACC":0.0,"ACG":0.0,"ACT":0.0,"AGA":0.0,"AGC":0.0, \
-                                    "AGG":0.0,"AGT":0.0,"ATA":0.0,"ATC":0.0,"ATG":0.0,"ATT":0.0,"CAA":0.0,"CAC":0.0,"CAG":0.0,"CAT":0.0, \
-                                    "CCA":0.0,"CCC":0.0,"CCG":0.0,"CCT":0.0,"CGA":0.0,"CGC":0.0,"CGG":0.0,"CGT":0.0,"CTA":0.0,"CTC":0.0,"CTG":0.0, \
-                                    "CTT":0.0,"GAA":0.0,"GAC":0.0,"GAG":0.0,"GAT":0.0,"GCA":0.0,"GCC":0.0,"GCG":0.0,"GCT":0.0,"GGA":0.0,"GGC":0.0, \
-                                    "GGG":0.0,"GGT":0.0,"GTA":0.0,"GTC":0.0,"GTG":0.0,"GTT":0.0,"TAA":0.0,"TAC":0.0,"TAG":0.0,"TAT":0.0,"TCA":0.0, \
-                                    "TCC":0.0,"TCG":0.0,"TCT":0.0,"TGA":0.0,"TGC":0.0,"TGG":0.0,"TGT":0.0,"TTA":0.0,"TTC":0.0,"TTG":0.0,"TTT":0.0},
-                            "N": {  "AAA":0.0,"AAC":0.0,"AAG":0.0,"AAT":0.0,"ACA":0.0,"ACC":0.0,"ACG":0.0,"ACT":0.0,"AGA":0.0,"AGC":0.0,"AGG":0.0, \
-                                    "AGT":0.0,"ATA":0.0,"ATC":0.0,"ATG":0.0,"ATT":0.0,"CAA":0.0,"CAC":0.0,"CAG":0.0,"CAT":0.0,"CCA":0.0,"CCC":0.0,"CCG":0.0, \
-                                    "CCT":0.0,"CGA":0.0,"CGC":0.0,"CGG":0.0,"CGT":0.0,"CTA":0.0,"CTC":0.0,"CTG":0.0,"CTT":0.0,"GAA":0.0,"GAC":0.0,"GAG":0.0, \
-                                    "GAT":0.0,"GCA":0.0,"GCC":0.0,"GCG":0.0,"GCT":0.0,"GGA":0.0,"GGC":0.0,"GGG":0.0,"GGT":0.0,"GTA":0.0,"GTC":0.0,"GTG":0.0, \
-                                    "GTT":0.0,"TAA":0.0,"TAC":0.0,"TAG":0.0,"TAT":0.0,"TCA":0.0,"TCC":0.0,"TCG":0.0,"TCT":0.0,"TGA":0.0,"TGC":0.0,"TGG":0.0, \
-                                    "TGT":0.0,"TTA":0.0,"TTC":0.0,"TTG":0.0,"TTT":0.0}}
-    # Mutate (substitutions) all possible codons in the given genetic code, and count proportions of mutations that are synonymous and non-synonmyous
+def calculate_potential_changes(genetic_code: dict) -> dict:
+    """Generate a dictionary with S and N pre-calculated for all possible codons"""
+    # Initialize structure
+    nts = set(["A", "G", "T", "C"])
+    potential_changes = {"S": {}, "N": {}}
+    for codon in it.product(nts, repeat=3):
+        potential_changes["S"]["".join(codon)] = 0.
+        potential_changes["N"]["".join(codon)] = 0.
+    # Mutate (substitutions) all possible codons in the given genetic code
+    # and count proportions of mutations that are synonymous and non-synonmyous
     for codon in genetic_code.keys():
         for codon_p in range(0, 3):
             nts = ["A", "G", "T", "C"]
             # Do not consider self substitutions, e.g. A->A
             nts.remove(codon[codon_p]) 
-            # ...and for each nucleotide that the bp can change 
             for nt in nts:
-                codon_mutated = list(copy.deepcopy(codon))
-                codon_mutated[codon_p] = nt  # mutate the basepair
-                codon_mutated = "".join(codon_mutated)
-                # ...count how many of them are synonymous.
-                if genetic_code[codon] == genetic_code[codon_mutated]:
-                    potential_changes["S"][codon]+=1.0/3.0 
+                codon_mutated = list(codon)
+                # Mutate the basepair
+                codon_mutated[codon_p] = nt
+                # Count how many of them are synonymous
+                if genetic_code[codon] == genetic_code["".join(codon_mutated)]:
+                    potential_changes["S"][codon] += 1/3.
                 else:
-                    potential_changes["N"][codon]+=1.0/3.0
+                    potential_changes["N"][codon] += 1/3.
     return potential_changes
 
 
@@ -57,15 +51,14 @@ def get_df(codons: dict, genetic_code: dict) -> pd.DataFrame:
     keys = []
     N_sites = []
     S_sites = []
-    values = potential_changes_dict(genetic_code)
-    for key,item in codons.items():
+    values = calculate_potential_changes(genetic_code)
+    for key, item in codons.items():
         keys.append(key)
         N = sum([values["N"][x] for x in item if x in values["N"].keys()])
         S = sum([values["S"][x] for x in item if x in values["S"].keys()])
         N_sites.append(N)
         S_sites.append(S) 
-    
-    return pd.DataFrame({ "gene":keys, "N":N_sites, "S":S_sites })
+    return pd.DataFrame({"gene": keys, "N": N_sites, "S": S_sites})
 
 
 def main():

From 409c1c8a250442ac40e18f5ecf09fefa2a8e4d06 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 11:55:59 +0200
Subject: [PATCH 04/11] Remove hard-coded genome length

---
 workflow/scripts/window.py | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/workflow/scripts/window.py b/workflow/scripts/window.py
index 916d93c..5ba6a8c 100644
--- a/workflow/scripts/window.py
+++ b/workflow/scripts/window.py
@@ -6,16 +6,12 @@
 from gb2seq.features import Features
 
 
-def get_polimorphic_sites(df: pd.DataFrame) -> set:
-    return set(df.POS)
-
-
-def window_calculation(sites: set, window: int, step: int, genome_size: int, coord: str) -> pd.DataFrame:
+def window_calculation(sites: set, window: int, step: int, coord: str) -> pd.DataFrame:
     ft = Features(coord) # Create Features object to obtain annotations
     positions = []
     pct = []
     genes = []
-    lim_sup = genome_size + 1
+    lim_sup = len(ft.reference.sequence) + 1
     for position in range(1, lim_sup, step):
         if len(ft.getFeatureNames(position)) == 0:
             genes.append("Intergenic")
@@ -25,7 +21,7 @@ def window_calculation(sites: set, window: int, step: int, genome_size: int, coo
         if position - window not in range(1, lim_sup):
             pct.append(0)
         else:
-            # Calculate nº of polimorphisms in the window
+            # Calculate no. of polimorphisms in the window
             num_snp = len([x for x in sites if x in range(position - window, position + 1)])
             pct.append(num_snp / window)
         positions.append(position)
@@ -38,17 +34,14 @@ def main():
 
     logging.info("Reading input VCF")
     df = pd.read_table(snakemake.input.vcf)
+    sites = set(df.POS)
 
     logging.info("Reading features")
     with open(snakemake.input.features) as f:
         features_key = json.load(f)
 
-    logging.info("Getting polimorphic sites")
-    sites = get_polimorphic_sites(df)
-
     logging.info("Sliding window calculation of proportion of polimorphic sites")
-
-    frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, 29903, snakemake.input.gb)
+    frame = window_calculation(sites, snakemake.params.window, snakemake.params.step, snakemake.input.gb)
     
     logging.info("Saving results")
     frame.replace(features_key, inplace = True)

From d84c0b02426521b1ce49351dd6ccf18573d76a7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 11:58:34 +0200
Subject: [PATCH 05/11] Fix genbank rule naming

---
 workflow/rules/fetch.smk | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/workflow/rules/fetch.smk b/workflow/rules/fetch.smk
index 6a1d46d..d3f0759 100644
--- a/workflow/rules/fetch.smk
+++ b/workflow/rules/fetch.smk
@@ -11,7 +11,7 @@ rule fetch_alignment_reference:
         "esearch -db nucleotide -query {params.ref} | efetch -format fasta > {output.fasta} 2> {log}"
 
 
-rule fetch_alignment_gb:
+rule fetch_reference_gb:
     threads: 1
     conda: "../envs/fetch.yaml"
     params:
@@ -19,7 +19,7 @@ rule fetch_alignment_gb:
     output:
         fasta = temp(OUTDIR/"reference.gb")
     log:
-        LOGDIR / "fetch_alignment_reference" / "log.txt"
+        LOGDIR / "fetch_reference_gb" / "log.txt"
     shell:
         "esearch -db nucleotide -query {params.ref} | efetch -format gb > {output.fasta} 2> {log}"
 

From 5eb43b00bb51bd1084e497e41e88dfe6a79690e1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 12:33:07 +0200
Subject: [PATCH 06/11] Remove methodology

---
 README.md | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/README.md b/README.md
index 2d3ba3d..01d7a94 100644
--- a/README.md
+++ b/README.md
@@ -25,26 +25,6 @@ snakemake --use-conda -c4
 The pipeline also allows for its [execution in an HPC environment](config/README.md#run-modes).
 Please check [config/README.md](config/README.md) for in-detail setup instructions.
 
-## Methodology
-
-### Pairwise distances between samples
-
-In order to describe in a better way the relationship between samples, distances beween them are calculated tacking into account allele frequencies in sequencing data. Our aproach to compute the distance between sets of allele frequencies is based on FST formula and define the distance between two samples as:
-
-```math
-d(M,N)=\sum\limits_{i=1}^I \frac {\sum\limits_{j=1}^J (M_{ij} -N_{ij})^2 } {4 - \sum\limits_{j=1}^J (M_{ij} +N_{ij})^2 }
-```
-
-where:
-
-$M$ y $N$: Two sequences.
-
-$i = 1... I :$ Index over polymorphic sites.
-
-$j = 1... J :$ Index over alleles.
-
-$M_{ij}$ : Frequency of allel $j$ in position $i$ for sequence $M$.
-
 ## Contributors
 
 [![Contributors figure](https://contrib.rocks/image?repo=PathoGenOmics-Lab/VIPERA)](https://github.com/PathoGenOmics-Lab/VIPERA/graphs/contributors)

From 8fb31faf6243ae1fe166289f8dd8ed1ba0dbc235 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 12:41:05 +0200
Subject: [PATCH 07/11] Add script to build targets

---
 build_targets.py | 81 ++++++++++++++++++++++++++++++++++++++++++++++++
 config/README.md |  8 ++++-
 2 files changed, 88 insertions(+), 1 deletion(-)
 create mode 100644 build_targets.py

diff --git a/build_targets.py b/build_targets.py
new file mode 100644
index 0000000..9c705fd
--- /dev/null
+++ b/build_targets.py
@@ -0,0 +1,81 @@
+#!/usr/bin/env python
+
+import sys
+import argparse
+from pathlib import Path
+from typing import List
+
+import yaml
+
+
+def find_file_with_extension(directory: Path, prefix: str, extensions: List[str]) -> str:
+    candidate_files = []
+    for path in directory.rglob(f"*"):
+        if any(path.name.endswith(ext) for ext in extensions) and path.name.startswith(prefix):
+            candidate_files.append(path)
+    if len(candidate_files) == 1:
+        return candidate_files[0].as_posix()
+    else:
+        sys.exit(f"ERROR: {len(candidate_files)} candidates found in '{directory}' for prefix '{prefix}' with extensions {extensions}: {candidate_files}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-i", "--sample-directory",
+        help="Directory containing sample sequencing data",
+        required=True,
+        type=Path
+    )
+    parser.add_argument(
+        "-s", "--sample-names",
+        nargs="+",
+        help="Sample names to look for in the sample directory",
+        required=True
+    )
+    parser.add_argument(
+        "-b", "--bam-extensions",
+        nargs="+",
+        help="File extensions for BAM files",
+        required=False,
+        default=[".trim.sort.bam"]
+    )
+    parser.add_argument(
+        "-f", "--fasta-extensions",
+        nargs="+",
+        help="File extensions for FASTA files",
+        required=False,
+        default=[".fa", ".fasta"]
+    )
+    parser.add_argument(
+        "-m", "--metadata-csv",
+        help="Metadata CSV file",
+        required=True,
+        type=Path
+    )
+    parser.add_argument(
+        "-o", "--output-yaml",
+        help="Output YAML file",
+        required=True
+    )
+    args = parser.parse_args()
+    
+    # Build targets
+    targets = {"SAMPLES": {}}
+    for sample_name in args.sample_names:
+        targets["SAMPLES"][sample_name] = {}
+        targets["SAMPLES"][sample_name]["bam"] = find_file_with_extension(args.sample_directory, sample_name, args.bam_extensions)
+        targets["SAMPLES"][sample_name]["fasta"] = find_file_with_extension(args.sample_directory, sample_name, args.fasta_extensions)
+    
+    # Write empty fields
+    if args.metadata_csv.is_file():
+        targets["METADATA"] = args.metadata_csv.as_posix()
+    else:
+        sys.exit(f"ERROR: metadata file '{args.metadata_csv}' does not exist")
+    targets["OUTPUT_DIRECTORY"] = "output"
+    targets["CONTEXT_FASTA"] = None
+    targets["MAPPING_REFERENCES_FASTA"] = None
+
+    # Write output
+    with open(args.output_yaml, "w") as fw:
+        yaml.dump(targets, fw, indent=2)
diff --git a/config/README.md b/config/README.md
index 5ba6c8e..4aa82ae 100644
--- a/config/README.md
+++ b/config/README.md
@@ -9,7 +9,13 @@ Modify the [target configuration file](config/targets.yaml)
 to point the `SAMPLES` and `METADATA` parameters to your data. The `OUTPUT_DIRECTORY`
 parameter should point to your desired results directory.
 
-It should look like this:
+The script [`build_targets.py`](build_targets.py) simplifies the process of creating
+the configuration file. To run this script, you need to have PyYAML installed. It
+takes a list of sample names, a directory with BAM and FASTA files, and the path to
+the metadata table as required inputs. Then, it searches the directory for files
+that have the appropriate extensions and sample names and adds them to the configuration file.
+
+The file should look like this:
 
 ```yaml
 SAMPLES:

From 5863d1ba8dee12e4fe5db9d09a1540078d7f78f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 12:44:59 +0200
Subject: [PATCH 08/11] Fix workflow version typo

---
 workflow/Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/Snakefile b/workflow/Snakefile
index 0b28e6a..a550e68 100644
--- a/workflow/Snakefile
+++ b/workflow/Snakefile
@@ -14,7 +14,7 @@ include: "rules/common.smk"
 
 # Workflow version
 repo_version = get_version_str('.')
-__version__ = "0.1" + f"(git: {repo_version})"
+__version__ = "0.1" + f" (git: {repo_version})"
 
 # Targets
 ## Output

From e7bd49126361b8e331c91674a1447a1c265df104 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 15:32:32 +0200
Subject: [PATCH 09/11] Fix JSON paths for tests

---
 .test/targets.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.test/targets.yaml b/.test/targets.yaml
index ad3d17f..9562879 100644
--- a/.test/targets.yaml
+++ b/.test/targets.yaml
@@ -36,3 +36,7 @@ NSP:
   "../data/report_files/nsp_annotation.csv"
 REPORT_QMD:
   "../template.qmd"
+FEATURES_JSON:
+  "../config/sarscov2_features.json"
+GENETIC_CODE_JSON:
+  "../config/standard_genetic_code.json"

From ee70672d4444d2b5c06a32944df3d349e674d419 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 16:12:30 +0200
Subject: [PATCH 10/11] Fix unnecessary list conversion

---
 workflow/scripts/report/get_annotation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflow/scripts/report/get_annotation.py b/workflow/scripts/report/get_annotation.py
index c6eb974..552d70d 100644
--- a/workflow/scripts/report/get_annotation.py
+++ b/workflow/scripts/report/get_annotation.py
@@ -13,7 +13,7 @@ def main():
     logging.info("Reading features")
     ft = Features(snakemake.input.gb)
     with open(snakemake.input.features) as f:
-        feature_key = list(json.load(f))
+        feature_key = json.load(f)
 
     logging.info("Reading reference")
     reference = str(next(SeqIO.parse(snakemake.input.ref, "fasta")).seq)

From d69dc295f4b3f4bf5f048a817571d9b74453eeba Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Miguel=20=C3=81lvarez=20Herrera?=
 <m.alvarez.herrera@csic.es>
Date: Wed, 11 Oct 2023 18:45:19 +0200
Subject: [PATCH 11/11] Resolve JSON paths

---
 workflow/rules/evolution.smk | 4 ++--
 workflow/rules/report.smk    | 2 +-
 workflow/rules/vaf.smk       | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/workflow/rules/evolution.smk b/workflow/rules/evolution.smk
index e8ae219..e89d358 100644
--- a/workflow/rules/evolution.smk
+++ b/workflow/rules/evolution.smk
@@ -4,8 +4,8 @@ rule N_S_sites:
     input:
         fasta = OUTDIR/f"{OUTPUT_NAME}.ancestor.fasta",
         gb = OUTDIR/"reference.gb",
-        features = config["FEATURES_JSON"],
-        genetic_code = config["GENETIC_CODE_JSON"]
+        features = Path(config["FEATURES_JSON"]).resolve(),
+        genetic_code = Path(config["GENETIC_CODE_JSON"]).resolve()
     output:
         csv = temp(OUTDIR/f"{OUTPUT_NAME}.ancestor.N_S.sites.csv")
     log:
diff --git a/workflow/rules/report.smk b/workflow/rules/report.smk
index cd9ca87..25d168a 100644
--- a/workflow/rules/report.smk
+++ b/workflow/rules/report.smk
@@ -19,7 +19,7 @@ rule window:
     input:
         vcf = OUTDIR/f"{OUTPUT_NAME}.masked.filtered.tsv",
         gb = OUTDIR/"reference.gb",
-        features = config["FEATURES_JSON"]
+        features = Path(config["FEATURES_JSON"]).resolve()
     output:
         window_df = temp(OUTDIR/f"{OUTPUT_NAME}.window.csv"),
     log:
diff --git a/workflow/rules/vaf.smk b/workflow/rules/vaf.smk
index 821ae87..166ea2b 100644
--- a/workflow/rules/vaf.smk
+++ b/workflow/rules/vaf.smk
@@ -59,7 +59,7 @@ rule annotation:
     input:
         gb = OUTDIR/"reference.gb",
         ref = OUTDIR/"reference.fasta",
-        features = config["FEATURES_JSON"]
+        features = Path(config["FEATURES_JSON"]).resolve()
     output:
         df = temp(OUTDIR/"annotation.csv")
     log: