Merge branch 'subset-vamb' of https://github.com/metagenome-atlas/atlas…

… into subset-vamb
metagenome-atlas · Jul 28, 2023 · 6b6c575 · 6b6c575
2 parents f0880de + b9c1779
commit 6b6c575
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 98 deletions.
diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk
@@ -313,7 +313,7 @@ rule get_bins:
     log:
         "{sample}/logs/binning/get_bins_{binner}.log",
     script:
-        "get_fasta_of_bins.py"
+        "../scripts/get_fasta_of_bins.py"
 
 
 localrules:

diff --git a/workflow/rules/cobinning.smk b/workflow/rules/cobinning.smk
@@ -13,6 +13,7 @@ rule filter_contigs:
         temp("Intermediate/cobinning/filtered_contigs/{sample}.fasta"),
     params:
         min_length=config["cobining_min_contig_length"],
+        prefix= lambda wc: wc.sample+ config["cobinning_separator"]
     log:
         "logs/cobinning/filter_contigs/{sample}.log",
     conda:
@@ -22,16 +23,19 @@ rule filter_contigs:
         mem=config["simplejob_mem"],
         java_mem=int(config["simplejob_mem"] * JAVA_MEM_FRACTION),
     shell:
-        " reformat.sh in={input} "
-        " fastaminlen={params.min_length} "
+        " rename.sh in={input} "
+        " prefix={params.prefix} addprefix=t addunderscore=f"
+        " minscaf={params.min_length} "
         " out={output} "
         " overwrite=true "
         " -Xmx{resources.java_mem}G 2> {log} "
 
 
 def get_samples_of_bingroup(wildcards):
 
-    return sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()
+    samples_of_group= sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()
+
+    return samples_of_group
 
 
 def get_filtered_contigs_of_bingroup(wildcards):
@@ -45,11 +49,7 @@ def get_filtered_contigs_of_bingroup(wildcards):
             "Adapt the sample.tsv to set BinGroup of size [5- 1000]"
         )
 
-    return {
-        "fasta": ancient(
-            expand(rules.filter_contigs.output[0], sample=samples_of_group)
-        ),
-    }
+    return expand(rules.filter_contigs.output[0], sample=samples_of_group)
 
 
 def get_bams_of_bingroup(wildcards):
@@ -65,27 +65,16 @@ def get_bams_of_bingroup(wildcards):
 
 rule combine_contigs:
     input:
-        unpack(get_filtered_contigs_of_bingroup),
+        fasta= get_filtered_contigs_of_bingroup,
     output:
         "Intermediate/cobinning/{bingroup}/combined_contigs.fasta.gz",
     log:
         "logs/cobinning/{bingroup}/combine_contigs.log",
-    params:
-        seperator=config["cobinning_separator"],
-        samples=SAMPLES,
-    threads: 1
-    run:
-        import gzip as gz
-
-        with gz.open(output[0], "wt") as fout:
-            for sample, input_fasta in zip(params.samples, input.fasta):
-                with open(input_fasta) as fin:
-                    for line in fin:
-                        # if line is a header add sample name
-                        if line[0] == ">":
-                            line = f">{sample}{params.seperator}" + line[1:]
-                        # write each line to the combined file
-                        fout.write(line)
+    conda:
+        "../envs/required_packages.yaml"
+    threads: config["simplejob_threads"],
+    shell:
+        " (cat {input} | pigz -p {threads} > {output} ) 2> {log}"
 
 
 rule minimap_index:
@@ -97,7 +86,7 @@ rule minimap_index:
         index_size="12G",
     resources:
         mem=config["mem"],  # limited num of fatnodes (>200g)
-    threads: 3
+    threads: config["simplejob_threads"],
     log:
         "logs/cobinning/{bingroup}/minimap_index.log",
     benchmark:
@@ -115,7 +104,7 @@ rule samtools_dict:
         dict="Intermediate/cobinning/{bingroup}/combined_contigs.dict",
     resources:
         mem=config["simplejob_mem"],
-        ttime=config["runtime"]["simplejob"],
+        time=config["runtime"]["simplejob"],
     threads: 1
     log:
         "logs/cobinning/{bingroup}/samtools_dict.log",
@@ -242,7 +231,7 @@ rule parse_vamb_output:
     input:
         expand(rules.run_vamb.output, bingroup=sampleTable.BinGroup.unique()),
     output:
-        renamed_clusters="Intermediate/cobinning/vamb_clusters.tsv.gz",
+        renamed_clusters="Binning/vamb/vamb_clusters.tsv.gz",
         cluster_atributions=expand(vamb_cluster_attribution_path, sample=SAMPLES),
     log:
         "logs/cobinning/vamb_parse_output.log",
@@ -259,7 +248,7 @@ rule parse_vamb_output:
 
 rule vamb:
     input:
-        "Intermediate/cobinning/vamb_clusters.tsv.gz",
+        "Binning/vamb/vamb_clusters.tsv.gz",
         expand("{sample}/binning/vamb/cluster_attribution.tsv", sample=SAMPLES),
 
 

diff --git a/workflow/rules/get_fasta_of_bins.py b/workflow/rules/get_fasta_of_bins.py
diff --git a/workflow/scripts/get_fasta_of_bins.py b/workflow/scripts/get_fasta_of_bins.py
@@ -0,0 +1,97 @@
+#! /usr/bin/env python
+
+
+import sys, os
+import logging, traceback
+
+logging.basicConfig(
+    filename=snakemake.log[0],
+    level=logging.INFO,
+    format="%(asctime)s %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+
+def handle_exception(exc_type, exc_value, exc_traceback):
+    if issubclass(exc_type, KeyboardInterrupt):
+        sys.__excepthook__(exc_type, exc_value, exc_traceback)
+        return
+
+    logging.error(
+        "".join(
+            [
+                "Uncaught exception: ",
+                *traceback.format_exception(exc_type, exc_value, exc_traceback),
+            ]
+        )
+    )
+
+
+# Install exception handler
+sys.excepthook = handle_exception
+
+
+# start of script
+import argparse
+import os, sys
+import shutil
+import warnings
+
+import pandas as pd
+from Bio import SeqIO
+
+
+def get_fasta_of_bins(cluster_attribution, contigs, out_folder):
+    """
+    Creates individual fasta files for each bin using the contigs fasta and the cluster attribution.
+
+    input:
+    - cluster attribution file:   tab seperated file of "contig_fasta_header    bin"
+    - contigs:                    fasta file of contigs
+    - out_prefix:                 output_prefix for bin fastas  {out_folder}/{binid}.fasta
+    """
+    # create outdir
+    if os.path.exists(out_folder):
+        shutil.rmtree(out_folder)
+    os.makedirs(out_folder)
+
+    CA = pd.read_csv(cluster_attribution, header=None, sep="\t", dtype=str)
+
+    assert CA.shape[1] == 2, "File should have only two columns " + cluster_attribution
+
+    CA.columns = ["Contig", "Bin"]
+
+    # assert that Contig is unique
+    assert CA.Contig.is_unique, (
+        f"First column of file {cluster_attribution} should be contigs, hence unique"
+        f"I got\n{CA.head()}"
+    )
+
+    contigs = SeqIO.to_dict(SeqIO.parse(contigs, "fasta"))
+
+    for binid in CA.Bin.unique():
+        bin_contig_names = CA.query('Bin == "@binid"').index.tolist()
+        out_file = os.path.join(out_folder, f"{binid}.fasta")
+
+        if type(bin_contig_names) == str:
+            warnings.warn(f"single contig bin Bin : {binid} {bin_contig_names}")
+            bin_contig_names = [bin_contig_names]
+
+        bin_contigs = [contigs[c] for c in bin_contig_names]
+        SeqIO.write(bin_contigs, out_file, "fasta")
+
+
+if __name__ == "__main__":
+    if "snakemake" not in globals():
+        p = argparse.ArgumentParser()
+        p.add_argument("--cluster-attribution")
+        p.add_argument("--contigs")
+        p.add_argument("--out-folder")
+        args = vars(p.parse_args())
+        get_fasta_of_bins(**args)
+    else:
+        get_fasta_of_bins(
+            snakemake.input.cluster_attribution,
+            snakemake.input.contigs,
+            snakemake.output[0],
+        )
diff --git a/workflow/scripts/parse_vamb.py b/workflow/scripts/parse_vamb.py
@@ -48,7 +48,6 @@ def handle_exception(exc_type, exc_value, exc_traceback):
 all_clusters = []
 
 for vamb_folder in list(snakemake.input):
-
     vamb_folder = Path(vamb_folder)
     # extract the bingroup name from the path
     vamb_folder_name = vamb_folder.parts[-1]
@@ -88,7 +87,7 @@ def handle_exception(exc_type, exc_value, exc_traceback):
     clusters.columns = ["Sample", "Contig"]
 
     # get number of BinID given by vamb, prefix with bingroup
-    clusters["BinID"] = (
+    clusters["BinId"] = (
         bingroup
         + clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1]
     )
@@ -105,23 +104,21 @@ def handle_exception(exc_type, exc_value, exc_traceback):
     del clusters_contigs
 
 
-logging.info(f"Concatenate all clusters")
+logging.info(f"Concatenate clusters of all bingroups")
 clusters = pd.concat(all_clusters, axis=0)
 
 
-logging.info(f"Write reformated table to {output_culsters}")
-clusters.to_csv(output_culsters, sep="\t", index=False)
-
-
 n_bins = (
     clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique()
 )
 logging.info(
     f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}"
 )
 
+logging.info(f"Write reformated table to {output_culsters}")
+clusters.to_csv(output_culsters, sep="\t", index=False)
 
-clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinID
+clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinId
 
 logging.info(f"Write cluster_attribution for samples")
 for sample, cl in clusters.groupby("Sample"):