Skip to content

Commit

Permalink
Merge branch 'subset-vamb' of https://github.com/metagenome-atlas/atlas
Browse files Browse the repository at this point in the history
… into subset-vamb
  • Loading branch information
SilasK committed Jul 28, 2023
2 parents f0880de + b9c1779 commit 6b6c575
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 98 deletions.
2 changes: 1 addition & 1 deletion workflow/rules/binning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ rule get_bins:
log:
"{sample}/logs/binning/get_bins_{binner}.log",
script:
"get_fasta_of_bins.py"
"../scripts/get_fasta_of_bins.py"


localrules:
Expand Down
47 changes: 18 additions & 29 deletions workflow/rules/cobinning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ rule filter_contigs:
temp("Intermediate/cobinning/filtered_contigs/{sample}.fasta"),
params:
min_length=config["cobining_min_contig_length"],
prefix= lambda wc: wc.sample+ config["cobinning_separator"]
log:
"logs/cobinning/filter_contigs/{sample}.log",
conda:
Expand All @@ -22,16 +23,19 @@ rule filter_contigs:
mem=config["simplejob_mem"],
java_mem=int(config["simplejob_mem"] * JAVA_MEM_FRACTION),
shell:
" reformat.sh in={input} "
" fastaminlen={params.min_length} "
" rename.sh in={input} "
" prefix={params.prefix} addprefix=t addunderscore=f"
" minscaf={params.min_length} "
" out={output} "
" overwrite=true "
" -Xmx{resources.java_mem}G 2> {log} "


def get_samples_of_bingroup(wildcards):

return sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()
samples_of_group= sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()

return samples_of_group


def get_filtered_contigs_of_bingroup(wildcards):
Expand All @@ -45,11 +49,7 @@ def get_filtered_contigs_of_bingroup(wildcards):
"Adapt the sample.tsv to set BinGroup of size [5- 1000]"
)

return {
"fasta": ancient(
expand(rules.filter_contigs.output[0], sample=samples_of_group)
),
}
return expand(rules.filter_contigs.output[0], sample=samples_of_group)


def get_bams_of_bingroup(wildcards):
Expand All @@ -65,27 +65,16 @@ def get_bams_of_bingroup(wildcards):

rule combine_contigs:
input:
unpack(get_filtered_contigs_of_bingroup),
fasta= get_filtered_contigs_of_bingroup,
output:
"Intermediate/cobinning/{bingroup}/combined_contigs.fasta.gz",
log:
"logs/cobinning/{bingroup}/combine_contigs.log",
params:
seperator=config["cobinning_separator"],
samples=SAMPLES,
threads: 1
run:
import gzip as gz

with gz.open(output[0], "wt") as fout:
for sample, input_fasta in zip(params.samples, input.fasta):
with open(input_fasta) as fin:
for line in fin:
# if line is a header add sample name
if line[0] == ">":
line = f">{sample}{params.seperator}" + line[1:]
# write each line to the combined file
fout.write(line)
conda:
"../envs/required_packages.yaml"
threads: config["simplejob_threads"],
shell:
" (cat {input} | pigz -p {threads} > {output} ) 2> {log}"


rule minimap_index:
Expand All @@ -97,7 +86,7 @@ rule minimap_index:
index_size="12G",
resources:
mem=config["mem"], # limited num of fatnodes (>200g)
threads: 3
threads: config["simplejob_threads"],
log:
"logs/cobinning/{bingroup}/minimap_index.log",
benchmark:
Expand All @@ -115,7 +104,7 @@ rule samtools_dict:
dict="Intermediate/cobinning/{bingroup}/combined_contigs.dict",
resources:
mem=config["simplejob_mem"],
ttime=config["runtime"]["simplejob"],
time=config["runtime"]["simplejob"],
threads: 1
log:
"logs/cobinning/{bingroup}/samtools_dict.log",
Expand Down Expand Up @@ -242,7 +231,7 @@ rule parse_vamb_output:
input:
expand(rules.run_vamb.output, bingroup=sampleTable.BinGroup.unique()),
output:
renamed_clusters="Intermediate/cobinning/vamb_clusters.tsv.gz",
renamed_clusters="Binning/vamb/vamb_clusters.tsv.gz",
cluster_atributions=expand(vamb_cluster_attribution_path, sample=SAMPLES),
log:
"logs/cobinning/vamb_parse_output.log",
Expand All @@ -259,7 +248,7 @@ rule parse_vamb_output:

rule vamb:
input:
"Intermediate/cobinning/vamb_clusters.tsv.gz",
"Binning/vamb/vamb_clusters.tsv.gz",
expand("{sample}/binning/vamb/cluster_attribution.tsv", sample=SAMPLES),


Expand Down
60 changes: 0 additions & 60 deletions workflow/rules/get_fasta_of_bins.py

This file was deleted.

97 changes: 97 additions & 0 deletions workflow/scripts/get_fasta_of_bins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#! /usr/bin/env python


import sys, os
import logging, traceback

logging.basicConfig(
filename=snakemake.log[0],
level=logging.INFO,
format="%(asctime)s %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)


def handle_exception(exc_type, exc_value, exc_traceback):
if issubclass(exc_type, KeyboardInterrupt):
sys.__excepthook__(exc_type, exc_value, exc_traceback)
return

logging.error(
"".join(
[
"Uncaught exception: ",
*traceback.format_exception(exc_type, exc_value, exc_traceback),
]
)
)


# Install exception handler
sys.excepthook = handle_exception


# start of script
import argparse
import os, sys
import shutil
import warnings

import pandas as pd
from Bio import SeqIO


def get_fasta_of_bins(cluster_attribution, contigs, out_folder):
"""
Creates individual fasta files for each bin using the contigs fasta and the cluster attribution.
input:
- cluster attribution file: tab seperated file of "contig_fasta_header bin"
- contigs: fasta file of contigs
- out_prefix: output_prefix for bin fastas {out_folder}/{binid}.fasta
"""
# create outdir
if os.path.exists(out_folder):
shutil.rmtree(out_folder)
os.makedirs(out_folder)

CA = pd.read_csv(cluster_attribution, header=None, sep="\t", dtype=str)

assert CA.shape[1] == 2, "File should have only two columns " + cluster_attribution

CA.columns = ["Contig", "Bin"]

# assert that Contig is unique
assert CA.Contig.is_unique, (
f"First column of file {cluster_attribution} should be contigs, hence unique"
f"I got\n{CA.head()}"
)

contigs = SeqIO.to_dict(SeqIO.parse(contigs, "fasta"))

for binid in CA.Bin.unique():
bin_contig_names = CA.query('Bin == "@binid"').index.tolist()
out_file = os.path.join(out_folder, f"{binid}.fasta")

if type(bin_contig_names) == str:
warnings.warn(f"single contig bin Bin : {binid} {bin_contig_names}")
bin_contig_names = [bin_contig_names]

bin_contigs = [contigs[c] for c in bin_contig_names]
SeqIO.write(bin_contigs, out_file, "fasta")


if __name__ == "__main__":
if "snakemake" not in globals():
p = argparse.ArgumentParser()
p.add_argument("--cluster-attribution")
p.add_argument("--contigs")
p.add_argument("--out-folder")
args = vars(p.parse_args())
get_fasta_of_bins(**args)
else:
get_fasta_of_bins(
snakemake.input.cluster_attribution,
snakemake.input.contigs,
snakemake.output[0],
)
13 changes: 5 additions & 8 deletions workflow/scripts/parse_vamb.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def handle_exception(exc_type, exc_value, exc_traceback):
all_clusters = []

for vamb_folder in list(snakemake.input):

vamb_folder = Path(vamb_folder)
# extract the bingroup name from the path
vamb_folder_name = vamb_folder.parts[-1]
Expand Down Expand Up @@ -88,7 +87,7 @@ def handle_exception(exc_type, exc_value, exc_traceback):
clusters.columns = ["Sample", "Contig"]

# get number of BinID given by vamb, prefix with bingroup
clusters["BinID"] = (
clusters["BinId"] = (
bingroup
+ clusters_contigs.OriginalName.str.rsplit(separator, n=1, expand=True)[1]
)
Expand All @@ -105,23 +104,21 @@ def handle_exception(exc_type, exc_value, exc_traceback):
del clusters_contigs


logging.info(f"Concatenate all clusters")
logging.info(f"Concatenate clusters of all bingroups")
clusters = pd.concat(all_clusters, axis=0)


logging.info(f"Write reformated table to {output_culsters}")
clusters.to_csv(output_culsters, sep="\t", index=False)


n_bins = (
clusters.query("Large_enough").groupby(["BinGroup", "Sample"])["BinId"].nunique()
)
logging.info(
f"Number of bins per sample and bingroup passing the size filter:\n{n_bins}"
)

logging.info(f"Write reformated table to {output_culsters}")
clusters.to_csv(output_culsters, sep="\t", index=False)

clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinID
clusters["SampleBin"] = clusters.Sample + "_vamb_" + clusters.BinId

logging.info(f"Write cluster_attribution for samples")
for sample, cl in clusters.groupby("Sample"):
Expand Down

0 comments on commit 6b6c575

Please sign in to comment.