Skip to content

Commit

Permalink
semibin in subgroups
Browse files Browse the repository at this point in the history
  • Loading branch information
SilasK committed Jul 26, 2023
1 parent 8f0b766 commit 7bf90ae
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 31 deletions.
6 changes: 4 additions & 2 deletions workflow/rules/cobinning.smk
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,14 @@ rule filter_contigs:
" -Xmx{resources.java_mem}G 2> {log} "


def get_samples_of_bingroup(wildcards):

return sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()

def get_filtered_contigs_of_bingroup(wildcards):


samples_of_group = sampleTable.query(f'BinGroup=="{wildcards.bingroup}"').index.tolist()
samples_of_group = get_samples_of_bingroup(wildcards)


if len(samples_of_group) <= 5:
raise ValueError(f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples."
Expand Down
71 changes: 42 additions & 29 deletions workflow/rules/semibin.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,77 +2,90 @@
rule semibin_generate_data_multi:
input:
fasta=rules.combine_contigs.output,
bams=expand(rules.sort_bam.output, sample=SAMPLES),
bams=lambda wc: expand(rules.sort_bam.output, sample= get_samples_of_bingroup(wc)),
output:
expand(
"Cobinning/SemiBin/samples/{sample}/{files}",
sample=SAMPLES,
files=["data.csv", "data_split.csv"],
),
directory("Intermediate/cobinning/{bingroup}/semibin/data_multi")
# expand(
# "Cobinning/SemiBin/samples/{sample}/{files}",
# sample=SAMPLES,
# files=["data.csv", "data_split.csv"],
# ),
conda:
"../envs/semibin.yaml"
threads: config["threads"]
resources:
mem=config["mem"],
time=config["runtime"]["default"],
log:
"logs/semibin/generate_data_multi.log",
"logs/semibin/{bingroup}/generate_data_multi.log",
benchmark:
"logs/benchmarks/semibin/generate_data_multi.tsv"
"logs/benchmarks/semibin/{bingroup}/generate_data_multi.tsv"
params:
output_dir="Cobinning/SemiBin",
# output_dir="Cobinning/SemiBin",
separator=config["cobinning_separator"],
shell:
"SemiBin generate_sequence_features_multi"
" --input-fasta {input.fasta} "
" --input-bam {input.bams} "
" --output {params.output_dir} "
" --output {output} "
" --threads {threads} "
" --separator {params.separator} "
" 2> {log}"


rule semibin_train:
input:
"{sample}/{sample}_contigs.fasta",
fasta=rules.filter_contigs.output,
bams=expand(rules.sort_bam.output, sample=SAMPLES),
data="Cobinning/SemiBin/samples/{sample}/data.csv",
data_split="Cobinning/SemiBin/samples/{sample}/data_split.csv",
flag = "{sample}/{sample}_contigs.fasta",
fasta_sample = rules.filter_contigs.output,
bams= rules.semibin_generate_data_multi.input.bams,
data_folder= rules.semibin_generate_data_multi.output[0],
output:
"Cobinning/SemiBin/{sample}/model.h5",
"Intermediate/cobinning/{bingroup}/semibin/models/{sample}/model.h5",
conda:
"../envs/semibin.yaml"
threads: config["threads"]
resources:
mem=config["mem"],
time=config["runtime"]["default"],
log:
"logs/semibin/train/{sample}.log",
"logs/semibin/{bingroup}/train/{sample}.log",
benchmark:
"logs/benchmarks/semibin/train/{sample}.tsv"
"logs/benchmarks/semibin/{bingroup}/train/{sample}.tsv"
params:
output_dir=lambda wc, output: os.path.dirname(output[0]),
data = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data.csv",
data_split = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data_split.csv",
extra=config["semibin_train_extra"],
shell:
"SemiBin train_self "
" --output {params.output_dir} "
" --threads {threads} "
" --data {input.data} "
" --data-split {input.data_split} "
" --data {params.data} "
" --data-split {params.data_split} "
" {params.extra} "
" 2> {log}"


def semibin_input(wildcards):

bingroup_of_sample = sampleTable.loc[wildcards.sample, "bingroup"]
samples_of_bingroup = sampleTable.query(f'BinGroup=="{bingroup_of_sample}"').index.tolist()


return dict(
flag= "{sample}/{sample}_contigs.fasta",
fasta = rules.filter_contigs.output,
bams =lambda wc: expand(rules.sort_bam.output, sample= samples_of_bingroup),
data_folder = rules.semibin_generate_data_multi.output[0].format(bingroup=bingroup_of_sample),
model = rules.semibin_train.output[0].format(bingroup=bingroup_of_sample, sample=wildcards.sample),
)

rule run_semibin:
input:
"{sample}/{sample}_contigs.fasta",
fasta=rules.filter_contigs.output,
bams=expand(rules.sort_bam.output, sample=SAMPLES),
data="Cobinning/SemiBin/samples/{sample}/data.csv",
model=rules.semibin_train.output[0],
unpack(semibin_input),
output:
directory("Cobinning/SemiBin/{sample}/output_recluster_bins/"),
# contains no info to bingroup
directory("Intermediate/cobinning/semibin_output/{sample}/output_recluster_bins/"),
conda:
"../envs/semibin.yaml"
threads: config["threads"]
Expand All @@ -84,15 +97,16 @@ rule run_semibin:
benchmark:
"logs/benchmarks/semibin/bin/{sample}.tsv"
params:
output_dir="Cobinning/SemiBin/{sample}/",
output_dir= lambda wc, output: os.path.dirname(output[0])
data = lambda wc, input: Path(input.data_folder)/"samples"/wc.sample/"data.csv",
min_bin_kbs=int(config["cobining_min_bin_size"] / 1000),
extra=config["semibin_options"],
shell:
"SemiBin bin "
" --input-fasta {input.fasta} "
" --output {params.output_dir} "
" --threads {threads} "
" --data {input.data} "
" --data {params.data} "
" --model {input.model} "
" --minfasta-kbs {params.min_bin_kbs}"
" {params.extra} "
Expand Down Expand Up @@ -120,5 +134,4 @@ rule parse_semibin_output:

rule semibin:
input:
expand("Cobinning/SemiBin/{sample}/output_recluster_bins/", sample=SAMPLES),
expand("{sample}/binning/SemiBin/cluster_attribution.tsv", sample=SAMPLES),

0 comments on commit 7bf90ae

Please sign in to comment.