Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Pyrodigal #623

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion workflow/envs/prodigal.yaml → workflow/envs/pyrodigal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,6 @@ channels:
- bioconda
- defaults
dependencies:
- prodigal =2.6.*
- python =3.10
- pyrodigal =2.1
- biopython =1.81
65 changes: 33 additions & 32 deletions workflow/rules/genomes.smk
Original file line number Diff line number Diff line change
Expand Up @@ -104,43 +104,44 @@ rule get_contig2genomes:
ruleorder: get_contig2genomes > rename_genomes


# rule predict_genes_genomes:
# input:
# dir= genomes_dir
# output:
# directory("genomes/annotations/genes")
# conda:
# "%s/prodigal.yaml" % CONDAENV
# log:
# "logs/genomes/prodigal.log"
# shadow:
# "shallow"
# threads:
# config.get("threads", 1)
# script:
# "predict_genes_of_genomes.py"


rule predict_genes_genomes:
input:
os.path.join(genome_dir, "{genome}.fasta"),
dir= genomes_dir
output:
fna="genomes/annotations/genes/{genome}.fna",
faa="genomes/annotations/genes/{genome}.faa",
gff=temp("genomes/annotations/genes/{genome}.gff"),
#directory("genomes/annotations/genes")
expand(
"genomes/annotations/genes/{genome}.{extension}",
genome=get_all_genomes(wildcards),
extension=["fna", "faa"])
conda:
"%s/prodigal.yaml" % CONDAENV
"../envs/pyrodigal.yaml"
log:
"logs/genomes/prodigal/{genome}.txt",
threads: 1
resources:
mem=config["simplejob_mem"],
time=config["runtime"]["simplejob"],
shell:
"""
prodigal -i {input} -o {output.gff} -d {output.fna} \
-a {output.faa} -p meta -f gff 2> {log}
"""
"logs/genomes/prodigal.log"
threads:
config.get("threads", 1)
script:
"predict_genes_of_genomes.py"


# rule predict_genes_genomes:
# input:
# os.path.join(genome_dir, "{genome}.fasta"),
# output:
# fna="genomes/annotations/genes/{genome}.fna",
# faa="genomes/annotations/genes/{genome}.faa",
# conda:
# "../envs/pyrodigal.yaml",
# log:
# "logs/genomes/prodigal/{genome}.txt",
# threads: 1
# resources:
# mem=config["simplejob_mem"],
# time=config["runtime"]["simplejob"],
# shell:
# """
# prodigal -i {input} -o {output.gff} -d {output.fna} \
# -a {output.faa} -p meta -f gff 2> {log}
# """


def get_all_genes(wildcards, extension=".faa"):
Expand Down
80 changes: 40 additions & 40 deletions workflow/rules/predict_genes_of_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,67 +31,67 @@ def handle_exception(exc_type, exc_value, exc_traceback):
# Install exception handler
sys.excepthook = handle_exception

#### Begining of scripts

# python 3.5 without f strings

import os, shutil, sys
import uuid
import itertools
from glob import glob
from snakemake.shell import shell
from snakemake.io import glob_wildcards
### New script

from Bio import SeqIO
import pyrodigal
from pathlib import Path


from multiprocessing import Pool
import itertools


def predict_genes(genome, fasta, out_dir, log):
fna = "{}/{}.fna".format(out_dir, genome)
faa = "{}/{}.faa".format(out_dir, genome)
gff = "{}/{}.gff".format(out_dir, genome)

shell('printf "{genome}:\n" > {log}'.format(genome=genome, log=log))
shell(
"prodigal -i {fasta} -o {gff} -d {fna} -a {faa} -p sinlge -c -m -f gff 2>> {log} ".format(
fasta=fasta, log=log, gff=gff, fna=fna, faa=faa
)
)
shell('printf "\n" >> {log}'.format(log=log))
def predict_genes(fasta_file, output_folder, translation_table=11, meta=True, name="infer") -> None:
"""Produces faa and fna file from fasta file using pyrodigal """

if not name=="infer":
# get name
if fasta_file.endswith(".gz"): raise NotImplementedError("I have not implmented gziped fasta")
name = Path(fasta_file).stem

# define output files
output_folder = Path(output_folder)
faa_file= output_folder/ (name+".faa")
fna_file= output_folder/ (name+".fna")


def predict_genes_genomes(input_dir, out_dir, log, threads):
genomes_fastas = glob(os.path.join(input_dir, "*.fasta"))
orf_finder = pyrodigal.OrfFinder(meta=meta)

os.makedirs(out_dir, exist_ok=True)
with open(faa_file,"w") as faa_handle, open(fna_file,"w") as fna_handle:

temp_log_dir = os.path.join(os.path.dirname(log), "tmp_" + uuid.uuid4().hex)
os.makedirs(temp_log_dir, exist_ok=False)
for contig in SeqIO.parse(fasta_file, "fasta"):

genome_names = []
log_names = []
for fasta in genomes_fastas:
genome_name = os.path.splitext(os.path.split(fasta)[-1])[0]
genome_names.append(genome_name)
log_names.append(os.path.join(temp_log_dir, genome_name + ".prodigal.tmp"))
genes = orf_finder.find_genes(bytes(contig.seq))


genes.write_genes(fna_handle, sequence_id=contig.id)
genes.write_translations(faa_handle, sequence_id=contig.id,translation_table=translation_table)


def predict_genes_genomes(input_dir, out_dir, threads):

input_dir = Path(input_dir)
out_dir = Path(out_dir)


genomes_fastas= input_dir.glob("*[.fasta|.fn]*")

out_dir.mkdir(exist_ok=True)

pool = Pool(threads)
pool.starmap(
predict_genes,
zip(genome_names, genomes_fastas, itertools.repeat(out_dir), log_names),
zip(genomes_fastas, itertools.repeat(out_dir),itertools.repeat(11)),
)

# cat in python
with open(log, "ab") as f_out:
for logfile in log_names:
with open(logfile, "rb") as f_in:
shutil.copyfileobj(f_in, f_out)

shell("rm -r {temp_log_dir}".format(temp_log_dir=temp_log_dir))


if __name__ == "__main__":
predict_genes_genomes(
snakemake.input.dir,
snakemake.output[0],
snakemake.log[0],
int(snakemake.threads),
)
Loading