Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 41 additions & 21 deletions vHULK.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
import re
import sys
import os
import tqdm
import multiprocessing
import functools

# Set logging level fro TensorFlow
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
Expand Down Expand Up @@ -211,7 +214,7 @@ def run_prokka(fasta_in, output_dir, threads):
threads, out_prefix, genome_dir, fasta_in
)
)
return_code = subprocess.run(command_line, shell=True)
return_code = subprocess.run(command_line, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

return_code.check_returncode()

Expand Down Expand Up @@ -481,25 +484,35 @@ def main():

print("**Prokka has started, this may take a while. Be patient.")

prokka_jobs = []
for bin_fasta in list_bins:
len_bin = 0
bin_name = get_bin_name(bin_fasta)
for record in SeqIO.parse(bin_fasta, "fasta"):
len_bin += len(record.seq)
if len_bin < 5000:
print(
"**vHULK has found a genome or bin, which is too short to "
"code proteins (< 5000 bp). As CDSs are an import feature for "
"vHULK, we will be skipping this: " + bin_fasta.name
)
# print(
# "**vHULK has found a genome or bin, which is too short to "
# "code proteins (< 5000 bp). As CDSs are an import feature for "
# "vHULK, we will be skipping this: " + bin_fasta.name
# )
prokka_skipped[bin_name] = bin_fasta
continue
prokka_jobs.append(bin_fasta)
#run_prokka(bin_fasta, prokka_dir, threads)

# count_prokka += 1
# if count_prokka % 10 == 0:
# print("**Done with {} genomes...".format(count_prokka))

print('{} short bins were skipped. (bin_len < 5000 bp)'.format(len(prokka_skipped)))

run_prokka(bin_fasta, prokka_dir, threads)
with multiprocessing.Pool(threads) as p:
run_wrapper = functools.partial(run_prokka, output_dir=prokka_dir, threads=1)
for _ in tqdm.tqdm(p.imap_unordered(run_wrapper, prokka_jobs), total=len(prokka_jobs)):
pass

count_prokka += 1
if count_prokka % 10 == 0:
print("**Done with {} genomes...".format(count_prokka))
count_prokka = len(prokka_jobs)

print("\n**PROKKA finished with no errors")
print(
Expand Down Expand Up @@ -547,16 +560,23 @@ def main():
if not hmmscan_dir.is_dir():
hmmscan_dir.mkdir(parents=True, exist_ok=True)

count_hmms = 0
for faa in valid_faas.values():
run_hmmscan(faa, hmmscan_dir, vog_profiles, threads)
count_hmms += 1
print(
"**Done with {} / {} HMMs\r".format(count_hmms, len(valid_faas)),
end="",
)
else:
print("\n**Done with HMMscan!")
# count_hmms = 0
# for faa in valid_faas.values():
# run_hmmscan(faa, hmmscan_dir, vog_profiles, threads)
# count_hmms += 1
# print(
# "**Done with {} / {} HMMs\r".format(count_hmms, len(valid_faas)),
# end="",
# )
# else:
# print("\n**Done with HMMscan!")

count_hmms = len(valid_faas)
with multiprocessing.Pool(threads) as p:
hmmscan_jobs = list(valid_faas.values())
run_wrapper = functools.partial(run_hmmscan, output_dir=hmmscan_dir, vogs_hmms=vog_profiles, threads=1)
for _ in tqdm.tqdm(p.imap_unordered(run_wrapper, hmmscan_jobs), total=len(prokka_jobs)):
pass

print_now()

Expand Down Expand Up @@ -678,4 +698,4 @@ def main():


if __name__ == "__main__":
main()
main()