From a642ba2db2c163dc3a76b42807e6c1bd231e50b3 Mon Sep 17 00:00:00 2001 From: marcelauliano Date: Thu, 16 Nov 2023 15:41:12 +0000 Subject: [PATCH] parsing of contigs_stats --- .../workflows/github-actions-build-code.yml | 4 +- src/mitohifi.py | 1123 +++++++++-------- src/modify_circularity_in_header.py | 17 + 3 files changed, 585 insertions(+), 559 deletions(-) create mode 100644 src/modify_circularity_in_header.py diff --git a/.github/workflows/github-actions-build-code.yml b/.github/workflows/github-actions-build-code.yml index 55cc7f2..85f21fc 100644 --- a/.github/workflows/github-actions-build-code.yml +++ b/.github/workflows/github-actions-build-code.yml @@ -1,9 +1,9 @@ name: Build and Publish MitoHiFi code -on: workflow_dispatch +on: #workflow_dispatch push: branches: - - 'stats' + - 'master' diff --git a/src/mitohifi.py b/src/mitohifi.py index 0a9301f..01265b2 100644 --- a/src/mitohifi.py +++ b/src/mitohifi.py @@ -1,557 +1,566 @@ -""" -This is the main MitoHiFi script. -""" - -import concurrent.futures -from concurrent.futures import wait -import logging -import shutil -import subprocess -import time -import warnings -import pandas as pd -import argparse -import parse_blast -from Bio import SeqIO -import sys -import os -import cleanUpCWD -from compareGenesLists import compare_genes_dicts -from createCoveragePlot import map_potential_contigs, map_final_mito, get_contigs_to_map, get_contigs_headers, split_mapping_by_contig, create_coverage_plot -import fetch -import fetch_mitos -import filterfasta -import findFrameShifts -import fixContigHeaders -import functools -import rotation -import getMitoLength -import getReprContig -from getGenesList import get_genes_list -from gfa2fa import gfa2fa -from parallel_annotation import process_contig, process_contig_02 -from parallel_annotation_mitos import process_contig_mitos, process_contig_02_mitos -import shlex -from circularizationCheck import circularizationCheck, get_circo_mito -import alignContigs -import plot_coverage -import plot_annotation - -def main(): - - __version__ = '3.0.0' - start_time = time.time() - - parser = argparse.ArgumentParser(prog='MitoHiFi') - parser._action_groups.pop() - required = parser.add_argument_group('required arguments') - optional = parser.add_argument_group('optional arguments') - mutually_exclusive_group = required.add_mutually_exclusive_group(required=True) - mutually_exclusive_group.add_argument("-r", help= "-r: Pacbio Hifi Reads from your species", metavar='.fasta') - mutually_exclusive_group.add_argument("-c", help= "-c: Assembled fasta contigs/scaffolds to be searched to find mitogenome", metavar='.fasta') - required.add_argument("-f", help= "-f: Close-related Mitogenome is fasta format", required = "True", metavar='.fasta') - required.add_argument("-g", help= "-k: Close-related species Mitogenome in genebank format", required = "True", metavar='.gbk') - required.add_argument("-t", help= "-t: Number of threads for (i) hifiasm and (ii) the blast search", required = "True", type=int, metavar='') - optional.add_argument("-d", help="-d: debug mode to output additional info on log", action="store_true") - optional.add_argument("-a", help="-a: Choose between animal (default) or plant", default="animal", choices=["animal", "plant", "fungi"]) - optional.add_argument("-p", help="-p: Percentage of query in the blast match with close-related mito", type=int, default=50, metavar='') - optional.add_argument("-m", help="-m: Number of bits for HiFiasm bloom filter [it maps to -f in HiFiasm] (default = 0)", type=int, default=0, metavar='') - optional.add_argument("--max-read-len", help="Maximum lenght of read relative to related mito (default = 1.0x related mito length)", type=float, default=1.0) - optional.add_argument("--mitos", help="Use MITOS2 for annotation (opposed to default MitoFinder", action="store_true") - optional.add_argument('--circular-size', help='Size to consider when checking for circularization', type=int, default=220) - optional.add_argument('--circular-offset', help='Offset from start and finish to consider when looking for circularization', type=int, default=40) - optional.add_argument('-winSize', help='Size of windows to calculate coverage over the final_mitogenom', type=int, default=300) - optional.add_argument('-covMap', help='Minimum mapping quality to filter reads when building final coverage plot', type=int, default=0) - optional.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) - optional.add_argument("-o", help="""-o: Organism genetic code following NCBI table (for mitogenome annotation): - 1. The Standard Code 2. The Vertebrate MitochondrialCode 3. The Yeast Mitochondrial Code - 4. The Mold,Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code 5. The Invertebrate Mitochondrial Code - 6. The Ciliate, Dasycladacean and Hexamita Nuclear Code 9. The Echinoderm and Flatworm Mitochondrial Code 10. The Euplotid Nuclear Code - 11. The Bacterial, Archaeal and Plant Plastid Code 12. The Alternative Yeast Nuclear Code 13. The Ascidian Mitochondrial Code - 14. The Alternative Flatworm Mitochondrial Code 16. Chlorophycean Mitochondrial Code 21. Trematode Mitochondrial Code - 22. Scenedesmus obliquus Mitochondrial Code 23. Thraustochytrium Mitochondrial Code 24. Pterobranchia Mitochondrial Code - 25. Candidate Division SR1 and Gracilibacteria Code - """, type=str, default='1', metavar='') - args = parser.parse_args() - - # Set log message format - FORMAT='%(asctime)s [%(levelname)s] %(message)s' - - if args.d: # If in debug mode - logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, - format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') - else: - logging.basicConfig(level=logging.INFO, stream=sys.stdout, - format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') - - # Welcome message - logging.info("Welcome to MitoHifi v2. Starting pipeline...") - logging.debug("Running MitoHiFi on debug mode.") - - # Measure the length of the related mitogenome - rel_mito_len = getMitoLength.get_mito_length(args.f) - rel_mito_num_genes = getMitoLength.get_mito_genes(args.g) - logging.info("Length of related mitogenome is: {} bp".format(rel_mito_len)) - logging.info("Number of genes on related mitogenome: {}".format(rel_mito_num_genes)) - - # If input are reads, map them to the related mitogenome and assemble the mapped ones - if args.r: - logging.info("Running MitoHifi pipeline in reads mode...") - logging.info("1. First we map your Pacbio HiFi reads to the close-related mitogenome") - minimap_cmd = ["minimap2", "-t", str(args.t), "--secondary=no", "-ax", "map-hifi", args.f] + shlex.split(args.r) - samtools_cmd = ["samtools", "view", "-@", str(args.t), "-b", "-F4", "-F", "0x800", "-o", "reads.HiFiMapped.bam"] - logging.info(" ".join(minimap_cmd) + " | " + " ".join(samtools_cmd)) - minimap = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) - #mapped_reads_f = open("reads.HiFiMapped.bam", "w") - subprocess.run(samtools_cmd, stderr=subprocess.STDOUT, stdin=minimap.stdout) - minimap.wait() - minimap.stdout.close() - - try: - f = open("reads.HiFiMapped.bam") - except FileNotFoundError: - sys.exit("""No reads.HiFiMapped.bam file. - An error may have occurred when mapping reads to the close-related mitogenome""") - finally: - f.close() - - logging.info("2. Now we filter out any mapped reads that are larger than the reference mitogenome to avoid NUMTS") - bam2fasta_cmd = ["samtools", "fasta", "reads.HiFiMapped.bam"] - logging.info("2.1 First we convert the mapped reads from BAM to FASTA format:") - logging.info(" ".join(bam2fasta_cmd) + " > gbk.HiFiMapped.bam.fasta") - mapped_fasta_f = open("gbk.HiFiMapped.bam.fasta", "w") - subprocess.run(bam2fasta_cmd, stdout=mapped_fasta_f, stderr=subprocess.DEVNULL) - before_filter = fetch.get_num_seqs("gbk.HiFiMapped.bam.fasta") - logging.info(f"Total number of mapped reads: {before_filter}") - - max_read_len = round(args.max_read_len * rel_mito_len) - logging.info(f"2.2 Then we filter reads that are larger than {max_read_len} bp") - filterfasta.filterFasta(minLength=max_read_len, neg=True, inStream="gbk.HiFiMapped.bam.fasta", - outPath="gbk.HiFiMapped.bam.filtered.fasta", log=False) - - try: - f = open("gbk.HiFiMapped.bam.filtered.fasta") - except FileNotFoundError: - sys.exit("""No gbk.HiFiMapped.bam.filtered.fasta file. - An error may have occurred when filtering reads larger than the reference mitogenome""") - finally: - f.close() - - after_filter = fetch.get_num_seqs("gbk.HiFiMapped.bam.filtered.fasta") - logging.info(f"Number of filtered reads: {after_filter}") - - logging.info("3. Now let's run hifiasm to assemble the mapped and filtered reads!") - - hifiasm_cmd = ["hifiasm", "--primary", "-t", str(args.t), "-f", str(args.m), - "-o", "gbk.HiFiMapped.bam.filtered.assembled", - "gbk.HiFiMapped.bam.filtered.fasta"] - - logging.info(" ".join(hifiasm_cmd)) - with open("hifiasm.log", "w") as hifiasm_log_f: - subprocess.run(hifiasm_cmd, stderr=subprocess.STDOUT, stdout=hifiasm_log_f) - - f1 = None - f2 = None - - try: - f1 = open("gbk.HiFiMapped.bam.filtered.assembled.p_ctg.gfa") - f2 = open("gbk.HiFiMapped.bam.filtered.assembled.a_ctg.gfa") - except FileNotFoundError: - sys.exit("""No gbk.HiFiMapped.bam.filtered.assembled.[a/p]_ctg.gfa file(s). - An error may have occurred when assembling reads with HiFiasm.""") - finally: - if f1: - f1.close() - if f2: - f2.close() - - # convert Hifiasm assemblies (primary and alternate) from GFA to FASTA format - gfa2fa("gbk.HiFiMapped.bam.filtered.assembled.p_ctg.gfa", "gbk.HiFiMapped.bam.filtered.assembled.p_ctg.fa") - gfa2fa("gbk.HiFiMapped.bam.filtered.assembled.a_ctg.gfa", "gbk.HiFiMapped.bam.filtered.assembled.a_ctg.fa") - - with open("hifiasm.contigs.fasta", "w") as hifiasm_f: - subprocess.run(["cat", "gbk.HiFiMapped.bam.filtered.assembled.p_ctg.fa", "gbk.HiFiMapped.bam.filtered.assembled.a_ctg.fa"], stdout=hifiasm_f) - - contigs = "hifiasm.contigs.fasta" - - else: - logging.info("Running MitoHifi pipeline in contigs mode...") - - logging.info("1. Fixing potentially conflicting FASTA headers") - original_contigs = args.c - fixContigHeaders.fix_headers(original_contigs, "fixed_header_contigs.fasta") - - os.remove(original_contigs) # remove original contig file - shutil.move("fixed_header_contigs.fasta", original_contigs) # replace original contigs file by the version that has the headers fixed - - contigs = original_contigs - - # Set number for the current step (for improving understanding of the log) - # On reads mode, it should be 4; on contigs mode, 2 - if args.r: - step = 4 - else: - step = 2 - - logging.info(f"{step}. Let's run the blast of the contigs versus the close-related mitogenome") - - makeblastdb_cmd = ["makeblastdb", "-in", args.f, "-dbtype", "nucl"] - logging.info(f"{step}.1. Creating BLAST database:") - logging.info(" ".join(makeblastdb_cmd)) - subprocess.run(makeblastdb_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) - logging.info("Makeblastdb done.") - - blast_cmd = ["blastn", "-query", contigs, "-db", args.f, "-num_threads", str(args.t), - "-out", "contigs.blastn", "-outfmt", "6 std qlen slen"] - logging.info(f"{step}.2. Running blast of contigs against close-related mitogenome:") - logging.info(" ".join(blast_cmd)) - subprocess.run(blast_cmd, stderr=subprocess.STDOUT) - logging.info("Blast done.") - - try: - f = open("contigs.blastn") - except FileNotFoundError: - sys.exit("""No contigs.blastn file. - An error may have occurred when BLASTing contigs against close-related mitogenome""") - finally: - f.close() - - step += 1 - logging.info(f"{step}. Filtering BLAST output to select target sequences") - - #the next script parses a series of conditions to exclude blast with NUMTs. - if args.a == "plant": - ## if species is a plant, set minimum query percentage equal to 0% of related mito - ## and maximum query lenght 10 times the lenght of the related - parse_blast.parse_blast(query_perc=args.p, min_query_perc=0, max_query_len=10) - else: - ## if species is an animal, set minimum query percentage equal to 80% of related mito - ## and maximum query length 5 times the length of the related (default values from - ## parse_blast function - parse_blast.parse_blast(query_perc=args.p, max_query_len=5) - - # select contigs to be circularized - # first look for contigs in parsed_blast.txt - contigs_ids = parse_blast.get_contigs_ids("parsed_blast.txt") - - # if we don't find contigs in parse_blast.txt - # look for contigs in parsed_blast_all.txt - if len(contigs_ids) == 0: - contigs_ids = parse_blast.get_contigs_ids("parsed_blast_all.txt") - - # if we can't find any contigs even in parsed_blast_all.txt, then we exit the pipeline - if len(contigs_ids) == 0: - sys.exit("""Attention! -'parsed_blast.txt' and 'parsed_blast_all.txt' files are empty. -The pipeline has stopped !! You need to run further scripts to check if you have mito reads pulled to a large NUMT!""") - - logging.info("Filtering BLAST finished. A list of the filtered contigs was saved on ./contigs_filtering/contigs_ids.txt file") - - # records all contigs kept for the downstream steps in a file called 'contigs_ids.txt' - with open("contigs_ids.txt", "w") as f: - for contig_id in contigs_ids: - f.write(contig_id + "\n") - - # removes file that contains history of circularization of it already exists - try: - os.remove('all_contigs.circularisationCheck.txt') - except OSError: - pass - - step += 1 - logging.info(f"{step}. Now we are going to circularize, annotate and rotate each filtered contig. Those are potential mitogenome(s).") - - # Set maximum contig size accepted by mitofinder when annotating the contigs - max_contig_size = 5*rel_mito_len - - threads_per_contig = 1 - if args.t // len(contigs_ids) > 1: - threads_per_contig = args.t // len(contigs_ids) - - logging.debug(f"Threads per contig={threads_per_contig}") - logging.debug(f"Thresholds for circularization: circular size={args.circular_size} | circular offset={args.circular_offset}") - logging.debug(f"Thresholds for annotation (MitoFinder): maximum contig size={max_contig_size}") - - if args.mitos: - logging.info("Annotation will be done using MITOS2") - if args.a == "fungi": - refseq_db = "refseq89f" - else: - refseq_db = "refseq89m" - partial_process_contig = functools.partial(process_contig_mitos, threads_per_contig, - args.circular_size, args.circular_offset, - contigs, max_contig_size, args.g, args.o, refseq_db) - else: - logging.info("Annotation will be done using MitoFinder (default)") - partial_process_contig = functools.partial(process_contig, threads_per_contig, - args.circular_size, args.circular_offset, - contigs, max_contig_size, args.g, args.o) - - with concurrent.futures.ProcessPoolExecutor() as executor: - executor.map(partial_process_contig, contigs_ids) - - if args.mitos: - tRNA_ref = fetch_mitos.get_ref_tRNA() - else: - tRNA_ref = fetch.get_ref_tRNA() - logging.debug(f"tRNA to be used as reference for rotation: {tRNA_ref}") - - #logging.info("Finished successfuly!") # debug - #sys.exit(0) # debug - if args.mitos: - partial_process_contig_02 = functools.partial(process_contig_02_mitos, tRNA_ref, - threads_per_contig, args.circular_size, - args.circular_offset, contigs, max_contig_size, - args.g, args.o) - else: - partial_process_contig_02 = functools.partial(process_contig_02, tRNA_ref, - threads_per_contig, args.circular_size, - args.circular_offset, contigs, max_contig_size, - args.g, args.o) - - - with concurrent.futures.ProcessPoolExecutor() as executor: - executor.map(partial_process_contig_02, contigs_ids) - - # concatenates circularization histories from all contigs into - # a single concatenate history file - circularization_history_files = [] - for curr_file in os.listdir('.'): - if curr_file.endswith('.circularisationCheck.txt'): - circularization_history_files.append(curr_file) - logging.debug(f"circularization_history_files: {circularization_history_files}") # for debugging - with open("all_contigs.circularisationCheck.txt", "a") as outfile: - for circ_hist_file in circularization_history_files: - contig_id = circ_hist_file.split('.')[0] - with open(circ_hist_file, "r") as infile: - for line in infile: - outfile.write("\t".join([contig_id, line.strip()+"\n"])) - - step += 1 - #align final mitogenome rotated contigs - logging.info(f"{step}. Now the rotated contigs will be aligned") - # list all final rotated contigs - contigs_files = [] - for curr_file in os.listdir('.'): - if curr_file.endswith('mitogenome.rotated.fa'): - contigs_files.append(curr_file) - if not contigs_files: - sys.exit("""No rotated contigs found. - An error has possibly occurred during annotation and/or rotation of your contigs""") - # first concatenate all rotated contigs into a single multifasta file - logging.info("List of contigs that will be aligned: " + str(contigs_files) + "\n") - concat_fasta = alignContigs.concatenate_contigs(contigs_files) - # then run MAFFT alignment between the rotated contigs using the multifasta as input and clustal as output format - alignContigs.mafft_align(multifasta_file=concat_fasta, threads=args.t, clustal_format=True) - logging.info("Alignment done and saved at ./final_mitogenome_choice/all_mitogenomes.rotated.aligned.aln\n") - - try: - f = open("all_mitogenomes.rotated.aligned.aln") - except FileNotFoundError: - sys.exit("""No all_mitogenomes.rotated.aligned.aln file. - An error may have occurred when aligning the potential contigs""") - finally: - f.close() - - step += 1 - logging.info(f"{step}. Now we will choose the most representative contig" + "\n") - repr_contig_id, repr_contig_cluster = getReprContig.get_repr_contig("all_mitogenomes.rotated.fa", rel_mito_len, rel_mito_num_genes, args.t, args.d) - logging.info(f"Representative contig is {repr_contig_id} that belongs to {repr_contig_cluster}. This contig will be our final mitogenome. See all contigs and clusters in cdhit.out.clstr") - - # create copies for final mitogenome (repr contigs) at both - # FASTA and GENBANK formats - repr_contig_fasta = repr_contig_id + ".mitogenome.rotated.fa" - final_fasta = "final_mitogenome.fasta" - - if args.mitos: - contig_annotation_dir = f"{repr_contig_id}.annotation" - for file in os.listdir(contig_annotation_dir): - if file.endswith(".rotated.gff"): - repr_contig_annotation = os.path.join(contig_annotation_dir, file) - final_annotation = "final_mitogenome.gff" - else: - repr_contig_annotation = repr_contig_id + ".mitogenome.rotated.gb" - final_annotation = "final_mitogenome.gb" - - shutil.copy(repr_contig_fasta, final_fasta) - shutil.copy(repr_contig_annotation, final_annotation) - - #repr_contig_get_gb = ["mitofinder", "--new-genes", "--max-contig-size", - # str(max_contig_size), "-j", "final_mitogenome.annotation", - # "-a", repr_contig_fasta, "-r", args.g, "-o", args.o, "-p", str(args.p), - # "--circular-size", "8000"] - #subprocess.run(repr_contig_get_gb, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) - - #final_fasta = os.path.join("final_mitogenome.annotation", "final_mitogenome.annotation_MitoFinder_mitfi_Final_Results", "final_mitogenome.annotation_mtDNA_contig.fasta") - #final_gbk = os.path.join("final_mitogenome.annotation", "final_mitogenome.annotation_MitoFinder_mitfi_Final_Results", "final_mitogenome.annotation_mtDNA_contig.gb") - - # Generate contigs stats - step += 1 - logging.info(f"""{step}. Calculating final stats for final mitogenome and other potential contigs. - Stats will be saved on contigs_stats.tsv file.""") - ## Print first three lines (comment and header) - is_circ = getReprContig.get_circularization_info(repr_contig_id) - if args.mitos: - frameshifts = findFrameShifts.find_frameshifts_mitos(final_annotation) - contig_len, num_genes = findFrameShifts.get_mitos_stats(final_annotation, final_fasta) - else: - frameshifts = findFrameShifts.find_frameshifts(final_annotation) - contig_len, num_genes = findFrameShifts.get_gb_stats(final_annotation) - - if not frameshifts: - all_frameshifts = "No frameshift found" - elif len(frameshifts)==1: - all_frameshifts = "".join(frameshifts) - elif len(frameshifts)>1: - all_frameshifts = ";".join(frameshifts) - with open("contigs_stats.tsv", "w") as f: - f.write(f"# Related mitogenome is {rel_mito_len} bp long and has {rel_mito_num_genes} genes\n") - f.write("\t".join(["contig_id", "frameshifts_found", "annotation_file", - "length(bp)", "number_of_genes", "was_circular\n"])) - f.write("\t".join(["final_mitogenome", all_frameshifts, final_annotation, - str(contig_len), str(num_genes), str(is_circ)+"\n"])) - - # compare list of genes from related mito and final mitogenome - related_mito_annotation = args.g - related_mito_genes = get_genes_list(related_mito_annotation, "genbank") - #print(f"related_mito_genes: {related_mito_genes}") #debug - if final_annotation.endswith(".gff"): - final_mito_genes = get_genes_list(final_annotation, "gff") - elif final_annotation.endswith(".gb"): - final_mito_genes = get_genes_list(final_annotation, "genbank") - #print(f"final_mito_genes: {final_mito_genes}") #debug - - shared, final_mito_specific, related_mito_specific = compare_genes_dicts(final_mito_genes, related_mito_genes, True) - - with open("shared_genes.tsv", "w") as f: - f.write("\t".join(["contig_id", "shared_genes", "unique_to_contig", "unique_to_relatedMito\n"])) - f.write("\t".join(["final_mitogenome", str(shared), str(final_mito_specific), str(related_mito_specific) + "\n"])) - - ## Iterate over each contig and print its info (ID, framshifts and genbank file used - ## to search for the frameshifts) - - # get list with all individual contig stats files - contigs_stats_files = [] - for curr_file in os.listdir('.'): - if curr_file.endswith('.individual.stats'): - # skips addition of representative contig, which is the oi - # same as the final_mitogenome - if curr_file.split('.individual.stats')[0] != repr_contig_id: - contigs_stats_files.append(curr_file) - - with open("contigs_stats.tsv", "a") as outfile: - for contig_stats in contigs_stats_files: - with open(contig_stats, "r") as infile: - shutil.copyfileobj(infile, outfile) - - # add comparison of genes for all potential contigs - with open("contigs_stats.tsv", "r") as f: - next(f) # skips first line of `contigs_stats.tsv` - next(f) # skips second line of `contigs_stats.tsv` - next(f) # skips third line of `contigs_stats.tsv` - for line in f: - contig_id = line.split()[0] - contig_annotation = "" - if args.mitos: - for curr_file in os.listdir(f"{contig_id}.annotation"): - if curr_file.endswith('.rotated.gff'): - contig_annotation = os.path.join(f"{contig_id}.annotation", curr_file) - #contig_annotation = os.path.join(f"{contig_id}.annotation", "result.rotated.gff") - if contig_annotation: - contig_genes = get_genes_list(contig_annotation, "gff") - print(f"{contig_id} list of genes: {contig_genes}") - else: - contig_annotation = f"{contig_id}.mitogenome.rotated.gb" - contig_genes = get_genes_list(contig_annotation, "genbank") - print(f"{contig_id} list of genes: {contig_genes}") - - shared, contig_specific, related_mito_specific = compare_genes_dicts(contig_genes, related_mito_genes, True) - - with open("shared_genes.tsv", "a") as f2: - f2.write("\t".join([contig_id, str(shared), str(contig_specific), str(related_mito_specific) + "\n"])) - - # creating annotation plots - step += 1 - logging.info(f"{step}. Building annotation plots for all contigs") - annotation_plots = [] - - # add final_mitogenome annotation plot as first plot - if args.mitos: - plot_annotation.plot_annotation_mitos("final_mitogenome.gff", "final_mitogenome.annotation.png") - else: - plot_annotation.plot_annotation("final_mitogenome.gb", "final_mitogenome.annotation.png") - annotation_plots.append("final_mitogenome.annotation.png") - - # add other plots following contigs order in `contigs_stats.tsv` - with open("contigs_stats.tsv", "r") as f: - next(f) # skips first line of `contigs_stats.tsv` - next(f) # skips second line of `contigs_stats.tsv` - next(f) # skips third line of `contigs_stats.tsv` - for line in f: - contig_id = line.split()[0] - contig_annotation = "" - if args.mitos: - for curr_file in os.listdir(f"{contig_id}.annotation"): - if curr_file.endswith('.rotated.gff'): - contig_annotation = os.path.join(f"{contig_id}.annotation", curr_file) - #contig_annotation = os.path.join(f"{contig_id}.annotation", "result.rotated.gff") - if contig_annotation: - contig_annotation_plot = contig_annotation.replace(".gff", ".annotation.png") - else: - contig_annotation = f"{contig_id}.mitogenome.rotated.gb" - contig_annotation_plot = f"{contig_id}.mitogenome.rotated.annotation.png" - if os.path.isfile(contig_annotation): - # skip repr_conti_id since it will have the annotation plot created - # after the for loop, when passing `final_mitogenome.gb` as argument - if contig_id != repr_contig_id: - if args.mitos: - plot_annotation.plot_annotation_mitos(contig_annotation, contig_annotation_plot) - else: - plot_annotation.plot_annotation(contig_annotation, contig_annotation_plot) - annotation_plots.append(contig_annotation_plot) - else: - logging.warning(f"No {contig_annotation} for {contig_id}. Skipping annotation plot for this contig.") - - ## concatenating all annotation plots into single final plot - plot_annotation.merge_images(annotation_plots, "contigs_annotations.png") - - # creating coverage plot (v02) - if args.r: - step += 1 - logging.info(f"{step}. Building coverage distribution for each potential contig") - - #reads = shlex.split(args.r) - contigs_to_map = get_contigs_to_map() - logging.info(f"contigs_to_map: {contigs_to_map}") - logging.info(f"{step}.1 Mapping HiFi (filtered) reads against potential contigs:") - contigs_mapping = map_potential_contigs(["gbk.HiFiMapped.bam.fasta"], contigs_to_map, args.t, args.covMap) - logging.info(f"HiFi reads mapping done. Output file: {contigs_mapping}") - contigs_headers = get_contigs_headers("all_potential_contigs.fa") - mapped_contigs = split_mapping_by_contig(contigs_mapping, contigs_headers, threads=args.t) - logging.info(f"Splitting mapping file done. Individual mapped contigs: {mapped_contigs}") - - # creating coverage plot - logging.info(f"{step}.2 Creating coverage plot...") - coverage_plot_filename = create_coverage_plot(mapped_contigs, args.winSize, repr_contig=repr_contig_id) - logging.info("Coverage plots created.") - #plot_coverage.move_intermediate_files(["HiFi-vs-final_mitogenome.bam", "HiFi-vs-final_mitogenome.sorted.bam", - # "HiFi-vs-final_mitogenome.sorted.bam.bai", genome_filename, genome_windows_filename, windows_depth_filename]) - - step += 1 - logging.info(f"{step}. Building coverage distribution for final mitogenome") - logging.info(f"{step}.1 Mapping HiFi (filtered) reads against final_mitogenome.fasta:") - mapping_filename = map_final_mito(["gbk.HiFiMapped.bam.fasta"], args.t, args.covMap) - logging.info(f"HiFi reads mapping done. Output file: {mapping_filename}") - logging.info(f"{step}.2 Creating coverage plot...") - coverage_plot_filename = create_coverage_plot([repr_contig_id], args.winSize, repr_contig=repr_contig_id, is_final_mito=True) - logging.info(f"Coverage plot for final_mitogenome created...") - - # cleaning up working directory - cleanUpCWD.clean_up_work_dir(contigs_ids) - - logging.info("Pipeline finished!" ) - runtime= time.time() - start_time - logging.info(f"Run time: {runtime:.2f} seconds") - -if __name__ == '__main__': - main() +""" +This is the main MitoHiFi script. +""" + +import concurrent.futures +from concurrent.futures import wait +import logging +import shutil +import subprocess +import time +import warnings +import pandas as pd +import argparse +import parse_blast +from Bio import SeqIO +import sys +import os +import cleanUpCWD +from compareGenesLists import compare_genes_dicts +from createCoveragePlot import map_potential_contigs, map_final_mito, get_contigs_to_map, get_contigs_headers, split_mapping_by_contig, create_coverage_plot +import fetch +import fetch_mitos +import filterfasta +import findFrameShifts +import fixContigHeaders +import functools +import rotation +import getMitoLength +import getReprContig +from getGenesList import get_genes_list +from gfa2fa import gfa2fa +from modify_circularity_in_header import modify_circularity_in_header +from parallel_annotation import process_contig, process_contig_02 +from parallel_annotation_mitos import process_contig_mitos, process_contig_02_mitos +import shlex +from circularizationCheck import circularizationCheck, get_circo_mito +import alignContigs +import plot_coverage +import plot_annotation + +def main(): + + __version__ = '3.0.0' + start_time = time.time() + + parser = argparse.ArgumentParser(prog='MitoHiFi') + parser._action_groups.pop() + required = parser.add_argument_group('required arguments') + optional = parser.add_argument_group('optional arguments') + mutually_exclusive_group = required.add_mutually_exclusive_group(required=True) + mutually_exclusive_group.add_argument("-r", help= "-r: Pacbio Hifi Reads from your species", metavar='.fasta') + mutually_exclusive_group.add_argument("-c", help= "-c: Assembled fasta contigs/scaffolds to be searched to find mitogenome", metavar='.fasta') + required.add_argument("-f", help= "-f: Close-related Mitogenome is fasta format", required = "True", metavar='.fasta') + required.add_argument("-g", help= "-k: Close-related species Mitogenome in genebank format", required = "True", metavar='.gbk') + required.add_argument("-t", help= "-t: Number of threads for (i) hifiasm and (ii) the blast search", required = "True", type=int, metavar='') + optional.add_argument("-d", help="-d: debug mode to output additional info on log", action="store_true") + optional.add_argument("-a", help="-a: Choose between animal (default) or plant", default="animal", choices=["animal", "plant", "fungi"]) + optional.add_argument("-p", help="-p: Percentage of query in the blast match with close-related mito", type=int, default=50, metavar='') + optional.add_argument("-m", help="-m: Number of bits for HiFiasm bloom filter [it maps to -f in HiFiasm] (default = 0)", type=int, default=0, metavar='') + optional.add_argument("--max-read-len", help="Maximum lenght of read relative to related mito (default = 1.0x related mito length)", type=float, default=1.0) + optional.add_argument("--mitos", help="Use MITOS2 for annotation (opposed to default MitoFinder", action="store_true") + optional.add_argument('--circular-size', help='Size to consider when checking for circularization', type=int, default=220) + optional.add_argument('--circular-offset', help='Offset from start and finish to consider when looking for circularization', type=int, default=40) + optional.add_argument('-winSize', help='Size of windows to calculate coverage over the final_mitogenom', type=int, default=300) + optional.add_argument('-covMap', help='Minimum mapping quality to filter reads when building final coverage plot', type=int, default=0) + optional.add_argument('-v', '--version', action='version', version='%(prog)s ' + __version__) + optional.add_argument("-o", help="""-o: Organism genetic code following NCBI table (for mitogenome annotation): + 1. The Standard Code 2. The Vertebrate MitochondrialCode 3. The Yeast Mitochondrial Code + 4. The Mold,Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code 5. The Invertebrate Mitochondrial Code + 6. The Ciliate, Dasycladacean and Hexamita Nuclear Code 9. The Echinoderm and Flatworm Mitochondrial Code 10. The Euplotid Nuclear Code + 11. The Bacterial, Archaeal and Plant Plastid Code 12. The Alternative Yeast Nuclear Code 13. The Ascidian Mitochondrial Code + 14. The Alternative Flatworm Mitochondrial Code 16. Chlorophycean Mitochondrial Code 21. Trematode Mitochondrial Code + 22. Scenedesmus obliquus Mitochondrial Code 23. Thraustochytrium Mitochondrial Code 24. Pterobranchia Mitochondrial Code + 25. Candidate Division SR1 and Gracilibacteria Code + """, type=str, default='1', metavar='') + args = parser.parse_args() + + # Set log message format + FORMAT='%(asctime)s [%(levelname)s] %(message)s' + + if args.d: # If in debug mode + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout, + format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') + else: + logging.basicConfig(level=logging.INFO, stream=sys.stdout, + format=FORMAT, datefmt='%Y-%m-%d %H:%M:%S') + + # Welcome message + logging.info("Welcome to MitoHifi v2. Starting pipeline...") + logging.debug("Running MitoHiFi on debug mode.") + + # Measure the length of the related mitogenome + rel_mito_len = getMitoLength.get_mito_length(args.f) + rel_mito_num_genes = getMitoLength.get_mito_genes(args.g) + logging.info("Length of related mitogenome is: {} bp".format(rel_mito_len)) + logging.info("Number of genes on related mitogenome: {}".format(rel_mito_num_genes)) + + # If input are reads, map them to the related mitogenome and assemble the mapped ones + if args.r: + logging.info("Running MitoHifi pipeline in reads mode...") + logging.info("1. First we map your Pacbio HiFi reads to the close-related mitogenome") + minimap_cmd = ["minimap2", "-t", str(args.t), "--secondary=no", "-ax", "map-hifi", args.f] + shlex.split(args.r) + samtools_cmd = ["samtools", "view", "-@", str(args.t), "-b", "-F4", "-F", "0x800", "-o", "reads.HiFiMapped.bam"] + logging.info(" ".join(minimap_cmd) + " | " + " ".join(samtools_cmd)) + minimap = subprocess.Popen(minimap_cmd, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) + #mapped_reads_f = open("reads.HiFiMapped.bam", "w") + subprocess.run(samtools_cmd, stderr=subprocess.STDOUT, stdin=minimap.stdout) + minimap.wait() + minimap.stdout.close() + + try: + f = open("reads.HiFiMapped.bam") + except FileNotFoundError: + sys.exit("""No reads.HiFiMapped.bam file. + An error may have occurred when mapping reads to the close-related mitogenome""") + finally: + f.close() + + logging.info("2. Now we filter out any mapped reads that are larger than the reference mitogenome to avoid NUMTS") + bam2fasta_cmd = ["samtools", "fasta", "reads.HiFiMapped.bam"] + logging.info("2.1 First we convert the mapped reads from BAM to FASTA format:") + logging.info(" ".join(bam2fasta_cmd) + " > gbk.HiFiMapped.bam.fasta") + mapped_fasta_f = open("gbk.HiFiMapped.bam.fasta", "w") + subprocess.run(bam2fasta_cmd, stdout=mapped_fasta_f, stderr=subprocess.DEVNULL) + before_filter = fetch.get_num_seqs("gbk.HiFiMapped.bam.fasta") + logging.info(f"Total number of mapped reads: {before_filter}") + + max_read_len = round(args.max_read_len * rel_mito_len) + logging.info(f"2.2 Then we filter reads that are larger than {max_read_len} bp") + filterfasta.filterFasta(minLength=max_read_len, neg=True, inStream="gbk.HiFiMapped.bam.fasta", + outPath="gbk.HiFiMapped.bam.filtered.fasta", log=False) + + try: + f = open("gbk.HiFiMapped.bam.filtered.fasta") + except FileNotFoundError: + sys.exit("""No gbk.HiFiMapped.bam.filtered.fasta file. + An error may have occurred when filtering reads larger than the reference mitogenome""") + finally: + f.close() + + after_filter = fetch.get_num_seqs("gbk.HiFiMapped.bam.filtered.fasta") + logging.info(f"Number of filtered reads: {after_filter}") + + logging.info("3. Now let's run hifiasm to assemble the mapped and filtered reads!") + + hifiasm_cmd = ["hifiasm", "--primary", "-t", str(args.t), "-f", str(args.m), + "-o", "gbk.HiFiMapped.bam.filtered.assembled", + "gbk.HiFiMapped.bam.filtered.fasta"] + + logging.info(" ".join(hifiasm_cmd)) + with open("hifiasm.log", "w") as hifiasm_log_f: + subprocess.run(hifiasm_cmd, stderr=subprocess.STDOUT, stdout=hifiasm_log_f) + + f1 = None + f2 = None + + try: + f1 = open("gbk.HiFiMapped.bam.filtered.assembled.p_ctg.gfa") + f2 = open("gbk.HiFiMapped.bam.filtered.assembled.a_ctg.gfa") + except FileNotFoundError: + sys.exit("""No gbk.HiFiMapped.bam.filtered.assembled.[a/p]_ctg.gfa file(s). + An error may have occurred when assembling reads with HiFiasm.""") + finally: + if f1: + f1.close() + if f2: + f2.close() + + # convert Hifiasm assemblies (primary and alternate) from GFA to FASTA format + gfa2fa("gbk.HiFiMapped.bam.filtered.assembled.p_ctg.gfa", "gbk.HiFiMapped.bam.filtered.assembled.p_ctg.fa") + gfa2fa("gbk.HiFiMapped.bam.filtered.assembled.a_ctg.gfa", "gbk.HiFiMapped.bam.filtered.assembled.a_ctg.fa") + + with open("hifiasm.contigs.fasta", "w") as hifiasm_f: + subprocess.run(["cat", "gbk.HiFiMapped.bam.filtered.assembled.p_ctg.fa", "gbk.HiFiMapped.bam.filtered.assembled.a_ctg.fa"], stdout=hifiasm_f) + + contigs = "hifiasm.contigs.fasta" + + else: + logging.info("Running MitoHifi pipeline in contigs mode...") + + logging.info("1. Fixing potentially conflicting FASTA headers") + original_contigs = args.c + fixContigHeaders.fix_headers(original_contigs, "fixed_header_contigs.fasta") + + os.remove(original_contigs) # remove original contig file + shutil.move("fixed_header_contigs.fasta", original_contigs) # replace original contigs file by the version that has the headers fixed + + # if contigs contain canu description of circularization, include that into the identifier + modify_circularity_in_header(original_contigs, "include_circ_to_contigs_IDs.contigs.fasta") + + os.remove(original_contigs) + shutil.move("include_circ_to_contigs_IDs.contigs.fasta", original_contigs) + + contigs = original_contigs + + # Set number for the current step (for improving understanding of the log) + # On reads mode, it should be 4; on contigs mode, 2 + if args.r: + step = 4 + else: + step = 2 + + logging.info(f"{step}. Let's run the blast of the contigs versus the close-related mitogenome") + + makeblastdb_cmd = ["makeblastdb", "-in", args.f, "-dbtype", "nucl"] + logging.info(f"{step}.1. Creating BLAST database:") + logging.info(" ".join(makeblastdb_cmd)) + subprocess.run(makeblastdb_cmd, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) + logging.info("Makeblastdb done.") + + blast_cmd = ["blastn", "-query", contigs, "-db", args.f, "-num_threads", str(args.t), + "-out", "contigs.blastn", "-outfmt", "6 std qlen slen"] + logging.info(f"{step}.2. Running blast of contigs against close-related mitogenome:") + logging.info(" ".join(blast_cmd)) + subprocess.run(blast_cmd, stderr=subprocess.STDOUT) + logging.info("Blast done.") + + try: + f = open("contigs.blastn") + except FileNotFoundError: + sys.exit("""No contigs.blastn file. + An error may have occurred when BLASTing contigs against close-related mitogenome""") + finally: + f.close() + + step += 1 + logging.info(f"{step}. Filtering BLAST output to select target sequences") + + #the next script parses a series of conditions to exclude blast with NUMTs. + if args.a == "plant": + ## if species is a plant, set minimum query percentage equal to 0% of related mito + ## and maximum query lenght 10 times the lenght of the related + parse_blast.parse_blast(query_perc=args.p, min_query_perc=0, max_query_len=10) + else: + ## if species is an animal, set minimum query percentage equal to 80% of related mito + ## and maximum query length 5 times the length of the related (default values from + ## parse_blast function + parse_blast.parse_blast(query_perc=args.p, max_query_len=5) + + # select contigs to be circularized + # first look for contigs in parsed_blast.txt + contigs_ids = parse_blast.get_contigs_ids("parsed_blast.txt") + + # if we don't find contigs in parse_blast.txt + # look for contigs in parsed_blast_all.txt + if len(contigs_ids) == 0: + contigs_ids = parse_blast.get_contigs_ids("parsed_blast_all.txt") + + # if we can't find any contigs even in parsed_blast_all.txt, then we exit the pipeline + if len(contigs_ids) == 0: + sys.exit("""Attention! +'parsed_blast.txt' and 'parsed_blast_all.txt' files are empty. +The pipeline has stopped !! You need to run further scripts to check if you have mito reads pulled to a large NUMT!""") + + logging.info("Filtering BLAST finished. A list of the filtered contigs was saved on ./contigs_filtering/contigs_ids.txt file") + + # records all contigs kept for the downstream steps in a file called 'contigs_ids.txt' + with open("contigs_ids.txt", "w") as f: + for contig_id in contigs_ids: + f.write(contig_id + "\n") + + # removes file that contains history of circularization of it already exists + try: + os.remove('all_contigs.circularisationCheck.txt') + except OSError: + pass + + step += 1 + logging.info(f"{step}. Now we are going to circularize, annotate and rotate each filtered contig. Those are potential mitogenome(s).") + + # Set maximum contig size accepted by mitofinder when annotating the contigs + max_contig_size = 5*rel_mito_len + + threads_per_contig = 1 + if args.t // len(contigs_ids) > 1: + threads_per_contig = args.t // len(contigs_ids) + + logging.debug(f"Threads per contig={threads_per_contig}") + logging.debug(f"Thresholds for circularization: circular size={args.circular_size} | circular offset={args.circular_offset}") + logging.debug(f"Thresholds for annotation (MitoFinder): maximum contig size={max_contig_size}") + + if args.mitos: + logging.info("Annotation will be done using MITOS2") + if args.a == "fungi": + refseq_db = "refseq89f" + else: + refseq_db = "refseq89m" + partial_process_contig = functools.partial(process_contig_mitos, threads_per_contig, + args.circular_size, args.circular_offset, + contigs, max_contig_size, args.g, args.o, refseq_db) + else: + logging.info("Annotation will be done using MitoFinder (default)") + partial_process_contig = functools.partial(process_contig, threads_per_contig, + args.circular_size, args.circular_offset, + contigs, max_contig_size, args.g, args.o) + + with concurrent.futures.ProcessPoolExecutor() as executor: + executor.map(partial_process_contig, contigs_ids) + + if args.mitos: + tRNA_ref = fetch_mitos.get_ref_tRNA() + else: + tRNA_ref = fetch.get_ref_tRNA() + logging.debug(f"tRNA to be used as reference for rotation: {tRNA_ref}") + + #logging.info("Finished successfuly!") # debug + #sys.exit(0) # debug + if args.mitos: + partial_process_contig_02 = functools.partial(process_contig_02_mitos, tRNA_ref, + threads_per_contig, args.circular_size, + args.circular_offset, contigs, max_contig_size, + args.g, args.o) + else: + partial_process_contig_02 = functools.partial(process_contig_02, tRNA_ref, + threads_per_contig, args.circular_size, + args.circular_offset, contigs, max_contig_size, + args.g, args.o) + + + with concurrent.futures.ProcessPoolExecutor() as executor: + executor.map(partial_process_contig_02, contigs_ids) + + # concatenates circularization histories from all contigs into + # a single concatenate history file + circularization_history_files = [] + for curr_file in os.listdir('.'): + if curr_file.endswith('.circularisationCheck.txt'): + circularization_history_files.append(curr_file) + logging.debug(f"circularization_history_files: {circularization_history_files}") # for debugging + with open("all_contigs.circularisationCheck.txt", "a") as outfile: + for circ_hist_file in circularization_history_files: + contig_id = circ_hist_file.split('.')[0] + with open(circ_hist_file, "r") as infile: + for line in infile: + outfile.write("\t".join([contig_id, line.strip()+"\n"])) + + step += 1 + #align final mitogenome rotated contigs + logging.info(f"{step}. Now the rotated contigs will be aligned") + # list all final rotated contigs + contigs_files = [] + for curr_file in os.listdir('.'): + if curr_file.endswith('mitogenome.rotated.fa'): + contigs_files.append(curr_file) + if not contigs_files: + sys.exit("""No rotated contigs found. + An error has possibly occurred during annotation and/or rotation of your contigs""") + # first concatenate all rotated contigs into a single multifasta file + logging.info("List of contigs that will be aligned: " + str(contigs_files) + "\n") + concat_fasta = alignContigs.concatenate_contigs(contigs_files) + # then run MAFFT alignment between the rotated contigs using the multifasta as input and clustal as output format + alignContigs.mafft_align(multifasta_file=concat_fasta, threads=args.t, clustal_format=True) + logging.info("Alignment done and saved at ./final_mitogenome_choice/all_mitogenomes.rotated.aligned.aln\n") + + try: + f = open("all_mitogenomes.rotated.aligned.aln") + except FileNotFoundError: + sys.exit("""No all_mitogenomes.rotated.aligned.aln file. + An error may have occurred when aligning the potential contigs""") + finally: + f.close() + + step += 1 + logging.info(f"{step}. Now we will choose the most representative contig" + "\n") + repr_contig_id, repr_contig_cluster = getReprContig.get_repr_contig("all_mitogenomes.rotated.fa", rel_mito_len, rel_mito_num_genes, args.t, args.d) + logging.info(f"Representative contig is {repr_contig_id} that belongs to {repr_contig_cluster}. This contig will be our final mitogenome. See all contigs and clusters in cdhit.out.clstr") + + # create copies for final mitogenome (repr contigs) at both + # FASTA and GENBANK formats + repr_contig_fasta = repr_contig_id + ".mitogenome.rotated.fa" + final_fasta = "final_mitogenome.fasta" + + if args.mitos: + contig_annotation_dir = f"{repr_contig_id}.annotation" + for file in os.listdir(contig_annotation_dir): + if file.endswith(".rotated.gff"): + repr_contig_annotation = os.path.join(contig_annotation_dir, file) + final_annotation = "final_mitogenome.gff" + else: + repr_contig_annotation = repr_contig_id + ".mitogenome.rotated.gb" + final_annotation = "final_mitogenome.gb" + + shutil.copy(repr_contig_fasta, final_fasta) + shutil.copy(repr_contig_annotation, final_annotation) + + #repr_contig_get_gb = ["mitofinder", "--new-genes", "--max-contig-size", + # str(max_contig_size), "-j", "final_mitogenome.annotation", + # "-a", repr_contig_fasta, "-r", args.g, "-o", args.o, "-p", str(args.p), + # "--circular-size", "8000"] + #subprocess.run(repr_contig_get_gb, stderr=subprocess.DEVNULL, stdout=subprocess.DEVNULL) + + #final_fasta = os.path.join("final_mitogenome.annotation", "final_mitogenome.annotation_MitoFinder_mitfi_Final_Results", "final_mitogenome.annotation_mtDNA_contig.fasta") + #final_gbk = os.path.join("final_mitogenome.annotation", "final_mitogenome.annotation_MitoFinder_mitfi_Final_Results", "final_mitogenome.annotation_mtDNA_contig.gb") + + # Generate contigs stats + step += 1 + logging.info(f"""{step}. Calculating final stats for final mitogenome and other potential contigs. + Stats will be saved on contigs_stats.tsv file.""") + ## Print first three lines (comment and header) + is_circ = getReprContig.get_circularization_info(repr_contig_id) + if args.mitos: + frameshifts = findFrameShifts.find_frameshifts_mitos(final_annotation) + contig_len, num_genes = findFrameShifts.get_mitos_stats(final_annotation, final_fasta) + else: + frameshifts = findFrameShifts.find_frameshifts(final_annotation) + contig_len, num_genes = findFrameShifts.get_gb_stats(final_annotation) + + if not frameshifts: + all_frameshifts = "No frameshift found" + elif len(frameshifts)==1: + all_frameshifts = "".join(frameshifts) + elif len(frameshifts)>1: + all_frameshifts = ";".join(frameshifts) + with open("contigs_stats.tsv", "w") as f: + f.write(f"# Related mitogenome is {rel_mito_len} bp long and has {rel_mito_num_genes} genes\n") + f.write("\t".join(["contig_id", "frameshifts_found", "annotation_file", + "length(bp)", "number_of_genes", "was_circular\n"])) + f.write("\t".join(["final_mitogenome", all_frameshifts, final_annotation, + str(contig_len), str(num_genes), str(is_circ)+"\n"])) + f.write("\t".join([repr_contig_id, all_frameshifts, final_annotation, + str(contig_len), str(num_genes), str(is_circ)+"\n"])) + + # compare list of genes from related mito and final mitogenome + related_mito_annotation = args.g + related_mito_genes = get_genes_list(related_mito_annotation, "genbank") + #print(f"related_mito_genes: {related_mito_genes}") #debug + if final_annotation.endswith(".gff"): + final_mito_genes = get_genes_list(final_annotation, "gff") + elif final_annotation.endswith(".gb"): + final_mito_genes = get_genes_list(final_annotation, "genbank") + #print(f"final_mito_genes: {final_mito_genes}") #debug + + shared, final_mito_specific, related_mito_specific = compare_genes_dicts(final_mito_genes, related_mito_genes, True) + + with open("shared_genes.tsv", "w") as f: + f.write("\t".join(["contig_id", "shared_genes", "unique_to_contig", "unique_to_relatedMito\n"])) + f.write("\t".join(["final_mitogenome", str(shared), str(final_mito_specific), str(related_mito_specific) + "\n"])) + + ## Iterate over each contig and print its info (ID, framshifts and genbank file used + ## to search for the frameshifts) + + # get list with all individual contig stats files + contigs_stats_files = [] + for curr_file in os.listdir('.'): + if curr_file.endswith('.individual.stats'): + # skips addition of representative contig, which is the + # same as the final_mitogenome + if curr_file.split('.individual.stats')[0] != repr_contig_id: + contigs_stats_files.append(curr_file) + + with open("contigs_stats.tsv", "a") as outfile: + for contig_stats in contigs_stats_files: + with open(contig_stats, "r") as infile: + shutil.copyfileobj(infile, outfile) + + # add comparison of genes for all potential contigs + with open("contigs_stats.tsv", "r") as f: + next(f) # skips first line of `contigs_stats.tsv` + next(f) # skips second line of `contigs_stats.tsv` + next(f) # skips third line of `contigs_stats.tsv` + for line in f: + contig_id = line.split()[0] + contig_annotation = "" + if args.mitos: + for curr_file in os.listdir(f"{contig_id}.annotation"): + if curr_file.endswith('.rotated.gff'): + contig_annotation = os.path.join(f"{contig_id}.annotation", curr_file) + #contig_annotation = os.path.join(f"{contig_id}.annotation", "result.rotated.gff") + if contig_annotation: + contig_genes = get_genes_list(contig_annotation, "gff") + print(f"{contig_id} list of genes: {contig_genes}") + else: + contig_annotation = f"{contig_id}.mitogenome.rotated.gb" + contig_genes = get_genes_list(contig_annotation, "genbank") + print(f"{contig_id} list of genes: {contig_genes}") + + shared, contig_specific, related_mito_specific = compare_genes_dicts(contig_genes, related_mito_genes, True) + + with open("shared_genes.tsv", "a") as f2: + f2.write("\t".join([contig_id, str(shared), str(contig_specific), str(related_mito_specific) + "\n"])) + + # creating annotation plots + step += 1 + logging.info(f"{step}. Building annotation plots for all contigs") + annotation_plots = [] + + # add final_mitogenome annotation plot as first plot + if args.mitos: + plot_annotation.plot_annotation_mitos("final_mitogenome.gff", "final_mitogenome.annotation.png") + else: + plot_annotation.plot_annotation("final_mitogenome.gb", "final_mitogenome.annotation.png") + annotation_plots.append("final_mitogenome.annotation.png") + + # add other plots following contigs order in `contigs_stats.tsv` + with open("contigs_stats.tsv", "r") as f: + next(f) # skips first line of `contigs_stats.tsv` + next(f) # skips second line of `contigs_stats.tsv` + next(f) # skips third line of `contigs_stats.tsv` + for line in f: + contig_id = line.split()[0] + contig_annotation = "" + if args.mitos: + for curr_file in os.listdir(f"{contig_id}.annotation"): + if curr_file.endswith('.rotated.gff'): + contig_annotation = os.path.join(f"{contig_id}.annotation", curr_file) + #contig_annotation = os.path.join(f"{contig_id}.annotation", "result.rotated.gff") + if contig_annotation: + contig_annotation_plot = contig_annotation.replace(".gff", ".annotation.png") + else: + contig_annotation = f"{contig_id}.mitogenome.rotated.gb" + contig_annotation_plot = f"{contig_id}.mitogenome.rotated.annotation.png" + if os.path.isfile(contig_annotation): + # skip repr_conti_id since it will have the annotation plot created + # after the for loop, when passing `final_mitogenome.gb` as argument + if contig_id != repr_contig_id: + if args.mitos: + plot_annotation.plot_annotation_mitos(contig_annotation, contig_annotation_plot) + else: + plot_annotation.plot_annotation(contig_annotation, contig_annotation_plot) + annotation_plots.append(contig_annotation_plot) + else: + logging.warning(f"No {contig_annotation} for {contig_id}. Skipping annotation plot for this contig.") + + ## concatenating all annotation plots into single final plot + plot_annotation.merge_images(annotation_plots, "contigs_annotations.png") + + # creating coverage plot (v02) + if args.r: + step += 1 + logging.info(f"{step}. Building coverage distribution for each potential contig") + + #reads = shlex.split(args.r) + contigs_to_map = get_contigs_to_map() + logging.info(f"contigs_to_map: {contigs_to_map}") + logging.info(f"{step}.1 Mapping HiFi (filtered) reads against potential contigs:") + contigs_mapping = map_potential_contigs(["gbk.HiFiMapped.bam.fasta"], contigs_to_map, args.t, args.covMap) + logging.info(f"HiFi reads mapping done. Output file: {contigs_mapping}") + contigs_headers = get_contigs_headers("all_potential_contigs.fa") + mapped_contigs = split_mapping_by_contig(contigs_mapping, contigs_headers, threads=args.t) + logging.info(f"Splitting mapping file done. Individual mapped contigs: {mapped_contigs}") + + # creating coverage plot + logging.info(f"{step}.2 Creating coverage plot...") + coverage_plot_filename = create_coverage_plot(mapped_contigs, args.winSize, repr_contig=repr_contig_id) + logging.info("Coverage plots created.") + #plot_coverage.move_intermediate_files(["HiFi-vs-final_mitogenome.bam", "HiFi-vs-final_mitogenome.sorted.bam", + # "HiFi-vs-final_mitogenome.sorted.bam.bai", genome_filename, genome_windows_filename, windows_depth_filename]) + + step += 1 + logging.info(f"{step}. Building coverage distribution for final mitogenome") + logging.info(f"{step}.1 Mapping HiFi (filtered) reads against final_mitogenome.fasta:") + mapping_filename = map_final_mito(["gbk.HiFiMapped.bam.fasta"], args.t, args.covMap) + logging.info(f"HiFi reads mapping done. Output file: {mapping_filename}") + logging.info(f"{step}.2 Creating coverage plot...") + coverage_plot_filename = create_coverage_plot([repr_contig_id], args.winSize, repr_contig=repr_contig_id, is_final_mito=True) + logging.info(f"Coverage plot for final_mitogenome created...") + + # cleaning up working directory + cleanUpCWD.clean_up_work_dir(contigs_ids) + + logging.info("Pipeline finished!" ) + runtime= time.time() - start_time + logging.info(f"Run time: {runtime:.2f} seconds") + +if __name__ == '__main__': + main() diff --git a/src/modify_circularity_in_header.py b/src/modify_circularity_in_header.py new file mode 100644 index 0000000..7a17ef0 --- /dev/null +++ b/src/modify_circularity_in_header.py @@ -0,0 +1,17 @@ +def modify_circularity_in_header(input_file, output_file): + with open(input_file, 'r') as input_fasta, open(output_file, 'w') as output_fasta: + for line in input_fasta: + if line.startswith('>'): # Check if the line is a header + header = line.strip()[1:] # Remove '>' and any leading/trailing whitespace + seq_id = header.split()[0] # + is_circular = "suggestCircular=yes" in header # Check if the suggestion is "yes" + is_linear = "suggestCircular=no" in header # Check if the suggestion is "no" + if is_circular: + new_header = f">{seq_id}c" + elif is_linear: + new_header = f">{seq_id}l" + else: + new_header = f">{header}" + output_fasta.write(new_header + "\n") + else: + output_fasta.write(line + "\n")