From 072bce051c1461537f2d97ff293fcf578236227b Mon Sep 17 00:00:00 2001 From: Kieran Reynolds Date: Wed, 25 Oct 2023 11:26:31 +0100 Subject: [PATCH] Add threads into more of the application --- hlsmallrna/genome_align.py | 528 ++++++++++++++++++++++++++++--------- 1 file changed, 399 insertions(+), 129 deletions(-) diff --git a/hlsmallrna/genome_align.py b/hlsmallrna/genome_align.py index ea45714..e9b3418 100644 --- a/hlsmallrna/genome_align.py +++ b/hlsmallrna/genome_align.py @@ -11,194 +11,464 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from collections import defaultdict import os import os.path import csv from subprocess import run -import pysam +import numpy as np -from Bio import SeqIO +from Bio import SeqIO, SeqRecord, Seq +from matplotlib import pyplot as plt from .config import get_config_key, mkdir_if_not_exists, do_log -def check_ids_unique(path_to_file, format): +def into_seqrecord(seqs): ''' - Make sure that the sequences in a FASTA or FASTQ file are unique + Convert a set of Biopython sequences into SeqRecord objects ''' - seen_ids = set() - for seq in SeqIO.parse(path_to_file, format): - if seq.id in seen_ids: - return False - else: - seen_ids.add(seq.id) + for i, seq in enumerate(seqs): + record = SeqRecord.SeqRecord(seq=Seq.Seq(seq), id=f'SmallRNA{i:07}', description='') + record.letter_annotations["phred_quality"] = [40] * len(record) - return True + yield record -def revcomp_input_file(smallRNA, quiet=0): +def make_fastqs_unique(fastq1, fastq2, output): ''' - Create a file containing the reverse complement of a file of small RNA + Take 2 FastQ files, combine and remove duplicates from the second ''' + unique_sequences = [] + cds_sequences = defaultdict(lambda: 0) - do_log(quiet, '====> Reverse complimenting RNA...') - PATH_TO_REVCOMP = os.path.join(get_config_key('general', 'output_directory'), 'revcomp_rna.fastq') + for sequence in SeqIO.parse(fastq1, 'fastq'): + unique_sequences.append(str(sequence.seq)) - def do_revcomp(seq): - revcomp = seq.reverse_complement() - revcomp.id = seq.id + try: + for sequence in SeqIO.parse(fastq2, 'fastq'): + cds_sequences[str(sequence.seq)] += 1 + except FileNotFoundError: + pass - return revcomp + for key in cds_sequences.keys(): + if key not in unique_sequences: + for i in range(cds_sequences[key]): + unique_sequences.append(key) - if not check_ids_unique(smallRNA, 'fastq'): - print('Error: duplicate ID found in your small RNA file, make IDs unique and try again') - exit(1) + SeqIO.write(into_seqrecord(unique_sequences), output, 'fastq') - seqs = SeqIO.parse(smallRNA, 'fastq') - revcomps = map(do_revcomp, seqs) - SeqIO.write(revcomps, PATH_TO_REVCOMP, 'fastq') +def make_fastq_overlap_only(fastq1, fastq2, output): + ''' + Create a fastq with only sequences that appear in both fastq files + ''' + fastq1_seqs = [] + + for sequence in SeqIO.parse(fastq1, 'fastq'): + fastq1_seqs.append(str(sequence.seq)) - return PATH_TO_REVCOMP + try: + fastq2_seqs = defaultdict(lambda: 0) + for sequence in SeqIO.parse(fastq2, 'fastq'): + fastq2_seqs[str(sequence.seq)] += 1 -def find_targets(smallRNA, possible_target_list, threads=4, min_seq_length=2, mismatches_allowed=0, quiet=0): + for seq in fastq2_seqs.keys(): + if seq not in fastq1_seqs: + for i in range(fastq2_seqs[seq]): + fastq1_seqs.append(seq) + + result_seqs = fastq1_seqs + except FileNotFoundError as e: + result_seqs = fastq1_seqs + + SeqIO.write(into_seqrecord(result_seqs), output, 'fastq') + +def remove_symbols_from_header(fasta): ''' - Align the small RNA against the lists of possible targets with bowtie2 and - analyse the output + Remove @s from the FASTA header before doing the alignment ''' + for seq in SeqIO.parse(fasta, 'fasta'): + seq.id = seq.id.replace('@', '_') - CWD = os.getcwd() - INDEX_DIR = os.path.join(get_config_key('general', 'output_directory'), 'bowtie_indexes') - SAM_DIR = os.path.join(CWD, get_config_key('general', 'output_directory'), 'target_alignments') + yield seq - sam_files = [] +def align_to_genome(genome, small_rnas, cds, quiet=0, threads=4, small_rna_filetype='fastq', mismatches=None): + ''' + Align the small RNAs to the genome and filter out any that are unsuccessful + ''' - mkdir_if_not_exists(SAM_DIR) - mkdir_if_not_exists(INDEX_DIR) - os.chdir(INDEX_DIR) + INTERMEDIATE_SAM = os.path.join(get_config_key('general', 'output_directory'), 'mapped_sequences.sam') + INTERMEDIATE_BAM = os.path.join(get_config_key('general', 'output_directory'), 'mapped_sequences.bam') + UNMAPPED_BAM = os.path.join(get_config_key('general', 'output_directory'), 'unmapped_sequences.bam') + RESULT_FASTQ = os.path.join(get_config_key('general', 'output_directory'), 'genome_mapped_sequences.fastq') + RESULT_UNMAPPED_FASTQ = os.path.join(get_config_key('general', 'output_directory'), 'genome_unmapped_sequences.fastq') + INDEX_DIRECTORY = os.path.join(get_config_key('general', 'output_directory'), 'bbmap_index') - for target in possible_target_list: - if not check_ids_unique(os.path.join(CWD, target), 'fasta'): - print(f'Error: duplicate ID found in your target file: {target}. Make IDs unique and try again') - exit(1) + CDS_INTERMEDIATE_SAM = os.path.join(get_config_key('general', 'output_directory'), 'cds_sequences.sam') + CDS_INTERMEDIATE_BAM = os.path.join(get_config_key('general', 'output_directory'), 'cds_sequences.bam') + CDS_UNMAPPED_BAM = os.path.join(get_config_key('general', 'output_directory'), 'cds_unmapped_sequences.bam') + CDS_UNMAPPED_FASTQ = os.path.join(get_config_key('general', 'output_directory'), 'cds_unmapped_sequences.fastq') + CDS_RESULT_FASTQ = os.path.join(get_config_key('general', 'output_directory'), 'cds_sequences.fastq') - do_log(quiet, f'====> Builing index for {target}') - INDEX_NAME = os.path.basename(target) + '_index' + FINAL_FASTQ = os.path.join(get_config_key('general', 'output_directory'), 'mapped_sequences.fastq') + FINAL_UNMAPPED_FASTQ = os.path.join(get_config_key('general', 'output_directory'), 'unmapped_sequences.fastq') - bowtie2_build_command = [ - get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2_build'), - '--threads', threads, - os.path.join(CWD, target), - INDEX_NAME - ] + mkdir_if_not_exists(INDEX_DIRECTORY) + + if small_rna_filetype == 'fasta': + NEW_SMALL_RNAS = os.path.join(get_config_key('general', 'output_directory'), 'corrected_headers.fasta') + SeqIO.write(remove_symbols_from_header(small_rnas), NEW_SMALL_RNAS, 'fasta') + + small_rnas = NEW_SMALL_RNAS - bowtie2_build_command = bowtie2_build_command + get_config_key('cli-tools', 'bowtie2', 'bowtie2_build_params') + bbmap_build_index = [ + get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2_build'), + '--threads', threads, + genome, + os.path.join(INDEX_DIRECTORY, 'genome_index') + ] - run(bowtie2_build_command, capture_output=(quiet != 0)) + bbmap_build_index = bbmap_build_index + get_config_key('cli-tools', 'bowtie2', 'bowtie2_build_params') - do_log(quiet, f'====> Aligning small RNA against {target}') + cds_build_index = [ + get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2_build'), + '--threads', threads, + cds, + os.path.join(INDEX_DIRECTORY, 'cds_index') + ] - sam_files.append(os.path.join(SAM_DIR, INDEX_NAME + '.sam')) + cds_build_index = cds_build_index + get_config_key('cli-tools', 'bowtie2', 'bowtie2_build_params') - if mismatches_allowed > 0: - bowtie2_align_command = [ + if mismatches is not None: + bbmap_align_reads = [ + get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'), + '--threads', threads, + '-L', '18', + '--no-1mm-upfront', + '--score-min', 'L,' + str(-mismatches) + ',0', + '--end-to-end', + '--mp', '1,1', + '--ignore-quals', + '--rdg', '9,1', + '--rfg', '9,1', + '-x', os.path.join(INDEX_DIRECTORY, 'genome_index'), + '-U', small_rnas, + '-S', INTERMEDIATE_SAM + ] + elif mismatches == 0: + bbmap_align_reads = [ get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'), '--threads', threads, - '-L', str(min_seq_length), + '-L', '18', '--no-1mm-upfront', - '--score-min', 'L,-' + str(mismatches_allowed) + ',0', + '--score-min', 'L,0,0', '--end-to-end', - '--norc', - '--mp', '1,1', - '--ignore-quals', - '--rdg', '9,1', - '--rfg', '9,1', - '-x', INDEX_NAME, - '-U', os.path.join(CWD, smallRNA), - '-S', sam_files[-1], - '-a', - '-p', get_config_key('general', 'threads') + '-M', '0', + '-x', os.path.join(INDEX_DIRECTORY, 'genome_index'), + '-U', small_rnas, + '-S', INTERMEDIATE_SAM ] - else: - bowtie2_align_command = [ + else: + bbmap_align_reads = [ + get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'), + '--threads', threads, + '-L', '18', + '-x', os.path.join(INDEX_DIRECTORY, 'genome_index'), + '-U', small_rnas, + '-S', INTERMEDIATE_SAM + ] + + if small_rna_filetype == 'fasta': + bbmap_align_reads.append('-f') + + bbmap_align_reads = bbmap_align_reads + get_config_key('cli-tools', 'bowtie2', 'bowtie2_params') + + if mismatches is not None: + cds_align_reads = [ + get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'), + '--threads', threads, + '-L', '18', + '--no-1mm-upfront', + '--score-min', 'L,' + str(-mismatches) + ',0', + '--end-to-end', + '--mp', '1,1', + '--ignore-quals', + '--rdg', '9,1', + '--rfg', '9,1', + '-x', os.path.join(INDEX_DIRECTORY, 'cds_index'), + '-U', small_rnas, + '-S', CDS_INTERMEDIATE_SAM + ] + elif mismatches == 0: + cds_align_reads = [ get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'), '--threads', threads, - '-L', str(min_seq_length), + '-L', '18', '--no-1mm-upfront', '--score-min', 'L,0,0', '--end-to-end', - '--norc', '-M', '0', - '-x', INDEX_NAME, - '-U', os.path.join(CWD, smallRNA), - '-S', sam_files[-1], - '-a' + '-x', os.path.join(INDEX_DIRECTORY, 'cds_index'), + '-U', small_rnas, + '-S', CDS_INTERMEDIATE_SAM ] + else: + cds_align_reads = [ + get_config_key('cli-tools', 'bowtie2', 'path_to_bowtie2'), + '--threads', threads, + '-L', '18', + '-x', os.path.join(INDEX_DIRECTORY, 'cds_index'), + '-U', small_rnas, + '-S', CDS_INTERMEDIATE_SAM + ] + + if small_rna_filetype == 'fasta': + cds_align_reads.append('-f') + + cds_align_reads = cds_align_reads + get_config_key('cli-tools', 'bowtie2', 'bowtie2_params') + + samtools_view = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'view', + '-h', + '-b', + '-F', '4', + '-o', INTERMEDIATE_BAM, + INTERMEDIATE_SAM + ] + + samtools_view = samtools_view + get_config_key('cli-tools', 'samtools', 'samtools_view_params') + + bedtools_bamtofastq_command = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'fastq', + INTERMEDIATE_BAM, + '-o', RESULT_FASTQ, + '-0', RESULT_FASTQ + ] + + cds_samtools_view = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'view', + '-h', + '-b', + '-F', '4', + '-o', CDS_INTERMEDIATE_BAM, + CDS_INTERMEDIATE_SAM + ] + + cds_samtools_view = cds_samtools_view + get_config_key('cli-tools', 'samtools', 'samtools_view_params') + + cds_bedtools_bamtofastq_command = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'fastq', + CDS_INTERMEDIATE_BAM, + '-o', CDS_RESULT_FASTQ, + '-0', CDS_RESULT_FASTQ + ] + + unmapped_samtools_view = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'view', + '-h', + '-b', + '-f', '4', + '-o', UNMAPPED_BAM, + INTERMEDIATE_SAM + ] + + unmapped_bedtools_bamtofastq_command = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'fastq', + UNMAPPED_BAM, + '-o', RESULT_UNMAPPED_FASTQ, + '-0', RESULT_UNMAPPED_FASTQ + ] + + cds_unmapped_samtools_view = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'view', + '-h', + '-b', + '-f', '4', + '-o', CDS_UNMAPPED_BAM, + CDS_INTERMEDIATE_SAM + ] + + cds_unmapped_bedtools_bamtofastq_command = [ + get_config_key('cli-tools', 'samtools', 'path_to_samtools'), + 'fastq', + CDS_UNMAPPED_BAM, + '-o', CDS_UNMAPPED_FASTQ, + '-0', CDS_UNMAPPED_FASTQ + ] + + do_log(quiet, '====> Building BBMap Index') + run(bbmap_build_index, capture_output=(quiet != 0)) + if cds is not None: + run(cds_build_index, capture_output=(quiet != 0)) + + do_log(quiet, '====> Aligning reads to the genome') + run(bbmap_align_reads, capture_output=(quiet != 0)) + if cds is not None: + run(cds_align_reads, capture_output=(quiet != 0)) + + do_log(quiet, '====> Converting to FASTQ') + run(samtools_view, capture_output=(quiet != 0)) + run(bedtools_bamtofastq_command, capture_output=(quiet != 0)) + run(unmapped_samtools_view, capture_output=(quiet != 0)) + run(unmapped_bedtools_bamtofastq_command, capture_output=(quiet != 0)) + if cds is not None: + run(cds_samtools_view, capture_output=(quiet != 0)) + run(cds_bedtools_bamtofastq_command, capture_output=(quiet != 0)) + run(cds_unmapped_samtools_view, capture_output=(quiet != 0)) + run(cds_unmapped_bedtools_bamtofastq_command, capture_output=(quiet != 0)) + + make_fastqs_unique(RESULT_FASTQ, CDS_RESULT_FASTQ, FINAL_FASTQ) + make_fastq_overlap_only(RESULT_UNMAPPED_FASTQ, CDS_UNMAPPED_FASTQ, FINAL_UNMAPPED_FASTQ) + + create_stats_table(small_rnas, get_config_key('general', 'output_directory'), small_rna_filetype=small_rna_filetype) + + return FINAL_FASTQ + +def create_stats_table(smallrna, output_dir, small_rna_filetype='fastq'): + ''' + Count sequences to create the statistics file + ''' + + input_reads = 0 + for read in SeqIO.parse(smallrna, small_rna_filetype): + input_reads += 1 + + overall_mapped = 0 + for read in SeqIO.parse(os.path.join(output_dir, 'mapped_sequences.fastq'), 'fastq'): + overall_mapped += 1 + + overall_unmapped = input_reads - overall_mapped + + genome_mapped = 0 + for read in SeqIO.parse(os.path.join(output_dir, 'genome_mapped_sequences.fastq'), 'fastq'): + genome_mapped += 1 + + cds_mapped = 0 + try: + for read in SeqIO.parse(os.path.join(output_dir, 'cds_sequences.fastq'), 'fastq'): + cds_mapped += 1 + except: + pass + + cds_unmapped = input_reads - cds_mapped + genome_unmapped = input_reads - genome_mapped + + with open(os.path.join(output_dir, 'report.tsv'), 'w') as f: + writer = csv.writer(f, delimiter='\t') + + writer.writerow(['Total Reads', input_reads]) + writer.writerow(['Total Mapped', overall_mapped]) + writer.writerow(['Percentage Mapped', overall_mapped / input_reads * 100]) + writer.writerow(['Total Unmapped', overall_unmapped]) + writer.writerow(['Percentage Unmapped', overall_unmapped / input_reads * 100]) + writer.writerow(['Mapped to Genome', genome_mapped]) + writer.writerow(['Percentage Mapped to Genome', genome_mapped / input_reads * 100]) + writer.writerow(['Unmapped to Genome', genome_unmapped]) + writer.writerow(['Percentage Unmapped to Genome', genome_unmapped / input_reads * 100]) + writer.writerow(['Mapped to CDS', cds_mapped]) + writer.writerow(['Percentage Mapped to CDS', cds_mapped / input_reads * 100]) + writer.writerow(['Unmapped to CDS', cds_unmapped]) + writer.writerow(['Percentage Unmapped to CDS', cds_unmapped / input_reads * 100]) + +def bin_rna_size(rna_file, min_length, max_length, quiet=0): + ''' + Bin the RNAs in the RNA file into new files by length + ''' + + BINS_DIRECTORY = os.path.join(get_config_key('general', 'output_directory'), 'binned_rna') + mkdir_if_not_exists(BINS_DIRECTORY) + + do_log(quiet, '====> Sorting RNA into arrays by length') + rnas = sorted(SeqIO.parse(rna_file, 'fastq'), key=lambda x: len(x)) + + table_path = os.path.join(get_config_key('general', 'output_directory'), 'rna_length_report.csv') - bowtie2_align_command = bowtie2_align_command + get_config_key('cli-tools', 'bowtie2', 'bowtie2_params') + last_length = 0 + current_rnas = [] + start_base_count = {'A': 0, 'C': 0, 'G': 0, 'T': 0} + with open(table_path, 'w') as table_file: + writer = csv.writer(table_file) + writer.writerow(['RNA Length', 'A', 'C', 'G', 'U', 'All Frequency']) - run(bowtie2_align_command, capture_output=(quiet != 0)) + for rna_seq in rnas: + if len(rna_seq) != last_length: + if len(current_rnas) > 0 and last_length >= min_length and last_length <= max_length: + filename = os.path.join(BINS_DIRECTORY, 'length' + str(last_length) + '.fastq') + with open(filename, 'a') as f: + SeqIO.write(current_rnas, f, 'fastq') - os.chdir(CWD) + writer.writerow([ + last_length, + start_base_count['A'], start_base_count['C'], + start_base_count['G'], start_base_count['T'], + len(current_rnas) + ]) - return sam_files + current_rnas = [] + start_base_count = {'A': 0, 'C': 0, 'G': 0, 'T': 0} -def build_summary_files(sam_files, quiet=0): + last_length = len(rna_seq) + + current_rnas.append(rna_seq) + try: + start_base_count[rna_seq[0]] += 1 + except KeyError: + do_log(quiet, f'Warning : An RNA started with ambiguous nt {rna_seq[0]}') + + return table_path + +def graph_length(path_to_table): ''' - Takes the alignment SAM and extracts the sequences into a summary file and - a set of FASTA files + Create the graph showing the length and starting base of all the RNA ''' - do_log(quiet, '====> Removing unaligend sequences and building summary') - - with open(os.path.join(get_config_key('general', 'output_directory'), 'rna_target_list.tsv'), 'w') as tablefile: - writer = csv.writer(tablefile, delimiter='\t') - writer.writerow(['Query Sequence', 'Target File', 'Target Sequence', 'Start Coordinate', 'End Coordinate', 'Strand']) + with open(path_to_table) as f: + reader = csv.reader(f) + next(reader) - for filename in sam_files: - samfile = pysam.AlignmentFile(filename, 'r') - fileheader = samfile.header + labels = [] + bases = {'A': [], 'C': [], 'G': [], 'T': []} + + for line in reader: + labels.append(str(line[0])) - for read in samfile: - if not read.is_unmapped: - query_name = read.query_name - r_name = fileheader.get_reference_name(read.reference_id) - start = read.reference_start - end = read.reference_end + bases['A'].append(int(line[1])) + bases['C'].append(int(line[2])) + bases['G'].append(int(line[3])) + bases['T'].append(int(line[4])) - if read.is_reverse: - strand = 'antisense' - else: - strand = 'sense' + total_rna = sum(bases['A']) + sum(bases['C']) + sum(bases['G']) + sum(bases['T']) + for key in bases.keys(): + bases[key] = np.array(bases[key]) / total_rna * 100 - writer.writerow([query_name, os.path.abspath(filename), r_name, start, end, strand]) + with open(os.path.join(get_config_key('general', 'output_directory'), 'baseplot_data.csv'), 'w') as f: + writer = csv.writer(f) - try: - remove_duplicate_command = [ - get_config_key('cli-tools', 'samtools', 'path_to_samtools'), - 'view', - '-o', filename + '.nodup.sam', - filename - ] - - remove_duplicate_command = remove_duplicate_command + get_config_key('cli-tools', 'samtools', 'samtools_view_params') - - result = run(remove_duplicate_command) - result.check_returncode() - filename = filename + '.nodup.sam' - - bam_to_fastq_command = [ - get_config_key('cli-tools', 'samtools', 'path_to_samtools'), - 'fastq', - filename - ] - - bam_to_fastq_command = bam_to_fastq_command + get_config_key('cli-tools', 'samtools', 'samtools_fastq_params') - - result = run(bam_to_fastq_command, capture_output=True) - result.check_returncode() - with open(filename + '.fastq', 'wb') as f: - f.write(result.stdout) - - except: - print('SAMTOOLS convertion failed, skipping creating filtered and FASTQ files...') \ No newline at end of file + writer.writerow(['RNA Length'] + labels) + writer.writerow(['A'] + list(bases['A'])) + writer.writerow(['T'] + list(bases['T'])) + writer.writerow(['C'] + list(bases['C'])) + writer.writerow(['G'] + list(bases['G'])) + + fig, ax = plt.subplots(figsize=(11.69, 8.27)) + + ax.bar(labels, bases['A'], label='A', align='edge', width=0.8) + ax.bar(labels, bases['C'], bottom=bases['A'], label='C', align='edge', width=0.8) + ax.bar(labels, bases['G'], bottom=bases['A'] + bases['C'], label='G', align='edge', width=0.8) + ax.bar(labels, bases['T'], bottom=bases['A'] + bases['C'] + bases['G'], label='U', align='edge', width=0.8) + + ax.set_xticklabels(labels, fontsize=7) + ax.set_xlabel('Length of small RNA') + ax.set_ylabel('Percentage of small RNA') + + plt.legend(title='First Nucliotide') + + plt.savefig(os.path.join(get_config_key('general', 'output_directory'), 'baseplot.png')) \ No newline at end of file