From d2d97ca5549ccc8e069f1bc9e987524dba654529 Mon Sep 17 00:00:00 2001 From: hiruna72 Date: Tue, 5 Mar 2024 19:43:08 +1100 Subject: [PATCH] signal to read changes --- src/calculate_offsets.py | 11 ++- src/metric.py | 32 +++++++ src/plot.py | 175 +++++++++++++++++++++++++-------------- src/plot_utils.py | 1 + 4 files changed, 153 insertions(+), 66 deletions(-) diff --git a/src/calculate_offsets.py b/src/calculate_offsets.py index 686a092..9635697 100644 --- a/src/calculate_offsets.py +++ b/src/calculate_offsets.py @@ -207,6 +207,7 @@ def run(args): model_ = obj[header_line_count:] for line in model_: values_ = line.split('\t') + values_[0] = values_[0].replace('U', 'T') model[values_[0]] = float(values_[1]) test_array = [] for base_offset in range(0, kmer_length): @@ -217,11 +218,17 @@ def run(args): max_offset, max_dist = calculate_distance(kmer_length, test_array) forward_shift = -1 * max_offset reverse_shift = -1 * (kmer_length - max_offset - 1) - print("kmer length: {}\nbest base shift (offset) for forward mapped reads: {}\nbest base shift (offset) for reverse mapped reads: {}\ndifference between highest and lowest medians of the distributions: {}".format(kmer_length, forward_shift, reverse_shift, round(max_dist, 4))) + if args.rna: + print("RNA\nkmer length: {}\nbest base shift (offset) for forward mapped reads: {}\nbest base shift (offset) for reverse mapped reads: {}\ndifference between highest and lowest medians of the distributions: {}".format(kmer_length, reverse_shift, forward_shift, round(max_dist, 4))) + else: + print("DNA\nkmer length: {}\nbest base shift (offset) for forward mapped reads: {}\nbest base shift (offset) for reverse mapped reads: {}\ndifference between highest and lowest medians of the distributions: {}".format(kmer_length, forward_shift, reverse_shift, round(max_dist, 4))) if args.output != "": output_pdf = PdfPages(args.output) print("output file: {}".format(args.output)) - plt_title = "{}\nkmer length: {}\ndifference between highest and lowest medians of the distributions: {}\nbest base shift (offset) for forward mapped reads (shown below): {}\nbest base shift (offset) for reverse mapped reads (derived): {}\n".format(args.tag_name, kmer_length, str(round(max_dist, 4)), forward_shift, reverse_shift) + if args.rna: + plt_title = "RNA\n{}\nkmer length: {}\ndifference between highest and lowest medians of the distributions: {}\nbest base shift (offset) for forward mapped reads (derived): {}\nbest base shift (offset) for reverse mapped reads (shown below): {}\n".format(args.tag_name, kmer_length, str(round(max_dist, 4)), reverse_shift, forward_shift) + else: + plt_title = "DNA\n{}\nkmer length: {}\ndifference between highest and lowest medians of the distributions: {}\nbest base shift (offset) for forward mapped reads (shown below): {}\nbest base shift (offset) for reverse mapped reads (derived): {}\n".format(args.tag_name, kmer_length, str(round(max_dist, 4)), forward_shift, reverse_shift) plot_distributions(kmer_length, test_array, output_pdf, plt_title) output_pdf.close() else: diff --git a/src/metric.py b/src/metric.py index 575d879..473f4b5 100644 --- a/src/metric.py +++ b/src/metric.py @@ -634,6 +634,17 @@ def run(args): if i < len(ref_pos_dic) - 1: print("\t", end="") print() + if args.output_current_1: + for i in range(0, len(ref_pos_dic)): + sublist = ref_pos_dic[i] + if not any(np.isnan(x) for x in sublist) and len(sublist) > 0: + rounded_sublist = [round(x, 2) for x in sublist] + print(",".join(map(str, rounded_sublist)), end="") + else: + continue + if i < len(ref_pos_dic) - 1: + print(",", end="") + print() if args.output_current_column: for i in range(0, len(ref_pos_dic)): sublist = ref_pos_dic[i] @@ -846,6 +857,26 @@ def run(args): if i < len(ref_pos_dic) - 1: print("\t", end="") print() + if args.output_current_1: + for i in range(0, len(ref_pos_dic)): + sublist = ref_pos_dic[i] + if not any(np.isnan(x) for x in sublist) and len(sublist) > 0: + rounded_sublist = [round(x, 2) for x in sublist] + print(",".join(map(str, rounded_sublist)), end="") + else: + continue + if i < len(ref_pos_dic) - 1: + print(",", end="") + print() + if args.output_current_column: + for i in range(0, len(ref_pos_dic)): + sublist = ref_pos_dic[i] + if not any(np.isnan(x) for x in sublist) and len(sublist) > 0: + rounded_sublist = [round(x, 2) for x in sublist] + if i in column_raw_samples: + column_raw_samples[i] += rounded_sublist + else: + column_raw_samples[i] = rounded_sublist else: raise Exception("Error: You should not have ended up here. Please check your arguments") @@ -897,6 +928,7 @@ def argparser(): parser.add_argument('--extend_0', required=False, action='store_true', help="print matches, deletions, insertions arrays") parser.add_argument('--extend_1', required=False, action='store_true', help="print ss string for the given region") parser.add_argument('--output_current', required=False, action='store_true', help="print signal values") + parser.add_argument('--output_current_1', required=False, action='store_true', help="print signal values without any nans/tabs") parser.add_argument('--output_current_column', required=False, action='store_true', help="print signal values per column") return parser diff --git a/src/plot.py b/src/plot.py index ec54eb2..2fb8a1e 100644 --- a/src/plot.py +++ b/src/plot.py @@ -82,7 +82,9 @@ def plot_function(p, read_id, signal_tuple, sig_algn_data, fasta_sequence, base_ base_box_details = {'left': [], 'right': [], 'fill_color': []} flag_base_index_bound = 0 + k = 0 for i in moves: + # print("{}: {}".format(k, i)) previous_location = location_plot previous_x_coordinate = x_coordinate if 'D' in i: @@ -169,6 +171,8 @@ def plot_function(p, read_id, signal_tuple, sig_algn_data, fasta_sequence, base_ break if x_coordinate - initial_x_coordinate > draw_data["sig_plot_limit"]: break + k += 1 + line_segment_source = ColumnDataSource(dict(x=line_segment_x, x1=line_segment_x, y=[y_min]*len(line_segment_x), y1=[y_max]*len(line_segment_x))) glyph = Segment(x0="x", y0="y", x1="x1", y1="y1", line_color="saddlebrown", line_width=1) @@ -566,6 +570,19 @@ def run(args): for read in Fastq(args.file): name = read.name.split()[0] sequence_reads[name] = read + if args.region != "": + args_region = re.sub(',', '', args.region) + # print(args_region) + # pattern = re.compile("^[a-z]+[0-9]+\:[0-9]+\-[0-9]+") + pattern = re.compile("^[0-9]+\-[0-9]+") + if not pattern.match(args_region): + raise Exception("Error: region provided is not in correct format") + args_ref_start = int(args.region.split("-")[0]) + args_ref_end = int(args.region.split("-")[1]) + + else: + args_ref_start = None + args_ref_end = None for paf_record in parse_paf(handle): if paf_record.query_name != paf_record.target_name: @@ -580,73 +597,101 @@ def run(args): if 'ss' not in paf_record.tags: raise Exception("Error: ss string is missing for the read_id {} in {}".format(read_id, args.alignment)) + if 'ss' not in paf_record.tags: + raise Exception("Error: ss string is missing for the read_id {} in {}".format(read_id, args.alignment)) + moves_string = paf_record.tags['ss'][2] + data_is_rna = 0 + start_index = paf_record.query_start + end_index = paf_record.query_end + ref_seq_len = paf_record.target_end - paf_record.target_start + reference_start = paf_record.target_start if paf_record.target_start > paf_record.target_end: # if RNA start_kmer>end_kmer in paf data_is_rna = 1 if not args.rna: print("Info: data is detected as RNA") raise Exception("Error: data is not specified as RNA. Please provide the argument --rna ") - - fasta_seq = "" - if use_fasta: - fasta_seq = sequence_reads[read_id][:].seq - fasta_seq = fasta_seq.upper() - else: - fasta_seq = sequence_reads[read_id].seq - fasta_seq = fasta_seq.upper() - if len(fasta_seq) < paf_record.target_length: - raise Exception("Error: Sequence lengths mismatch. If {} is a multi-line fastq file convert it to a 4-line fastq using seqtk.".format(args.file)) - if not bool(re.match('^[ACGTUMRWSYKVHDBN]+$', fasta_seq)): - raise Exception("Error: base characters other than A,C,G,T/U,M,R,W,S,Y,K,V,H,D,B,N were detected. Please check your sequence files") - - ref_start = -1 - ref_end = -1 - if data_is_rna == 1: - fasta_seq = fasta_seq[paf_record.target_end:] - ref_start = paf_record.target_end + 1 - else: - fasta_seq = fasta_seq[paf_record.target_start:] - ref_start = paf_record.target_start + 1 + ref_seq_len = paf_record.target_start - paf_record.target_end + reference_start = paf_record.target_end + kmer_correction base_limit = args.base_limit - seq_len = len(fasta_seq) - if seq_len < base_limit: - base_limit = seq_len - ref_end = base_limit + if ref_seq_len < base_limit: + base_limit = ref_seq_len + paf_record_reference_end = reference_start + ref_seq_len #1based closed if args.region != "": - pattern = re.compile("^[0-9]+\-[0-9]+") - if not pattern.match(args.region): - raise Exception("Error: region provided is not in correct format") - ref_start = int(args.region.split("-")[0]) - ref_end = int(args.region.split("-")[1]) - - if ref_start < 1: - raise Exception("Error: region start coordinate ({}) must be positive".format(ref_start)) - - if ref_end < 1: - raise Exception("Error: region end coordinate ({}) must be positive".format(ref_end)) - - if data_is_rna == 1 and paf_record.target_start < ref_end: - ref_end = paf_record.target_start - elif data_is_rna == 0 and paf_record.target_end < ref_end: - ref_end = paf_record.target_end - - # print("ref_start: " + str(ref_start)) - # print("ref_end: " + str(ref_end)) - - if (ref_end - ref_start + 1) < base_limit: - base_limit = ref_end - ref_start + 1 + if not args.loose_bound: + if data_is_rna == 1: + if args_ref_start < reference_start + 1 - kmer_correction: + continue + else: + if args_ref_start < reference_start + 1: + continue + if args_ref_end > paf_record_reference_end: + continue + ref_name = paf_record.target_name + ref_start = reference_start + 1 + ref_end = ref_start + base_limit - 1 #ref_end is 1based closed + if args.region != "": + if args_ref_start > ref_start: + ref_start = args_ref_start + if (ref_start + base_limit - 1) < paf_record_reference_end: + ref_end = ref_start + base_limit - 1 + else: + ref_end = paf_record_reference_end + if args_ref_end < ref_end: + ref_end = args_ref_end + if ref_end < ref_start: + print("Warning: a corner case has hit because the kmer_length used is larger than 1. This alignment will be skipped") + continue + base_limit = ref_end - ref_start + 1 - print("plot region: {}-{}\tread_id: {}".format(ref_start, ref_end, read_id)) + record_is_reverse = 0 + if paf_record.strand == "-": + record_is_reverse = 1 + if record_is_reverse and data_is_rna == 1: + raise Exception("Error: A transcript is always sequenced from 3` to 5`. Squigualiser only supports reads mapped to the transcriptome.") + if data_is_rna == 1: + print("plot (RNA 5'->3') region: {}:{}-{}\tread_id: {}".format(ref_name, ref_end, ref_start, read_id)) + if use_fasta: + fasta_seq = sequence_reads[ref_name][:].seq[ref_start-1:ref_end] # this slicing is 0-based [) interval + else: + fasta_seq = sequence_reads[read_id].seq + if len(fasta_seq) < paf_record.target_length: + raise Exception("Error: Sequence lengths mismatch. If {} is a multi-line fastq file convert it to a 4-line fastq using seqtk.".format(args.file)) + fasta_seq = fasta_seq[ref_start-1:ref_end] + fasta_seq = fasta_seq.upper() + else: + if record_is_reverse: + print("plot (-) region: {}:{}-{}\tread_id: {}".format(ref_name, ref_start, ref_end, read_id)) + if use_fasta: + fasta_seq = sequence_reads[ref_name][:].seq[ref_start-1:ref_end] + else: + fasta_seq = sequence_reads[read_id].seq + if len(fasta_seq) < paf_record.target_length: + raise Exception("Error: Sequence lengths mismatch. If {} is a multi-line fastq file convert it to a 4-line fastq using seqtk.".format(args.file)) + fasta_seq = fasta_seq[ref_start-1:ref_end] + fasta_seq = fasta_seq.upper() + nn = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} + fasta_seq = "".join(nn[n] for n in fasta_seq) + else: + print("plot (+) region: {}:{}-{}\tread_id: {}".format(ref_name, ref_start, ref_end, read_id)) + if use_fasta: + fasta_seq = sequence_reads[ref_name][:].seq[ref_start-1:ref_end] + else: + fasta_seq = sequence_reads[read_id].seq + if len(fasta_seq) < paf_record.target_length: + raise Exception("Error: Sequence lengths mismatch. If {} is a multi-line fastq file convert it to a 4-line fastq using seqtk.".format(args.file)) + fasta_seq = fasta_seq[ref_start-1:ref_end] + fasta_seq = fasta_seq.upper() + if not bool(re.match('^[ACGTUMRWSYKVHDBN]+$', fasta_seq)): + raise Exception("Error: base characters other than A,C,G,T/U,M,R,W,S,Y,K,V,H,D,B,N were detected. Please check your sequence files") x = [] x_real = [] y = [] read = s5.get_read(read_id, pA=args.no_pa, aux=["read_number", "start_mux"]) if read is not None: - start_index = paf_record.query_start - end_index = read['len_raw_signal'] x = list(range(1, end_index - start_index + 1)) x_real = list(range(start_index + 1, end_index + 1)) # 1based y = read['signal'][start_index:end_index] @@ -668,32 +713,34 @@ def run(args): y = plot_utils.scale_signal(y, args.sig_scale, scale_params) - strand_dir = "(DNA 5'->3')" - if data_is_rna == 1: - fasta_seq = fasta_seq[:ref_end] - fasta_seq = fasta_seq[::-1] - strand_dir = "(RNA 3'->5')" - else: - fasta_seq = fasta_seq[ref_start-1:] - - moves_string = paf_record.tags['ss'][2] moves_string = re.sub('D', 'D,', moves_string) moves_string = re.sub('I', 'I,', moves_string).rstrip(',') moves = re.split(r',+', moves_string) + if data_is_rna == 0: + strand_dir = "(DNA +)" + if record_is_reverse: + strand_dir = "(DNA -)" + x_real.reverse() + y = np.flip(y) + moves.reverse() + if data_is_rna == 1: + strand_dir = "(RNA 3'->5')" + fasta_seq = fasta_seq[::-1] + signal_tuple = (x, x_real, y) - region_tuple = (ref_start, ref_end, 0, seq_len) + region_tuple = (ref_start, ref_end, reference_start, reference_start + ref_seq_len) sig_algn_dic = {} sig_algn_dic['start_kmer'] = 0 sig_algn_dic['ref_start'] = ref_start sig_algn_dic['ref_end'] = ref_end - sig_algn_dic['use_paf'] = use_paf + sig_algn_dic['pa'] = args.no_pa sig_algn_dic['plot_sig_ref_flag'] = plot_sig_ref_flag sig_algn_dic['data_is_rna'] = data_is_rna sig_algn_dic['ss'] = moves - signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data) + signal_tuple, region_tuple, sig_algn_dic, fasta_seq = plot_utils.adjust_before_plotting(ref_seq_len, signal_tuple, region_tuple, sig_algn_dic, fasta_seq, draw_data) if args.fixed_width: sig_algn_dic['tag_name'] = args.tag_name + indt + "base_shift: " + str(draw_data["base_shift"]) + indt + "scale:" + scaling_str + indt + "fixed_width: " + str(args.base_width) + indt + strand_dir + indt + "region: " @@ -704,7 +751,7 @@ def run(args): draw_data['y_max'] = np.nanmax(y) p = plot_utils.create_figure(args, plot_mode=0) if args.bed: - p = bed_annotation.plot_bed_annotation(p=p, ref_id=read_id, bed_dic=bed_dic, sig_algn_data=sig_algn_dic, draw_data=draw_data, base_limit=base_limit) + p = bed_annotation.plot_bed_annotation(p=p, ref_id=ref_name, bed_dic=bed_dic, sig_algn_data=sig_algn_dic, draw_data=draw_data, base_limit=base_limit) if args.fixed_width: layout_ = plot_function_fixed_width(p=p, read_id=read_id, signal_tuple=signal_tuple, sig_algn_data=sig_algn_dic, fasta_sequence=fasta_seq, base_limit=base_limit, draw_data=draw_data) @@ -1040,7 +1087,7 @@ def run(args): raise Exception("Error: A transcript is always sequenced from 3` to 5`. Squigualiser only supports reads mapped to the transcriptome.") if data_is_rna == 1: print("plot (RNA 5'->3') region: {}:{}-{}\tread_id: {}".format(ref_name, ref_end, ref_start, read_id)) - fasta_seq = fasta_reads.get_seq(name=ref_name, start=ref_start, end=ref_end).seq + fasta_seq = fasta_reads.get_seq(name=ref_name, start=ref_start, end=ref_end).seq #get_seq is 1-based closed interval fasta_seq = fasta_seq.upper() else: if record_is_reverse: diff --git a/src/plot_utils.py b/src/plot_utils.py index d2a0a9b..727583f 100644 --- a/src/plot_utils.py +++ b/src/plot_utils.py @@ -146,6 +146,7 @@ def scale_signal(y, sig_scale, scale_params): "kmer_model_dna_r9.4.1_450bps_6_mer": [-2, -3], "kmer_model_rna_r9.4.1_70bps_5_mer": [-3, -1], "kmer_model_dna_r10.4.1_e8.2_400bps_9_mer": [-6, -2], + "kmer_model_rna004_130bps_9mer": [-5, -3], "corrected_at_reform": [0, 0]} def list_profiles_base_shift(): # print(profile_dic)