Skip to content

Commit

Permalink
HDR Updates - yw #82
Browse files Browse the repository at this point in the history
Multiple amplicon names are resolved before adding the HDR amplicon -- unnamed amplicons are named Amplicon{i} for each amplicon.
Plot 4g data (nuc pct table, mod pct table for all reads aligned to the first reference) is output and linked to from the plot display
Ambiguous reads don't contribute to plot 4g data (which would otherwise lead to double counting and pct values > 1)
  • Loading branch information
kclem committed Mar 21, 2021
1 parent d7915b2 commit 4598226
Showing 1 changed file with 26 additions and 3 deletions.
29 changes: 26 additions & 3 deletions CRISPResso2/CRISPRessoCORE.py
Original file line number Diff line number Diff line change
Expand Up @@ -1019,9 +1019,18 @@ def rreplace(s, old, new):
if len(amp_seq) == 0:
raise BadParameterException("Amplicon " + str(idx + 1) + " has length 0. Please check the --amplicon_seq parameter.")

this_name = 'Amplicon'+str(idx)
if idx >= len(amplicon_name_arr):
amplicon_name_arr.append(this_name)

this_min_aln_score = args.default_min_aln_score
if idx >= len(amplicon_min_alignment_score_arr):
amplicon_min_alignment_score_arr.append(this_min_aln_score)

if args.expected_hdr_amplicon_seq != "":
amplicon_seq_arr.append(args.expected_hdr_amplicon_seq)
amplicon_name_arr.append('HDR')
amplicon_min_alignment_score_arr.append(args.default_min_aln_score)

amplicon_quant_window_coordinates_arr = ['']*len(amplicon_seq_arr)
if args.quantification_window_coordinates is not None:
Expand Down Expand Up @@ -2486,6 +2495,7 @@ def get_allele_row(reference_name, variant_count, aln_ref_names_str, aln_ref_sco
ref1_all_base_count_vectors[ref_names[0]+"_"+nuc ] = all_base_count_vectors[ref_names[0]+"_"+nuc].copy()

#then go through and align and add other reads
#this is our second time iterating through the variantCache -- this time just for HDR and prime editing
for variant in variantCache:
#skip variant if there were none observed
variant_count = variantCache[variant]['count']
Expand All @@ -2496,17 +2506,22 @@ def get_allele_row(reference_name, variant_count, aln_ref_names_str, aln_ref_sco
if len(aln_ref_names) == 1 and ref_names[0] == aln_ref_names[0]: #if this read was only aligned to ref1, skip it because we already included the indels when we initialized the ref1_all_deletion_count_vectors array
continue

class_name = variantCache[variant]['class_name'] #for classifying read e.g. 'HDR_MODIFIED' for pie chart
#if class is AMBIGUOUS (set above if the args.expand_ambiguous_alignments param is false) don't add the modifications in this allele to the allele summaries
if class_name == "AMBIGUOUS":
continue #for ambiguous reads, don't add indels to reference totals

#align this variant to ref1 sequence
ref_aln_name,s1,s2,score = variantCache[variant]['ref_aln_details'][0]
if args.use_legacy_insertion_quantification:
payload=CRISPRessoCOREResources.find_indels_substitutions_legacy(s1,s2,refs[ref1_name]['include_idxs'])
else:
payload=CRISPRessoCOREResources.find_indels_substitutions(s1,s2,refs[ref1_name]['include_idxs'])

#indels in this alignment against ref1 should be recorded for each ref it was originally assigned to, as well as for ref1
#indels in this alignment against ref1 should be recorded for each ref it was originally assigned to
#for example, if this read aligned to ref3, align this read to ref1, and add the resulting indels to ref1_all_insertion_count_vectors[ref3] as well as ref1_all_insertion_count_vectors[ref1]
# Thus, ref1_all_insertion_count_vectors[ref3] will show the position of indels of reads that aligned to ref3, but mapped onto ref1
# And ref1_alle_insertion_count_vectors[ref1] will show the position of indels of all reads, mapped onto ref1
# And ref1_alle_insertion_count_vectors[ref1] will show the position of indels of all reads, mapped onto ref1 (as copied above)
for ref_name in aln_ref_names:
if ref_name == ref_names[0]:
continue
Expand Down Expand Up @@ -3956,7 +3971,7 @@ def count_alternate_alleles(sub_base_vectors,ref_name,ref_sequence,ref_total_aln
tot = float(counts_total[ref_name_for_hdr])
for nuc in ['A','C','G','T','N','-']:
nuc_pcts.append(np.concatenate(([ref_name_for_hdr,nuc], np.array(ref1_all_base_count_vectors[ref_name_for_hdr+"_"+nuc]).astype(np.float)/tot)))
colnames = ['Batch','Nucleotide']+list(refs[ref_names[0]]['sequence'])
colnames = ['Batch','Nucleotide']+list(refs[ref_names_for_hdr[0]]['sequence'])
hdr_nucleotide_percentage_summary_df = pd.DataFrame(nuc_pcts,columns=colnames)

mod_pcts = []
Expand All @@ -3976,11 +3991,19 @@ def count_alternate_alleles(sub_base_vectors,ref_name,ref_sequence,ref_total_aln
# include_idxs_list = refs[ref_names_for_hdr[0]]['include_idxs']
include_idxs_list = [] # the quantification windows may be different between different amplicons

ref_plot_name = refs[ref_names_for_hdr[0]]['ref_plot_name']
mod_freq_filename = _jp(ref_plot_name + 'Reads_from_all_amplicons_modification_percent_table.txt')
hdr_modification_percentage_summary_df.rename(columns={'Batch':'Amplicon'}).to_csv(mod_freq_filename,sep='\t',header=True,index=False)
nuc_freq_filename = _jp(ref_plot_name + 'Reads_from_all_amplicons_nucleotide_percent_table.txt')
hdr_nucleotide_percentage_summary_df.rename(columns={'Batch':'Amplicon'}).to_csv(nuc_freq_filename,sep='\t',header=True,index=False)

plot_root = _jp('4g.HDR_nucleotide_percentage_quilt')
CRISPRessoPlot.plot_nucleotide_quilt(hdr_nucleotide_percentage_summary_df,hdr_modification_percentage_summary_df,plot_root,save_png,sgRNA_intervals=sgRNA_intervals,sgRNA_names=sgRNA_names,sgRNA_mismatches=sgRNA_mismatches,quantification_window_idxs=include_idxs_list)
crispresso2_info['refs'][ref_names_for_hdr[0]]['plot_4g_root'] = os.path.basename(plot_root)
crispresso2_info['refs'][ref_names_for_hdr[0]]['plot_4g_caption'] = "Figure 4g: Nucleotide distribution across all amplicons. At each base in the reference amplicon, the percentage of each base as observed in sequencing reads is shown (A = green; C = orange; G = yellow; T = purple). Black bars show the percentage of reads for which that base was deleted. Brown bars between bases show the percentage of reads having an insertion at that position."
crispresso2_info['refs'][ref_names_for_hdr[0]]['plot_4g_data'] = []
crispresso2_info['refs'][ref_names_for_hdr[0]]['plot_4g_data'].append(('Nucleotide percentage table for all reads aligned to ' + ref_names[0],os.path.basename(nuc_freq_filename)))
crispresso2_info['refs'][ref_names_for_hdr[0]]['plot_4g_data'].append(('Modification percentage table for all reads aligned to ' + ref_names[0],os.path.basename(mod_freq_filename)))
for ref_name_for_hdr in ref_names_for_hdr:
if 'nuc_freq_filename' in crispresso2_info['refs'][ref_name_for_hdr]:
crispresso2_info['refs'][ref_names_for_hdr[0]]['plot_4g_data'].append(('Nucleotide frequency table for ' + ref_name_for_hdr,os.path.basename(crispresso2_info['refs'][ref_name_for_hdr]['nuc_freq_filename'])))
Expand Down

0 comments on commit 4598226

Please sign in to comment.