diff --git a/pvactools/lib/run_argument_parser.py b/pvactools/lib/run_argument_parser.py index e53a350b..6692a6ba 100644 --- a/pvactools/lib/run_argument_parser.py +++ b/pvactools/lib/run_argument_parser.py @@ -371,6 +371,11 @@ def pvacvector(self): help="Number of amino acids to permit clipping from the start and/or end of peptides in order to test novel junction epitopes when the first pass on the full peptide fails.", default=3, ) + self.parser.add_argument( + '--allow-n-peptide-exclusion', type=int, + help="If no solution is found after adding spacers and clipping peptides, attempt to find partial solutions with up to n peptides removed.", + default=2, + ) class PvacbindRunArgumentParser(RunArgumentParser): def __init__(self): diff --git a/pvactools/tools/pvacvector/run.py b/pvactools/tools/pvacvector/run.py index 0259cb4f..a90a9391 100644 --- a/pvactools/tools/pvacvector/run.py +++ b/pvactools/tools/pvacvector/run.py @@ -15,6 +15,7 @@ import itertools import json import platform +import shutil from pvactools.lib.optimal_peptide import OptimalPeptide from pvactools.lib.vector_visualization import VectorVisualization @@ -251,72 +252,72 @@ def find_optimal_path(graph, distance_matrix, seq_dict, base_output_dir, args): peptide.copy_strategy = "slice" peptide.save_state_on_exit = False state, e = peptide.anneal() - print("%i distance :" % e) + names = list() + cumulative_weight = 0 + all_scores = list() + problematic_junctions = [] + for i in range(0, (len(state) - 1)): + names.append(state[i]) + if graph.has_edge(state[i], state[i + 1]): + edge = graph[state[i]][state[i + 1]] + try: + min_score = min(min_score, edge['weight']) + except: + min_score = edge['weight'] + cumulative_weight += edge['weight'] + all_scores.append(str(edge['weight'])) + spacer = edge['spacer'] + if spacer != 'None': + names.append(spacer) + else: + problematic_junctions.append("{} - {}".format(state[i], state[i+1])) + names.append(state[-1]) + if len(problematic_junctions) > 0: + return (None, "No valid junction between peptides: {}".format(", ".join(problematic_junctions))) + + print("%i distance :" % e) for id in state: print("\t", id) - results_file = os.path.join(base_output_dir, args.sample_name + '_results.fa') - with open(results_file, 'w') as f: - names = list() - cumulative_weight = 0 - all_scores = list() - - problematic_junctions = [] - for i in range(0, (len(state) - 1)): - names.append(state[i]) - if graph.has_edge(state[i], state[i + 1]): - edge = graph[state[i]][state[i + 1]] - try: - min_score = min(min_score, edge['weight']) - except: - min_score = edge['weight'] - cumulative_weight += edge['weight'] - all_scores.append(str(edge['weight'])) - spacer = edge['spacer'] - if spacer != 'None': - names.append(spacer) + median_score = str(cumulative_weight/len(all_scores)) + peptide_id_list = ','.join(names) + score_list = ','.join(all_scores) + output = list() + output.append(">") + output.append(peptide_id_list) + output.append("|Median_Junction_Score:") + output.append(median_score) + output.append("|Lowest_Junction_Score:") + output.append(str(min_score)) + output.append("|All_Junction_Scores:") + output.append(score_list) + output.append("\n") + for (i, name) in enumerate(names): + if name in args.spacers: + output.append(name) + else: + full_sequence = seq_dict[name] + if i > 0: + left_name = names[i-1] + if left_name in args.spacers: + left_name = names[i-2] + left_clip = graph.edges[(left_name, name)]['right_partner_trim'] else: - problematic_junctions.append("{} - {}".format(state[i], state[i+1])) - if len(problematic_junctions) > 0: - return (None, "No valid junction between peptides: {}".format(", ".join(problematic_junctions))) - names.append(state[-1]) - median_score = str(cumulative_weight/len(all_scores)) - peptide_id_list = ','.join(names) - score_list = ','.join(all_scores) - output = list() - output.append(">") - output.append(peptide_id_list) - output.append("|Median_Junction_Score:") - output.append(median_score) - output.append("|Lowest_Junction_Score:") - output.append(str(min_score)) - output.append("|All_Junction_Scores:") - output.append(score_list) - output.append("\n") - for (i, name) in enumerate(names): - if name in args.spacers: - output.append(name) + left_clip = 0 + if i < (len(names) - 1): + right_name = names[i+1] + if right_name in args.spacers: + right_name = names[i+2] + right_clip = graph.edges[(name, right_name)]['left_partner_trim'] else: - full_sequence = seq_dict[name] - if i > 0: - left_name = names[i-1] - if left_name in args.spacers: - left_name = names[i-2] - left_clip = graph.edges[(left_name, name)]['right_partner_trim'] - else: - left_clip = 0 - if i < (len(names) - 1): - right_name = names[i+1] - if right_name in args.spacers: - right_name = names[i+2] - right_clip = graph.edges[(name, right_name)]['left_partner_trim'] - else: - right_clip = 0 - clipped_sequence = full_sequence[left_clip:len(full_sequence) - right_clip] - output.append(clipped_sequence) + right_clip = 0 + clipped_sequence = full_sequence[left_clip:len(full_sequence) - right_clip] + output.append(clipped_sequence) - output.append("\n") + output.append("\n") + results_file = os.path.join(base_output_dir, args.sample_name + '_results.fa') + with open(results_file, 'w') as f: f.write(''.join(output)) return (results_file, None) @@ -451,21 +452,12 @@ def main(args_input=sys.argv[1:]): print("No valid path found. {}".format(error)) junctions_to_process = identify_problematic_junctions(graph, seq_tuples) else: - junctions_file = os.path.join(current_output_dir, 'junctions.tsv') - import shutil - shutil.copy(junctions_file, base_output_dir) break tries += 1 + junctions_file = os.path.join(current_output_dir, 'junctions.tsv') + shutil.copy(junctions_file, base_output_dir) - if results_file is None: - print( - 'Unable to find path. ' + - 'A vaccine design using the parameters specified could not be found. Some options that you may want to consider:\n' + - '1) decreasing the acceptable junction binding score to allow more possible connections (-b parameter)\n' + - '2) using the "median" binding score instead of the "best" binding score for each junction, (best may be too conservative, -m parameter)\n' + - '3) if running with a percentile threshold set, either remove this parameter or reduce the acceptable threshold to allow more possible connections (--percentile-threshold parameter)' - ) - else: + if results_file is not None: if 'DISPLAY' in os.environ.keys(): VectorVisualization(results_file, base_output_dir, args.spacers).draw() @@ -478,6 +470,64 @@ def main(args_input=sys.argv[1:]): shutil.rmtree(os.path.join(base_output_dir, str(subdirectory), spacer, 'MHC_Class_I'), ignore_errors=True) shutil.rmtree(os.path.join(base_output_dir, str(subdirectory), spacer, 'MHC_Class_II'), ignore_errors=True) + change_permissions_recursive(base_output_dir, 0o755, 0o644) + return + + solution_found = False + if results_file is None: + for n_nodes_to_remove in range(1, args.allow_n_peptide_exclusion+1): + node_sets_to_remove = list(itertools.combinations(seq_keys, n_nodes_to_remove)) + for node_set in node_sets_to_remove: + print("Removing nodes: {}".format(', '.join(node_set))) + #Creating output directory + current_output_dir = os.path.join(base_output_dir, "without_{}".format('_'.join(node_set))) + os.makedirs(current_output_dir, exist_ok=True) + #Creating junctions file with node_set removed + junctions_file = os.path.join(base_output_dir, 'junctions.tsv') + modified_junctions_file = os.path.join(current_output_dir, 'junctions.tsv') + with open(junctions_file) as input_fh, open(modified_junctions_file, 'w') as output_fh: + reader = csv.DictReader(input_fh, delimiter='\t') + writer = csv.DictWriter(output_fh, delimiter='\t', fieldnames=reader.fieldnames) + writer.writeheader() + for line in reader: + for node in node_set: + if line['left_peptide'] == node or line['right_peptide'] == node: + continue + writer.writerow(line) + #Modify graph to remove node_set and test it + modified_graph = graph.copy() + modified_graph.remove_nodes_from(node_set) + modified_seq_dict = seq_dict.copy() + for node in node_set: + del modified_seq_dict[node] + (valid, error) = check_graph_valid(modified_graph, modified_seq_dict) + if not valid: + print("No valid path found after removing nodes: {}".format(', '.join(node_set))) + continue + distance_matrix = create_distance_matrix(modified_graph) + (results_file, error) = find_optimal_path(modified_graph, distance_matrix, modified_seq_dict, current_output_dir, args) + if results_file is None: + print("No valid path found after removing nodes: {}".format(', '.join(node_set))) + continue + else: + solution_found = True + if 'DISPLAY' in os.environ.keys(): + VectorVisualization(results_file, current_output_dir, args.spacers).draw() + dna_results_file = os.path.join(current_output_dir, args.sample_name + '_results.dna.fa') + create_dna_backtranslation(results_file, dna_results_file) + if solution_found: + break + + if not solution_found: + print( + 'Unable to find path. ' + + 'A vaccine design using the parameters specified could not be found. Some options that you may want to consider:\n' + + '1) decreasing the acceptable junction binding score to allow more possible connections (-b parameter)\n' + + '2) using the "median" binding score instead of the "best" binding score for each junction, (best may be too conservative, -m parameter)\n' + + '3) if running with a percentile threshold set, either remove this parameter or reduce the acceptable threshold to allow more possible connections (--percentile-threshold parameter)\n' + + '4) increase the number of peptides that can be excluded from the vector (--allow-n-peptide-exclusion parameter)' + ) + change_permissions_recursive(base_output_dir, 0o755, 0o644) if __name__ == "__main__": diff --git a/tests/test_data/pvacvector/without_MT.CASP10.S654R.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.CASP10.S654R.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..9e5fdfd4 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.CASP10.S654R.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.SUMF2.G23A,MT.TP53.R157H,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PEX1.V356I|Median_Junction_Score:32636.72125|Lowest_Junction_Score:25464.36|All_Junction_Scores:35215.03,30620.2,36498.27,37632.32,36869.38,25464.36,32071.19,26723.02 +KLGILFGRKSIWQKLRRLDGALSLR +KQSQWMTEVERHCPHFERCSRSDGL +LAKAKQYRLKGWGMSFLAGKKLKEG +HPRWEILIGRQNVTGGYYKREAKTK +RTGLIAHFGEGWPGWSRLRRLPDDK +DGTWICRFNGNHQPRISSLTNGVPI +FGAPAWSQPWFRGSTFVFSGGAATS +SLASTLSPSCRTTPQFFPSSYLPPH +KTTWQVWLDPFIKEEGSEEFDFILP diff --git a/tests/test_data/pvacvector/without_MT.FAT3.R4848T.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.FAT3.R4848T.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..0f357811 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.FAT3.R4848T.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.SUMF2.G23A,MT.ACSL3.S345N,MT.PRDM15.G654W,MT.PEX1.V356I,MT.TP53.R157H,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.CASP10.S654R|Median_Junction_Score:30083.295000000002|Lowest_Junction_Score:23486.97|All_Junction_Scores:24253.18,37632.32,37434.15,30563.6,30620.2,31211.58,25464.36,23486.97 +KLGILFGRKSIWQKLRRLDGALSLR +HPRWEILIGRQNVTGGYYKREAKTK +RTGLIAHFGEGWPGWSRLRRLPDDK +KTTWQVWLDPFIKEEGSEEFDFILP +KQSQWMTEVERHCPHFERCSRSDGL +LAKAKQYRLKGWGMSFLAGKKLKEG +DGTWICRFNGNHQPRISSLTNGVPI +FGAPAWSQPWFRGSTFVFSGGAATS +GYVWFRFVEEGWWYIQSFCNHLKKL diff --git a/tests/test_data/pvacvector/without_MT.PEX1.V356I.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.PEX1.V356I.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..da20b1c9 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.PEX1.V356I.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.CASP10.S654R,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.TP53.R157H|Median_Junction_Score:30712.94125|Lowest_Junction_Score:24345.72|All_Junction_Scores:24345.72,36498.27,28221.36,32071.19,26803.52,36869.38,25679.06,35215.03 +GYVWFRFVEEGWWYIQSFCNHLKKL +LAKAKQYRLKGWGMSFLAGKKLKEG +HPRWEILIGRQNVTGGYYKREAKTK +FGAPAWSQPWFRGSTFVFSGGAATS +SLASTLSPSCRTTPQFFPSSYLPPH +RTGLIAHFGEGWPGWSRLRRLPDDK +DGTWICRFNGNHQPRISSLTNGVPI +KLGILFGRKSIWQKLRRLDGALSLR +KQSQWMTEVERHCPHFERCSRSDGL diff --git a/tests/test_data/pvacvector/without_MT.POM121C.G3107R.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.POM121C.G3107R.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..ebe9d8e2 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.POM121C.G3107R.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.FAT3.R4848T,MT.PEX1.V356I,MT.TP53.R157H,MT.DTX3L.G501R,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.CASP10.S654R|Median_Junction_Score:28628.89|Lowest_Junction_Score:24253.18|All_Junction_Scores:26723.02,30563.6,30620.2,28746.49,36869.38,25679.06,24253.18,25576.19 +SLASTLSPSCRTTPQFFPSSYLPPH +KTTWQVWLDPFIKEEGSEEFDFILP +KQSQWMTEVERHCPHFERCSRSDGL +LAKAKQYRLKGWGMSFLAGKKLKEG +RTGLIAHFGEGWPGWSRLRRLPDDK +DGTWICRFNGNHQPRISSLTNGVPI +KLGILFGRKSIWQKLRRLDGALSLR +HPRWEILIGRQNVTGGYYKREAKTK +GYVWFRFVEEGWWYIQSFCNHLKKL diff --git a/tests/test_data/pvacvector/without_MT.PRDM15.G654W.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.PRDM15.G654W.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..0752f947 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.PRDM15.G654W.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.CASP10.S654R,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.FAT3.R4848T,MT.PEX1.V356I,MT.TP53.R157H|Median_Junction_Score:28468.2575|Lowest_Junction_Score:24253.18|All_Junction_Scores:24345.72,31211.58,25464.36,29377.88,24253.18,35806.72,26723.02,30563.6 +GYVWFRFVEEGWWYIQSFCNHLKKL +LAKAKQYRLKGWGMSFLAGKKLKEG +DGTWICRFNGNHQPRISSLTNGVPI +FGAPAWSQPWFRGSTFVFSGGAATS +KLGILFGRKSIWQKLRRLDGALSLR +HPRWEILIGRQNVTGGYYKREAKTK +SLASTLSPSCRTTPQFFPSSYLPPH +KTTWQVWLDPFIKEEGSEEFDFILP +KQSQWMTEVERHCPHFERCSRSDGL diff --git a/tests/test_data/pvacvector/without_MT.SUMF2.G23A.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.SUMF2.G23A.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..a545f7a4 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.SUMF2.G23A.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.CASP10.S654R,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.PEX1.V356I,MT.TP53.R157H|Median_Junction_Score:30769.6675|Lowest_Junction_Score:24345.72|All_Junction_Scores:24345.72,36498.27,28221.36,32071.19,26803.52,36869.38,30784.3,30563.6 +GYVWFRFVEEGWWYIQSFCNHLKKL +LAKAKQYRLKGWGMSFLAGKKLKEG +HPRWEILIGRQNVTGGYYKREAKTK +FGAPAWSQPWFRGSTFVFSGGAATS +SLASTLSPSCRTTPQFFPSSYLPPH +RTGLIAHFGEGWPGWSRLRRLPDDK +DGTWICRFNGNHQPRISSLTNGVPI +KTTWQVWLDPFIKEEGSEEFDFILP +KQSQWMTEVERHCPHFERCSRSDGL diff --git a/tests/test_data/pvacvector/without_MT.TP53.R157H.test_pvacvector_produces_expected_output_results.fa b/tests/test_data/pvacvector/without_MT.TP53.R157H.test_pvacvector_produces_expected_output_results.fa new file mode 100644 index 00000000..2c389a88 --- /dev/null +++ b/tests/test_data/pvacvector/without_MT.TP53.R157H.test_pvacvector_produces_expected_output_results.fa @@ -0,0 +1,10 @@ +>MT.POM121C.G3107R,MT.CASP10.S654R,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.PEX1.V356I|Median_Junction_Score:28627.6125|Lowest_Junction_Score:23486.97|All_Junction_Scores:23486.97,24345.72,31211.58,25679.06,24253.18,35806.72,26803.52,37434.15 +FGAPAWSQPWFRGSTFVFSGGAATS +GYVWFRFVEEGWWYIQSFCNHLKKL +LAKAKQYRLKGWGMSFLAGKKLKEG +DGTWICRFNGNHQPRISSLTNGVPI +KLGILFGRKSIWQKLRRLDGALSLR +HPRWEILIGRQNVTGGYYKREAKTK +SLASTLSPSCRTTPQFFPSSYLPPH +RTGLIAHFGEGWPGWSRLRRLPDDK +KTTWQVWLDPFIKEEGSEEFDFILP diff --git a/tests/test_pvacvector.py b/tests/test_pvacvector.py index 7545b45a..1a55d1e8 100644 --- a/tests/test_pvacvector.py +++ b/tests/test_pvacvector.py @@ -187,6 +187,7 @@ def test_pvacvector_fa_input_runs_and_produces_expected_output(self): output_dir.name, '-e1', self.epitope_length, '-n', self.input_n_mer, + '--allow-n-peptide-exclusion', '0', '-k' ]) @@ -229,6 +230,7 @@ def test_pvacvector_generate_fa_runs_and_produces_expected_output(self): '-v', self.input_vcf, '-e1', self.epitope_length, '-n', self.input_n_mer, + '--allow-n-peptide-exclusion', '0', '-k', ]) @@ -264,6 +266,7 @@ def test_pvacvector_generate_fa_with_epitope_at_beginning_of_transcript(self): '-v', os.path.join(self.test_data_dir, "input_negative_start.vcf.gz"), '-e1', self.epitope_length, '-n', self.input_n_mer, + '--allow-n-peptide-exclusion', '0', '-k', ]) @@ -288,6 +291,7 @@ def test_pvacvector_clipping(self): '-k', '-b', '32000', '--max-clip-length', '1', + '--allow-n-peptide-exclusion', '0', '--spacers', 'None,AAY', ]) @@ -352,6 +356,7 @@ def test_pvacvector_percentile_threshold(self): '-b', '32000', '--percentile-threshold', '80', '--max-clip-length', '0', + '--allow-n-peptide-exclusion', '0', '--spacers', 'None', ]) @@ -361,3 +366,61 @@ def test_pvacvector_percentile_threshold(self): )) output_dir.cleanup() + + def test_pvacvector_remove_peptides(self): + output_dir = tempfile.TemporaryDirectory() + + run.main([ + self.input_file, + self.test_run_name, + self.allele, + self.method, + output_dir.name, + '-e1', self.epitope_length, + '-n', self.input_n_mer, + '-k', + '-b', '22000', + '--max-clip-length', '0', + '--spacers', 'None', + ]) + + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.CASP10.S654R", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.CASP10.S654R.test_pvacvector_produces_expected_output_results.fa") + )) + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.FAT3.R4848T", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.FAT3.R4848T.test_pvacvector_produces_expected_output_results.fa") + )) + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.PEX1.V356I", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.PEX1.V356I.test_pvacvector_produces_expected_output_results.fa") + )) + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.POM121C.G3107R", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.POM121C.G3107R.test_pvacvector_produces_expected_output_results.fa") + )) + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.PRDM15.G654W", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.PRDM15.G654W.test_pvacvector_produces_expected_output_results.fa") + )) + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.SUMF2.G23A", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.SUMF2.G23A.test_pvacvector_produces_expected_output_results.fa") + )) + self.assertTrue(compare( + os.path.join(output_dir.name, "without_MT.TP53.R157H", "test_pvacvector_produces_expected_output_results.fa"), + os.path.join(self.test_data_dir, "without_MT.TP53.R157H.test_pvacvector_produces_expected_output_results.fa") + )) + + self.assertFalse(os.path.exists( + os.path.join(output_dir.name, "without_MT.ACSL3.S345N", "test_pvacvector_produces_expected_output_results.fa"), + )) + self.assertFalse(os.path.exists( + os.path.join(output_dir.name, "without_MT.DTX3L.G501R", "test_pvacvector_produces_expected_output_results.fa"), + )) + self.assertFalse(os.path.exists( + os.path.join(output_dir.name, "without_MT.NRCAM.P838H", "test_pvacvector_produces_expected_output_results.fa"), + )) + + output_dir.cleanup()