Skip to content

Commit

Permalink
Allow pVACvector to remove peptides in order to find a partial solution
Browse files Browse the repository at this point in the history
  • Loading branch information
susannasiebert committed Dec 20, 2024
1 parent c45aeb3 commit 5b0b39b
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 71 deletions.
5 changes: 5 additions & 0 deletions pvactools/lib/run_argument_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,11 @@ def pvacvector(self):
help="Number of amino acids to permit clipping from the start and/or end of peptides in order to test novel junction epitopes when the first pass on the full peptide fails.",
default=3,
)
self.parser.add_argument(
'--allow-n-peptide-exclusion', type=int,
help="If no solution is found after adding spacers and clipping peptides, attempt to find partial solutions with up to n peptides removed.",
default=2,
)

class PvacbindRunArgumentParser(RunArgumentParser):
def __init__(self):
Expand Down
192 changes: 121 additions & 71 deletions pvactools/tools/pvacvector/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import itertools
import json
import platform
import shutil

from pvactools.lib.optimal_peptide import OptimalPeptide
from pvactools.lib.vector_visualization import VectorVisualization
Expand Down Expand Up @@ -251,72 +252,72 @@ def find_optimal_path(graph, distance_matrix, seq_dict, base_output_dir, args):
peptide.copy_strategy = "slice"
peptide.save_state_on_exit = False
state, e = peptide.anneal()
print("%i distance :" % e)

names = list()
cumulative_weight = 0
all_scores = list()
problematic_junctions = []
for i in range(0, (len(state) - 1)):
names.append(state[i])
if graph.has_edge(state[i], state[i + 1]):
edge = graph[state[i]][state[i + 1]]
try:
min_score = min(min_score, edge['weight'])
except:
min_score = edge['weight']
cumulative_weight += edge['weight']
all_scores.append(str(edge['weight']))
spacer = edge['spacer']
if spacer != 'None':
names.append(spacer)
else:
problematic_junctions.append("{} - {}".format(state[i], state[i+1]))
names.append(state[-1])
if len(problematic_junctions) > 0:
return (None, "No valid junction between peptides: {}".format(", ".join(problematic_junctions)))

print("%i distance :" % e)
for id in state:
print("\t", id)

results_file = os.path.join(base_output_dir, args.sample_name + '_results.fa')
with open(results_file, 'w') as f:
names = list()
cumulative_weight = 0
all_scores = list()

problematic_junctions = []
for i in range(0, (len(state) - 1)):
names.append(state[i])
if graph.has_edge(state[i], state[i + 1]):
edge = graph[state[i]][state[i + 1]]
try:
min_score = min(min_score, edge['weight'])
except:
min_score = edge['weight']
cumulative_weight += edge['weight']
all_scores.append(str(edge['weight']))
spacer = edge['spacer']
if spacer != 'None':
names.append(spacer)
median_score = str(cumulative_weight/len(all_scores))
peptide_id_list = ','.join(names)
score_list = ','.join(all_scores)
output = list()
output.append(">")
output.append(peptide_id_list)
output.append("|Median_Junction_Score:")
output.append(median_score)
output.append("|Lowest_Junction_Score:")
output.append(str(min_score))
output.append("|All_Junction_Scores:")
output.append(score_list)
output.append("\n")
for (i, name) in enumerate(names):
if name in args.spacers:
output.append(name)
else:
full_sequence = seq_dict[name]
if i > 0:
left_name = names[i-1]
if left_name in args.spacers:
left_name = names[i-2]
left_clip = graph.edges[(left_name, name)]['right_partner_trim']
else:
problematic_junctions.append("{} - {}".format(state[i], state[i+1]))
if len(problematic_junctions) > 0:
return (None, "No valid junction between peptides: {}".format(", ".join(problematic_junctions)))
names.append(state[-1])
median_score = str(cumulative_weight/len(all_scores))
peptide_id_list = ','.join(names)
score_list = ','.join(all_scores)
output = list()
output.append(">")
output.append(peptide_id_list)
output.append("|Median_Junction_Score:")
output.append(median_score)
output.append("|Lowest_Junction_Score:")
output.append(str(min_score))
output.append("|All_Junction_Scores:")
output.append(score_list)
output.append("\n")
for (i, name) in enumerate(names):
if name in args.spacers:
output.append(name)
left_clip = 0
if i < (len(names) - 1):
right_name = names[i+1]
if right_name in args.spacers:
right_name = names[i+2]
right_clip = graph.edges[(name, right_name)]['left_partner_trim']
else:
full_sequence = seq_dict[name]
if i > 0:
left_name = names[i-1]
if left_name in args.spacers:
left_name = names[i-2]
left_clip = graph.edges[(left_name, name)]['right_partner_trim']
else:
left_clip = 0
if i < (len(names) - 1):
right_name = names[i+1]
if right_name in args.spacers:
right_name = names[i+2]
right_clip = graph.edges[(name, right_name)]['left_partner_trim']
else:
right_clip = 0
clipped_sequence = full_sequence[left_clip:len(full_sequence) - right_clip]
output.append(clipped_sequence)
right_clip = 0
clipped_sequence = full_sequence[left_clip:len(full_sequence) - right_clip]
output.append(clipped_sequence)

output.append("\n")
output.append("\n")
results_file = os.path.join(base_output_dir, args.sample_name + '_results.fa')
with open(results_file, 'w') as f:
f.write(''.join(output))
return (results_file, None)

Expand Down Expand Up @@ -451,21 +452,12 @@ def main(args_input=sys.argv[1:]):
print("No valid path found. {}".format(error))
junctions_to_process = identify_problematic_junctions(graph, seq_tuples)
else:
junctions_file = os.path.join(current_output_dir, 'junctions.tsv')
import shutil
shutil.copy(junctions_file, base_output_dir)
break
tries += 1
junctions_file = os.path.join(current_output_dir, 'junctions.tsv')
shutil.copy(junctions_file, base_output_dir)

if results_file is None:
print(
'Unable to find path. ' +
'A vaccine design using the parameters specified could not be found. Some options that you may want to consider:\n' +
'1) decreasing the acceptable junction binding score to allow more possible connections (-b parameter)\n' +
'2) using the "median" binding score instead of the "best" binding score for each junction, (best may be too conservative, -m parameter)\n' +
'3) if running with a percentile threshold set, either remove this parameter or reduce the acceptable threshold to allow more possible connections (--percentile-threshold parameter)'
)
else:
if results_file is not None:
if 'DISPLAY' in os.environ.keys():
VectorVisualization(results_file, base_output_dir, args.spacers).draw()

Expand All @@ -478,6 +470,64 @@ def main(args_input=sys.argv[1:]):
shutil.rmtree(os.path.join(base_output_dir, str(subdirectory), spacer, 'MHC_Class_I'), ignore_errors=True)
shutil.rmtree(os.path.join(base_output_dir, str(subdirectory), spacer, 'MHC_Class_II'), ignore_errors=True)

change_permissions_recursive(base_output_dir, 0o755, 0o644)
return

solution_found = False
if results_file is None:
for n_nodes_to_remove in range(1, args.allow_n_peptide_exclusion+1):
node_sets_to_remove = list(itertools.combinations(seq_keys, n_nodes_to_remove))
for node_set in node_sets_to_remove:
print("Removing nodes: {}".format(', '.join(node_set)))
#Creating output directory
current_output_dir = os.path.join(base_output_dir, "without_{}".format('_'.join(node_set)))
os.makedirs(current_output_dir, exist_ok=True)
#Creating junctions file with node_set removed
junctions_file = os.path.join(base_output_dir, 'junctions.tsv')
modified_junctions_file = os.path.join(current_output_dir, 'junctions.tsv')
with open(junctions_file) as input_fh, open(modified_junctions_file, 'w') as output_fh:
reader = csv.DictReader(input_fh, delimiter='\t')
writer = csv.DictWriter(output_fh, delimiter='\t', fieldnames=reader.fieldnames)
writer.writeheader()
for line in reader:
for node in node_set:
if line['left_peptide'] == node or line['right_peptide'] == node:
continue
writer.writerow(line)
#Modify graph to remove node_set and test it
modified_graph = graph.copy()
modified_graph.remove_nodes_from(node_set)
modified_seq_dict = seq_dict.copy()
for node in node_set:
del modified_seq_dict[node]
(valid, error) = check_graph_valid(modified_graph, modified_seq_dict)
if not valid:
print("No valid path found after removing nodes: {}".format(', '.join(node_set)))
continue
distance_matrix = create_distance_matrix(modified_graph)
(results_file, error) = find_optimal_path(modified_graph, distance_matrix, modified_seq_dict, current_output_dir, args)
if results_file is None:
print("No valid path found after removing nodes: {}".format(', '.join(node_set)))
continue
else:
solution_found = True
if 'DISPLAY' in os.environ.keys():
VectorVisualization(results_file, current_output_dir, args.spacers).draw()
dna_results_file = os.path.join(current_output_dir, args.sample_name + '_results.dna.fa')
create_dna_backtranslation(results_file, dna_results_file)
if solution_found:
break

if not solution_found:
print(
'Unable to find path. ' +
'A vaccine design using the parameters specified could not be found. Some options that you may want to consider:\n' +
'1) decreasing the acceptable junction binding score to allow more possible connections (-b parameter)\n' +
'2) using the "median" binding score instead of the "best" binding score for each junction, (best may be too conservative, -m parameter)\n' +
'3) if running with a percentile threshold set, either remove this parameter or reduce the acceptable threshold to allow more possible connections (--percentile-threshold parameter)\n' +
'4) increase the number of peptides that can be excluded from the vector (--allow-n-peptide-exclusion parameter)'
)

change_permissions_recursive(base_output_dir, 0o755, 0o644)

if __name__ == "__main__":
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.SUMF2.G23A,MT.TP53.R157H,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PEX1.V356I|Median_Junction_Score:32636.72125|Lowest_Junction_Score:25464.36|All_Junction_Scores:35215.03,30620.2,36498.27,37632.32,36869.38,25464.36,32071.19,26723.02
KLGILFGRKSIWQKLRRLDGALSLR
KQSQWMTEVERHCPHFERCSRSDGL
LAKAKQYRLKGWGMSFLAGKKLKEG
HPRWEILIGRQNVTGGYYKREAKTK
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
FGAPAWSQPWFRGSTFVFSGGAATS
SLASTLSPSCRTTPQFFPSSYLPPH
KTTWQVWLDPFIKEEGSEEFDFILP
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.SUMF2.G23A,MT.ACSL3.S345N,MT.PRDM15.G654W,MT.PEX1.V356I,MT.TP53.R157H,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.CASP10.S654R|Median_Junction_Score:30083.295000000002|Lowest_Junction_Score:23486.97|All_Junction_Scores:24253.18,37632.32,37434.15,30563.6,30620.2,31211.58,25464.36,23486.97
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
RTGLIAHFGEGWPGWSRLRRLPDDK
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
LAKAKQYRLKGWGMSFLAGKKLKEG
DGTWICRFNGNHQPRISSLTNGVPI
FGAPAWSQPWFRGSTFVFSGGAATS
GYVWFRFVEEGWWYIQSFCNHLKKL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.CASP10.S654R,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.TP53.R157H|Median_Junction_Score:30712.94125|Lowest_Junction_Score:24345.72|All_Junction_Scores:24345.72,36498.27,28221.36,32071.19,26803.52,36869.38,25679.06,35215.03
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
HPRWEILIGRQNVTGGYYKREAKTK
FGAPAWSQPWFRGSTFVFSGGAATS
SLASTLSPSCRTTPQFFPSSYLPPH
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
KLGILFGRKSIWQKLRRLDGALSLR
KQSQWMTEVERHCPHFERCSRSDGL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.FAT3.R4848T,MT.PEX1.V356I,MT.TP53.R157H,MT.DTX3L.G501R,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.CASP10.S654R|Median_Junction_Score:28628.89|Lowest_Junction_Score:24253.18|All_Junction_Scores:26723.02,30563.6,30620.2,28746.49,36869.38,25679.06,24253.18,25576.19
SLASTLSPSCRTTPQFFPSSYLPPH
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
LAKAKQYRLKGWGMSFLAGKKLKEG
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
GYVWFRFVEEGWWYIQSFCNHLKKL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.CASP10.S654R,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.POM121C.G3107R,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.FAT3.R4848T,MT.PEX1.V356I,MT.TP53.R157H|Median_Junction_Score:28468.2575|Lowest_Junction_Score:24253.18|All_Junction_Scores:24345.72,31211.58,25464.36,29377.88,24253.18,35806.72,26723.02,30563.6
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
DGTWICRFNGNHQPRISSLTNGVPI
FGAPAWSQPWFRGSTFVFSGGAATS
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
SLASTLSPSCRTTPQFFPSSYLPPH
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.CASP10.S654R,MT.DTX3L.G501R,MT.ACSL3.S345N,MT.POM121C.G3107R,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.NRCAM.P838H,MT.PEX1.V356I,MT.TP53.R157H|Median_Junction_Score:30769.6675|Lowest_Junction_Score:24345.72|All_Junction_Scores:24345.72,36498.27,28221.36,32071.19,26803.52,36869.38,30784.3,30563.6
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
HPRWEILIGRQNVTGGYYKREAKTK
FGAPAWSQPWFRGSTFVFSGGAATS
SLASTLSPSCRTTPQFFPSSYLPPH
RTGLIAHFGEGWPGWSRLRRLPDDK
DGTWICRFNGNHQPRISSLTNGVPI
KTTWQVWLDPFIKEEGSEEFDFILP
KQSQWMTEVERHCPHFERCSRSDGL
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
>MT.POM121C.G3107R,MT.CASP10.S654R,MT.DTX3L.G501R,MT.NRCAM.P838H,MT.SUMF2.G23A,MT.ACSL3.S345N,MT.FAT3.R4848T,MT.PRDM15.G654W,MT.PEX1.V356I|Median_Junction_Score:28627.6125|Lowest_Junction_Score:23486.97|All_Junction_Scores:23486.97,24345.72,31211.58,25679.06,24253.18,35806.72,26803.52,37434.15
FGAPAWSQPWFRGSTFVFSGGAATS
GYVWFRFVEEGWWYIQSFCNHLKKL
LAKAKQYRLKGWGMSFLAGKKLKEG
DGTWICRFNGNHQPRISSLTNGVPI
KLGILFGRKSIWQKLRRLDGALSLR
HPRWEILIGRQNVTGGYYKREAKTK
SLASTLSPSCRTTPQFFPSSYLPPH
RTGLIAHFGEGWPGWSRLRRLPDDK
KTTWQVWLDPFIKEEGSEEFDFILP
Loading

0 comments on commit 5b0b39b

Please sign in to comment.