Skip to content

Commit b95f8c0

Browse files
Update reference proteome similarity step to work with nonhuman data
1 parent af74bab commit b95f8c0

8 files changed

+281
-27
lines changed

pvactools/lib/calculate_reference_proteome_similarity.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,7 @@ def _input_tsv_type(self, line):
269269
def _get_full_peptide(self, line, mt_records_dict, wt_records_dict):
270270
for record_id in mt_records_dict.keys():
271271
(rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
272-
transcript_regex = '^.*(ENST[0-9|.]+)$'
272+
transcript_regex = '^.*(ENS[0-9|A-Z|.]+)$'
273273
transcript_p = re.compile(transcript_regex)
274274
m = transcript_p.match(rest_record_id)
275275
if m:
@@ -522,7 +522,7 @@ def _write_outputs(self, processed_peptides, mt_records_dict, wt_records_dict):
522522
for query_window, hit_reference_matches in groupby(metric_lines,key=lambda x:x['Match Window']):
523523
hit_reference_matches = list(hit_reference_matches)
524524
gene_regex = '^.*gene_symbol:([0-9|A-Z]+).*$'
525-
transcript_regex = '^.*transcript:(ENST[0-9|.]+).*$'
525+
transcript_regex = '^.*transcript:(ENS[0-9|A-Z|.]+).*$'
526526
gene_p = re.compile(gene_regex)
527527
transcript_p = re.compile(transcript_regex)
528528
genes = []

tests/test_calculate_reference_proteome_similarity.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,29 @@ def test_calculate_self_similarity_with_aggregated_tsv_and_peptide_fasta(self):
9090
))
9191
os.remove(metric_file)
9292

93+
def test_calculate_self_similarity_with_aggregated_tsv_and_peptide_fasta_mouse(self):
94+
input_file = os.path.join(self.test_data_dir, 'Test.all_epitopes.aggregated.mouse.tsv')
95+
input_aggregated_metrics_file = os.path.join(self.test_data_dir, 'Test.all_epitopes.aggregated.mouse.tsv.metrics.json')
96+
tmp_aggregated_metrics_file = tempfile.NamedTemporaryFile()
97+
import shutil
98+
shutil.copy(input_aggregated_metrics_file, tmp_aggregated_metrics_file.name)
99+
input_fasta = os.path.join(self.test_data_dir, 'Test.mouse.fasta')
100+
output_file = tempfile.NamedTemporaryFile(suffix='.tsv')
101+
metric_file = "{}.reference_matches".format(output_file.name)
102+
output_aggregated_metrics_file = output_file.name.replace(".tsv", ".metrics.json")
103+
self.assertFalse(CalculateReferenceProteomeSimilarity(
104+
input_file,
105+
input_fasta,
106+
output_file.name,
107+
peptide_fasta=self.peptide_fasta,
108+
aggregate_metrics_file=tmp_aggregated_metrics_file.name,
109+
).execute())
110+
self.assertTrue(cmp(
111+
output_file.name,
112+
os.path.join(self.test_data_dir, "output.aggregated.peptide_fasta.mouse.tsv"),
113+
))
114+
os.remove(metric_file)
115+
93116
def test_wt_peptide_fully_in_mt_peptide(self):
94117
input_file = os.path.join(self.test_data_dir, 'input_wt_in_mt.tsv')
95118
input_fasta = os.path.join(self.test_data_dir, 'input_wt_in_mt.fasta')
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ID E*01:01 G*01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
2+
22-41920894-41920895-G-C 2 1 ACO2 N1453S 1 KFNPQTDYL ENSMUST00000027032 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor Pending
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,247 @@
1+
{
2+
"tumor_purity": null,
3+
"vaf_clonal": 0.571,
4+
"vaf_subclonal": 0.286,
5+
"binding_threshold": 500,
6+
"aggregate_inclusion_binding_threshold": 5000,
7+
"trna_vaf": 0.25,
8+
"trna_cov": 10,
9+
"allele_expr_threshold": 2.5,
10+
"maximum_transcript_support_level": 1,
11+
"percentile_threshold": null,
12+
"allele_specific_binding_thresholds": false,
13+
"mt_top_score_metric": "Best",
14+
"wt_top_score_metric": "Corresponding",
15+
"binding_cutoffs": {
16+
"HLA-E*01:01": 500,
17+
"HLA-G*01:09": 500
18+
},
19+
"is_allele_specific_binding_cutoff": {
20+
"HLA-E*01:01": false,
21+
"HLA-G*01:09": false
22+
},
23+
"allele_specific_anchors": false,
24+
"anchor_contribution_threshold": 0.8,
25+
"22-41920894-41920895-G-C": {
26+
"good_binders": {
27+
"Transcript Set 1": {
28+
"peptides": {
29+
"LLPLLPLLL": {
30+
"ic50s_MT": [
31+
2551.25,
32+
4778.52001953125
33+
],
34+
"percentiles_MT": [
35+
0.1700439453125,
36+
6.80078125
37+
],
38+
"ic50s_WT": [
39+
3099.81,
40+
4830.5
41+
],
42+
"percentiles_WT": [
43+
0.33,
44+
6.9
45+
],
46+
"hla_types": [
47+
"HLA-E*01:01",
48+
"HLA-G*01:09"
49+
],
50+
"mutation_position": "6-7",
51+
"problematic_positions": "None",
52+
"individual_ic50_calls": {
53+
"algorithms": [
54+
"NetMHC",
55+
"PickPocket"
56+
],
57+
"MT": {
58+
"HLA-E*01:01": [
59+
6891.60986328125,
60+
2551.25
61+
],
62+
"HLA-G*01:09": [
63+
"NA",
64+
4778.52001953125
65+
]
66+
},
67+
"WT": {
68+
"HLA-E*01:01": [
69+
9234.71,
70+
3099.81
71+
],
72+
"HLA-G*01:09": [
73+
"NA",
74+
4830.5
75+
]
76+
}
77+
},
78+
"individual_percentile_calls": {
79+
"algorithms": [
80+
"NetMHC",
81+
"PickPocket"
82+
],
83+
"MT": {
84+
"HLA-E*01:01": [
85+
0.1700439453125,
86+
1.400390625
87+
],
88+
"HLA-G*01:09": [
89+
"NA",
90+
6.80078125
91+
]
92+
},
93+
"WT": {
94+
"HLA-E*01:01": [
95+
0.33,
96+
2.2
97+
],
98+
"HLA-G*01:09": [
99+
"NA",
100+
6.9
101+
]
102+
}
103+
},
104+
"individual_el_calls": {
105+
"algorithms": [],
106+
"MT": {
107+
"HLA-E*01:01": [],
108+
"HLA-G*01:09": []
109+
},
110+
"WT": {
111+
"HLA-E*01:01": [],
112+
"HLA-G*01:09": []
113+
}
114+
},
115+
"individual_el_percentile_calls": {
116+
"algorithms": [],
117+
"MT": {
118+
"HLA-E*01:01": [],
119+
"HLA-G*01:09": []
120+
},
121+
"WT": {
122+
"HLA-E*01:01": [],
123+
"HLA-G*01:09": []
124+
}
125+
},
126+
"wt_peptide": "LLPLLLLLL"
127+
},
128+
"LLPLLLLLG": {
129+
"ic50s_MT": [
130+
4676.22998046875,
131+
"X"
132+
],
133+
"percentiles_MT": [
134+
5.8984375,
135+
"X"
136+
],
137+
"ic50s_WT": [
138+
38565.12,
139+
"X"
140+
],
141+
"percentiles_WT": [
142+
81.0,
143+
"X"
144+
],
145+
"hla_types": [
146+
"HLA-E*01:01",
147+
"HLA-G*01:09"
148+
],
149+
"mutation_position": "3-4",
150+
"problematic_positions": "None",
151+
"individual_ic50_calls": {
152+
"algorithms": [
153+
"NetMHC",
154+
"PickPocket"
155+
],
156+
"MT": {
157+
"HLA-E*01:01": [
158+
31981.0703125,
159+
4676.22998046875
160+
]
161+
},
162+
"WT": {
163+
"HLA-E*01:01": [
164+
42768.73,
165+
38565.12
166+
]
167+
}
168+
},
169+
"individual_percentile_calls": {
170+
"algorithms": [
171+
"NetMHC",
172+
"PickPocket"
173+
],
174+
"MT": {
175+
"HLA-E*01:01": [
176+
14.0,
177+
5.8984375
178+
]
179+
},
180+
"WT": {
181+
"HLA-E*01:01": [
182+
60.0,
183+
81.0
184+
]
185+
}
186+
},
187+
"individual_el_calls": {
188+
"algorithms": [],
189+
"MT": {
190+
"HLA-E*01:01": []
191+
},
192+
"WT": {
193+
"HLA-E*01:01": []
194+
}
195+
},
196+
"individual_el_percentile_calls": {
197+
"algorithms": [],
198+
"MT": {
199+
"HLA-E*01:01": []
200+
},
201+
"WT": {
202+
"HLA-E*01:01": []
203+
}
204+
},
205+
"wt_peptide": "LPLLLLLLG"
206+
}
207+
},
208+
"transcripts": [
209+
"ENST00000233809.4-IGFBP2-L/LLP-20"
210+
],
211+
"transcript_expr": [
212+
"NA"
213+
],
214+
"tsl": [
215+
"Not Supported"
216+
],
217+
"biotype": [
218+
"protein_coding"
219+
],
220+
"transcript_length": [
221+
325
222+
],
223+
"transcript_count": 1,
224+
"peptide_count": 2,
225+
"total_expr": 0
226+
}
227+
},
228+
"sets": [
229+
"Transcript Set 1"
230+
],
231+
"transcript_counts": [
232+
1
233+
],
234+
"peptide_counts": [
235+
2
236+
],
237+
"set_expr": [
238+
0
239+
],
240+
"DNA VAF": 0.891,
241+
"RNA VAF": "NA",
242+
"gene_expr": "NA",
243+
"best_peptide_mt": "LLPLLPLLL",
244+
"best_peptide_wt": "LLPLLLLLL",
245+
"best_hla_allele": "HLA-E*01:01"
246+
}
247+
}

tests/test_data/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.tsv.metrics.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3275,4 +3275,4 @@
32753275
"best_peptide_wt": "ATLSRTLLA",
32763276
"best_hla_allele": "HLA-E*01:01"
32773277
}
3278-
}
3278+
}
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
>WT.Rp1.ENSMUST00000027032.missense.1453N/S
2+
IAGTLKFNPETDYLTGTDG
3+
>MT.Rp1.ENSMUST00000027032.missense.1453N/S
4+
IAGTLKFNPQTDYLTGTDG

tests/test_data/calculate_reference_proteome_similarity/input.aggregated.tsv

Lines changed: 0 additions & 24 deletions
This file was deleted.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
ID E*01:01 G*01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Ref Match Evaluation
2+
22-41920894-41920895-G-C 2 1 ACO2 N1453S 1 KFNPQTDYL ENSMUST00000027032 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor False Pending

0 commit comments

Comments
 (0)