Merge pull request #1017 from griffithlab/ref_seq_mouse

Update reference proteome similarity step to work with nonhuman data
griffithlab · Aug 9, 2023 · 2dea4a7 · 2dea4a7
2 parents af74bab + b95f8c0
commit 2dea4a7
Show file tree

Hide file tree

Showing 8 changed files with 281 additions and 27 deletions.
diff --git a/pvactools/lib/calculate_reference_proteome_similarity.py b/pvactools/lib/calculate_reference_proteome_similarity.py
@@ -269,7 +269,7 @@ def _input_tsv_type(self, line):
     def _get_full_peptide(self, line, mt_records_dict, wt_records_dict):
         for record_id in mt_records_dict.keys():
             (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
-            transcript_regex = '^.*(ENST[0-9|.]+)$'
+            transcript_regex = '^.*(ENS[0-9|A-Z|.]+)$'
             transcript_p = re.compile(transcript_regex)
             m = transcript_p.match(rest_record_id)
             if m:
@@ -522,7 +522,7 @@ def _write_outputs(self, processed_peptides, mt_records_dict, wt_records_dict):
                         for query_window, hit_reference_matches in groupby(metric_lines,key=lambda x:x['Match Window']):
                             hit_reference_matches = list(hit_reference_matches)
                             gene_regex = '^.*gene_symbol:([0-9|A-Z]+).*$'
-                            transcript_regex = '^.*transcript:(ENST[0-9|.]+).*$'
+                            transcript_regex = '^.*transcript:(ENS[0-9|A-Z|.]+).*$'
                             gene_p = re.compile(gene_regex)
                             transcript_p = re.compile(transcript_regex)
                             genes = []

diff --git a/tests/test_calculate_reference_proteome_similarity.py b/tests/test_calculate_reference_proteome_similarity.py
@@ -90,6 +90,29 @@ def test_calculate_self_similarity_with_aggregated_tsv_and_peptide_fasta(self):
         ))
         os.remove(metric_file)
 
+    def test_calculate_self_similarity_with_aggregated_tsv_and_peptide_fasta_mouse(self):
+        input_file = os.path.join(self.test_data_dir, 'Test.all_epitopes.aggregated.mouse.tsv')
+        input_aggregated_metrics_file = os.path.join(self.test_data_dir, 'Test.all_epitopes.aggregated.mouse.tsv.metrics.json')
+        tmp_aggregated_metrics_file = tempfile.NamedTemporaryFile()
+        import shutil
+        shutil.copy(input_aggregated_metrics_file, tmp_aggregated_metrics_file.name)
+        input_fasta = os.path.join(self.test_data_dir, 'Test.mouse.fasta')
+        output_file = tempfile.NamedTemporaryFile(suffix='.tsv')
+        metric_file = "{}.reference_matches".format(output_file.name)
+        output_aggregated_metrics_file = output_file.name.replace(".tsv", ".metrics.json")
+        self.assertFalse(CalculateReferenceProteomeSimilarity(
+            input_file,
+            input_fasta,
+            output_file.name,
+            peptide_fasta=self.peptide_fasta,
+            aggregate_metrics_file=tmp_aggregated_metrics_file.name,
+        ).execute())
+        self.assertTrue(cmp(
+            output_file.name,
+            os.path.join(self.test_data_dir, "output.aggregated.peptide_fasta.mouse.tsv"),
+        ))
+        os.remove(metric_file)
+
     def test_wt_peptide_fully_in_mt_peptide(self):
         input_file = os.path.join(self.test_data_dir, 'input_wt_in_mt.tsv')
         input_fasta = os.path.join(self.test_data_dir, 'input_wt_in_mt.fasta')

diff --git a/.../test_data/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv b/.../test_data/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv
@@ -0,0 +1,2 @@
+ID	E*01:01	G*01:09	Gene	AA Change	Num Passing Transcripts	Best Peptide	Best Transcript	TSL	Allele	Pos	Prob Pos	Num Passing Peptides	IC50 MT	IC50 WT	%ile MT	%ile WT	RNA Expr	RNA VAF	Allele Expr	RNA Depth	DNA VAF	Tier	Evaluation
+22-41920894-41920895-G-C	2	1	ACO2	N1453S	1	KFNPQTDYL	ENSMUST00000027032	Not Supported	HLA-G*01:09	5	None	3	1262.760	1318.61	0.500	0.6	NA	NA	NA	NA	0.250	Poor	Pending
diff --git a/...lculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv.metrics.json b/...lculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv.metrics.json
@@ -0,0 +1,247 @@
+{
+  "tumor_purity": null,
+  "vaf_clonal": 0.571,
+  "vaf_subclonal": 0.286,
+  "binding_threshold": 500,
+  "aggregate_inclusion_binding_threshold": 5000,
+  "trna_vaf": 0.25,
+  "trna_cov": 10,
+  "allele_expr_threshold": 2.5,
+  "maximum_transcript_support_level": 1,
+  "percentile_threshold": null,
+  "allele_specific_binding_thresholds": false,
+  "mt_top_score_metric": "Best",
+  "wt_top_score_metric": "Corresponding",
+  "binding_cutoffs": {
+    "HLA-E*01:01": 500,
+    "HLA-G*01:09": 500
+  },
+  "is_allele_specific_binding_cutoff": {
+    "HLA-E*01:01": false,
+    "HLA-G*01:09": false
+  },
+  "allele_specific_anchors": false,
+  "anchor_contribution_threshold": 0.8,
+  "22-41920894-41920895-G-C": {
+    "good_binders": {
+      "Transcript Set 1": {
+        "peptides": {
+          "LLPLLPLLL": {
+            "ic50s_MT": [
+              2551.25,
+              4778.52001953125
+            ],
+            "percentiles_MT": [
+              0.1700439453125,
+              6.80078125
+            ],
+            "ic50s_WT": [
+              3099.81,
+              4830.5
+            ],
+            "percentiles_WT": [
+              0.33,
+              6.9
+            ],
+            "hla_types": [
+              "HLA-E*01:01",
+              "HLA-G*01:09"
+            ],
+            "mutation_position": "6-7",
+            "problematic_positions": "None",
+            "individual_ic50_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  6891.60986328125,
+                  2551.25
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  4778.52001953125
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  9234.71,
+                  3099.81
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  4830.5
+                ]
+              }
+            },
+            "individual_percentile_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  0.1700439453125,
+                  1.400390625
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  6.80078125
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  0.33,
+                  2.2
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  6.9
+                ]
+              }
+            },
+            "individual_el_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              },
+              "WT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              }
+            },
+            "individual_el_percentile_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              },
+              "WT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              }
+            },
+            "wt_peptide": "LLPLLLLLL"
+          },
+          "LLPLLLLLG": {
+            "ic50s_MT": [
+              4676.22998046875,
+              "X"
+            ],
+            "percentiles_MT": [
+              5.8984375,
+              "X"
+            ],
+            "ic50s_WT": [
+              38565.12,
+              "X"
+            ],
+            "percentiles_WT": [
+              81.0,
+              "X"
+            ],
+            "hla_types": [
+              "HLA-E*01:01",
+              "HLA-G*01:09"
+            ],
+            "mutation_position": "3-4",
+            "problematic_positions": "None",
+            "individual_ic50_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  31981.0703125,
+                  4676.22998046875
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  42768.73,
+                  38565.12
+                ]
+              }
+            },
+            "individual_percentile_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  14.0,
+                  5.8984375
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  60.0,
+                  81.0
+                ]
+              }
+            },
+            "individual_el_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": []
+              },
+              "WT": {
+                "HLA-E*01:01": []
+              }
+            },
+            "individual_el_percentile_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": []
+              },
+              "WT": {
+                "HLA-E*01:01": []
+              }
+            },
+            "wt_peptide": "LPLLLLLLG"
+          }
+        },
+        "transcripts": [
+          "ENST00000233809.4-IGFBP2-L/LLP-20"
+        ],
+        "transcript_expr": [
+          "NA"
+        ],
+        "tsl": [
+          "Not Supported"
+        ],
+        "biotype": [
+          "protein_coding"
+        ],
+        "transcript_length": [
+          325
+        ],
+        "transcript_count": 1,
+        "peptide_count": 2,
+        "total_expr": 0
+      }
+    },
+    "sets": [
+      "Transcript Set 1"
+    ],
+    "transcript_counts": [
+      1
+    ],
+    "peptide_counts": [
+      2
+    ],
+    "set_expr": [
+      0
+    ],
+    "DNA VAF": 0.891,
+    "RNA VAF": "NA",
+    "gene_expr": "NA",
+    "best_peptide_mt": "LLPLLPLLL",
+    "best_peptide_wt": "LLPLLLLLL",
+    "best_hla_allele": "HLA-E*01:01"
+  }
+}
diff --git a/...ata/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.tsv.metrics.json b/...ata/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.tsv.metrics.json
@@ -3275,4 +3275,4 @@
     "best_peptide_wt": "ATLSRTLLA",
     "best_hla_allele": "HLA-E*01:01"
   }
-}
+}
diff --git a/tests/test_data/calculate_reference_proteome_similarity/Test.mouse.fasta b/tests/test_data/calculate_reference_proteome_similarity/Test.mouse.fasta
@@ -0,0 +1,4 @@
+>WT.Rp1.ENSMUST00000027032.missense.1453N/S
+IAGTLKFNPETDYLTGTDG
+>MT.Rp1.ENSMUST00000027032.missense.1453N/S
+IAGTLKFNPQTDYLTGTDG
diff --git a/tests/test_data/calculate_reference_proteome_similarity/input.aggregated.tsv b/tests/test_data/calculate_reference_proteome_similarity/input.aggregated.tsv
diff --git a/...st_data/calculate_reference_proteome_similarity/output.aggregated.peptide_fasta.mouse.tsv b/...st_data/calculate_reference_proteome_similarity/output.aggregated.peptide_fasta.mouse.tsv
@@ -0,0 +1,2 @@
+ID	E*01:01	G*01:09	Gene	AA Change	Num Passing Transcripts	Best Peptide	Best Transcript	TSL	Allele	Pos	Prob Pos	Num Passing Peptides	IC50 MT	IC50 WT	%ile MT	%ile WT	RNA Expr	RNA VAF	Allele Expr	RNA Depth	DNA VAF	Tier	Ref Match	Evaluation
+22-41920894-41920895-G-C	2	1	ACO2	N1453S	1	KFNPQTDYL	ENSMUST00000027032	Not Supported	HLA-G*01:09	5	None	3	1262.760	1318.61	0.500	0.6	NA	NA	NA	NA	0.250	Poor	False	Pending
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ID E01:01 G01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
		22-41920894-41920895-G-C 2 1 ACO2 N1453S 1 KFNPQTDYL ENSMUST00000027032 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor Pending
-Original file line number
+Diff line change
@@ Expand Up / @@ -3275,4 +3275,4 @@ @@
         "best_peptide_wt": "ATLSRTLLA",
         "best_hla_allele": "HLA-E*01:01"
       }
-    }
+    }