Merge remote-tracking branch 'origin/hotfix'

griffithlab · Aug 9, 2023 · 3317d2c · 3317d2c
2 parents 1307ab0 + 12fa00e
commit 3317d2c
Show file tree

Hide file tree

Showing 12 changed files with 293 additions and 46 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -70,7 +70,7 @@
 # The short X.Y version.
 version = '4.0'
 # The full version, including alpha/beta/rc tags.
-release = '4.0.2'
+release = '4.0.3'
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation

diff --git a/docs/index.rst b/docs/index.rst
@@ -56,23 +56,8 @@ New in Release |release|
 
 This is a bugfix release. It fixes the following problem(s):
 
-- Arriba annotated fusion sequences may contain characters that aren't
-  supported. This update skips such sequences.
-- The ``--aggregate-report-evaluation`` parameter in the standalone ``pvacseq
-  generate_protein_fasta`` command was previously set up with
-  nargs in order to allow specifying multiple values. However, this
-  conflicts with required positional parameters. The parameter definiton was
-  updated so that multiple values are now specified as a comma-separated list.
-- pVACfuse would previously fail in an odd way when none of the fusions in the
-  input were processable. This update now exits pVACfuse more gracefully in
-  this case.
-- The reference proteome similarity step would previously fail when an epitope's
-  full peptide sequence wasn't found in the input fasta. It now skips such
-  epitopes and marks the Reference Match column as ``Not Run``.
-- There was a mismatch in how proximal variants were incorporated into the
-  n-mer fasta files vs the "master" fasta file which had the potential of
-  epitopes not being present in the "master" fasta file. This update brings
-  both file creation steps in sync.
+- The fixes in issue in the reference proteome similarity step in pVACseq
+  where running with non-human data would cause an error.
 
 New in Version |version|
 ------------------------

diff --git a/docs/releases/4_0.rst b/docs/releases/4_0.rst
@@ -98,3 +98,11 @@ This is a bugfix release. It fixes the following problem(s):
   n-mer fasta files vs the "master" fasta file which had the potential of
   epitopes not being present in the "master" fasta file. This update brings
   both file creation steps in sync.
+
+New in Version 4.0.3
+--------------------
+
+This is a bugfix release. It fixes the following problem(s):
+
+- The fixes in issue in the reference proteome similarity step in pVACseq
+  where running with non-human data would cause an error.
diff --git a/pvactools/lib/calculate_reference_proteome_similarity.py b/pvactools/lib/calculate_reference_proteome_similarity.py
@@ -269,7 +269,7 @@ def _input_tsv_type(self, line):
     def _get_full_peptide(self, line, mt_records_dict, wt_records_dict):
         for record_id in mt_records_dict.keys():
             (rest_record_id, variant_type, aa_change) = record_id.rsplit(".", 2)
-            transcript_regex = '^.*(ENST[0-9|.]+)$'
+            transcript_regex = '^.*(ENS[0-9|A-Z|.]+)$'
             transcript_p = re.compile(transcript_regex)
             m = transcript_p.match(rest_record_id)
             if m:
@@ -522,7 +522,7 @@ def _write_outputs(self, processed_peptides, mt_records_dict, wt_records_dict):
                         for query_window, hit_reference_matches in groupby(metric_lines,key=lambda x:x['Match Window']):
                             hit_reference_matches = list(hit_reference_matches)
                             gene_regex = '^.*gene_symbol:([0-9|A-Z]+).*$'
-                            transcript_regex = '^.*transcript:(ENST[0-9|.]+).*$'
+                            transcript_regex = '^.*transcript:(ENS[0-9|A-Z|.]+).*$'
                             gene_p = re.compile(gene_regex)
                             transcript_p = re.compile(transcript_regex)
                             genes = []

diff --git a/setup.py b/setup.py
@@ -51,7 +51,7 @@
 
 setup(
     name="pvactools",
-    version="4.0.2",
+    version="4.0.3",
     packages=[
         "pvactools.tools",
         "pvactools.tools.pvacbind",

diff --git a/tests/test_calculate_reference_proteome_similarity.py b/tests/test_calculate_reference_proteome_similarity.py
@@ -90,6 +90,29 @@ def test_calculate_self_similarity_with_aggregated_tsv_and_peptide_fasta(self):
         ))
         os.remove(metric_file)
 
+    def test_calculate_self_similarity_with_aggregated_tsv_and_peptide_fasta_mouse(self):
+        input_file = os.path.join(self.test_data_dir, 'Test.all_epitopes.aggregated.mouse.tsv')
+        input_aggregated_metrics_file = os.path.join(self.test_data_dir, 'Test.all_epitopes.aggregated.mouse.tsv.metrics.json')
+        tmp_aggregated_metrics_file = tempfile.NamedTemporaryFile()
+        import shutil
+        shutil.copy(input_aggregated_metrics_file, tmp_aggregated_metrics_file.name)
+        input_fasta = os.path.join(self.test_data_dir, 'Test.mouse.fasta')
+        output_file = tempfile.NamedTemporaryFile(suffix='.tsv')
+        metric_file = "{}.reference_matches".format(output_file.name)
+        output_aggregated_metrics_file = output_file.name.replace(".tsv", ".metrics.json")
+        self.assertFalse(CalculateReferenceProteomeSimilarity(
+            input_file,
+            input_fasta,
+            output_file.name,
+            peptide_fasta=self.peptide_fasta,
+            aggregate_metrics_file=tmp_aggregated_metrics_file.name,
+        ).execute())
+        self.assertTrue(cmp(
+            output_file.name,
+            os.path.join(self.test_data_dir, "output.aggregated.peptide_fasta.mouse.tsv"),
+        ))
+        os.remove(metric_file)
+
     def test_wt_peptide_fully_in_mt_peptide(self):
         input_file = os.path.join(self.test_data_dir, 'input_wt_in_mt.tsv')
         input_fasta = os.path.join(self.test_data_dir, 'input_wt_in_mt.fasta')

diff --git a/.../test_data/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv b/.../test_data/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv
@@ -0,0 +1,2 @@
+ID	E*01:01	G*01:09	Gene	AA Change	Num Passing Transcripts	Best Peptide	Best Transcript	TSL	Allele	Pos	Prob Pos	Num Passing Peptides	IC50 MT	IC50 WT	%ile MT	%ile WT	RNA Expr	RNA VAF	Allele Expr	RNA Depth	DNA VAF	Tier	Evaluation
+22-41920894-41920895-G-C	2	1	ACO2	N1453S	1	KFNPQTDYL	ENSMUST00000027032	Not Supported	HLA-G*01:09	5	None	3	1262.760	1318.61	0.500	0.6	NA	NA	NA	NA	0.250	Poor	Pending
diff --git a/...lculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv.metrics.json b/...lculate_reference_proteome_similarity/Test.all_epitopes.aggregated.mouse.tsv.metrics.json
@@ -0,0 +1,247 @@
+{
+  "tumor_purity": null,
+  "vaf_clonal": 0.571,
+  "vaf_subclonal": 0.286,
+  "binding_threshold": 500,
+  "aggregate_inclusion_binding_threshold": 5000,
+  "trna_vaf": 0.25,
+  "trna_cov": 10,
+  "allele_expr_threshold": 2.5,
+  "maximum_transcript_support_level": 1,
+  "percentile_threshold": null,
+  "allele_specific_binding_thresholds": false,
+  "mt_top_score_metric": "Best",
+  "wt_top_score_metric": "Corresponding",
+  "binding_cutoffs": {
+    "HLA-E*01:01": 500,
+    "HLA-G*01:09": 500
+  },
+  "is_allele_specific_binding_cutoff": {
+    "HLA-E*01:01": false,
+    "HLA-G*01:09": false
+  },
+  "allele_specific_anchors": false,
+  "anchor_contribution_threshold": 0.8,
+  "22-41920894-41920895-G-C": {
+    "good_binders": {
+      "Transcript Set 1": {
+        "peptides": {
+          "LLPLLPLLL": {
+            "ic50s_MT": [
+              2551.25,
+              4778.52001953125
+            ],
+            "percentiles_MT": [
+              0.1700439453125,
+              6.80078125
+            ],
+            "ic50s_WT": [
+              3099.81,
+              4830.5
+            ],
+            "percentiles_WT": [
+              0.33,
+              6.9
+            ],
+            "hla_types": [
+              "HLA-E*01:01",
+              "HLA-G*01:09"
+            ],
+            "mutation_position": "6-7",
+            "problematic_positions": "None",
+            "individual_ic50_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  6891.60986328125,
+                  2551.25
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  4778.52001953125
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  9234.71,
+                  3099.81
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  4830.5
+                ]
+              }
+            },
+            "individual_percentile_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  0.1700439453125,
+                  1.400390625
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  6.80078125
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  0.33,
+                  2.2
+                ],
+                "HLA-G*01:09": [
+                  "NA",
+                  6.9
+                ]
+              }
+            },
+            "individual_el_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              },
+              "WT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              }
+            },
+            "individual_el_percentile_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              },
+              "WT": {
+                "HLA-E*01:01": [],
+                "HLA-G*01:09": []
+              }
+            },
+            "wt_peptide": "LLPLLLLLL"
+          },
+          "LLPLLLLLG": {
+            "ic50s_MT": [
+              4676.22998046875,
+              "X"
+            ],
+            "percentiles_MT": [
+              5.8984375,
+              "X"
+            ],
+            "ic50s_WT": [
+              38565.12,
+              "X"
+            ],
+            "percentiles_WT": [
+              81.0,
+              "X"
+            ],
+            "hla_types": [
+              "HLA-E*01:01",
+              "HLA-G*01:09"
+            ],
+            "mutation_position": "3-4",
+            "problematic_positions": "None",
+            "individual_ic50_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  31981.0703125,
+                  4676.22998046875
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  42768.73,
+                  38565.12
+                ]
+              }
+            },
+            "individual_percentile_calls": {
+              "algorithms": [
+                "NetMHC",
+                "PickPocket"
+              ],
+              "MT": {
+                "HLA-E*01:01": [
+                  14.0,
+                  5.8984375
+                ]
+              },
+              "WT": {
+                "HLA-E*01:01": [
+                  60.0,
+                  81.0
+                ]
+              }
+            },
+            "individual_el_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": []
+              },
+              "WT": {
+                "HLA-E*01:01": []
+              }
+            },
+            "individual_el_percentile_calls": {
+              "algorithms": [],
+              "MT": {
+                "HLA-E*01:01": []
+              },
+              "WT": {
+                "HLA-E*01:01": []
+              }
+            },
+            "wt_peptide": "LPLLLLLLG"
+          }
+        },
+        "transcripts": [
+          "ENST00000233809.4-IGFBP2-L/LLP-20"
+        ],
+        "transcript_expr": [
+          "NA"
+        ],
+        "tsl": [
+          "Not Supported"
+        ],
+        "biotype": [
+          "protein_coding"
+        ],
+        "transcript_length": [
+          325
+        ],
+        "transcript_count": 1,
+        "peptide_count": 2,
+        "total_expr": 0
+      }
+    },
+    "sets": [
+      "Transcript Set 1"
+    ],
+    "transcript_counts": [
+      1
+    ],
+    "peptide_counts": [
+      2
+    ],
+    "set_expr": [
+      0
+    ],
+    "DNA VAF": 0.891,
+    "RNA VAF": "NA",
+    "gene_expr": "NA",
+    "best_peptide_mt": "LLPLLPLLL",
+    "best_peptide_wt": "LLPLLLLLL",
+    "best_hla_allele": "HLA-E*01:01"
+  }
+}
diff --git a/...ata/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.tsv.metrics.json b/...ata/calculate_reference_proteome_similarity/Test.all_epitopes.aggregated.tsv.metrics.json
@@ -3275,4 +3275,4 @@
     "best_peptide_wt": "ATLSRTLLA",
     "best_hla_allele": "HLA-E*01:01"
   }
-}
+}
diff --git a/tests/test_data/calculate_reference_proteome_similarity/Test.mouse.fasta b/tests/test_data/calculate_reference_proteome_similarity/Test.mouse.fasta
@@ -0,0 +1,4 @@
+>WT.Rp1.ENSMUST00000027032.missense.1453N/S
+IAGTLKFNPETDYLTGTDG
+>MT.Rp1.ENSMUST00000027032.missense.1453N/S
+IAGTLKFNPQTDYLTGTDG
diff --git a/tests/test_data/calculate_reference_proteome_similarity/input.aggregated.tsv b/tests/test_data/calculate_reference_proteome_similarity/input.aggregated.tsv
diff --git a/...st_data/calculate_reference_proteome_similarity/output.aggregated.peptide_fasta.mouse.tsv b/...st_data/calculate_reference_proteome_similarity/output.aggregated.peptide_fasta.mouse.tsv
@@ -0,0 +1,2 @@
+ID	E*01:01	G*01:09	Gene	AA Change	Num Passing Transcripts	Best Peptide	Best Transcript	TSL	Allele	Pos	Prob Pos	Num Passing Peptides	IC50 MT	IC50 WT	%ile MT	%ile WT	RNA Expr	RNA VAF	Allele Expr	RNA Depth	DNA VAF	Tier	Ref Match	Evaluation
+22-41920894-41920895-G-C	2	1	ACO2	N1453S	1	KFNPQTDYL	ENSMUST00000027032	Not Supported	HLA-G*01:09	5	None	3	1262.760	1318.61	0.500	0.6	NA	NA	NA	NA	0.250	Poor	False	Pending
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		ID E01:01 G01:09 Gene AA Change Num Passing Transcripts Best Peptide Best Transcript TSL Allele Pos Prob Pos Num Passing Peptides IC50 MT IC50 WT %ile MT %ile WT RNA Expr RNA VAF Allele Expr RNA Depth DNA VAF Tier Evaluation
		22-41920894-41920895-G-C 2 1 ACO2 N1453S 1 KFNPQTDYL ENSMUST00000027032 Not Supported HLA-G*01:09 5 None 3 1262.760 1318.61 0.500 0.6 NA NA NA NA 0.250 Poor Pending
-Original file line number
+Diff line change
@@ Expand Up / @@ -3275,4 +3275,4 @@ @@
         "best_peptide_wt": "ATLSRTLLA",
         "best_hla_allele": "HLA-E*01:01"
       }
-    }
+    }