From c22089f4b7648b5b3ba96892cc4edb069c74553b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 31 Oct 2024 22:28:37 +0100 Subject: [PATCH] Move mutation filtering by gene id and symbol from loader to filter --- .../scripts/ImportExtendedMutationData.java | 68 ++---------- .../cbio/portal/scripts/MutationFilter.java | 105 +++++++----------- .../portal/util/ExtendedMutationUtil.java | 22 ++++ 3 files changed, 70 insertions(+), 125 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 45c42d8..1a2f56f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -245,67 +245,15 @@ public void importData() throws IOException, DaoException { int proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd( record.getProteinPosition(), proteinChange); - // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) - String geneSymbol = record.getHugoGeneSymbol(); - String entrezIdString = record.getGivenEntrezGeneId(); - + String geneSymbol = ExtendedMutationUtil.normalizeGeneSymbol(record.getHugoGeneSymbol()); + Long entrezGeneId = ExtendedMutationUtil.parseEntrezGeneId(record.getGivenEntrezGeneId()); CanonicalGene gene = null; - // try to parse entrez if it is not empty nor 0: - if (!(entrezIdString.isEmpty() || - entrezIdString.equals("0"))) { - Long entrezGeneId; - try { - entrezGeneId = Long.parseLong(entrezIdString); - } catch (NumberFormatException e) { - entrezGeneId = null; - } - //non numeric values or negative values should not be allowed: - if (entrezGeneId == null || entrezGeneId < 0) { - ProgressMonitor.logWarning( - "Ignoring line with invalid Entrez_Id " + - entrezIdString); - entriesSkipped++; - continue; - } else { - gene = daoGene.getGene(entrezGeneId); - if (gene == null) { - //skip if not in DB: - ProgressMonitor.logWarning( - "Entrez gene ID " + entrezGeneId + - " not found. Record will be skipped."); - entriesSkipped++; - continue; - } - } - } - - // If Entrez Gene ID Fails, try Symbol. - if (gene == null && - !(geneSymbol.equals("") || - geneSymbol.equals("Unknown"))) { + // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) + if (entrezGeneId != null) { + gene = daoGene.getGene(entrezGeneId); + } else if (geneSymbol != null) { gene = daoGene.getNonAmbiguousGene(geneSymbol, true); } - - String mutationType = ExtendedMutationUtil.getMutationType(record); - // assume symbol=Unknown and entrez=0 (or missing Entrez column) to imply an - // intergenic, irrespective of what the column Variant_Classification says - if (geneSymbol.equals("Unknown") && - (entrezIdString.equals("0") || mafUtil.getEntrezGeneIdIndex() == -1)) { - // give extra warning if mutationType is something different from IGR: - if (!"IRG".equalsIgnoreCase(mutationType)) { - ProgressMonitor.logWarning( - "Treating mutation with gene symbol 'Unknown' " + - (mafUtil.getEntrezGeneIdIndex() == -1 ? "" : "and Entrez gene ID 0") + " as intergenic ('IGR') " + - "instead of '" + mutationType + "'. Entry filtered/skipped."); - } - // treat as IGR: - myMutationFilter.decisions++; - myMutationFilter.addRejectedVariant(myMutationFilter.rejectionMap, "IGR"); - // skip entry: - entriesSkipped++; - continue; - } - // skip the record if a gene was expected but not identified if (gene == null) { ProgressMonitor.logWarning( @@ -316,8 +264,8 @@ public void importData() throws IOException, DaoException { entriesSkipped++; continue; } - ExtendedMutation mutation = new ExtendedMutation(); + ExtendedMutation mutation = new ExtendedMutation(); mutation.setGeneticProfileId(geneticProfileId); mutation.setSampleId(sample.getInternalId()); mutation.setGene(gene); @@ -325,7 +273,7 @@ public void importData() throws IOException, DaoException { mutation.setSequencer(record.getSequencer()); mutation.setProteinChange(proteinChange); mutation.setAminoAcidChange(aaChange); - mutation.setMutationType(mutationType); + mutation.setMutationType(ExtendedMutationUtil.getMutationType(record)); mutation.setChr(record.getChr()); mutation.setStartPosition(record.getStartPosition()); mutation.setEndPosition(record.getEndPosition()); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java index aa65f8b..8f8c934 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java @@ -38,10 +38,7 @@ import java.util.Set; import org.mskcc.cbio.maf.MafRecord; -import org.mskcc.cbio.maf.TabDelimitedFileUtil; -import org.mskcc.cbio.portal.model.ExtendedMutation; import org.mskcc.cbio.portal.util.ExtendedMutationUtil; -import org.mskcc.cbio.portal.util.TsvUtil; /** * Filter mutations as they're imported into the CGDS dbms. @@ -50,19 +47,15 @@ */ public class MutationFilter { - private Set whiteListGenesForPromoterMutations; + private final Set whiteListGenesForPromoterMutations; private int accepts=0; - private int germlineWhitelistAccepts=0; - private int somaticWhitelistAccepts=0; - private int unknownAccepts=0; - public int decisions=0; + public int decisions=0; private int mutationStatusNoneRejects=0; private int invalidChromosome=0; + private int invalidGeneInfo=0; private int lohOrWildTypeRejects=0; - private int emptyAnnotationRejects=0; - private int missenseGermlineRejects=0; - private int redactedRejects=0; + private int redactedOrWildTypeRejects =0; public Map rejectionMap = new HashMap(); private static final Map VALID_CHR_VALUES = new HashMap<>(); @@ -89,7 +82,7 @@ public class MutationFilter { */ public MutationFilter() throws IllegalArgumentException{ whiteListGenesForPromoterMutations = new HashSet(); - whiteListGenesForPromoterMutations.add(Long.valueOf(7015)); // TERT + whiteListGenesForPromoterMutations.add(7015L); // TERT } /** @@ -122,6 +115,22 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations | Translation_Start_Site | +------------------------+ */ + if (ExtendedMutationUtil.isBlankEntrezGeneId(mafRecord.getGivenEntrezGeneId()) + && ExtendedMutationUtil.isBlankHugoGeneSymbol(mafRecord.getHugoGeneSymbol())) { + invalidGeneInfo++; + return false; + } + long entrezGeneId; + try { + entrezGeneId = Long.parseLong(mafRecord.getGivenEntrezGeneId()); + if (entrezGeneId < 0) { + invalidGeneInfo++; + return false; + } + } catch (NumberFormatException e) { + invalidGeneInfo++; + return false; + } // Do not accept mutations with invalid chromosome symbol if (normalizeChr(mafRecord.getChr()) == null) { invalidChromosome++; @@ -140,9 +149,10 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations return false; } - // Do not accept Redacted mutations - if (safeStringTest(mafRecord.getValidationStatus(), "Redacted")) { - redactedRejects++; + // Do not accept Redacted or Wildtype mutations + if (safeStringTest(mafRecord.getValidationStatus(), "Redacted") || + safeStringTest( mafRecord.getValidationStatus(), "Wildtype" )) { + redactedOrWildTypeRejects++; return false; } @@ -150,7 +160,7 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations String mutationType = ExtendedMutationUtil.getMutationType(mafRecord); if (filteredMutations != null) { if (filteredMutations.contains(mutationType)) { - addRejectedVariant(rejectionMap, mutationType); + addRejectedVariant(mutationType); return false; } else { if( safeStringTest( mutationType, "5'Flank" ) ) { @@ -167,15 +177,15 @@ public boolean acceptMutation(MafRecord mafRecord, Set filteredMutations safeStringTest( mutationType, "5'UTR" ) || safeStringTest( mutationType, "IGR" ) || safeStringTest( mutationType, "RNA")){ - addRejectedVariant(rejectionMap, mutationType); + addRejectedVariant(mutationType); return false; } if( safeStringTest( mutationType, "5'Flank" ) ) { - if (whiteListGenesForPromoterMutations.contains(mafRecord.getGivenEntrezGeneId())){ + if (whiteListGenesForPromoterMutations.contains(entrezGeneId)){ mafRecord.setProteinChange("Promoter"); } else { - addRejectedVariant(rejectionMap, mutationType); + addRejectedVariant(mutationType); return false; } } @@ -220,61 +230,25 @@ public int getLohOrWildTypeRejects() { return this.lohOrWildTypeRejects; } - /** - * Provide number of REJECT decisions for Emtpy Annotation Mutations. - * @return number of REJECT decisions for Empty Annotation Mutations. - */ - public int getEmptyAnnotationRejects() { - return this.emptyAnnotationRejects; - } - - /** - * Provide number of REJECT decisions for Missense Germline Mutations. - * @return number of REJECT decisions for Missense Germline Mutations. - */ - public int getMissenseGermlineRejects() { - return this.missenseGermlineRejects; - } - - /** - * Provide number of germline whitelist ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of germline whitelist ACCEPT (return true) decisions made by this MutationFilter - */ - public int getGermlineWhitelistAccepts(){ - return this.germlineWhitelistAccepts; - } - - /** - * Provide number of somatic whitelist ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of somatic whitelist ACCEPT (return true) decisions made by this MutationFilter - */ - public int getSomaticWhitelistAccepts(){ - return this.somaticWhitelistAccepts; - } - public int getInvalidChromosome() { return invalidChromosome; } - /** - * Provide number of unknown whitelist ACCEPT (return true) decisions made by this MutationFilter. - * @return the number of unknown ACCEPT (return true) decisions made by this MutationFilter - */ - public int getUnknownAccepts(){ - return this.unknownAccepts; + public int getInvalidGeneInfo() { + return invalidGeneInfo; } - public int getRedactedRejects() + public int getRedactedOrWildTypeRejects() { - return this.redactedRejects; + return this.redactedOrWildTypeRejects; } public Map getRejectionMap() { return this.rejectionMap; } - public void addRejectedVariant(Map rejectionMap, String mutation) { - this.rejectionMap.computeIfAbsent(mutation, (k) -> 0); + public void addRejectedVariant(String mutation) { + this.rejectionMap.putIfAbsent(mutation, 0); this.rejectionMap.computeIfPresent(mutation, (k, v) -> v + 1); } @@ -290,10 +264,11 @@ public String getStatistics(){ String statistics = "Mutation filter decisions: " + this.getDecisions() + "\nRejects: " + this.getRejects() + "\nMutation Status 'None' Rejects: " + this.getMutationStatusNoneRejects() + - "\nLOH or Wild Type Rejects: " + this.getLohOrWildTypeRejects() + - "\nEmpty Annotation Rejects: " + this.getEmptyAnnotationRejects() + - "\nMissense Germline Rejects: " + this.getMissenseGermlineRejects(); - + "\nLOH or Wild Type Mutation Status Rejects: " + this.getLohOrWildTypeRejects() + + "\nRedacted or Wild Type Validation Status Rejects: " + this.getRedactedOrWildTypeRejects() + + "\nInvalid Choromosome Rejects: " + this.getInvalidChromosome() + + "\nInvalid Gene Info Rejects: " + this.getInvalidGeneInfo(); + Map variantsRejected = this.getRejectionMap(); for (Map.Entry variant : variantsRejected.entrySet()) { statistics = statistics + "\n" + variant.getKey() + " Rejects: " + variant.getValue(); diff --git a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java index 614d623..a7119ac 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java @@ -394,4 +394,26 @@ private static Map annotateProteinChange(String proteinChange) annotation.put("end", end); return annotation; } + + public static Long parseEntrezGeneId(String givenEntrezGeneId) { + if (isBlankEntrezGeneId(givenEntrezGeneId)) { + return null; + } + return Long.parseLong(givenEntrezGeneId); + } + + public static boolean isBlankEntrezGeneId(String givenEntrezGeneId) { + return givenEntrezGeneId == null || givenEntrezGeneId.trim().isEmpty() || "0".equals(givenEntrezGeneId); + } + + public static String normalizeGeneSymbol(String hugoGeneSymbol) { + if (isBlankHugoGeneSymbol(hugoGeneSymbol)) { + return null; + } + return hugoGeneSymbol.trim(); + } + + public static boolean isBlankHugoGeneSymbol(String hugoGeneSymbol) { + return hugoGeneSymbol == null || hugoGeneSymbol.trim().isEmpty() || "Unknown".equals(hugoGeneSymbol); + } }