Skip to content

Commit

Permalink
Move mutation filtering by gene id and symbol from loader to filter
Browse files Browse the repository at this point in the history
  • Loading branch information
forus committed Oct 31, 2024
1 parent e9c8c74 commit c22089f
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 125 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -245,67 +245,15 @@ public void importData() throws IOException, DaoException {
int proteinPosEnd = ExtendedMutationUtil.getProteinPosEnd(
record.getProteinPosition(), proteinChange);

// Assume we are dealing with Entrez Gene Ids (this is the best / most stable option)
String geneSymbol = record.getHugoGeneSymbol();
String entrezIdString = record.getGivenEntrezGeneId();

String geneSymbol = ExtendedMutationUtil.normalizeGeneSymbol(record.getHugoGeneSymbol());
Long entrezGeneId = ExtendedMutationUtil.parseEntrezGeneId(record.getGivenEntrezGeneId());
CanonicalGene gene = null;
// try to parse entrez if it is not empty nor 0:
if (!(entrezIdString.isEmpty() ||
entrezIdString.equals("0"))) {
Long entrezGeneId;
try {
entrezGeneId = Long.parseLong(entrezIdString);
} catch (NumberFormatException e) {
entrezGeneId = null;
}
//non numeric values or negative values should not be allowed:
if (entrezGeneId == null || entrezGeneId < 0) {
ProgressMonitor.logWarning(
"Ignoring line with invalid Entrez_Id " +
entrezIdString);
entriesSkipped++;
continue;
} else {
gene = daoGene.getGene(entrezGeneId);
if (gene == null) {
//skip if not in DB:
ProgressMonitor.logWarning(
"Entrez gene ID " + entrezGeneId +
" not found. Record will be skipped.");
entriesSkipped++;
continue;
}
}
}

// If Entrez Gene ID Fails, try Symbol.
if (gene == null &&
!(geneSymbol.equals("") ||
geneSymbol.equals("Unknown"))) {
// Assume we are dealing with Entrez Gene Ids (this is the best / most stable option)
if (entrezGeneId != null) {
gene = daoGene.getGene(entrezGeneId);
} else if (geneSymbol != null) {
gene = daoGene.getNonAmbiguousGene(geneSymbol, true);
}

String mutationType = ExtendedMutationUtil.getMutationType(record);
// assume symbol=Unknown and entrez=0 (or missing Entrez column) to imply an
// intergenic, irrespective of what the column Variant_Classification says
if (geneSymbol.equals("Unknown") &&
(entrezIdString.equals("0") || mafUtil.getEntrezGeneIdIndex() == -1)) {
// give extra warning if mutationType is something different from IGR:
if (!"IRG".equalsIgnoreCase(mutationType)) {
ProgressMonitor.logWarning(
"Treating mutation with gene symbol 'Unknown' " +
(mafUtil.getEntrezGeneIdIndex() == -1 ? "" : "and Entrez gene ID 0") + " as intergenic ('IGR') " +
"instead of '" + mutationType + "'. Entry filtered/skipped.");
}
// treat as IGR:
myMutationFilter.decisions++;
myMutationFilter.addRejectedVariant(myMutationFilter.rejectionMap, "IGR");
// skip entry:
entriesSkipped++;
continue;
}

// skip the record if a gene was expected but not identified
if (gene == null) {
ProgressMonitor.logWarning(
Expand All @@ -316,16 +264,16 @@ public void importData() throws IOException, DaoException {
entriesSkipped++;
continue;
}
ExtendedMutation mutation = new ExtendedMutation();

ExtendedMutation mutation = new ExtendedMutation();
mutation.setGeneticProfileId(geneticProfileId);
mutation.setSampleId(sample.getInternalId());
mutation.setGene(gene);
mutation.setSequencingCenter(record.getCenter());
mutation.setSequencer(record.getSequencer());
mutation.setProteinChange(proteinChange);
mutation.setAminoAcidChange(aaChange);
mutation.setMutationType(mutationType);
mutation.setMutationType(ExtendedMutationUtil.getMutationType(record));
mutation.setChr(record.getChr());
mutation.setStartPosition(record.getStartPosition());
mutation.setEndPosition(record.getEndPosition());
Expand Down
105 changes: 40 additions & 65 deletions src/main/java/org/mskcc/cbio/portal/scripts/MutationFilter.java
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,7 @@
import java.util.Set;

import org.mskcc.cbio.maf.MafRecord;
import org.mskcc.cbio.maf.TabDelimitedFileUtil;
import org.mskcc.cbio.portal.model.ExtendedMutation;
import org.mskcc.cbio.portal.util.ExtendedMutationUtil;
import org.mskcc.cbio.portal.util.TsvUtil;

/**
* Filter mutations as they're imported into the CGDS dbms.
Expand All @@ -50,19 +47,15 @@
*/
public class MutationFilter {

private Set<Long> whiteListGenesForPromoterMutations;
private final Set<Long> whiteListGenesForPromoterMutations;

private int accepts=0;
private int germlineWhitelistAccepts=0;
private int somaticWhitelistAccepts=0;
private int unknownAccepts=0;
public int decisions=0;
public int decisions=0;
private int mutationStatusNoneRejects=0;
private int invalidChromosome=0;
private int invalidGeneInfo=0;
private int lohOrWildTypeRejects=0;
private int emptyAnnotationRejects=0;
private int missenseGermlineRejects=0;
private int redactedRejects=0;
private int redactedOrWildTypeRejects =0;
public Map<String,Integer> rejectionMap = new HashMap<String, Integer>();

private static final Map<String,String> VALID_CHR_VALUES = new HashMap<>();
Expand All @@ -89,7 +82,7 @@ public class MutationFilter {
*/
public MutationFilter() throws IllegalArgumentException{
whiteListGenesForPromoterMutations = new HashSet<Long>();
whiteListGenesForPromoterMutations.add(Long.valueOf(7015)); // TERT
whiteListGenesForPromoterMutations.add(7015L); // TERT
}

/**
Expand Down Expand Up @@ -122,6 +115,22 @@ public boolean acceptMutation(MafRecord mafRecord, Set<String> filteredMutations
| Translation_Start_Site |
+------------------------+
*/
if (ExtendedMutationUtil.isBlankEntrezGeneId(mafRecord.getGivenEntrezGeneId())
&& ExtendedMutationUtil.isBlankHugoGeneSymbol(mafRecord.getHugoGeneSymbol())) {
invalidGeneInfo++;
return false;
}
long entrezGeneId;
try {
entrezGeneId = Long.parseLong(mafRecord.getGivenEntrezGeneId());
if (entrezGeneId < 0) {
invalidGeneInfo++;
return false;
}
} catch (NumberFormatException e) {
invalidGeneInfo++;
return false;
}
// Do not accept mutations with invalid chromosome symbol
if (normalizeChr(mafRecord.getChr()) == null) {
invalidChromosome++;
Expand All @@ -140,17 +149,18 @@ public boolean acceptMutation(MafRecord mafRecord, Set<String> filteredMutations
return false;
}

// Do not accept Redacted mutations
if (safeStringTest(mafRecord.getValidationStatus(), "Redacted")) {
redactedRejects++;
// Do not accept Redacted or Wildtype mutations
if (safeStringTest(mafRecord.getValidationStatus(), "Redacted") ||
safeStringTest( mafRecord.getValidationStatus(), "Wildtype" )) {
redactedOrWildTypeRejects++;
return false;
}

//Filter by types if specified in the meta file, else filter for the default types
String mutationType = ExtendedMutationUtil.getMutationType(mafRecord);
if (filteredMutations != null) {
if (filteredMutations.contains(mutationType)) {
addRejectedVariant(rejectionMap, mutationType);
addRejectedVariant(mutationType);
return false;
} else {
if( safeStringTest( mutationType, "5'Flank" ) ) {
Expand All @@ -167,15 +177,15 @@ public boolean acceptMutation(MafRecord mafRecord, Set<String> filteredMutations
safeStringTest( mutationType, "5'UTR" ) ||
safeStringTest( mutationType, "IGR" ) ||
safeStringTest( mutationType, "RNA")){
addRejectedVariant(rejectionMap, mutationType);
addRejectedVariant(mutationType);
return false;
}

if( safeStringTest( mutationType, "5'Flank" ) ) {
if (whiteListGenesForPromoterMutations.contains(mafRecord.getGivenEntrezGeneId())){
if (whiteListGenesForPromoterMutations.contains(entrezGeneId)){
mafRecord.setProteinChange("Promoter");
} else {
addRejectedVariant(rejectionMap, mutationType);
addRejectedVariant(mutationType);
return false;
}
}
Expand Down Expand Up @@ -220,61 +230,25 @@ public int getLohOrWildTypeRejects() {
return this.lohOrWildTypeRejects;
}

/**
* Provide number of REJECT decisions for Emtpy Annotation Mutations.
* @return number of REJECT decisions for Empty Annotation Mutations.
*/
public int getEmptyAnnotationRejects() {
return this.emptyAnnotationRejects;
}

/**
* Provide number of REJECT decisions for Missense Germline Mutations.
* @return number of REJECT decisions for Missense Germline Mutations.
*/
public int getMissenseGermlineRejects() {
return this.missenseGermlineRejects;
}

/**
* Provide number of germline whitelist ACCEPT (return true) decisions made by this MutationFilter.
* @return the number of germline whitelist ACCEPT (return true) decisions made by this MutationFilter
*/
public int getGermlineWhitelistAccepts(){
return this.germlineWhitelistAccepts;
}

/**
* Provide number of somatic whitelist ACCEPT (return true) decisions made by this MutationFilter.
* @return the number of somatic whitelist ACCEPT (return true) decisions made by this MutationFilter
*/
public int getSomaticWhitelistAccepts(){
return this.somaticWhitelistAccepts;
}

public int getInvalidChromosome() {
return invalidChromosome;
}

/**
* Provide number of unknown whitelist ACCEPT (return true) decisions made by this MutationFilter.
* @return the number of unknown ACCEPT (return true) decisions made by this MutationFilter
*/
public int getUnknownAccepts(){
return this.unknownAccepts;
public int getInvalidGeneInfo() {
return invalidGeneInfo;
}

public int getRedactedRejects()
public int getRedactedOrWildTypeRejects()
{
return this.redactedRejects;
return this.redactedOrWildTypeRejects;
}

public Map<String, Integer> getRejectionMap() {
return this.rejectionMap;
}

public void addRejectedVariant(Map<String, Integer> rejectionMap, String mutation) {
this.rejectionMap.computeIfAbsent(mutation, (k) -> 0);
public void addRejectedVariant(String mutation) {
this.rejectionMap.putIfAbsent(mutation, 0);
this.rejectionMap.computeIfPresent(mutation, (k, v) -> v + 1);
}

Expand All @@ -290,10 +264,11 @@ public String getStatistics(){
String statistics = "Mutation filter decisions: " + this.getDecisions() +
"\nRejects: " + this.getRejects() +
"\nMutation Status 'None' Rejects: " + this.getMutationStatusNoneRejects() +
"\nLOH or Wild Type Rejects: " + this.getLohOrWildTypeRejects() +
"\nEmpty Annotation Rejects: " + this.getEmptyAnnotationRejects() +
"\nMissense Germline Rejects: " + this.getMissenseGermlineRejects();

"\nLOH or Wild Type Mutation Status Rejects: " + this.getLohOrWildTypeRejects() +
"\nRedacted or Wild Type Validation Status Rejects: " + this.getRedactedOrWildTypeRejects() +
"\nInvalid Choromosome Rejects: " + this.getInvalidChromosome() +
"\nInvalid Gene Info Rejects: " + this.getInvalidGeneInfo();

Map<String, Integer> variantsRejected = this.getRejectionMap();
for (Map.Entry<String, Integer> variant : variantsRejected.entrySet()) {
statistics = statistics + "\n" + variant.getKey() + " Rejects: " + variant.getValue();
Expand Down
22 changes: 22 additions & 0 deletions src/main/java/org/mskcc/cbio/portal/util/ExtendedMutationUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -394,4 +394,26 @@ private static Map<String, Integer> annotateProteinChange(String proteinChange)
annotation.put("end", end);
return annotation;
}

public static Long parseEntrezGeneId(String givenEntrezGeneId) {
if (isBlankEntrezGeneId(givenEntrezGeneId)) {
return null;
}
return Long.parseLong(givenEntrezGeneId);
}

public static boolean isBlankEntrezGeneId(String givenEntrezGeneId) {
return givenEntrezGeneId == null || givenEntrezGeneId.trim().isEmpty() || "0".equals(givenEntrezGeneId);
}

public static String normalizeGeneSymbol(String hugoGeneSymbol) {
if (isBlankHugoGeneSymbol(hugoGeneSymbol)) {
return null;
}
return hugoGeneSymbol.trim();
}

public static boolean isBlankHugoGeneSymbol(String hugoGeneSymbol) {
return hugoGeneSymbol == null || hugoGeneSymbol.trim().isEmpty() || "Unknown".equals(hugoGeneSymbol);
}
}

0 comments on commit c22089f

Please sign in to comment.