Skip to content

Commit

Permalink
Fix logic validating MM tags on negative strand alignments.
Browse files Browse the repository at this point in the history
  • Loading branch information
jrobinso committed Mar 1, 2024
1 parent b9c5e0c commit 4305244
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 77 deletions.
12 changes: 12 additions & 0 deletions src/main/java/org/broad/igv/prefs/PreferencesManager.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,18 @@ private static void init() {

Map<String, String> nullCategory = loadDefaults23();

// Special override -- maintains backward compatibility
if (!nullCategory.containsKey("FONT_SCALE_FACTOR") || nullCategory.get("FONT_SCALE_FACTOR").equals("1")) {
try {
double resolutionScale = Math.max(1, Toolkit.getDefaultToolkit().getScreenResolution() / ((double) Globals.DESIGN_DPI));
nullCategory.put(FONT_SCALE_FACTOR, String.valueOf((float) resolutionScale));
} catch (HeadlessException e) {
// Ignore -- this is expected
} catch (Exception e) {
log.error("Error overriding font scale factor", e);
}
}

defaultPreferences.put(NULL_CATEGORY, nullCategory);
defaultPreferences.put(RNA, new HashMap<>());
defaultPreferences.put(THIRD_GEN, new HashMap<>());
Expand Down
79 changes: 6 additions & 73 deletions src/main/java/org/broad/igv/sam/SAMAlignment.java
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@
import htsjdk.samtools.SAMReadGroupRecord;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMTag;
import htsjdk.samtools.util.SequenceUtil;
import org.broad.igv.logging.*;
import org.broad.igv.Globals;
import org.broad.igv.feature.Strand;
Expand Down Expand Up @@ -65,9 +64,6 @@ public class SAMAlignment implements Alignment {
public static final Pattern RIGHT_CIGAR_PATTERN = Pattern.compile("[A-Z](.{1," + MAX_CIGAR_STRING_LENGTH_TO_DISPLAY / 2 + "})$");
public static final Pattern LEFT_CIGAR_PATTERN = Pattern.compile("^(.{1," + (MAX_CIGAR_STRING_LENGTH_TO_DISPLAY / 2 - 1) + "}[A-Z])");
private static final Logger log = LogManager.getLogger(SAMAlignment.class);

private static int MM_WARNING_COUNT = 0;

public static final char DELETE_CHAR = '-';
public static final char SKIP_CHAR = '=';
public static final char MATCH = 'M';
Expand Down Expand Up @@ -376,23 +372,26 @@ public List<BaseModificationSet> getBaseModificationSets() {

Object mm = record.hasAttribute("Mm") ? record.getAttribute("Mm") : record.getAttribute("MM");
byte[] ml = (byte[]) (record.hasAttribute("Ml") ? record.getAttribute("Ml") : record.getAttribute("ML"));
Integer mn = record.getIntegerAttribute("MN");

// Minimal tag validation -- 10X uses MM and/or ML for other purposes
if (mm instanceof String && (mm.toString().length() > 0) && (ml == null || ml instanceof byte[])) {

byte[] sequence = record.getReadBases();

// Sequence length validation -- if MN tag is present use it, otherwise do a partial validation
// Test sequence length vs mn if avaliable
Integer mn = record.getIntegerAttribute("MN");
if (mn == null) {
mn = sequence.length;
}
if (mn != null) {
if (mn != sequence.length) {
return null;
}
} else if (!validateMMTag(mm.toString(), ml, record.getReadBases(), isNegativeStrand())) { //record.getCigarString().indexOf("H") > 0 &&
} else if (!BaseModificationUtils.validateMMTag(this.getReadName(), mm.toString(), record.getReadBases(), isNegativeStrand())) { //record.getCigarString().indexOf("H") > 0 &&
return null;
}


if (mm.toString().length() == 0) { // TODO -- more extensive validation?
baseModificationSets = Collections.EMPTY_LIST;
} else {
Expand All @@ -403,72 +402,6 @@ public List<BaseModificationSet> getBaseModificationSets() {
return baseModificationSets;
}

/**
* Minimally validate an MM tag. This will not catch all problems, but will many. Validation proceeds as follows
* 1. Validate types of MM and ML tags. This catches missues of the tags, for example in certain 10X files.
* 2. If available, validate sequence length vs MN tag.
* 3. If MN tag is not available, validate implied minimum count of base nucleotide vs actual count.
*
* @return
*/
boolean validateMMTag(String mm, byte[] ml, byte[] sequence, boolean isNegativeStrand) {

// Minimal tag validation -- 10X uses MM and/or ML for other purposes
if (!(mm instanceof String && mm.toString().length() > 0 && (ml == null || ml instanceof byte[]))) {
return false;
}

// Test sequence length vs mn if avaliable
Integer mn = record.getIntegerAttribute("MN");
if (mn != null) {
return (mn == sequence.length);
}

// Finally, test implied minimum base count vs actual base count in sequence. The minimum base count is
// equal to the number of modified bases + the number of skipped bases as codified in the MM tag.
// e.g. C+m,5,12,0 => at least 20 "Cs" in the read sequence, 3 with modifications and 17 skipped
if (PreferencesManager.getPreferences().getAsBoolean(Constants.BASEMOD_VALIDATE_BASE_COUNT)) {

String[] mmTokens = mm.split(";");

for (String mmi : mmTokens) {
String[] tokens = mmi.split(","); //Globals.commaPattern.split(mm);
int baseCount;
if (tokens[0].charAt(0) == 'N') {
baseCount = sequence.length;
} else {
byte base = (byte) tokens[0].charAt(0);
char strand = tokens[0].charAt(1);
if (strand == '-') {
base = SequenceUtil.complement(base);
}
baseCount = 0;
for (int i = 0; i < sequence.length; i++) {
byte readBase = isNegativeStrand ? SequenceUtil.complement(sequence[i]) : sequence[i];
if (readBase == base) baseCount++;
}
}

// Count # of bases implied by tag
int modified = tokens.length - 1; // All tokens but the first are "skip" numbers
int skipped = 0;
for (int i = 1; i < tokens.length; i++) skipped += Integer.parseInt(tokens[i]);
if (modified + skipped > baseCount) {
if (++MM_WARNING_COUNT < 21) {
log.warn(this.getReadName() + " MM base count validation failed: expected " + (modified + skipped) + "'" + (tokens[0].charAt(0) + "'s" + ", actual count = " + baseCount));
if (MM_WARNING_COUNT == 20) {
log.warn("MM validation warning count exceeded. Further failures will not be logged.");
}
}
return false;
}
}
}

// If we get here assume the tag is valid
return true;
}

public SMRTKinetics getSmrtKinetics() {
if (smrtKinetics == null) {
smrtKinetics = new SMRTKinetics(this);
Expand Down
61 changes: 57 additions & 4 deletions src/main/java/org/broad/igv/sam/mods/BaseModificationUtils.java
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
package org.broad.igv.sam.mods;

import htsjdk.samtools.util.SequenceUtil;
import org.broad.igv.logging.LogManager;
import org.broad.igv.logging.Logger;
import org.broad.igv.prefs.Constants;
import org.broad.igv.prefs.PreferencesManager;
import org.broad.igv.sam.Alignment;
import org.broad.igv.sam.AlignmentBlock;
import org.broad.igv.sam.AlignmentTrack;
import org.broad.igv.sam.AlignmentUtils;
import org.broad.igv.sam.*;

import java.awt.*;
import java.util.*;
Expand Down Expand Up @@ -42,6 +40,8 @@ public class BaseModificationUtils {
codeValues.put("NONE_A", "Unmodified A");
}

private static int MM_WARNING_COUNT = 0;

public static String modificationName(String modification) {
return ((codeValues.containsKey(modification)) ? codeValues.get(modification) : modification);
}
Expand Down Expand Up @@ -190,4 +190,57 @@ public static boolean isChEBI(String str) {
return true;
}

/**
* Minimally validate an MM tag. This will not catch all problems, but will many. Validation proceeds as follows
* 1. Validate types of MM and ML tags. This catches missues of the tags, for example in certain 10X files.
* 2. If available, validate sequence length vs MN tag.
* 3. If MN tag is not available, validate implied minimum count of base nucleotide vs actual count.
*
* @return
*/
public static boolean validateMMTag(String readName, String mm, byte[] sequence, boolean isNegativeStrand) {


// Finally, test implied minimum base count vs actual base count in sequence. The minimum base count is
// equal to the number of modified bases + the number of skipped bases as codified in the MM tag.
// e.g. C+m,5,12,0 => at least 20 "Cs" in the read sequence, 3 with modifications and 17 skipped
if (PreferencesManager.getPreferences().getAsBoolean(Constants.BASEMOD_VALIDATE_BASE_COUNT)) {

String[] mmTokens = mm.split(";");

for (String mmi : mmTokens) {
String[] tokens = mmi.split(","); //Globals.commaPattern.split(mm);
int baseCount;
if (tokens[0].charAt(0) == 'N') {
baseCount = sequence.length;
} else {
byte base = (byte) tokens[0].charAt(0); // "Top strand" base seen by sequencing instrument
byte readBase = isNegativeStrand ? SequenceUtil.complement(base) : base; // Base as reported in BAM file
baseCount = 0;
for (int i = 0; i < sequence.length; i++) {
if (readBase == sequence[i]) baseCount++;
}
}

// Count # of bases implied by tag
int modified = tokens.length - 1; // All tokens but the first are "skip" numbers
int skipped = 0;
for (int i = 1; i < tokens.length; i++) {
skipped += Integer.parseInt(tokens[i]);
}
if (modified + skipped > baseCount) {
if (++MM_WARNING_COUNT < 21) {
log.warn(readName + " MM base count validation failed: expected " + (modified + skipped) + "'" + (tokens[0].charAt(0) + "'s" + ", actual count = " + baseCount));
if (MM_WARNING_COUNT == 20) {
log.warn("MM validation warning count exceeded. Further failures will not be logged.");
}
}
return false;
}
}
}

// If we get here assume the tag is valid
return true;
}
}
Loading

0 comments on commit 4305244

Please sign in to comment.