Skip to content

Commit

Permalink
Update the large test CRAM files to CRAM v3.0. (#8832)
Browse files Browse the repository at this point in the history
  • Loading branch information
cmnbroad committed Sep 4, 2024
1 parent 3c1430d commit f42b7f7
Show file tree
Hide file tree
Showing 7 changed files with 901 additions and 996 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ public abstract class GATKBaseTest extends BaseTest {
// ~600,000 reads from chromosomes 20 and 21 of an NA12878 WGS bam aligned to b37, plus ~50,000 unmapped reads
public static final String NA12878_20_21_WGS_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam";
public static final String NA12878_20_21_WGS_mmp2_bam = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.mmp2.bam";
public static final String NA12878_20_21_WGS_cram = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram";
public static final String NA12878_20_21_WGS_cram = largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram";

public static final String NA12878_20_21_covered_regions = publicTestDir + "wgs_calling_regions.v1.chr20_chr21.interval_list";

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ public class PrintFileDiagnosticsIntegrationTest extends CommandLineProgramTest
public Object[][] getFileDiagnosticsTestCases() {
return new Object[][]{
{
"src/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram",
//this pathname is embedded in the diagnostics output file, so we use a relative pathname
// instead of the named constant NA12878_20_21_WGS_cram in order to avoid test failures
// caused by the full pathname varying in different environments
"src/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram",
List.of(Pair.of("count-limit", "10")),
"src/test/resources/filediagnostics/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.txt"
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@

import com.google.common.jimfs.Configuration;
import com.google.common.jimfs.Jimfs;
import htsjdk.samtools.SAMRecord;
import htsjdk.samtools.SAMRecordIterator;
import htsjdk.samtools.SamFiles;
import htsjdk.samtools.SamReader;
import htsjdk.samtools.SamReaderFactory;
import htsjdk.samtools.ValidationStringency;
import htsjdk.samtools.util.CloseableIterator;
import htsjdk.samtools.*;
import htsjdk.samtools.cram.ref.ReferenceSource;
import org.broadinstitute.hellbender.CommandLineProgramTest;
import org.broadinstitute.hellbender.cmdline.ReadFilterArgumentDefinitions;
import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.testutils.ArgumentsBuilder;
Expand Down Expand Up @@ -40,6 +36,82 @@ public String getTestedClassName() {
return PrintReads.class.getSimpleName();
}

@DataProvider(name="roundTripCRAMTests")
public Object[][] getRoundTripCRAMTests() {
return new Object[][] {
// we need to use lenient equality for this test because this bam has ~14k reads that fail full
// read equality; at least some of which are because they are unmapped/unplaced, but have cigar
// strings that both samtools and htsjdk drop when roundtripping
{NA12878_20_21_WGS_bam, b37_reference_20_21, true, false},
// this cram is the result of converting the above bam to cram using samtools; once the file is
// converted, we can use full read equality when roundtripping through cram, so we don't need to
// be lenient
{NA12878_20_21_WGS_cram, b37_reference_20_21, false, false},
// roundtrip a v2.1 file
{ largeFileTestDir + "CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram",
b37_reference_20_21, false, false },
};
}

@Test(dataProvider="roundTripCRAMTests")
public void testRoundTripToCRAM(
final String sourceBamOrCRAM,
final String reference,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
final File outputCram = createTempFile("testRoundTripCRAM", ".cram");

final ArgumentsBuilder args = new ArgumentsBuilder();
args.addInput(sourceBamOrCRAM);
args.addOutput(outputCram);
args.addReference(reference);
args.add(ReadFilterArgumentDefinitions.DISABLE_TOOL_DEFAULT_READ_FILTERS, true);

runCommandLine(args, "PrintReads");
assertRoundTripFidelity(new File(sourceBamOrCRAM), outputCram, new File(reference), lenientEquality, emitDetail);
}

public void assertRoundTripFidelity(
final File sourceFile,
final File targetCRAMFile,
final File referenceFile,
final boolean lenientEquality,
final boolean emitDetail) throws IOException {
try (final SamReader sourceReader = SamReaderFactory.makeDefault()
.referenceSequence(referenceFile)
.validationStringency((ValidationStringency.SILENT))
.open(sourceFile);
final CRAMFileReader copyReader = new CRAMFileReader(targetCRAMFile, new ReferenceSource(referenceFile))) {
final SAMRecordIterator sourceIterator = sourceReader.iterator();
final SAMRecordIterator targetIterator = copyReader.getIterator();

while (sourceIterator.hasNext() && targetIterator.hasNext()) {
if (lenientEquality) {
final SAMRecord sourceRec = sourceIterator.next();
final SAMRecord targetRec = targetIterator.next();
Assert.assertEquals(targetRec.getReadName(), sourceRec.getReadName());
Assert.assertEquals(targetRec.getReadBases(), sourceRec.getReadBases());
Assert.assertEquals(targetRec.getBaseQualities(), sourceRec.getBaseQualities());
Assert.assertEquals(targetRec.getAlignmentStart(), sourceRec.getAlignmentStart());
Assert.assertEquals(targetRec.getAlignmentEnd(), sourceRec.getAlignmentEnd());
} else if (emitDetail) {
final SAMRecord sourceRec = sourceIterator.next();
final SAMRecord targetRec = targetIterator.next();
if (!sourceRec.equals(targetRec)) {
System.out.println("Difference found:");
System.out.println(sourceRec.getSAMString());
System.out.println(targetRec.getSAMString());
}
Assert.assertEquals(targetRec, sourceRec);
} else {
Assert.assertEquals(targetIterator.next(), sourceIterator.next());
}
}
Assert.assertEquals(sourceIterator.hasNext(), targetIterator.hasNext());
}
}


@DataProvider(name = "ReadEntireCramTestData")
public Object[][] readEntireCramTestData() {
final File ref = new File(hg19MiniReference);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,8 @@ public void testNoConflictPG() throws IOException {
public Object[][] getHttpPaths(){
final String bam = "gs://hellbender/test/resources/benchmark/CEUTrio.HiSeq.WEx.b37.NA12892.bam";
final String bai = "gs://hellbender/test/resources/benchmark/CEUTrio.HiSeq.WEx.b37.NA12892.bam.bai";
final String cram = "gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram";
final String crai = "gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.cram.bai";
final String cram = "gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram";
final String crai = "gs://hellbender/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.v3.0.samtools.cram.crai";
final List<SimpleInterval> bamIntervals = Arrays.asList(new SimpleInterval("3",1_000_000, 1_000_001),
new SimpleInterval("3", 1_000_003, 1_000_100),
new SimpleInterval("20", 1_099_000, 1_100_000));
Expand Down
Loading

0 comments on commit f42b7f7

Please sign in to comment.