Skip to content

Commit

Permalink
Merge pull request #26 from molgenis/feat/capice2vcf_precomputed
Browse files Browse the repository at this point in the history
Fix #25 Extend capice2vcf to convert precomputed scores
  • Loading branch information
bartcharbon authored Sep 3, 2020
2 parents 05759a6 + 5047e10 commit 62f5689
Show file tree
Hide file tree
Showing 19 changed files with 199 additions and 51 deletions.
31 changes: 28 additions & 3 deletions src/main/java/org/molgenis/capice/AppCommandLineOptions.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ class AppCommandLineOptions {
static final String OPT_OUTPUT_LONG = "output";
static final String OPT_FORCE = "f";
static final String OPT_FORCE_LONG = "force";
static final String OPT_TYPE = "t";
static final String OPT_TYPE_LONG = "type";
static final String OPT_DEBUG = "d";
static final String OPT_DEBUG_LONG = "debug";
static final String OPT_VERSION = "v";
Expand All @@ -30,7 +32,7 @@ class AppCommandLineOptions {
.hasArg(true)
.required()
.longOpt(OPT_INPUT_LONG)
.desc("Input CAPICE predictions file (.tsv).")
.desc("Input CAPICE predictions file (.tsv) or precomputed scores file (.tsv.gz).")
.build());
appOptions.addOption(
Option.builder(OPT_OUTPUT)
Expand All @@ -43,6 +45,12 @@ class AppCommandLineOptions {
.longOpt(OPT_FORCE_LONG)
.desc("Override the output file if it already exists.")
.build());
appOptions.addOption(
Option.builder(OPT_TYPE)
.hasArg(true)
.longOpt(OPT_TYPE_LONG)
.desc("Input type (precomputed_scores or predictions). Default: predictions.")
.build());
APP_OPTIONS = appOptions;

Options appVersionOptions = new Options();
Expand All @@ -68,6 +76,7 @@ static Options getAppVersionOptions() {
static void validateCommandLine(CommandLine commandLine) {
validateInput(commandLine);
validateOutput(commandLine);
validateType(commandLine);
}

private static void validateInput(CommandLine commandLine) {
Expand All @@ -85,9 +94,9 @@ private static void validateInput(CommandLine commandLine) {
format("Input file '%s' is not readable.", inputPath.toString()));
}
String inputPathStr = inputPath.toString();
if (!inputPathStr.endsWith(".tsv")) {
if (!inputPathStr.endsWith(".tsv") && !inputPathStr.endsWith(".tsv.gz")) {
throw new IllegalArgumentException(
format("Input file '%s' is not a .tsv file.", inputPathStr));
format("Input file '%s' is not a .tsv or .tsv.gz file.", inputPathStr));
}
}

Expand All @@ -109,4 +118,20 @@ private static void validateOutput(CommandLine commandLine) {
format("Output file '%s' already exists", outputPath.toString()));
}
}

private static void validateType(CommandLine commandLine) {
if (!commandLine.hasOption(OPT_TYPE)) {
return;
}

String typeStr = commandLine.getOptionValue(OPT_TYPE);
switch (typeStr) {
case "precomputed_scores":
case "predictions":
break;
default:
throw new IllegalArgumentException(
format("Type '%s' unknown. Valid types: precomputed_scores, predictions", typeStr));
}
}
}
12 changes: 11 additions & 1 deletion src/main/java/org/molgenis/capice/AppCommandLineRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,17 @@ public void run(String... args) {
}

LOGGER.info("mapping tsv from to '{}' to vcf...", settings.getInputTsvPath());
capiceService.mapPredictionsToVcf(settings);
FileType fileType = settings.getFileType();
switch (fileType) {
case PRECOMPUTED_SCORES:
capiceService.mapPrecomputedScores(settings);
break;
case PREDICTIONS:
capiceService.mapPredictionsToVcf(settings);
break;
default:
throw new IllegalArgumentException(String.format("unknown file type %s", fileType));
}
LOGGER.info("created vcf '{}'", outputReportPath);
} catch (Exception e) {
LOGGER.error("", e);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import static org.molgenis.capice.AppCommandLineOptions.OPT_FORCE;
import static org.molgenis.capice.AppCommandLineOptions.OPT_INPUT;
import static org.molgenis.capice.AppCommandLineOptions.OPT_OUTPUT;
import static org.molgenis.capice.AppCommandLineOptions.OPT_TYPE;

import java.nio.file.Path;
import org.apache.commons.cli.CommandLine;
Expand Down Expand Up @@ -35,6 +36,22 @@ Settings map(CommandLine commandLine) {

boolean overwriteOutput = commandLine.hasOption(OPT_FORCE);

return new Settings(inputPath, outputPath, overwriteOutput, appName, appVersion);
FileType fileType;
if (commandLine.hasOption(OPT_TYPE)) {
String optionStr = commandLine.getOptionValue(OPT_TYPE);
switch (optionStr) {
case "precomputed_scores":
fileType = FileType.PRECOMPUTED_SCORES;
break;
case "predictions":
fileType = FileType.PREDICTIONS;
break;
default:
throw new IllegalArgumentException(String.format("invalid file type '%s'", optionStr));
}
} else {
fileType = FileType.PREDICTIONS;
}
return new Settings(inputPath, outputPath, overwriteOutput, fileType, appName, appVersion);
}
}
6 changes: 6 additions & 0 deletions src/main/java/org/molgenis/capice/FileType.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
package org.molgenis.capice;

public enum FileType {
PRECOMPUTED_SCORES,
PREDICTIONS
}
2 changes: 2 additions & 0 deletions src/main/java/org/molgenis/capice/vcf/CapiceService.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@

public interface CapiceService {
void mapPredictionsToVcf(Settings settings);

void mapPrecomputedScores(Settings settings);
}
9 changes: 9 additions & 0 deletions src/main/java/org/molgenis/capice/vcf/CapiceServiceImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,13 @@ public void mapPredictionsToVcf(Settings settings) {
}
}
}

@Override
public void mapPrecomputedScores(Settings settings) {
Path inputTsvPath = settings.getInputTsvPath();

LOGGER.info("mapping tsv to vcf...");
tsvToVcfMapper.mapPrecomputedScores(inputTsvPath, settings.getOutputVcfPath(), settings);
LOGGER.info("done mapping tsv to vcf");
}
}
12 changes: 10 additions & 2 deletions src/main/java/org/molgenis/capice/vcf/CapiceUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
public class CapiceUtils {
private static final Pattern POS_PATTERN = Pattern.compile("(.+?)_(.+?)_(.+?)_(.+?)");

private CapiceUtils(){}
private CapiceUtils() {}

public static VcfPosition getVcfPosition(CSVRecord thatRecord) {
public static VcfPosition getVcfPositionFromPrediction(CSVRecord thatRecord) {
String positionString = thatRecord.get(0) != null ? thatRecord.get(0) : "";
Matcher thatMatcher = POS_PATTERN.matcher(positionString);
if (!thatMatcher.matches()) {
Expand All @@ -21,4 +21,12 @@ public static VcfPosition getVcfPosition(CSVRecord thatRecord) {
String alt = thatMatcher.group(4);
return new VcfPosition(chrom, pos, ref, alt);
}

public static VcfPosition getVcfPositionFromPrecomputedScore(CSVRecord csvRecord) {
String chrom = csvRecord.get(0);
int pos = Integer.parseInt(csvRecord.get(1));
String ref = csvRecord.get(2);
String alt = csvRecord.get(3);
return new VcfPosition(chrom, pos, ref, alt);
}
}
2 changes: 2 additions & 0 deletions src/main/java/org/molgenis/capice/vcf/Settings.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,15 @@
import lombok.NonNull;
import lombok.Value;
import lombok.experimental.NonFinal;
import org.molgenis.capice.FileType;

@Value
@NonFinal
public class Settings {
@NonNull Path inputTsvPath;
@NonNull Path outputVcfPath;
boolean overwriteOutputVcf;
FileType fileType;
@NonNull String appName;
@NonNull String appVersion;
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
package org.molgenis.capice.vcf;

import static org.molgenis.capice.vcf.CapiceUtils.getVcfPosition;
import static org.molgenis.capice.vcf.CapiceUtils.getVcfPositionFromPrediction;

import java.util.Comparator;
import org.apache.commons.csv.CSVRecord;
Expand All @@ -9,8 +9,8 @@ public class TsvRecordComparator implements Comparator<CSVRecord> {

@Override
public int compare(CSVRecord thisRecord, CSVRecord thatRecord) {
VcfPosition thisPosition = getVcfPosition(thisRecord);
VcfPosition thatPosition = getVcfPosition(thatRecord);
VcfPosition thisPosition = getVcfPositionFromPrediction(thisRecord);
VcfPosition thatPosition = getVcfPositionFromPrediction(thatRecord);

if (thisPosition.isNumericChromosome()) {
return compareNumericChrom(thisPosition.getChromosome(), thisPosition.getPosition(), thatPosition.getChromosome(), thatPosition.isNumericChromosome(), thatPosition.getPosition());
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/org/molgenis/capice/vcf/TsvToVcfMapper.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@

public interface TsvToVcfMapper {
void map(Path sortedTsvPath, Path outputVcfPath, Settings settings);

void mapPrecomputedScores(Path inputTsvPath, Path outputVcfPath, Settings settings);
}
84 changes: 64 additions & 20 deletions src/main/java/org/molgenis/capice/vcf/TsvToVcfMapperImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

import static java.nio.charset.StandardCharsets.UTF_8;
import static java.util.Collections.singletonList;
import static org.molgenis.capice.vcf.CapiceUtils.getVcfPosition;
import static org.molgenis.capice.vcf.CapiceUtils.getVcfPositionFromPrecomputedScore;
import static org.molgenis.capice.vcf.CapiceUtils.getVcfPositionFromPrediction;
import static org.molgenis.capice.vcf.TsvUtils.TSV_FORMAT;

import htsjdk.variant.variantcontext.VariantContextBuilder;
Expand All @@ -21,6 +22,7 @@
import java.io.UncheckedIOException;
import java.nio.file.Path;
import java.util.Iterator;
import java.util.zip.GZIPInputStream;
import org.apache.commons.csv.CSVParser;
import org.apache.commons.csv.CSVRecord;
import org.springframework.stereotype.Component;
Expand All @@ -33,38 +35,65 @@ public class TsvToVcfMapperImpl implements TsvToVcfMapper {

@Override
public void map(Path sortedTsvPath, Path outputVcfPath, Settings settings) {
VariantContextWriter variantContextWriter =
new VariantContextWriterBuilder()
.setOutputFile(outputVcfPath.toFile())
.setOutputFileType(OutputType.BLOCK_COMPRESSED_VCF)
.build();
try {
try (VariantContextWriter variantContextWriter = createVariantContextWriter(outputVcfPath)) {
setVcfHeader(settings, variantContextWriter);
mapCapiceOutput(sortedTsvPath, variantContextWriter);
} finally {
variantContextWriter.close();
mapCapicePredictionsOutput(sortedTsvPath, variantContextWriter);
}
}

private void mapCapiceOutput(Path sortedTsvPath, VariantContextWriter variantContextWriter) {
try(Reader in = new InputStreamReader(new FileInputStream(sortedTsvPath.toFile()), UTF_8);
@Override
public void mapPrecomputedScores(Path inputTsvPath, Path outputVcfPath, Settings settings) {
try (VariantContextWriter variantContextWriter = createVariantContextWriter(outputVcfPath)) {
setVcfHeader(settings, variantContextWriter);
mapCapicePrecomputedScoresOutput(inputTsvPath, variantContextWriter);
}
}

private void mapCapicePredictionsOutput(
Path sortedTsvPath, VariantContextWriter variantContextWriter) {
try (Reader in = createInputReader(sortedTsvPath);
CSVParser csvParser = TSV_FORMAT.parse(in)) {
Iterator<CSVRecord> iterator = csvParser.iterator();
iterator.next(); // skip header line (TSV_FORMAT.withSkipHeaderLine doesn't seem to work)
while (iterator.hasNext()) {
CSVRecord record = iterator.next();
mapLine(variantContextWriter, record);
mapPredictionsLine(variantContextWriter, record);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}

private void mapCapicePrecomputedScoresOutput(
Path inputTsvPath, VariantContextWriter variantContextWriter) {
try (Reader in = createInputReader(inputTsvPath);
CSVParser csvParser = TSV_FORMAT.parse(in)) {
for (CSVRecord record : csvParser) {
mapPrecomputedScoreLine(variantContextWriter, record);
}
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}

void mapLine(VariantContextWriter variantContextWriter, CSVRecord record) {
void mapPredictionsLine(VariantContextWriter variantContextWriter, CSVRecord record) {
validateLine(record);
VcfPosition vcfPosition = getVcfPosition(record);
VcfPosition vcfPosition = getVcfPositionFromPrediction(record);
float prediction = getPrediction(record);

map(vcfPosition, prediction, variantContextWriter);
}

private void mapPrecomputedScoreLine(
VariantContextWriter variantContextWriter, CSVRecord record) {
VcfPosition vcfPosition = getVcfPositionFromPrecomputedScore(record);
float prediction = getPrediction(record);

map(vcfPosition, prediction, variantContextWriter);
}

private void map(
VcfPosition vcfPosition, float prediction, VariantContextWriter variantContextWriter) {
long start = vcfPosition.getPosition();
long stop = start + (vcfPosition.getReference().length() - 1);
VariantContextBuilder variantContextBuilder = new VariantContextBuilder();
Expand All @@ -77,7 +106,7 @@ void mapLine(VariantContextWriter variantContextWriter, CSVRecord record) {
}

private void validateLine(CSVRecord record) {
if(record.get(POSITION_INDEX)==null || record.get(SCORE_INDEX) == null){
if (record.get(POSITION_INDEX) == null || record.get(SCORE_INDEX) == null) {
throw new MalformedCapiceInputException(record.getRecordNumber());
}
}
Expand All @@ -97,11 +126,26 @@ private void setVcfHeader(Settings settings, VariantContextWriter variantContext
}

private float getPrediction(CSVRecord record) {
try{
return Float.parseFloat(record.get(4));
}
catch(NumberFormatException e){
try {
return Float.parseFloat(record.get(4));
} catch (NumberFormatException e) {
throw new IllegalStateException(e);
}
}

private static Reader createInputReader(Path inputTsvPath) throws IOException {
if (inputTsvPath.endsWith(".gz")) {
return new InputStreamReader(
new GZIPInputStream(new FileInputStream(inputTsvPath.toFile())), UTF_8);
} else {
return new InputStreamReader(new FileInputStream(inputTsvPath.toFile()), UTF_8);
}
}

private static VariantContextWriter createVariantContextWriter(Path outputVcfPath) {
return new VariantContextWriterBuilder()
.setOutputFile(outputVcfPath.toFile())
.setOutputFileType(OutputType.BLOCK_COMPRESSED_VCF)
.build();
}
}
Loading

0 comments on commit 62f5689

Please sign in to comment.