Skip to content
This repository has been archived by the owner on Sep 25, 2018. It is now read-only.

Commit

Permalink
Merge pull request #10 from phenopackets/hpo-annotations
Browse files Browse the repository at this point in the history
HPO Phenote annotations import
  • Loading branch information
balhoff authored Jul 6, 2016
2 parents 8ef8bf6 + dc50e9a commit 9b58eb6
Show file tree
Hide file tree
Showing 5 changed files with 194 additions and 23 deletions.
8 changes: 6 additions & 2 deletions build.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,18 @@ javaOptions += "-Xmx4G"

libraryDependencies ++= {
Seq(
"org.phenopackets" % "phenopackets-api" % "0.0.4",
"org.phenopackets" % "phenopackets-api" % "0.0.5-SNAPSHOT" exclude("org.slf4j", "slf4j-log4j12"),
"org.backuity.clist" %% "clist-core" % "2.0.2",
"org.backuity.clist" %% "clist-macros" % "2.0.2" % "provided",
"net.sourceforge.owlapi" % "owlapi-distribution" % "4.2.5",
"org.apache.jena" % "apache-jena-libs" % "3.1.0",
"org.phenoscape" %% "scowl" % "1.1",
"org.apache.jena" % "apache-jena-libs" % "2.12.1" exclude("org.slf4j", "slf4j-log4j12"),
"com.github.jsonld-java" % "jsonld-java" % "0.8.3",
"org.apache.directory.studio" % "org.apache.commons.io" % "2.4",
"org.scalaz" %% "scalaz-core" % "7.2.1",
"com.github.tototoshi" %% "scala-csv" % "1.3.3",
"com.nrinaudo" %% "kantan.csv" % "0.1.12",
"com.nrinaudo" %% "kantan.csv-generic" % "0.1.12",
"com.typesafe.scala-logging" %% "scala-logging" % "3.4.0",
"ch.qos.logback" % "logback-classic" % "1.1.7",
"org.codehaus.groovy" % "groovy-all" % "2.4.6",
Expand Down
52 changes: 32 additions & 20 deletions src/main/scala/org/phenopackets/pxftools/command/Common.scala
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,38 @@ import java.io.OutputStream
import java.io.OutputStreamWriter

import org.apache.commons.io.IOUtils
import org.apache.jena.riot.Lang
import org.backuity.clist._
import org.phenopackets.api.PhenoPacket
import org.phenopackets.api.io.JsonGenerator
import org.phenopackets.api.io.JsonReader
import org.phenopackets.api.io.RDFGenerator
import org.phenopackets.api.io.YamlGenerator
import org.phenopackets.api.io.YamlReader
import org.phenopackets.api.io.RDFGenerator
import org.apache.jena.riot.Lang
import org.phenopackets.pxftools.util.HPOAnnotations

trait Common extends Command {

type PhenoPacketWriter = PhenoPacket => String
type PhenoPacketReader = InputStream => PhenoPacket

def run(): Unit

var out = opt[String](description = "Output file. Omit to write to standard out.", default = "")

var format = opt[String](description = "Output format. Set the output format to one of:\nyaml\njson\nturtle", default = "yaml")
var informat = opt[Option[String]](description = "Input format. By default both yaml and json will be attempted. Set the input format to one of:\nyaml\njson\nhpo-phenote")
var outformat = opt[String](description = "Output format. Set the output format to one of:\nyaml\njson\nturtle", default = "yaml")

def outputWriter: PhenoPacketWriter = format match {
case "yaml" => YamlGenerator.render _
case "json" => JsonGenerator.render _
def inputReader: Option[PhenoPacketReader] = informat.map(_ match {
case "yaml" => YamlReader.readInputStream
case "json" => JsonReader.readInputStream
case "hpo-phenote" => HPOAnnotations.read
case _ => throw new ParsingException("Invalid input format.")
})

def outputWriter: PhenoPacketWriter = outformat match {
case "yaml" => YamlGenerator.render
case "json" => JsonGenerator.render
case "turtle" => RDFGenerator.render(_, null, Lang.TURTLE) //TODO should we ask for a base?
case _ => throw new ParsingException("Invalid output format.")
}
Expand All @@ -43,23 +53,25 @@ trait Common extends Command {
}

def readPhenoPacket(inputStream: InputStream): PhenoPacket = {
// This is more complicated than it ought to be so that we can reuse
// the inputStream to try multiple parsers
val baos = new ByteArrayOutputStream()
IOUtils.copy(inputStream, baos)
val bytes = baos.toByteArray()
inputStream.close()
try {
val bais = new ByteArrayInputStream(bytes);
val packet = JsonReader.readInputStream(bais)
bais.close()
packet
} catch {
case ioe: IOException => {
inputReader.map(_(inputStream)).getOrElse {
// This is more complicated than it ought to be so that we can reuse
// the inputStream to try multiple parsers
val baos = new ByteArrayOutputStream()
IOUtils.copy(inputStream, baos)
val bytes = baos.toByteArray()
inputStream.close()
try {
val bais = new ByteArrayInputStream(bytes);
val packet = YamlReader.readInputStream(bais)
val packet = JsonReader.readInputStream(bais)
bais.close()
packet
} catch {
case ioe: IOException => {
val bais = new ByteArrayInputStream(bytes);
val packet = YamlReader.readInputStream(bais)
bais.close()
packet
}
}
}
}
Expand Down
131 changes: 131 additions & 0 deletions src/main/scala/org/phenopackets/pxftools/util/HPOAnnotations.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
package org.phenopackets.pxftools.util

import java.util.UUID

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.phenopackets.api.PhenoPacket
import org.phenopackets.api.io.RDFReader
import org.phenopackets.api.util.ContextUtil
import org.phenopackets.pxftools.util.PhenoPacketVocabulary._
import org.phenoscape.scowl._
import org.semanticweb.owlapi.apibinding.OWLManager
import org.semanticweb.owlapi.model.AxiomType
import org.semanticweb.owlapi.model.IRI

import com.github.jsonldjava.core.Context
import com.github.tototoshi.csv.CSVReader
import com.hp.hpl.jena.rdf.model.ModelFactory
import com.hp.hpl.jena.rdf.model.Resource
import com.hp.hpl.jena.rdf.model.ResourceFactory
import com.hp.hpl.jena.rdf.model.Statement
import com.hp.hpl.jena.vocabulary.RDF
import com.hp.hpl.jena.vocabulary.RDFS
import com.typesafe.scalalogging.LazyLogging
import com.github.tototoshi.csv.TSVFormat
import java.io.InputStream

object HPOAnnotations extends LazyLogging {

def read(stream: InputStream): PhenoPacket = importFromTable(CSVReader.open(scala.io.Source.fromInputStream(stream, "utf-8"))(new TSVFormat {}))

def importFromTable(table: CSVReader): PhenoPacket = {
val packetURI = s"urn:uuid:${UUID.randomUUID.toString}"
val packet = ResourceFactory.createResource(packetURI)
val triples = table.iteratorWithHeaders.flatMap(rowToTriples(_, packet)).toSeq
val model = ModelFactory.createDefaultModel()
model.add(triples.asJava)
RDFReader.readModel(model, packetURI)
}

private def rowToTriples(row: Map[String, String], packet: Resource): Set[Statement] = {
val statements = mutable.Set.empty[Statement]
row.getOpt("Disease ID").foreach { diseaseID =>
val disease = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(diseaseID.trim, HPOContext))
statements += ResourceFactory.createStatement(packet, Diseases, disease)
row.getOpt("Disease Name").foreach { diseaseLabel =>
statements += ResourceFactory.createStatement(disease, RDFS.label, ResourceFactory.createTypedLiteral(diseaseLabel.trim))
}
val association = ResourceFactory.createResource()
statements += ResourceFactory.createStatement(packet, PhenotypeProfile, association)
statements += ResourceFactory.createStatement(association, Entity, disease)
val phenotype = ResourceFactory.createResource()
statements += ResourceFactory.createStatement(association, Phenotype, phenotype)
row.getOpt("Phenotype ID").foreach { phenotypeID =>
val phenotypeType = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(phenotypeID.trim, HPOContext))
val phenoRelation = if (row.getOpt("Negation ID").exists(_.trim.toUpperCase == "NOT")) {
OWLComplementOf
} else RDF.`type`
statements += ResourceFactory.createStatement(phenotype, phenoRelation, phenotypeType)
row.getOpt("Phenotype Name").foreach { phenotypeLabel =>
statements += ResourceFactory.createStatement(phenotypeType, RDFS.label, ResourceFactory.createTypedLiteral(phenotypeLabel.trim))
}
}
row.getOpt("Age of Onset ID").foreach { onsetID =>
val onsetType = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(onsetID.trim, HPOContext))
val onset = ResourceFactory.createResource()
statements += ResourceFactory.createStatement(phenotype, Onset, onset)
statements += ResourceFactory.createStatement(onset, RDF.`type`, onsetType)
row.getOpt("Age of Onset Name").foreach { onsetLabel =>
statements += ResourceFactory.createStatement(onsetType, RDFS.label, ResourceFactory.createTypedLiteral(onsetLabel.trim))
}
}
row.getOpt("Frequency").foreach { frequencyDesc =>
val frequency = ResourceFactory.createResource()
statements += ResourceFactory.createStatement(phenotype, Frequency, frequency)
statements += ResourceFactory.createStatement(frequency, Description, ResourceFactory.createTypedLiteral(frequencyDesc.trim))
}
row.getOpt("Description").foreach { description =>
statements += ResourceFactory.createStatement(phenotype, Description, ResourceFactory.createTypedLiteral(description.trim))
}
if (row.getOpt("Evidence ID").nonEmpty || row.getOpt("Pub").nonEmpty) {
val evidence = ResourceFactory.createResource()
statements += ResourceFactory.createStatement(association, Evidence, evidence)
row.getOpt("Evidence ID").foreach { evidenceID =>
val evidenceTypeOpt = evidenceCodesToURI.get(evidenceID.trim)
val evidenceType = evidenceTypeOpt.getOrElse {
logger.warn(s"No IRI found for evidence code $evidenceID")
ResourceFactory.createResource(evidenceID.trim)
}
statements += ResourceFactory.createStatement(evidence, RDF.`type`, evidenceType)
row.getOpt("Evidence Name").foreach { evidenceName =>
statements += ResourceFactory.createStatement(evidenceType, RDFS.label, ResourceFactory.createTypedLiteral(evidenceName.trim))
}
}
row.getOpt("Pub").foreach { pubID =>
val pub = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(pubID.trim, HPOContext))
statements += ResourceFactory.createStatement(evidence, Source, pub)
}
}
}
statements.toSet
}

private val HPOContext: Context = new Context().parse(Map[String, Object](
"obo" -> "http://purl.obolibrary.org/obo/",
"HP" -> "obo:HP_",
"OMIM" -> "obo:OMIM_").asJava)

/**
* HPO annotations use shorthand labels as evidence IDs
*/
private lazy val evidenceCodesToURI: Map[String, Resource] = {
val manager = OWLManager.createOWLOntologyManager()
val eco = manager.loadOntology(IRI.create("http://purl.obolibrary.org/obo/eco.owl"))
val HasExactSynonym = AnnotationProperty("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym")
(for {
AnnotationAssertion(_, HasExactSynonym, term: IRI, synonym ^^ dt) <- eco.getAxioms(AxiomType.ANNOTATION_ASSERTION).asScala
} yield {
synonym -> ResourceFactory.createResource(term.toString)
}).toMap
}

private implicit class NullEmptyStringMap(val self: Map[String, String]) extends AnyVal {

//scala-csv puts empty strings in the result map; convert to None instead
def getOpt(key: String): Option[String] = self.get(key).filter(_.nonEmpty)

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ package org.phenopackets.pxftools.util
import org.phenopackets.api.PhenoPacket
import org.phenopackets.api.io.RDFGenerator
import org.phenopackets.api.io.RDFReader
import org.apache.jena.rdf.model.ModelFactory

import com.hp.hpl.jena.rdf.model.ModelFactory

object MergeUtil {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package org.phenopackets.pxftools.util

import com.hp.hpl.jena.rdf.model.ResourceFactory

object PhenoPacketVocabulary {

private val Pheno = "http://phenopackets.org"
private val DC = "http://purl.org/dc/terms"

private def p = ResourceFactory.createProperty(_: String)

val Diseases = p(s"$Pheno/diseases")
val PhenotypeProfile = p(s"$Pheno/phenotype_profile")
val Entity = p(s"$Pheno/entity")
val Phenotype = p(s"$Pheno/phenotype")
val Onset = p(s"$Pheno/onset")
val Frequency = p(s"$Pheno/frequency")
val Evidence = p(s"$Pheno/evidence")
val Description = p(s"$DC/description")
val Source = p(s"$DC/source")
val OWLComplementOf = p("http://www.w3.org/2002/07/owl#complementOf")

}

0 comments on commit 9b58eb6

Please sign in to comment.