diff --git a/build.sbt b/build.sbt index 2ae1eea..230edcb 100644 --- a/build.sbt +++ b/build.sbt @@ -20,14 +20,18 @@ javaOptions += "-Xmx4G" libraryDependencies ++= { Seq( - "org.phenopackets" % "phenopackets-api" % "0.0.4", + "org.phenopackets" % "phenopackets-api" % "0.0.5-SNAPSHOT" exclude("org.slf4j", "slf4j-log4j12"), "org.backuity.clist" %% "clist-core" % "2.0.2", "org.backuity.clist" %% "clist-macros" % "2.0.2" % "provided", "net.sourceforge.owlapi" % "owlapi-distribution" % "4.2.5", - "org.apache.jena" % "apache-jena-libs" % "3.1.0", + "org.phenoscape" %% "scowl" % "1.1", + "org.apache.jena" % "apache-jena-libs" % "2.12.1" exclude("org.slf4j", "slf4j-log4j12"), "com.github.jsonld-java" % "jsonld-java" % "0.8.3", "org.apache.directory.studio" % "org.apache.commons.io" % "2.4", "org.scalaz" %% "scalaz-core" % "7.2.1", + "com.github.tototoshi" %% "scala-csv" % "1.3.3", + "com.nrinaudo" %% "kantan.csv" % "0.1.12", + "com.nrinaudo" %% "kantan.csv-generic" % "0.1.12", "com.typesafe.scala-logging" %% "scala-logging" % "3.4.0", "ch.qos.logback" % "logback-classic" % "1.1.7", "org.codehaus.groovy" % "groovy-all" % "2.4.6", diff --git a/src/main/scala/org/phenopackets/pxftools/command/Common.scala b/src/main/scala/org/phenopackets/pxftools/command/Common.scala index f025723..1b7db31 100644 --- a/src/main/scala/org/phenopackets/pxftools/command/Common.scala +++ b/src/main/scala/org/phenopackets/pxftools/command/Common.scala @@ -11,28 +11,38 @@ import java.io.OutputStream import java.io.OutputStreamWriter import org.apache.commons.io.IOUtils +import org.apache.jena.riot.Lang import org.backuity.clist._ import org.phenopackets.api.PhenoPacket import org.phenopackets.api.io.JsonGenerator import org.phenopackets.api.io.JsonReader +import org.phenopackets.api.io.RDFGenerator import org.phenopackets.api.io.YamlGenerator import org.phenopackets.api.io.YamlReader -import org.phenopackets.api.io.RDFGenerator -import org.apache.jena.riot.Lang +import org.phenopackets.pxftools.util.HPOAnnotations trait Common extends Command { type PhenoPacketWriter = PhenoPacket => String + type PhenoPacketReader = InputStream => PhenoPacket def run(): Unit var out = opt[String](description = "Output file. Omit to write to standard out.", default = "") - var format = opt[String](description = "Output format. Set the output format to one of:\nyaml\njson\nturtle", default = "yaml") + var informat = opt[Option[String]](description = "Input format. By default both yaml and json will be attempted. Set the input format to one of:\nyaml\njson\nhpo-phenote") + var outformat = opt[String](description = "Output format. Set the output format to one of:\nyaml\njson\nturtle", default = "yaml") - def outputWriter: PhenoPacketWriter = format match { - case "yaml" => YamlGenerator.render _ - case "json" => JsonGenerator.render _ + def inputReader: Option[PhenoPacketReader] = informat.map(_ match { + case "yaml" => YamlReader.readInputStream + case "json" => JsonReader.readInputStream + case "hpo-phenote" => HPOAnnotations.read + case _ => throw new ParsingException("Invalid input format.") + }) + + def outputWriter: PhenoPacketWriter = outformat match { + case "yaml" => YamlGenerator.render + case "json" => JsonGenerator.render case "turtle" => RDFGenerator.render(_, null, Lang.TURTLE) //TODO should we ask for a base? case _ => throw new ParsingException("Invalid output format.") } @@ -43,23 +53,25 @@ trait Common extends Command { } def readPhenoPacket(inputStream: InputStream): PhenoPacket = { - // This is more complicated than it ought to be so that we can reuse - // the inputStream to try multiple parsers - val baos = new ByteArrayOutputStream() - IOUtils.copy(inputStream, baos) - val bytes = baos.toByteArray() - inputStream.close() - try { - val bais = new ByteArrayInputStream(bytes); - val packet = JsonReader.readInputStream(bais) - bais.close() - packet - } catch { - case ioe: IOException => { + inputReader.map(_(inputStream)).getOrElse { + // This is more complicated than it ought to be so that we can reuse + // the inputStream to try multiple parsers + val baos = new ByteArrayOutputStream() + IOUtils.copy(inputStream, baos) + val bytes = baos.toByteArray() + inputStream.close() + try { val bais = new ByteArrayInputStream(bytes); - val packet = YamlReader.readInputStream(bais) + val packet = JsonReader.readInputStream(bais) bais.close() packet + } catch { + case ioe: IOException => { + val bais = new ByteArrayInputStream(bytes); + val packet = YamlReader.readInputStream(bais) + bais.close() + packet + } } } } diff --git a/src/main/scala/org/phenopackets/pxftools/util/HPOAnnotations.scala b/src/main/scala/org/phenopackets/pxftools/util/HPOAnnotations.scala new file mode 100644 index 0000000..9ec61a6 --- /dev/null +++ b/src/main/scala/org/phenopackets/pxftools/util/HPOAnnotations.scala @@ -0,0 +1,131 @@ +package org.phenopackets.pxftools.util + +import java.util.UUID + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.phenopackets.api.PhenoPacket +import org.phenopackets.api.io.RDFReader +import org.phenopackets.api.util.ContextUtil +import org.phenopackets.pxftools.util.PhenoPacketVocabulary._ +import org.phenoscape.scowl._ +import org.semanticweb.owlapi.apibinding.OWLManager +import org.semanticweb.owlapi.model.AxiomType +import org.semanticweb.owlapi.model.IRI + +import com.github.jsonldjava.core.Context +import com.github.tototoshi.csv.CSVReader +import com.hp.hpl.jena.rdf.model.ModelFactory +import com.hp.hpl.jena.rdf.model.Resource +import com.hp.hpl.jena.rdf.model.ResourceFactory +import com.hp.hpl.jena.rdf.model.Statement +import com.hp.hpl.jena.vocabulary.RDF +import com.hp.hpl.jena.vocabulary.RDFS +import com.typesafe.scalalogging.LazyLogging +import com.github.tototoshi.csv.TSVFormat +import java.io.InputStream + +object HPOAnnotations extends LazyLogging { + + def read(stream: InputStream): PhenoPacket = importFromTable(CSVReader.open(scala.io.Source.fromInputStream(stream, "utf-8"))(new TSVFormat {})) + + def importFromTable(table: CSVReader): PhenoPacket = { + val packetURI = s"urn:uuid:${UUID.randomUUID.toString}" + val packet = ResourceFactory.createResource(packetURI) + val triples = table.iteratorWithHeaders.flatMap(rowToTriples(_, packet)).toSeq + val model = ModelFactory.createDefaultModel() + model.add(triples.asJava) + RDFReader.readModel(model, packetURI) + } + + private def rowToTriples(row: Map[String, String], packet: Resource): Set[Statement] = { + val statements = mutable.Set.empty[Statement] + row.getOpt("Disease ID").foreach { diseaseID => + val disease = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(diseaseID.trim, HPOContext)) + statements += ResourceFactory.createStatement(packet, Diseases, disease) + row.getOpt("Disease Name").foreach { diseaseLabel => + statements += ResourceFactory.createStatement(disease, RDFS.label, ResourceFactory.createTypedLiteral(diseaseLabel.trim)) + } + val association = ResourceFactory.createResource() + statements += ResourceFactory.createStatement(packet, PhenotypeProfile, association) + statements += ResourceFactory.createStatement(association, Entity, disease) + val phenotype = ResourceFactory.createResource() + statements += ResourceFactory.createStatement(association, Phenotype, phenotype) + row.getOpt("Phenotype ID").foreach { phenotypeID => + val phenotypeType = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(phenotypeID.trim, HPOContext)) + val phenoRelation = if (row.getOpt("Negation ID").exists(_.trim.toUpperCase == "NOT")) { + OWLComplementOf + } else RDF.`type` + statements += ResourceFactory.createStatement(phenotype, phenoRelation, phenotypeType) + row.getOpt("Phenotype Name").foreach { phenotypeLabel => + statements += ResourceFactory.createStatement(phenotypeType, RDFS.label, ResourceFactory.createTypedLiteral(phenotypeLabel.trim)) + } + } + row.getOpt("Age of Onset ID").foreach { onsetID => + val onsetType = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(onsetID.trim, HPOContext)) + val onset = ResourceFactory.createResource() + statements += ResourceFactory.createStatement(phenotype, Onset, onset) + statements += ResourceFactory.createStatement(onset, RDF.`type`, onsetType) + row.getOpt("Age of Onset Name").foreach { onsetLabel => + statements += ResourceFactory.createStatement(onsetType, RDFS.label, ResourceFactory.createTypedLiteral(onsetLabel.trim)) + } + } + row.getOpt("Frequency").foreach { frequencyDesc => + val frequency = ResourceFactory.createResource() + statements += ResourceFactory.createStatement(phenotype, Frequency, frequency) + statements += ResourceFactory.createStatement(frequency, Description, ResourceFactory.createTypedLiteral(frequencyDesc.trim)) + } + row.getOpt("Description").foreach { description => + statements += ResourceFactory.createStatement(phenotype, Description, ResourceFactory.createTypedLiteral(description.trim)) + } + if (row.getOpt("Evidence ID").nonEmpty || row.getOpt("Pub").nonEmpty) { + val evidence = ResourceFactory.createResource() + statements += ResourceFactory.createStatement(association, Evidence, evidence) + row.getOpt("Evidence ID").foreach { evidenceID => + val evidenceTypeOpt = evidenceCodesToURI.get(evidenceID.trim) + val evidenceType = evidenceTypeOpt.getOrElse { + logger.warn(s"No IRI found for evidence code $evidenceID") + ResourceFactory.createResource(evidenceID.trim) + } + statements += ResourceFactory.createStatement(evidence, RDF.`type`, evidenceType) + row.getOpt("Evidence Name").foreach { evidenceName => + statements += ResourceFactory.createStatement(evidenceType, RDFS.label, ResourceFactory.createTypedLiteral(evidenceName.trim)) + } + } + row.getOpt("Pub").foreach { pubID => + val pub = ResourceFactory.createResource(ContextUtil.expandIdentifierAsValue(pubID.trim, HPOContext)) + statements += ResourceFactory.createStatement(evidence, Source, pub) + } + } + } + statements.toSet + } + + private val HPOContext: Context = new Context().parse(Map[String, Object]( + "obo" -> "http://purl.obolibrary.org/obo/", + "HP" -> "obo:HP_", + "OMIM" -> "obo:OMIM_").asJava) + + /** + * HPO annotations use shorthand labels as evidence IDs + */ + private lazy val evidenceCodesToURI: Map[String, Resource] = { + val manager = OWLManager.createOWLOntologyManager() + val eco = manager.loadOntology(IRI.create("http://purl.obolibrary.org/obo/eco.owl")) + val HasExactSynonym = AnnotationProperty("http://www.geneontology.org/formats/oboInOwl#hasExactSynonym") + (for { + AnnotationAssertion(_, HasExactSynonym, term: IRI, synonym ^^ dt) <- eco.getAxioms(AxiomType.ANNOTATION_ASSERTION).asScala + } yield { + synonym -> ResourceFactory.createResource(term.toString) + }).toMap + } + + private implicit class NullEmptyStringMap(val self: Map[String, String]) extends AnyVal { + + //scala-csv puts empty strings in the result map; convert to None instead + def getOpt(key: String): Option[String] = self.get(key).filter(_.nonEmpty) + + } + +} diff --git a/src/main/scala/org/phenopackets/pxftools/util/MergeUtil.scala b/src/main/scala/org/phenopackets/pxftools/util/MergeUtil.scala index 04e1a88..6d2a04b 100644 --- a/src/main/scala/org/phenopackets/pxftools/util/MergeUtil.scala +++ b/src/main/scala/org/phenopackets/pxftools/util/MergeUtil.scala @@ -3,7 +3,8 @@ package org.phenopackets.pxftools.util import org.phenopackets.api.PhenoPacket import org.phenopackets.api.io.RDFGenerator import org.phenopackets.api.io.RDFReader -import org.apache.jena.rdf.model.ModelFactory + +import com.hp.hpl.jena.rdf.model.ModelFactory object MergeUtil { diff --git a/src/main/scala/org/phenopackets/pxftools/util/PhenoPacketVocabulary.scala b/src/main/scala/org/phenopackets/pxftools/util/PhenoPacketVocabulary.scala new file mode 100644 index 0000000..02114de --- /dev/null +++ b/src/main/scala/org/phenopackets/pxftools/util/PhenoPacketVocabulary.scala @@ -0,0 +1,23 @@ +package org.phenopackets.pxftools.util + +import com.hp.hpl.jena.rdf.model.ResourceFactory + +object PhenoPacketVocabulary { + + private val Pheno = "http://phenopackets.org" + private val DC = "http://purl.org/dc/terms" + + private def p = ResourceFactory.createProperty(_: String) + + val Diseases = p(s"$Pheno/diseases") + val PhenotypeProfile = p(s"$Pheno/phenotype_profile") + val Entity = p(s"$Pheno/entity") + val Phenotype = p(s"$Pheno/phenotype") + val Onset = p(s"$Pheno/onset") + val Frequency = p(s"$Pheno/frequency") + val Evidence = p(s"$Pheno/evidence") + val Description = p(s"$DC/description") + val Source = p(s"$DC/source") + val OWLComplementOf = p("http://www.w3.org/2002/07/owl#complementOf") + +} \ No newline at end of file