Skip to content

Commit

Permalink
refactor: oxigraph -> sophia api (#13)
Browse files Browse the repository at this point in the history
* refactor: rm all oxigraph imports

* refactor(deps): rio+oxigraph -> sophia

* refactor: standalone GraphFormat enum + adapt signatures

* refactor: use sophia 0.8.0 [WIP]

* refactor: complete refactor to sophia

* refactor: use Output enum instead of trait generics

* feat: add benchmark script

* fix(bench): add stdev on viz

* fix(bench): add stdev on viz (bis)

* chore: rm unused deps

* refactor: simplify RdfParser signatures

* doc: module documentation

* tests(io): unit testing

* test(cli): integration tests

* fix(cli): disable output on --no-out
  • Loading branch information
cmdoret authored Oct 30, 2023
1 parent f857df8 commit 54c1d98
Show file tree
Hide file tree
Showing 10 changed files with 694 additions and 509 deletions.
682 changes: 264 additions & 418 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ edition = "2021"

[dependencies]
clap = { version = "4.4.2", features = ["derive"] }
oxigraph = "0.3.19"
rio_api = "0.8.4"
rio_turtle = "0.8.4"
sophia = { version = "0.8.0-alpha.3", features = ["xml"] }

[dev-dependencies]
assert_cmd = "2.0.12"
tempfile = "3.8.1"
19 changes: 19 additions & 0 deletions scripts/run_bench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/usr/bin/env bash
# Compare runtime of rdfpipe vs rdfpipe-rs
# hyperfine is the only dependency (besides rdfpipe and rdfpipe-rs)
set -euo pipefail

# File path to a (large) ntriples RDF dataset
DATASET="$1"
RDFPIPE_PY="rdfpipe"
RDFPIPE_RS="./target/release/rdfpipe-rs"

# Run both implementations with different number of triples
# timings are saved in timings.csv
hyperfine \
--warmup 1 \
-L N 1,2,3,4,5,10,15,20,50 \
-L FMT ttl,xml \
--export-csv timings.csv \
"head -n {N}000 ${DATASET} | ${RDFPIPE_PY} -i nt -o {FMT} - > /dev/null" \
"head -n {N}000 ${DATASET} | ${RDFPIPE_RS} -i nt -o {FMT} - > /dev/null"
42 changes: 42 additions & 0 deletions scripts/viz_bench.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
# Visualization of timings for rdfpipe vs rdfpipe-rs
# tidyverse>=1.1.3 is the only dependency

library(tidyverse)

timings <- read_csv("timings.csv")

bench <- timings %>%
rename(
tool = command,
thousand_lines = parameter_N,
fmt = parameter_FMT
) %>%
mutate(tool = case_when(
str_detect(tool, "rdfpipe-rs") ~ "rdfpipe-rs",
TRUE ~ "rdfpipe"
)) %>%
select(tool, mean, fmt, stddev, thousand_lines) %>%
arrange(thousand_lines, tool)

ggplot(bench, aes(x = thousand_lines, y = log10(mean), color = tool)) +
geom_ribbon(
aes(
y = log10(mean),
ymin = log10(mean - stddev),
ymax = log10(mean + stddev),
),
alpha = .5,
linewidth = 0,
fill = "lightgrey",
) +
geom_line() +
xlab("Thousands of lines parsed") +
ylab("Log10 time (seconds)") +
theme_bw(base_size = 22) +
coord_fixed(ratio = 10) +
facet_grid(~fmt, labeller = labeller(
fmt = c(
"ttl" = "ntriples -> turtle",
"xml" = "ntriples -> xml"
)
))
53 changes: 27 additions & 26 deletions src/cli.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
//! Command line interface for the RDF conversion tool.
use clap;
use clap::{Parser, ValueEnum};
use std::fmt::Error;
use std::str::FromStr;

use clap::Parser;
use clap::ValueEnum;
use oxigraph::io::GraphFormat;

// This lets clap automate validation of
// RDF formats from the command line
#[derive(Clone, Debug, ValueEnum)]
pub(crate) enum ArgGraphFormat {
#[derive(Clone, Debug, PartialEq, ValueEnum)]
pub enum GraphFormat {
#[clap(alias = "ttl")]
Turtle,
#[clap(alias = "nt", alias = "ntriples")]
Expand All @@ -17,46 +16,48 @@ pub(crate) enum ArgGraphFormat {
RdfXml,
}

// Helper mappings to convert from helper CLI enum
// to corresponding values in oxigraph's enum
impl From<&ArgGraphFormat> for GraphFormat {
fn from(other: &ArgGraphFormat) -> GraphFormat {
match other {
ArgGraphFormat::Turtle => GraphFormat::Turtle,
ArgGraphFormat::NTriples => GraphFormat::NTriples,
ArgGraphFormat::RdfXml => GraphFormat::RdfXml,
}
}
}

impl FromStr for ArgGraphFormat {
impl FromStr for GraphFormat {
type Err = Error;
fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"ntriples" | "nt" | "n-triples" => Ok(ArgGraphFormat::NTriples),
"xml" | "rdf/xml" | "rdf-xml" => Ok(ArgGraphFormat::RdfXml),
"ttl" | "turtle" => Ok(ArgGraphFormat::Turtle),
"ntriples" | "nt" | "n-triples" => Ok(GraphFormat::NTriples),
"xml" | "rdf/xml" | "rdf-xml" => Ok(GraphFormat::RdfXml),
"ttl" | "turtle" => Ok(GraphFormat::Turtle),
_ => Err(Error),
}
}
}

impl GraphFormat {
pub fn from_extension(ext: &str) -> Option<Self> {
match ext {
"nt" | "ntriples" => Some(GraphFormat::NTriples),
"xml" | "rdf" | "owl" => Some(GraphFormat::RdfXml),
"ttl" | "turtle" => Some(GraphFormat::Turtle),
_ => None,
}
}
}

#[derive(Parser, Debug)]
#[command(author, about = "RDF conversion tool")]
pub(crate) struct Args {
#[arg(long, help="Don't guess format based on file suffix.")]
#[arg(long, help = "Don't guess format based on file suffix.")]
pub(crate) no_guess: bool,
#[arg(long, help="Don't output the resulting graph (useful for checking validity of input).")]
#[arg(
long,
help = "Don't output the resulting graph (useful for checking validity of input)."
)]
pub(crate) no_out: bool,
#[arg(short, long, help = "Input RDF serialization format")]
pub(crate) input_format: Option<ArgGraphFormat>,
pub(crate) input_format: Option<GraphFormat>,
#[arg(
short,
long,
default_value = "turtle",
help = "Output RDF serialization format"
)]
pub(crate) output_format: Option<ArgGraphFormat>,
pub(crate) output_format: GraphFormat,
#[arg(default_value = "-", help = "Input file. Omit or use - for stdin.")]
pub(crate) input_file: Option<String>,
}
39 changes: 39 additions & 0 deletions src/converter.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
//! # Conversion logic
//!
//! This module contains the `RdfIO` trait which is used to parse and serialize RDF graphs.
//! Each RDF serialization format should implement this trait.
//!

use crate::io::{Input, Output};
use sophia::api::prelude::TripleParser;
use sophia::api::serializer::TripleSerializer;
use sophia::api::source::TripleSource;
use sophia::inmem::graph::FastGraph;

/// The `RdfIO` trait is used to parse and serialize RDF graphs.

pub trait RdfIO<'a, P: TripleParser<Input>, F: TripleSerializer> {
/// Parse an RDF graph from an input source to an in-memory graph.
fn parse(&self, input: Input) -> Result<FastGraph, String> {
let mut graph = FastGraph::new();
match self.parser().parse(input).add_to_graph(&mut graph) {
Ok(_) => Ok(graph),
Err(_) => Err(String::from("Could not parse graph")),
}
}

/// Serialize an in-memory RDF graph to an output source.
fn serialize(&self, writer: Output, graph: FastGraph) -> Result<(), String> {
let mut formatter = self.serializer(writer);
match formatter.serialize_graph(&graph) {
Ok(_) => Ok(()),
Err(_) => Err(String::from("Could not serialize graph")),
}
}

/// Create a new parser for this format.
fn parser(&self) -> P;

/// Create a new serializer for this format.
fn serializer(&self, writer: Output) -> F;
}
77 changes: 77 additions & 0 deletions src/formats.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
//! # Implementation of concrete RDF formats
//!
//! This module implements `RdfIO` trait for each RDF serialization format.
use crate::cli::GraphFormat;
use crate::converter::RdfIO;
use crate::io::{Input, Output};
use sophia::inmem::graph::FastGraph;
use sophia::turtle::parser::nt::NTriplesParser;
use sophia::turtle::parser::turtle::TurtleParser;
use sophia::turtle::serializer::nt::NtSerializer;
use sophia::turtle::serializer::turtle::TurtleSerializer;
use sophia::xml::parser::RdfXmlParser;
use sophia::xml::serializer::RdfXmlSerializer;

pub(crate) struct NTriples;
pub(crate) struct Turtle;
pub(crate) struct RdfXml;

/// The `RdfParser` struct provides a generic interface to parse RDF graphs
/// from different formats.
pub struct RdfParser {
pub graph: FastGraph,
}

impl RdfParser {
pub fn new(input: Input, format: GraphFormat) -> Result<Self, String> {
let graph = match format {
GraphFormat::NTriples => NTriples.parse(input),
GraphFormat::Turtle => Turtle.parse(input),
GraphFormat::RdfXml => RdfXml.parse(input),
}?;
Ok(RdfParser { graph })
}
}

/// The `RdfSerializer` struct provides a generic interface to serialize
/// RDF graphs to different formats.
pub struct RdfSerializer;

impl RdfSerializer {
pub fn serialize(dest: Output, format: GraphFormat, graph: FastGraph) -> Result<(), String> {
match format {
GraphFormat::NTriples => NTriples.serialize(dest, graph),
GraphFormat::Turtle => Turtle.serialize(dest, graph),
GraphFormat::RdfXml => RdfXml.serialize(dest, graph),
}
}
}
impl<'a> RdfIO<'a, NTriplesParser, NtSerializer<Output>> for NTriples {
fn parser(&self) -> NTriplesParser {
NTriplesParser {}
}

fn serializer(&self, writer: Output) -> NtSerializer<Output> {
NtSerializer::new(writer)
}
}

impl<'a> RdfIO<'a, TurtleParser, TurtleSerializer<Output>> for Turtle {
fn parser(&self) -> TurtleParser {
TurtleParser { base: None }
}

fn serializer(&self, writer: Output) -> TurtleSerializer<Output> {
TurtleSerializer::new(writer)
}
}

impl<'a> RdfIO<'a, RdfXmlParser, RdfXmlSerializer<Output>> for RdfXml {
fn parser(&self) -> RdfXmlParser {
RdfXmlParser { base: None }
}

fn serializer(&self, writer: Output) -> RdfXmlSerializer<Output> {
RdfXmlSerializer::new(writer)
}
}
Loading

0 comments on commit 54c1d98

Please sign in to comment.