diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ce44e96..5f358c8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,15 +16,15 @@ jobs: python-version: ["3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v7 - name: Install dependencies run: uv sync diff --git a/README.md b/README.md index f74ebfc..fa429c2 100644 --- a/README.md +++ b/README.md @@ -362,4 +362,6 @@ The `example_projects` folder contains contrived examples, but this has also bee 1. check and raise error or at least warning if default_union is set in underlying Dataset of DatasetView 1. document and/or fix serialisation: canon longTurtle is not great with the way it orders things, so we might need to call out to riot unfortunately. 1. consider changing the distinction from interal/external to data/vocabulary (where vocab includes taxonomies or ontologies) - basically the ABox/TBox distinction where CBox is part of TBox. -1. add support for ASK query +1. add better output support for ASK query +1. add option to remove project name from named graphs, for easier specification: + 1. e.g. `` which is easy to remember and specify on command-line. diff --git a/example_projects/eg0-basic/derived/expected_merged.trig b/example_projects/eg0-basic/derived/expected_merged.trig deleted file mode 100644 index 3269bb8..0000000 --- a/example_projects/eg0-basic/derived/expected_merged.trig +++ /dev/null @@ -1,18 +0,0 @@ -@prefix : . -@prefix foaf: . -@prefix owl: . - - { - :Bob a foaf:Person ; - foaf:knows :Alice ; - foaf:name "Bob Jones" . - - :Alice a foaf:Person ; - foaf:age 30 ; - foaf:name "Alice Smith" . -} - - { - foaf:knows a owl:SymmetricProperty . -} - diff --git a/example_projects/eg0-basic/expected/expected-0-merged.trig b/example_projects/eg0-basic/expected/expected-0-merged.trig new file mode 100644 index 0000000..adf0572 --- /dev/null +++ b/example_projects/eg0-basic/expected/expected-0-merged.trig @@ -0,0 +1,30 @@ +@prefix : . +@prefix dcterms: . +@prefix foaf: . +@prefix owl: . +@prefix pythinfer: . + +@base . + + { + :Bob a foaf:Person ; + foaf:knows :Alice ; + foaf:name "Bob Jones" . + + :Alice a foaf:Person ; + foaf:age 30 ; + foaf:name "Alice Smith" . +} + + { + foaf:knows a owl:SymmetricProperty . +} + + { + a pythinfer:SourceGraph ; + dcterms:source ; + . + a pythinfer:SourceGraph ; + dcterms:source ; + . +} \ No newline at end of file diff --git a/example_projects/eg0-basic/derived/expected_inferred_wanted.trig b/example_projects/eg0-basic/expected/expected-2-inferred-wanted.trig similarity index 68% rename from example_projects/eg0-basic/derived/expected_inferred_wanted.trig rename to example_projects/eg0-basic/expected/expected-2-inferred-wanted.trig index ba12791..d00c58e 100644 --- a/example_projects/eg0-basic/derived/expected_inferred_wanted.trig +++ b/example_projects/eg0-basic/expected/expected-2-inferred-wanted.trig @@ -1,6 +1,6 @@ @prefix : . @prefix foaf: . - { + { :Alice foaf:knows :Bob . } diff --git a/example_projects/eg1-ancestors/derived/expected_merged.trig b/example_projects/eg1-ancestors/expected/expected-0-merged.trig similarity index 79% rename from example_projects/eg1-ancestors/derived/expected_merged.trig rename to example_projects/eg1-ancestors/expected/expected-0-merged.trig index 1e8d3e6..a3774e8 100644 --- a/example_projects/eg1-ancestors/derived/expected_merged.trig +++ b/example_projects/eg1-ancestors/expected/expected-0-merged.trig @@ -4,8 +4,11 @@ @prefix rdf: . @prefix rdfs: . @prefix skos: . +@prefix pythinfer: . - { +@base . + + { ex:childOf rdfs:label "child of" ; owl:inverseOf ex:parentOf . @@ -33,7 +36,7 @@ rdfs:label "Person" . } - { + { ex:Alice a ex:Person ; ex:parentOf ex:Bob, ex:Carol . @@ -48,3 +51,9 @@ ex:Eve a ex:Person . } + + { + a pythinfer:SourceGraph . + a pythinfer:SourceGraph . + a pythinfer:SourceGraph . +} \ No newline at end of file diff --git a/example_projects/eg1-ancestors/derived/expected_inferred_wanted.trig b/example_projects/eg1-ancestors/expected/expected-2-inferred-wanted.trig similarity index 89% rename from example_projects/eg1-ancestors/derived/expected_inferred_wanted.trig rename to example_projects/eg1-ancestors/expected/expected-2-inferred-wanted.trig index a355100..c49f1be 100644 --- a/example_projects/eg1-ancestors/derived/expected_inferred_wanted.trig +++ b/example_projects/eg1-ancestors/expected/expected-2-inferred-wanted.trig @@ -1,6 +1,7 @@ @prefix ex: . - { +@base . + { ex:David ex:childOf ex:Bob ; ex:descendantOf ex:Alice, ex:Bob . diff --git a/example_projects/eg2-projects/derived/expected_inferred_wanted.trig b/example_projects/eg2-projects/expected/expected-2-inferred-wanted.trig similarity index 98% rename from example_projects/eg2-projects/derived/expected_inferred_wanted.trig rename to example_projects/eg2-projects/expected/expected-2-inferred-wanted.trig index 202a2a0..27d80cc 100644 --- a/example_projects/eg2-projects/derived/expected_inferred_wanted.trig +++ b/example_projects/eg2-projects/expected/expected-2-inferred-wanted.trig @@ -7,7 +7,9 @@ @prefix ptp: . @prefix rdfs: . - { +@base . + + { # This is the most important SPARQL-based inference: the relationship between projA and projB eg:relationship-projA-projB a ptp:ProjectRelationship ; ptp:hasParticipant eg:projA, @@ -47,7 +49,7 @@ } - { + { eg:projA a dcat:Catalog, prov:Activity ; ptp:hasDataSource _:b0, diff --git a/src/pythinfer/infer.py b/src/pythinfer/infer.py index e51bb42..6254752 100755 --- a/src/pythinfer/infer.py +++ b/src/pythinfer/infer.py @@ -9,6 +9,7 @@ from owlrl import DeductiveClosure from owlrl.OWLRL import OWLRL_Semantics from rdflib import ( + DCTERMS, OWL, RDF, RDFS, @@ -30,11 +31,9 @@ export_dataset, load_sparql_inference_queries, ) +from pythinfer.inout import PYTHINFER_NS from pythinfer.rdflibplus import DatasetView -IRI_EXTERNAL_INFERENCES: URIRef = URIRef("inferences_external") # type: ignore[bad-assignment] -IRI_OWL_INFERENCES: URIRef = URIRef("inferences_owl") # type: ignore[bad-assignment] -IRI_SPARQL_INFERENCES: URIRef = URIRef("inferences_sparql") # type: ignore[bad-assignment] MAX_REASONING_ROUNDS = 5 SCRIPT_DIR = Path(__file__).parent @@ -259,7 +258,7 @@ def filter_triples( def _generate_external_inferences( - ds: Dataset, external_graph_ids: list[IdentifiedNode] + ds: Dataset, external_graph_ids: list[IdentifiedNode], project: Project ) -> Graph: """Generate inferences from external vocabularies only (step 2). @@ -269,6 +268,7 @@ def _generate_external_inferences( Args: ds: Dataset containing all graphs. external_graph_ids: List of graph identifiers that are external. + project: The project configuration. Returns: Graph containing external inferences. @@ -286,13 +286,21 @@ def _generate_external_inferences( info(" Temporary dataset created with %d triples in default graph", len(temp_ds)) # Create inferences graph in temp dataset (must share same store) - temp_inferences = temp_ds.graph(IRI_EXTERNAL_INFERENCES) + iri_external = project.inference_gid("external") + temp_inferences = temp_ds.graph(iri_external) + g_provenance = ds.graph(project.provenance_gid) apply_owlrl_inference(temp_ds, temp_inferences) - g_external_inferences = ds.graph(IRI_EXTERNAL_INFERENCES) + g_external_inferences = ds.graph(iri_external) for s, p, o in temp_inferences: g_external_inferences.add((s, p, o)) + + # Add provenance metadata for external inference graph + g_provenance.add((iri_external, RDF.type, PYTHINFER_NS["InferenceGraph"])) + g_provenance.add( + (iri_external, PYTHINFER_NS["inferenceEngine"], Literal("owlrl")) + ) info(" External inferences generated: %d triples", len(g_external_inferences)) return g_external_inferences @@ -406,7 +414,9 @@ def run_inference_backend( sparql_queries = load_sparql_inference_queries(project.paths_sparql_inference or []) # Step 2: Generate external inferences (once - this is the "noise floor") - g_external_inferences = _generate_external_inferences(ds, external_graph_ids) + g_external_inferences = _generate_external_inferences( + ds, external_graph_ids, project + ) # Steps 3-5: Iterate full inferences + heuristics until convergence info( @@ -414,8 +424,28 @@ def run_inference_backend( MAX_REASONING_ROUNDS, ) - g_inferences_owl = ds.graph(IRI_OWL_INFERENCES) - g_inferences_sparql = ds.graph(IRI_SPARQL_INFERENCES) + iri_owl = project.inference_gid("owl") + iri_sparql = project.inference_gid("sparql") + g_inferences_owl = ds.graph(iri_owl) + g_inferences_sparql = ds.graph(iri_sparql) + g_provenance = ds.graph(project.provenance_gid) + + # Add provenance metadata for inference graphs + g_provenance.add((iri_owl, RDF.type, PYTHINFER_NS["InferenceGraph"])) + g_provenance.add( + (iri_owl, PYTHINFER_NS["inferenceEngine"], Literal(project.owl_backend)) + ) + + g_provenance.add( + (iri_sparql, RDF.type, PYTHINFER_NS["InferenceGraph"]) + ) + g_provenance.add( + ( + iri_sparql, + PYTHINFER_NS["inferenceEngine"], + Literal("SPARQL CONSTRUCT"), + ) + ) iteration = 0 previous_triple_count = len(ds) # Count triples in entire dataset @@ -491,9 +521,10 @@ def run_inference_backend( len(g_inferences_owl) + len(g_inferences_sparql), ) + iri_external = project.inference_gid("external") all_external_ids: list[IdentifiedNode] = [ *external_graph_ids, - IRI_EXTERNAL_INFERENCES, + iri_external, ] output_file = output or project.path_output / f"{INFERRED_WANTED_FILESTEM}.trig" @@ -501,8 +532,8 @@ def run_inference_backend( output_ds = DatasetView( ds, - [IRI_OWL_INFERENCES, IRI_SPARQL_INFERENCES] - + ([IRI_EXTERNAL_INFERENCES] if export_external_inferences else []), + [iri_owl, iri_sparql] + + ([iri_external] if export_external_inferences else []), ) export_dataset( diff --git a/src/pythinfer/inout.py b/src/pythinfer/inout.py index c8b63d2..bf4d4b6 100644 --- a/src/pythinfer/inout.py +++ b/src/pythinfer/inout.py @@ -14,10 +14,17 @@ field_validator, model_validator, ) -from rdflib import Dataset, Graph +from rdflib import Dataset, Graph, Namespace, URIRef logger = logging.getLogger(__name__) +# Base namespace for pythinfer graph identifiers and potentially other IRIs +# Originally wanted to use a URN base (`urn:pythinfer:`) like so: +# Format: urn:pythinfer:{project-name}:file:{relative-path} +# or: urn:pythinfer:{project-name}:inferences:{type} +# However, parsing the TTL complained about no slash after colon etc. +PYTHINFER_NS = Namespace("http://pythinfer.local/") + PROJECT_FILE_NAME = "pythinfer.yaml" MAX_DISCOVERY_SEARCH_DEPTH = 10 @@ -117,7 +124,7 @@ class Project(BaseModel): @model_validator(mode="before") @classmethod - def normalize_field_names(cls, data: dict) -> dict: + def normalize_field_names(cls, data: dict[str, str]) -> dict[str, str]: """Normalize field names to accept multiple spellings.""" if not isinstance(data, dict): return data @@ -136,10 +143,9 @@ def normalize_field_names(cls, data: dict) -> dict: "sparql_inference": "paths_sparql_inference", "paths_sparql_inference": "paths_sparql_inference", "owl-backend": "owl_backend", - "owl_backend": "owl_backend", } - normalized = {} + normalized: dict[str, str] = {} for key, value in data.items(): # Use canonical name if it's an alias, otherwise keep original canonical_key = field_aliases.get(key, key) @@ -209,7 +215,7 @@ def from_yaml(config_path: Path | str) -> "Project": # Add path_self to the config dict before validation cfg["path_self"] = _config_path if "name" not in cfg: - cfg["name"] = _config_path.stem + cfg["name"] = _config_path.parent.stem # Let Pydantic handle validation and field normalization # Pass config_dir through context for path resolution in validators @@ -221,10 +227,11 @@ def _path_to_yaml_str(self, path: Path) -> str: If the path is relative to the project file's directory, store it relative for better portability. Otherwise, store as absolute path. """ - project_dir = self.path_self.parent + resolved_path = path.resolve() + resolved_project_dir = self.path_self.resolve().parent try: # Try to make it relative to the project directory - rel_path = path.relative_to(project_dir) + rel_path = resolved_path.relative_to(resolved_project_dir) return str(rel_path) except ValueError: # Path is not relative to project_dir, store as-is @@ -280,6 +287,67 @@ def paths_all(self) -> list[Path]: """List of all paths (input + SPARQL inference) - cache checking.""" return self.paths_all_input + (self.paths_sparql_inference or []) + @property + def namespace(self) -> Namespace: + """The IRI Namespace associated with this Project.""" + # TODO: normalise name to be appropriate for an IRI. + return Namespace(PYTHINFER_NS[self.name] + "/") + + @property + def provenance_gid(self) -> URIRef: + """The IRI to use for the provenance named graph for this Project.""" + return self.namespace["provenance"] + + def source_file_gid(self, file_path: Path) -> URIRef: + """Create a stable identifier for a source file's named graph. + + Uses project name and relative path to create an IRI that is: + - Stable across re-parsing + - Portable within a project + - Informative about the source + - (no longer, because URNs don't currently work) Explicitly non-dereferenceable + + Args: + file_path: Path to the source file + + Returns: + IRI for the named graph, e.g.: + http://pythinfer.local/eg0-basic/file/basic-model.ttl + + """ + # Resolve both paths to their canonical form to handle symlinks and + # relative path differences that can occur across different environments + resolved_file_path = file_path.resolve() + resolved_project_parent = self.path_self.resolve().parent + + try: + rel_path = resolved_file_path.relative_to(resolved_project_parent) + except ValueError: + # File is outside project directory; try to use the original path as-is + # to preserve the structure shown in the config file + try: + # Try with the unresolved path in case it has a meaningful structure + rel_path = file_path.relative_to(self.path_self.parent) + except ValueError: + # If that also fails, just use the file name + rel_path = resolved_file_path.name + + # Note, to use a URN, we'd need to replace with colons for URN structure + # Use colons to maintain hierarchical structure in URN + return self.namespace[f"file/{rel_path}"] + + def inference_gid(self, inference_type: str) -> URIRef: + """Create a stable identifier for an inference graph. + + Args: + inference_type: Type of inference ('external', 'owl', or 'sparql') + + Returns: + IRI for the inference graph, e.g.: + http://pythinfer.local/eg0-basic/inferences/owl + + """ + return self.namespace[f"inferences/{inference_type}"] def discover_project(start_path: Path, _current_depth: int = 0) -> Path: """Discover a pythinfer project by searching for a config file. diff --git a/src/pythinfer/merge.py b/src/pythinfer/merge.py index 6c8f7ed..bf2ec21 100644 --- a/src/pythinfer/merge.py +++ b/src/pythinfer/merge.py @@ -3,21 +3,15 @@ import logging from pathlib import Path -from rdflib import Dataset, IdentifiedNode +from rdflib import DCTERMS, RDF, Dataset, IdentifiedNode, URIRef -from pythinfer.inout import MERGED_FILESTEM, Project, export_dataset +from pythinfer.inout import MERGED_FILESTEM, PYTHINFER_NS, Project, export_dataset from pythinfer.rdflibplus import DatasetView logger = logging.getLogger(__name__) info = logger.info dbg = debug = logger.debug - -# NB: in the below we are using the file *name* only as the named graph identifier. -# This assumes that input files have unique names even if in different directories, -# which is likely an invalid assumption... - - def merge_graphs( project: Project, *, @@ -43,24 +37,43 @@ def merge_graphs( """ ds = Dataset() + ds.bind("pythinfer", PYTHINFER_NS) + ds.bind("dcterms", DCTERMS) external_gids: list[IdentifiedNode] = [] + g_provenance = ds.graph(project.provenance_gid) # Load external vocabulary files (ephemeral - used for inference only) for src in project.paths_vocab_ext: - g = ds.graph(src.name) + graph_urn = project.source_file_gid(src) + g = ds.graph(graph_urn) g.parse(src, format="turtle") + + # Add provenance metadata to the graph + g_provenance.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g_provenance.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) + external_gids.append(g.identifier) # Load internal vocabulary files for src in project.paths_vocab_int: - g = ds.graph(src.name) + graph_urn = project.source_file_gid(src) + g = ds.graph(graph_urn) g.parse(src, format="turtle") + # Add provenance metadata + g_provenance.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g_provenance.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) + # Load data files for src in project.paths_data: - g = ds.graph(src.name) + graph_urn = project.source_file_gid(src) + g = ds.graph(graph_urn) g.parse(src, format="turtle") + # Add provenance metadata + g_provenance.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g_provenance.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) + if output: if isinstance(output, bool): output_file = project.path_output / f"{MERGED_FILESTEM}.trig" diff --git a/tests/e2e/test_e2e_from_cli.py b/tests/e2e/test_e2e_from_cli.py index 313e22a..7a9e068 100644 --- a/tests/e2e/test_e2e_from_cli.py +++ b/tests/e2e/test_e2e_from_cli.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest -from rdflib import Dataset +from rdflib import Dataset, DCTERMS from rdflib.compare import graph_diff, isomorphic from typer.testing import CliRunner @@ -39,15 +39,11 @@ def test_cli_command( if (command == "merge") else f"{INFERRED_WANTED_FILESTEM}.trig" ) - expected_file = ( - "expected_merged.trig" - if (command == "merge") - else "expected_inferred_wanted.trig" - ) + expected_file = "expected-" + actual_file # Path to expected and actual output files - expected_file_path = project_dir / "derived" / expected_file - actual_file_path = project_dir / "derived" / actual_file + expected_file_path = project_dir / "expected" / expected_file + actual_file_path = project_dir / "derived" / "test_cli_command" / actual_file # Ensure expected file exists assert expected_file_path.exists(), ( @@ -58,16 +54,19 @@ def test_cli_command( if actual_file_path.exists(): actual_file_path.unlink() + # Make sure intermediate output folder exists + actual_file_path.parent.mkdir(parents=True, exist_ok=True) + # Run the command using CliRunner but with proper working directory # Save current working directory and change to project directory original_cwd = Path.cwd() + runner = CliRunner() + cmd_args = [command, "--output", str(actual_file_path)] + # Disable cache for infer command to ensure fresh runs + if command == "infer": + cmd_args.append("--no-cache") try: os.chdir(project_dir) - runner = CliRunner() - cmd_args = [command, "--output", str(actual_file_path)] - # Disable cache for infer command to ensure fresh runs - if command == "infer": - cmd_args.append("--no-cache") result = runner.invoke(app, cmd_args) finally: os.chdir(original_cwd) @@ -103,6 +102,11 @@ def test_cli_command( expected_graph = expected_ds.graph(graph_id) actual_graph = actual_ds.graph(graph_id) + if graph_id.endswith("provenance"): + # Remove source information, as this will differ by execution environment + expected_graph.remove((None, DCTERMS.source, None)) + actual_graph.remove((None, DCTERMS.source, None)) + if not isomorphic(expected_graph, actual_graph): # Compute the difference to show what's missing/extra in_both, in_expected_only, in_actual_only = graph_diff(