From 250c9530346c9ba6ea4324986f615674d8336a93 Mon Sep 17 00:00:00 2001 From: rmuil1 Date: Mon, 2 Feb 2026 19:25:37 +0000 Subject: [PATCH 1/9] use urn scheme for named graphs tests don't work fully yet --- README.md | 4 +- .../derived/expected_inferred_wanted.trig | 2 +- .../eg0-basic/derived/expected_merged.trig | 4 +- src/pythinfer/infer.py | 94 ++++++++++++++++--- src/pythinfer/merge.py | 59 ++++++++++-- 5 files changed, 140 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index f74ebfc..fa429c2 100644 --- a/README.md +++ b/README.md @@ -362,4 +362,6 @@ The `example_projects` folder contains contrived examples, but this has also bee 1. check and raise error or at least warning if default_union is set in underlying Dataset of DatasetView 1. document and/or fix serialisation: canon longTurtle is not great with the way it orders things, so we might need to call out to riot unfortunately. 1. consider changing the distinction from interal/external to data/vocabulary (where vocab includes taxonomies or ontologies) - basically the ABox/TBox distinction where CBox is part of TBox. -1. add support for ASK query +1. add better output support for ASK query +1. add option to remove project name from named graphs, for easier specification: + 1. e.g. `` which is easy to remember and specify on command-line. diff --git a/example_projects/eg0-basic/derived/expected_inferred_wanted.trig b/example_projects/eg0-basic/derived/expected_inferred_wanted.trig index ba12791..a804dc6 100644 --- a/example_projects/eg0-basic/derived/expected_inferred_wanted.trig +++ b/example_projects/eg0-basic/derived/expected_inferred_wanted.trig @@ -1,6 +1,6 @@ @prefix : . @prefix foaf: . - { + { :Alice foaf:knows :Bob . } diff --git a/example_projects/eg0-basic/derived/expected_merged.trig b/example_projects/eg0-basic/derived/expected_merged.trig index 3269bb8..00ca617 100644 --- a/example_projects/eg0-basic/derived/expected_merged.trig +++ b/example_projects/eg0-basic/derived/expected_merged.trig @@ -2,7 +2,7 @@ @prefix foaf: . @prefix owl: . - { + { :Bob a foaf:Person ; foaf:knows :Alice ; foaf:name "Bob Jones" . @@ -12,7 +12,7 @@ foaf:name "Alice Smith" . } - { + { foaf:knows a owl:SymmetricProperty . } diff --git a/src/pythinfer/infer.py b/src/pythinfer/infer.py index e51bb42..6db90dd 100755 --- a/src/pythinfer/infer.py +++ b/src/pythinfer/infer.py @@ -30,11 +30,22 @@ export_dataset, load_sparql_inference_queries, ) +from pythinfer.merge import PYTHINFER_NS from pythinfer.rdflibplus import DatasetView -IRI_EXTERNAL_INFERENCES: URIRef = URIRef("inferences_external") # type: ignore[bad-assignment] -IRI_OWL_INFERENCES: URIRef = URIRef("inferences_owl") # type: ignore[bad-assignment] -IRI_SPARQL_INFERENCES: URIRef = URIRef("inferences_sparql") # type: ignore[bad-assignment] + +def _create_inference_urn(project_name: str, inference_type: str) -> URIRef: + """Create a stable URN identifier for an inference graph. + + Args: + project_name: Name of the project + inference_type: Type of inference ('external', 'owl', or 'sparql') + + Returns: + URN for the inference graph, e.g.: + urn:pythinfer:eg0-basic:inferences:owl + """ + return PYTHINFER_NS[f"{project_name}:inferences:{inference_type}"] MAX_REASONING_ROUNDS = 5 SCRIPT_DIR = Path(__file__).parent @@ -259,7 +270,7 @@ def filter_triples( def _generate_external_inferences( - ds: Dataset, external_graph_ids: list[IdentifiedNode] + ds: Dataset, external_graph_ids: list[IdentifiedNode], project: Project ) -> Graph: """Generate inferences from external vocabularies only (step 2). @@ -269,6 +280,7 @@ def _generate_external_inferences( Args: ds: Dataset containing all graphs. external_graph_ids: List of graph identifiers that are external. + project: The project configuration. Returns: Graph containing external inferences. @@ -286,13 +298,35 @@ def _generate_external_inferences( info(" Temporary dataset created with %d triples in default graph", len(temp_ds)) # Create inferences graph in temp dataset (must share same store) - temp_inferences = temp_ds.graph(IRI_EXTERNAL_INFERENCES) + iri_external = _create_inference_urn(project.name, "external") + temp_inferences = temp_ds.graph(iri_external) apply_owlrl_inference(temp_ds, temp_inferences) - g_external_inferences = ds.graph(IRI_EXTERNAL_INFERENCES) + g_external_inferences = ds.graph(iri_external) for s, p, o in temp_inferences: g_external_inferences.add((s, p, o)) + + # Add provenance metadata for external inference graph + from rdflib import DCTERMS + g_external_inferences.add( + (iri_external, RDF.type, PYTHINFER_NS["InferenceGraph"]) + ) + g_external_inferences.add( + ( + iri_external, + PYTHINFER_NS["inferenceType"], + PYTHINFER_NS["ExternalReasoner"], + ) + ) + g_external_inferences.add( + ( + iri_external, + DCTERMS.description, + Literal("Inferences generated by OWL-RL over external vocabularies"), + ) + ) + info(" External inferences generated: %d triples", len(g_external_inferences)) return g_external_inferences @@ -406,7 +440,7 @@ def run_inference_backend( sparql_queries = load_sparql_inference_queries(project.paths_sparql_inference or []) # Step 2: Generate external inferences (once - this is the "noise floor") - g_external_inferences = _generate_external_inferences(ds, external_graph_ids) + g_external_inferences = _generate_external_inferences(ds, external_graph_ids, project) # Steps 3-5: Iterate full inferences + heuristics until convergence info( @@ -414,8 +448,43 @@ def run_inference_backend( MAX_REASONING_ROUNDS, ) - g_inferences_owl = ds.graph(IRI_OWL_INFERENCES) - g_inferences_sparql = ds.graph(IRI_SPARQL_INFERENCES) + iri_owl = _create_inference_urn(project.name, "owl") + iri_sparql = _create_inference_urn(project.name, "sparql") + g_inferences_owl = ds.graph(iri_owl) + g_inferences_sparql = ds.graph(iri_sparql) + + # Add provenance metadata for inference graphs + from rdflib import DCTERMS + g_inferences_owl.add((iri_owl, RDF.type, PYTHINFER_NS["InferenceGraph"])) + g_inferences_owl.add( + (iri_owl, PYTHINFER_NS["inferenceType"], PYTHINFER_NS["OWLRL"]) + ) + g_inferences_owl.add( + ( + iri_owl, + DCTERMS.description, + Literal("Inferences generated by OWL-RL reasoner"), + ) + ) + + g_inferences_sparql.add( + (iri_sparql, RDF.type, PYTHINFER_NS["InferenceGraph"]) + ) + g_inferences_sparql.add( + ( + iri_sparql, + PYTHINFER_NS["inferenceType"], + PYTHINFER_NS["SPARQL"], + ) + ) + g_inferences_sparql.add( + ( + iri_sparql, + DCTERMS.description, + Literal("Inferences generated by SPARQL CONSTRUCT queries"), + ) + ) + iteration = 0 previous_triple_count = len(ds) # Count triples in entire dataset @@ -491,9 +560,10 @@ def run_inference_backend( len(g_inferences_owl) + len(g_inferences_sparql), ) + iri_external = _create_inference_urn(project.name, "external") all_external_ids: list[IdentifiedNode] = [ *external_graph_ids, - IRI_EXTERNAL_INFERENCES, + iri_external, ] output_file = output or project.path_output / f"{INFERRED_WANTED_FILESTEM}.trig" @@ -501,8 +571,8 @@ def run_inference_backend( output_ds = DatasetView( ds, - [IRI_OWL_INFERENCES, IRI_SPARQL_INFERENCES] - + ([IRI_EXTERNAL_INFERENCES] if export_external_inferences else []), + [iri_owl, iri_sparql] + + ([iri_external] if export_external_inferences else []), ) export_dataset( diff --git a/src/pythinfer/merge.py b/src/pythinfer/merge.py index 6c8f7ed..9cae30f 100644 --- a/src/pythinfer/merge.py +++ b/src/pythinfer/merge.py @@ -3,7 +3,7 @@ import logging from pathlib import Path -from rdflib import Dataset, IdentifiedNode +from rdflib import DCTERMS, RDF, Dataset, IdentifiedNode, Namespace, URIRef from pythinfer.inout import MERGED_FILESTEM, Project, export_dataset from pythinfer.rdflibplus import DatasetView @@ -12,10 +12,34 @@ info = logger.info dbg = debug = logger.debug +# URN namespace for pythinfer graph identifiers +# Format: urn:pythinfer:{project-name}:file:{relative-path} +# or: urn:pythinfer:{project-name}:inferences:{type} +PYTHINFER_NS = Namespace("urn:pythinfer:") -# NB: in the below we are using the file *name* only as the named graph identifier. -# This assumes that input files have unique names even if in different directories, -# which is likely an invalid assumption... + +def _create_graph_urn(project: Project, file_path: Path) -> URIRef: + """Create a stable URN identifier for a source file's named graph. + + Uses project name and relative path to create a URN that is: + - Stable across re-parsing + - Portable within a project + - Explicitly non-dereferenceable + - Informative about the source + + Args: + project: The pythinfer project + file_path: Path to the source file + + Returns: + URN for the named graph, e.g.: + urn:pythinfer:eg0-basic:file:basic-model.ttl + """ + rel_path = file_path.relative_to(project.path_self.parent) + # Normalize to forward slashes and replace with colons for URN structure + # Use colons to maintain hierarchical structure in URN + path_str = str(rel_path).replace("\\", "/").replace("/", ":") + return PYTHINFER_NS[f"{project.name}:file:{path_str}"] def merge_graphs( @@ -43,24 +67,45 @@ def merge_graphs( """ ds = Dataset() + ds.bind("pythinfer", PYTHINFER_NS) + ds.bind("dcterms", DCTERMS) external_gids: list[IdentifiedNode] = [] # Load external vocabulary files (ephemeral - used for inference only) for src in project.paths_vocab_ext: - g = ds.graph(src.name) + graph_urn = _create_graph_urn(project, src) + g = ds.graph(graph_urn) g.parse(src, format="turtle") + + # Add provenance metadata to the graph + g.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) + g.add((graph_urn, PYTHINFER_NS["sourceType"], PYTHINFER_NS["ExternalVocabulary"])) + external_gids.append(g.identifier) # Load internal vocabulary files for src in project.paths_vocab_int: - g = ds.graph(src.name) + graph_urn = _create_graph_urn(project, src) + g = ds.graph(graph_urn) g.parse(src, format="turtle") + # Add provenance metadata + g.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) + g.add((graph_urn, PYTHINFER_NS["sourceType"], PYTHINFER_NS["InternalVocabulary"])) + # Load data files for src in project.paths_data: - g = ds.graph(src.name) + graph_urn = _create_graph_urn(project, src) + g = ds.graph(graph_urn) g.parse(src, format="turtle") + # Add provenance metadata + g.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) + g.add((graph_urn, PYTHINFER_NS["sourceType"], PYTHINFER_NS["DataGraph"])) + if output: if isinstance(output, bool): output_file = project.path_output / f"{MERGED_FILESTEM}.trig" From 8d72ab648a50c3ad520590826def1ddb781b925e Mon Sep 17 00:00:00 2001 From: rmuil1 Date: Tue, 3 Feb 2026 19:33:31 +0000 Subject: [PATCH 2/9] use folder of project file for project name if none provided --- src/pythinfer/inout.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pythinfer/inout.py b/src/pythinfer/inout.py index c8b63d2..fc41f50 100644 --- a/src/pythinfer/inout.py +++ b/src/pythinfer/inout.py @@ -209,7 +209,7 @@ def from_yaml(config_path: Path | str) -> "Project": # Add path_self to the config dict before validation cfg["path_self"] = _config_path if "name" not in cfg: - cfg["name"] = _config_path.stem + cfg["name"] = _config_path.parent.stem # Let Pydantic handle validation and field normalization # Pass config_dir through context for path resolution in validators From b3f6da0d71b00da480a60f5180103edc164a3391 Mon Sep 17 00:00:00 2001 From: rmuil1 Date: Tue, 3 Feb 2026 21:03:19 +0000 Subject: [PATCH 3/9] consolidate graph iri minting and update expected example files --- .../eg0-basic/derived/expected_merged.trig | 18 ----- .../eg0-basic/expected/expected-0-merged.trig | 30 +++++++ .../expected-2-inferred-wanted.trig} | 2 +- .../expected-0-merged.trig} | 13 ++- .../expected-2-inferred-wanted.trig} | 3 +- .../expected-2-inferred-wanted.trig} | 6 +- src/pythinfer/infer.py | 81 +++++-------------- src/pythinfer/inout.py | 59 +++++++++++++- src/pythinfer/merge.py | 56 +++---------- tests/e2e/test_e2e_from_cli.py | 32 ++++---- 10 files changed, 154 insertions(+), 146 deletions(-) delete mode 100644 example_projects/eg0-basic/derived/expected_merged.trig create mode 100644 example_projects/eg0-basic/expected/expected-0-merged.trig rename example_projects/eg0-basic/{derived/expected_inferred_wanted.trig => expected/expected-2-inferred-wanted.trig} (68%) rename example_projects/eg1-ancestors/{derived/expected_merged.trig => expected/expected-0-merged.trig} (79%) rename example_projects/eg1-ancestors/{derived/expected_inferred_wanted.trig => expected/expected-2-inferred-wanted.trig} (89%) rename example_projects/eg2-projects/{derived/expected_inferred_wanted.trig => expected/expected-2-inferred-wanted.trig} (98%) diff --git a/example_projects/eg0-basic/derived/expected_merged.trig b/example_projects/eg0-basic/derived/expected_merged.trig deleted file mode 100644 index 00ca617..0000000 --- a/example_projects/eg0-basic/derived/expected_merged.trig +++ /dev/null @@ -1,18 +0,0 @@ -@prefix : . -@prefix foaf: . -@prefix owl: . - - { - :Bob a foaf:Person ; - foaf:knows :Alice ; - foaf:name "Bob Jones" . - - :Alice a foaf:Person ; - foaf:age 30 ; - foaf:name "Alice Smith" . -} - - { - foaf:knows a owl:SymmetricProperty . -} - diff --git a/example_projects/eg0-basic/expected/expected-0-merged.trig b/example_projects/eg0-basic/expected/expected-0-merged.trig new file mode 100644 index 0000000..adf0572 --- /dev/null +++ b/example_projects/eg0-basic/expected/expected-0-merged.trig @@ -0,0 +1,30 @@ +@prefix : . +@prefix dcterms: . +@prefix foaf: . +@prefix owl: . +@prefix pythinfer: . + +@base . + + { + :Bob a foaf:Person ; + foaf:knows :Alice ; + foaf:name "Bob Jones" . + + :Alice a foaf:Person ; + foaf:age 30 ; + foaf:name "Alice Smith" . +} + + { + foaf:knows a owl:SymmetricProperty . +} + + { + a pythinfer:SourceGraph ; + dcterms:source ; + . + a pythinfer:SourceGraph ; + dcterms:source ; + . +} \ No newline at end of file diff --git a/example_projects/eg0-basic/derived/expected_inferred_wanted.trig b/example_projects/eg0-basic/expected/expected-2-inferred-wanted.trig similarity index 68% rename from example_projects/eg0-basic/derived/expected_inferred_wanted.trig rename to example_projects/eg0-basic/expected/expected-2-inferred-wanted.trig index a804dc6..d00c58e 100644 --- a/example_projects/eg0-basic/derived/expected_inferred_wanted.trig +++ b/example_projects/eg0-basic/expected/expected-2-inferred-wanted.trig @@ -1,6 +1,6 @@ @prefix : . @prefix foaf: . - { + { :Alice foaf:knows :Bob . } diff --git a/example_projects/eg1-ancestors/derived/expected_merged.trig b/example_projects/eg1-ancestors/expected/expected-0-merged.trig similarity index 79% rename from example_projects/eg1-ancestors/derived/expected_merged.trig rename to example_projects/eg1-ancestors/expected/expected-0-merged.trig index 1e8d3e6..a3774e8 100644 --- a/example_projects/eg1-ancestors/derived/expected_merged.trig +++ b/example_projects/eg1-ancestors/expected/expected-0-merged.trig @@ -4,8 +4,11 @@ @prefix rdf: . @prefix rdfs: . @prefix skos: . +@prefix pythinfer: . - { +@base . + + { ex:childOf rdfs:label "child of" ; owl:inverseOf ex:parentOf . @@ -33,7 +36,7 @@ rdfs:label "Person" . } - { + { ex:Alice a ex:Person ; ex:parentOf ex:Bob, ex:Carol . @@ -48,3 +51,9 @@ ex:Eve a ex:Person . } + + { + a pythinfer:SourceGraph . + a pythinfer:SourceGraph . + a pythinfer:SourceGraph . +} \ No newline at end of file diff --git a/example_projects/eg1-ancestors/derived/expected_inferred_wanted.trig b/example_projects/eg1-ancestors/expected/expected-2-inferred-wanted.trig similarity index 89% rename from example_projects/eg1-ancestors/derived/expected_inferred_wanted.trig rename to example_projects/eg1-ancestors/expected/expected-2-inferred-wanted.trig index a355100..c49f1be 100644 --- a/example_projects/eg1-ancestors/derived/expected_inferred_wanted.trig +++ b/example_projects/eg1-ancestors/expected/expected-2-inferred-wanted.trig @@ -1,6 +1,7 @@ @prefix ex: . - { +@base . + { ex:David ex:childOf ex:Bob ; ex:descendantOf ex:Alice, ex:Bob . diff --git a/example_projects/eg2-projects/derived/expected_inferred_wanted.trig b/example_projects/eg2-projects/expected/expected-2-inferred-wanted.trig similarity index 98% rename from example_projects/eg2-projects/derived/expected_inferred_wanted.trig rename to example_projects/eg2-projects/expected/expected-2-inferred-wanted.trig index 202a2a0..27d80cc 100644 --- a/example_projects/eg2-projects/derived/expected_inferred_wanted.trig +++ b/example_projects/eg2-projects/expected/expected-2-inferred-wanted.trig @@ -7,7 +7,9 @@ @prefix ptp: . @prefix rdfs: . - { +@base . + + { # This is the most important SPARQL-based inference: the relationship between projA and projB eg:relationship-projA-projB a ptp:ProjectRelationship ; ptp:hasParticipant eg:projA, @@ -47,7 +49,7 @@ } - { + { eg:projA a dcat:Catalog, prov:Activity ; ptp:hasDataSource _:b0, diff --git a/src/pythinfer/infer.py b/src/pythinfer/infer.py index 6db90dd..6254752 100755 --- a/src/pythinfer/infer.py +++ b/src/pythinfer/infer.py @@ -9,6 +9,7 @@ from owlrl import DeductiveClosure from owlrl.OWLRL import OWLRL_Semantics from rdflib import ( + DCTERMS, OWL, RDF, RDFS, @@ -30,23 +31,10 @@ export_dataset, load_sparql_inference_queries, ) -from pythinfer.merge import PYTHINFER_NS +from pythinfer.inout import PYTHINFER_NS from pythinfer.rdflibplus import DatasetView -def _create_inference_urn(project_name: str, inference_type: str) -> URIRef: - """Create a stable URN identifier for an inference graph. - - Args: - project_name: Name of the project - inference_type: Type of inference ('external', 'owl', or 'sparql') - - Returns: - URN for the inference graph, e.g.: - urn:pythinfer:eg0-basic:inferences:owl - """ - return PYTHINFER_NS[f"{project_name}:inferences:{inference_type}"] - MAX_REASONING_ROUNDS = 5 SCRIPT_DIR = Path(__file__).parent logger = logging.getLogger(__name__) @@ -298,8 +286,9 @@ def _generate_external_inferences( info(" Temporary dataset created with %d triples in default graph", len(temp_ds)) # Create inferences graph in temp dataset (must share same store) - iri_external = _create_inference_urn(project.name, "external") + iri_external = project.inference_gid("external") temp_inferences = temp_ds.graph(iri_external) + g_provenance = ds.graph(project.provenance_gid) apply_owlrl_inference(temp_ds, temp_inferences) @@ -308,25 +297,10 @@ def _generate_external_inferences( g_external_inferences.add((s, p, o)) # Add provenance metadata for external inference graph - from rdflib import DCTERMS - g_external_inferences.add( - (iri_external, RDF.type, PYTHINFER_NS["InferenceGraph"]) - ) - g_external_inferences.add( - ( - iri_external, - PYTHINFER_NS["inferenceType"], - PYTHINFER_NS["ExternalReasoner"], - ) - ) - g_external_inferences.add( - ( - iri_external, - DCTERMS.description, - Literal("Inferences generated by OWL-RL over external vocabularies"), - ) + g_provenance.add((iri_external, RDF.type, PYTHINFER_NS["InferenceGraph"])) + g_provenance.add( + (iri_external, PYTHINFER_NS["inferenceEngine"], Literal("owlrl")) ) - info(" External inferences generated: %d triples", len(g_external_inferences)) return g_external_inferences @@ -440,7 +414,9 @@ def run_inference_backend( sparql_queries = load_sparql_inference_queries(project.paths_sparql_inference or []) # Step 2: Generate external inferences (once - this is the "noise floor") - g_external_inferences = _generate_external_inferences(ds, external_graph_ids, project) + g_external_inferences = _generate_external_inferences( + ds, external_graph_ids, project + ) # Steps 3-5: Iterate full inferences + heuristics until convergence info( @@ -448,43 +424,28 @@ def run_inference_backend( MAX_REASONING_ROUNDS, ) - iri_owl = _create_inference_urn(project.name, "owl") - iri_sparql = _create_inference_urn(project.name, "sparql") + iri_owl = project.inference_gid("owl") + iri_sparql = project.inference_gid("sparql") g_inferences_owl = ds.graph(iri_owl) g_inferences_sparql = ds.graph(iri_sparql) + g_provenance = ds.graph(project.provenance_gid) # Add provenance metadata for inference graphs - from rdflib import DCTERMS - g_inferences_owl.add((iri_owl, RDF.type, PYTHINFER_NS["InferenceGraph"])) - g_inferences_owl.add( - (iri_owl, PYTHINFER_NS["inferenceType"], PYTHINFER_NS["OWLRL"]) - ) - g_inferences_owl.add( - ( - iri_owl, - DCTERMS.description, - Literal("Inferences generated by OWL-RL reasoner"), - ) + g_provenance.add((iri_owl, RDF.type, PYTHINFER_NS["InferenceGraph"])) + g_provenance.add( + (iri_owl, PYTHINFER_NS["inferenceEngine"], Literal(project.owl_backend)) ) - g_inferences_sparql.add( + g_provenance.add( (iri_sparql, RDF.type, PYTHINFER_NS["InferenceGraph"]) ) - g_inferences_sparql.add( + g_provenance.add( ( iri_sparql, - PYTHINFER_NS["inferenceType"], - PYTHINFER_NS["SPARQL"], + PYTHINFER_NS["inferenceEngine"], + Literal("SPARQL CONSTRUCT"), ) ) - g_inferences_sparql.add( - ( - iri_sparql, - DCTERMS.description, - Literal("Inferences generated by SPARQL CONSTRUCT queries"), - ) - ) - iteration = 0 previous_triple_count = len(ds) # Count triples in entire dataset @@ -560,7 +521,7 @@ def run_inference_backend( len(g_inferences_owl) + len(g_inferences_sparql), ) - iri_external = _create_inference_urn(project.name, "external") + iri_external = project.inference_gid("external") all_external_ids: list[IdentifiedNode] = [ *external_graph_ids, iri_external, diff --git a/src/pythinfer/inout.py b/src/pythinfer/inout.py index fc41f50..3ed158a 100644 --- a/src/pythinfer/inout.py +++ b/src/pythinfer/inout.py @@ -14,10 +14,17 @@ field_validator, model_validator, ) -from rdflib import Dataset, Graph +from rdflib import Dataset, Graph, Namespace, URIRef logger = logging.getLogger(__name__) +# Base namespace for pythinfer graph identifiers and potentially other IRIs +# Originally wanted to use a URN base (`urn:pythinfer:`) like so: +# Format: urn:pythinfer:{project-name}:file:{relative-path} +# or: urn:pythinfer:{project-name}:inferences:{type} +# However, parsing the TTL complained about no slash after colon etc. +PYTHINFER_NS = Namespace("http://pythinfer.local/") + PROJECT_FILE_NAME = "pythinfer.yaml" MAX_DISCOVERY_SEARCH_DEPTH = 10 @@ -117,7 +124,7 @@ class Project(BaseModel): @model_validator(mode="before") @classmethod - def normalize_field_names(cls, data: dict) -> dict: + def normalize_field_names(cls, data: dict[str, str]) -> dict[str, str]: """Normalize field names to accept multiple spellings.""" if not isinstance(data, dict): return data @@ -136,10 +143,9 @@ def normalize_field_names(cls, data: dict) -> dict: "sparql_inference": "paths_sparql_inference", "paths_sparql_inference": "paths_sparql_inference", "owl-backend": "owl_backend", - "owl_backend": "owl_backend", } - normalized = {} + normalized: dict[str, str] = {} for key, value in data.items(): # Use canonical name if it's an alias, otherwise keep original canonical_key = field_aliases.get(key, key) @@ -280,6 +286,51 @@ def paths_all(self) -> list[Path]: """List of all paths (input + SPARQL inference) - cache checking.""" return self.paths_all_input + (self.paths_sparql_inference or []) + @property + def namespace(self) -> Namespace: + """The IRI Namespace associated with this Project.""" + # TODO: normalise name to be appropriate for an IRI. + return Namespace(PYTHINFER_NS[self.name] + "/") + + @property + def provenance_gid(self) -> URIRef: + """The IRI to use for the provenance named graph for this Project.""" + return self.namespace["provenance"] + + def source_file_gid(self, file_path: Path) -> URIRef: + """Create a stable identifier for a source file's named graph. + + Uses project name and relative path to create an IRI that is: + - Stable across re-parsing + - Portable within a project + - Informative about the source + - (no longer, because URNs don't currently work) Explicitly non-dereferenceable + + Args: + file_path: Path to the source file + + Returns: + IRI for the named graph, e.g.: + http://pythinfer.local/eg0-basic/file/basic-model.ttl + + """ + rel_path = file_path.relative_to(self.path_self.parent) + # Note, to use a URN, we'd need to replace with colons for URN structure + # Use colons to maintain hierarchical structure in URN + return self.namespace[f"file/{rel_path}"] + + def inference_gid(self, inference_type: str) -> URIRef: + """Create a stable identifier for an inference graph. + + Args: + inference_type: Type of inference ('external', 'owl', or 'sparql') + + Returns: + IRI for the inference graph, e.g.: + http://pythinfer.local/eg0-basic/inferences/owl + + """ + return self.namespace[f"inferences/{inference_type}"] def discover_project(start_path: Path, _current_depth: int = 0) -> Path: """Discover a pythinfer project by searching for a config file. diff --git a/src/pythinfer/merge.py b/src/pythinfer/merge.py index 9cae30f..bf2ec21 100644 --- a/src/pythinfer/merge.py +++ b/src/pythinfer/merge.py @@ -3,45 +3,15 @@ import logging from pathlib import Path -from rdflib import DCTERMS, RDF, Dataset, IdentifiedNode, Namespace, URIRef +from rdflib import DCTERMS, RDF, Dataset, IdentifiedNode, URIRef -from pythinfer.inout import MERGED_FILESTEM, Project, export_dataset +from pythinfer.inout import MERGED_FILESTEM, PYTHINFER_NS, Project, export_dataset from pythinfer.rdflibplus import DatasetView logger = logging.getLogger(__name__) info = logger.info dbg = debug = logger.debug -# URN namespace for pythinfer graph identifiers -# Format: urn:pythinfer:{project-name}:file:{relative-path} -# or: urn:pythinfer:{project-name}:inferences:{type} -PYTHINFER_NS = Namespace("urn:pythinfer:") - - -def _create_graph_urn(project: Project, file_path: Path) -> URIRef: - """Create a stable URN identifier for a source file's named graph. - - Uses project name and relative path to create a URN that is: - - Stable across re-parsing - - Portable within a project - - Explicitly non-dereferenceable - - Informative about the source - - Args: - project: The pythinfer project - file_path: Path to the source file - - Returns: - URN for the named graph, e.g.: - urn:pythinfer:eg0-basic:file:basic-model.ttl - """ - rel_path = file_path.relative_to(project.path_self.parent) - # Normalize to forward slashes and replace with colons for URN structure - # Use colons to maintain hierarchical structure in URN - path_str = str(rel_path).replace("\\", "/").replace("/", ":") - return PYTHINFER_NS[f"{project.name}:file:{path_str}"] - - def merge_graphs( project: Project, *, @@ -70,41 +40,39 @@ def merge_graphs( ds.bind("pythinfer", PYTHINFER_NS) ds.bind("dcterms", DCTERMS) external_gids: list[IdentifiedNode] = [] + g_provenance = ds.graph(project.provenance_gid) # Load external vocabulary files (ephemeral - used for inference only) for src in project.paths_vocab_ext: - graph_urn = _create_graph_urn(project, src) + graph_urn = project.source_file_gid(src) g = ds.graph(graph_urn) g.parse(src, format="turtle") # Add provenance metadata to the graph - g.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) - g.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) - g.add((graph_urn, PYTHINFER_NS["sourceType"], PYTHINFER_NS["ExternalVocabulary"])) + g_provenance.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g_provenance.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) external_gids.append(g.identifier) # Load internal vocabulary files for src in project.paths_vocab_int: - graph_urn = _create_graph_urn(project, src) + graph_urn = project.source_file_gid(src) g = ds.graph(graph_urn) g.parse(src, format="turtle") # Add provenance metadata - g.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) - g.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) - g.add((graph_urn, PYTHINFER_NS["sourceType"], PYTHINFER_NS["InternalVocabulary"])) + g_provenance.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g_provenance.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) # Load data files for src in project.paths_data: - graph_urn = _create_graph_urn(project, src) + graph_urn = project.source_file_gid(src) g = ds.graph(graph_urn) g.parse(src, format="turtle") # Add provenance metadata - g.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) - g.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) - g.add((graph_urn, PYTHINFER_NS["sourceType"], PYTHINFER_NS["DataGraph"])) + g_provenance.add((graph_urn, RDF.type, PYTHINFER_NS["SourceGraph"])) + g_provenance.add((graph_urn, DCTERMS.source, URIRef(src.resolve().as_uri()))) if output: if isinstance(output, bool): diff --git a/tests/e2e/test_e2e_from_cli.py b/tests/e2e/test_e2e_from_cli.py index 313e22a..1a50c0c 100644 --- a/tests/e2e/test_e2e_from_cli.py +++ b/tests/e2e/test_e2e_from_cli.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest -from rdflib import Dataset +from rdflib import Dataset, DCTERMS from rdflib.compare import graph_diff, isomorphic from typer.testing import CliRunner @@ -39,15 +39,11 @@ def test_cli_command( if (command == "merge") else f"{INFERRED_WANTED_FILESTEM}.trig" ) - expected_file = ( - "expected_merged.trig" - if (command == "merge") - else "expected_inferred_wanted.trig" - ) + expected_file = "expected-" + actual_file # Path to expected and actual output files - expected_file_path = project_dir / "derived" / expected_file - actual_file_path = project_dir / "derived" / actual_file + expected_file_path = project_dir / "expected" / expected_file + actual_file_path = project_dir / "derived" / "test_cli_command" / actual_file # Ensure expected file exists assert expected_file_path.exists(), ( @@ -58,16 +54,19 @@ def test_cli_command( if actual_file_path.exists(): actual_file_path.unlink() + # Make sure intermediate output folder exists + actual_file_path.parent.mkdir(exist_ok=True) + # Run the command using CliRunner but with proper working directory # Save current working directory and change to project directory original_cwd = Path.cwd() + runner = CliRunner() + cmd_args = [command, "--output", str(actual_file_path)] + # Disable cache for infer command to ensure fresh runs + if command == "infer": + cmd_args.append("--no-cache") + os.chdir(project_dir) try: - os.chdir(project_dir) - runner = CliRunner() - cmd_args = [command, "--output", str(actual_file_path)] - # Disable cache for infer command to ensure fresh runs - if command == "infer": - cmd_args.append("--no-cache") result = runner.invoke(app, cmd_args) finally: os.chdir(original_cwd) @@ -103,6 +102,11 @@ def test_cli_command( expected_graph = expected_ds.graph(graph_id) actual_graph = actual_ds.graph(graph_id) + if graph_id.endswith("provenance"): + # Remove source information, as this will differ by execution environment + expected_graph.remove((None, DCTERMS.source, None)) + actual_graph.remove((None, DCTERMS.source, None)) + if not isomorphic(expected_graph, actual_graph): # Compute the difference to show what's missing/extra in_both, in_expected_only, in_actual_only = graph_diff( From cc79b051e3fa5397c995597ee08ca11b5e1a4490 Mon Sep 17 00:00:00 2001 From: rmuil1 Date: Tue, 3 Feb 2026 23:00:10 +0000 Subject: [PATCH 4/9] make parents of derived folder in test --- tests/e2e/test_e2e_from_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/test_e2e_from_cli.py b/tests/e2e/test_e2e_from_cli.py index 1a50c0c..54ffa4d 100644 --- a/tests/e2e/test_e2e_from_cli.py +++ b/tests/e2e/test_e2e_from_cli.py @@ -55,7 +55,7 @@ def test_cli_command( actual_file_path.unlink() # Make sure intermediate output folder exists - actual_file_path.parent.mkdir(exist_ok=True) + actual_file_path.parent.mkdir(parents=True, exist_ok=True) # Run the command using CliRunner but with proper working directory # Save current working directory and change to project directory From 677d7cda3f58b072f31d5b32d20d25f403cf2cae Mon Sep 17 00:00:00 2001 From: rmuil1 Date: Tue, 3 Feb 2026 23:04:01 +0000 Subject: [PATCH 5/9] trying to fix missing file in CICD --- tests/e2e/test_e2e_from_cli.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/e2e/test_e2e_from_cli.py b/tests/e2e/test_e2e_from_cli.py index 54ffa4d..7a9e068 100644 --- a/tests/e2e/test_e2e_from_cli.py +++ b/tests/e2e/test_e2e_from_cli.py @@ -65,8 +65,8 @@ def test_cli_command( # Disable cache for infer command to ensure fresh runs if command == "infer": cmd_args.append("--no-cache") - os.chdir(project_dir) try: + os.chdir(project_dir) result = runner.invoke(app, cmd_args) finally: os.chdir(original_cwd) From fb43d1e6f7785df134d277705164a5db6386ccb4 Mon Sep 17 00:00:00 2001 From: Robert Muil Date: Fri, 6 Feb 2026 12:37:21 +0000 Subject: [PATCH 6/9] update github actions upgrade actions/checkout and actions/setup-python and astral-sh/setup-uv --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ce44e96..5f358c8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -16,15 +16,15 @@ jobs: python-version: ["3.10", "3.11", "3.12"] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - name: Install uv - uses: astral-sh/setup-uv@v2 + uses: astral-sh/setup-uv@v7 - name: Install dependencies run: uv sync From 1a7096a4d35b47801272c07de172a60fabf8cc60 Mon Sep 17 00:00:00 2001 From: Robert Muil Date: Fri, 6 Feb 2026 16:56:24 +0000 Subject: [PATCH 7/9] debugging action --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 5f358c8..43ba2d4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,7 +30,7 @@ jobs: run: uv sync - name: Run tests with coverage - run: uv run pytest tests/ --cov=src/pythinfer --cov-report=xml --cov-report=term-missing + run: pwd && find ./ && uv run pytest tests/ --cov=src/pythinfer --cov-report=xml --cov-report=term-missing - name: Upload coverage to Codecov uses: codecov/codecov-action@v4 From b801ae231062a86f7e3bcef032711e7f8263a704 Mon Sep 17 00:00:00 2001 From: rmuil1 Date: Fri, 6 Feb 2026 20:22:08 +0000 Subject: [PATCH 8/9] attempted fix to brittle relative path handling --- src/pythinfer/inout.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/src/pythinfer/inout.py b/src/pythinfer/inout.py index 3ed158a..bf4d4b6 100644 --- a/src/pythinfer/inout.py +++ b/src/pythinfer/inout.py @@ -227,10 +227,11 @@ def _path_to_yaml_str(self, path: Path) -> str: If the path is relative to the project file's directory, store it relative for better portability. Otherwise, store as absolute path. """ - project_dir = self.path_self.parent + resolved_path = path.resolve() + resolved_project_dir = self.path_self.resolve().parent try: # Try to make it relative to the project directory - rel_path = path.relative_to(project_dir) + rel_path = resolved_path.relative_to(resolved_project_dir) return str(rel_path) except ValueError: # Path is not relative to project_dir, store as-is @@ -314,7 +315,23 @@ def source_file_gid(self, file_path: Path) -> URIRef: http://pythinfer.local/eg0-basic/file/basic-model.ttl """ - rel_path = file_path.relative_to(self.path_self.parent) + # Resolve both paths to their canonical form to handle symlinks and + # relative path differences that can occur across different environments + resolved_file_path = file_path.resolve() + resolved_project_parent = self.path_self.resolve().parent + + try: + rel_path = resolved_file_path.relative_to(resolved_project_parent) + except ValueError: + # File is outside project directory; try to use the original path as-is + # to preserve the structure shown in the config file + try: + # Try with the unresolved path in case it has a meaningful structure + rel_path = file_path.relative_to(self.path_self.parent) + except ValueError: + # If that also fails, just use the file name + rel_path = resolved_file_path.name + # Note, to use a URN, we'd need to replace with colons for URN structure # Use colons to maintain hierarchical structure in URN return self.namespace[f"file/{rel_path}"] From 743f2cd347327030e6cbad609967ba6ede821c90 Mon Sep 17 00:00:00 2001 From: Robert Muil Date: Fri, 6 Feb 2026 20:27:16 +0000 Subject: [PATCH 9/9] Rewind diagnostics from test.yml --- .github/workflows/test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 43ba2d4..5f358c8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -30,7 +30,7 @@ jobs: run: uv sync - name: Run tests with coverage - run: pwd && find ./ && uv run pytest tests/ --cov=src/pythinfer --cov-report=xml --cov-report=term-missing + run: uv run pytest tests/ --cov=src/pythinfer --cov-report=xml --cov-report=term-missing - name: Upload coverage to Codecov uses: codecov/codecov-action@v4