Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,23 @@ A distinction is made between 'external' and 'internal' files. See below.

This will create a `pythinfer.yaml` project file in the project folder, merge all RDF files it finds, perform inference, and then execute the SPARQL query against the inferred graph.

1. To use a specific project file, use the `--project` option before the command:

```bash
uvx ~/git/pythinfer --project pythinfer_celebrity.yaml query select_who_knows_whom.rq
```

1. Edit the `pythinfer.yaml` file to specify which files to include, try again. Have fun.

![Demo of executing eg0 in CLI](demo-eg0.gif)

## Command Line Interface

### Global Options

- `--project` / `-p`: Specify the path to a project configuration file. If not provided, pythinfer will search for `pythinfer.yaml` in the current directory and parent directories, or create a new project if none is found.
- `--verbose` / `-v`: Enable verbose (DEBUG) logging output.

### Common Options

- `--extra-export`: allows specifying extra export formats beyond the default trig. Can be used to 'flatten' quads to triples when exporting (by exporting to ttl or nt as well as trig)
Expand Down
2 changes: 1 addition & 1 deletion example_projects/eg0-basic/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ To demonstrate custom inference rules with SPARQL, a CONSTRUCT query `eg_rule_to
(BTW: the CONSTRUCT query file is named `eg_rule...` instead of `infer...` to avoid being picked up by the automatic project creation in the test suite.)

```sh
uv run pythinfer query --no-cache --project pythinfer_celebrity.yaml select_who_knows_whom.rq
uv run pythinfer --project pythinfer_celebrity.yaml query --no-cache select_who_knows_whom.rq
```

NB: beware of [bug #33](https://github.com/robertmuil/pythinfer/issues/33): hence the `--no-cache` flag.
44 changes: 28 additions & 16 deletions src/pythinfer/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
from collections.abc import Sequence
from datetime import UTC, datetime
from contextvars import ContextVar
from pathlib import Path
from typing import Annotated

Expand All @@ -19,6 +19,24 @@
)
from pythinfer.rdflibplus import DatasetView, graph_lengths

ProjectOption = Annotated[
Path | None,
typer.Option(
"--project",
"-p",
help="Path to project configuration file (pythinfer.yaml)",
),
]

VerboseOption = Annotated[
bool,
typer.Option(
"--verbose",
"-v",
help="Enable verbose (DEBUG) logging output",
),
]

ExtraExportFormatOption = Annotated[
list[str] | None,
typer.Option(
Expand All @@ -32,6 +50,9 @@
app = typer.Typer()
logger = logging.getLogger(__name__)

# Context variable to store the project path (thread-safe alternative to global)
_project_path_var: ContextVar[Path | None] = ContextVar("project_path", default=None)


def echo_success(msg: str) -> None: # noqa: D103 - self-explanatory function
typer.secho(msg, fg=typer.colors.GREEN)
Expand Down Expand Up @@ -73,14 +94,11 @@ def configure_logging(*, verbose: bool) -> None:
@app.callback()
def main_callback(
*,
verbose: bool = typer.Option(
False, # noqa: FBT003
"--verbose",
"-v",
help="Enable verbose (DEBUG) logging output",
),
project: ProjectOption = None,
verbose: VerboseOption = False,
) -> None:
"""Global options for pythinfer CLI."""
_project_path_var.set(project)
configure_logging(verbose=verbose)


Expand Down Expand Up @@ -108,7 +126,6 @@ def create(

@app.command()
def merge(
config: Path | None = None,
output: Path | None = None,
*,
export_external: bool = False,
Expand All @@ -117,14 +134,13 @@ def merge(
"""Merge graphs as specified in the config file and save.

Args:
config: path to the project configuration file
output: path for data to be saved to (defaults to `derived/merged.trig`)
export_external: whether to include external graphs in output
extra_export_format: additional export format(s) (besides trig),
can be specified multiple times

"""
project = load_project(config)
project = load_project(_project_path_var.get())
ds, external_graph_ids = merge_graphs(
project,
output=output or True,
Expand All @@ -137,7 +153,6 @@ def merge(

@app.command()
def infer(
config: Path | None = None,
backend: str = "owlrl",
output: Path | None = None,
*,
Expand All @@ -150,7 +165,6 @@ def infer(
"""Run inference backends on merged graph.

Args:
config: path to Project defining the inputs
backend: OWL inference engine to use
output: output path for final inferences (None for project-based default)
include_unwanted_triples: include all valid inferences, even unhelpful
Expand All @@ -161,7 +175,7 @@ def infer(
can be specified multiple times

"""
project = load_project(config)
project = load_project(_project_path_var.get())

# Force no_cache when extra export formats requested, otherwise exports won't happen
if extra_export_format and not no_cache:
Expand Down Expand Up @@ -212,7 +226,6 @@ def infer(
@app.command()
def query(
query: str,
project: Path | None = None,
graph: list[str] | None = None,
*,
no_cache: bool = False,
Expand All @@ -224,7 +237,6 @@ def query(

Args:
query: path to the query file to execute, or the query string itself
project: Path to project file (defaults to project selection process)
graph: IRI for graph to include (can be specified multiple times)
no_cache: whether to skip loading from cache and re-run inference

Expand All @@ -235,7 +247,7 @@ def query(
else:
query_contents = str(query)

ds, _ = infer(project, no_cache=no_cache)
ds, _ = infer(no_cache=no_cache)

view = ds
if graph:
Expand Down
8 changes: 6 additions & 2 deletions src/pythinfer/inout.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,8 +263,12 @@ def to_yaml_file(self, output_path: Path) -> None:

@property
def path_output(self) -> Path:
"""Path to the output folder."""
return self.path_self.parent / "derived"
"""Path to the output folder.

Includes the project file stem to avoid cache collisions when multiple
project files exist in the same directory.
"""
return self.path_self.parent / "derived" / self.path_self.stem

@property
def paths_all_input(self) -> list[Path]:
Expand Down
193 changes: 193 additions & 0 deletions tests/integration/test_cache_isolation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
"""Integration tests for cache isolation with multiple project files."""

import os
import shutil
from pathlib import Path

import pytest

from pythinfer.infer import load_cache, run_inference_backend
from pythinfer.inout import COMBINED_FULL_FILESTEM, load_project
from pythinfer.merge import merge_graphs

PROJECT_ROOT = Path(__file__).parent.parent.parent


class TestCacheIsolation:
"""Test that different project files in the same directory have isolated caches."""

@pytest.fixture
def eg0_temp_dir(self, tmp_path: Path) -> Path:
"""Create temporary copy of eg0-basic to avoid modifying the repository."""
shutil.copytree(
PROJECT_ROOT / "example_projects" / "eg0-basic", tmp_path / "eg0-basic"
)

return tmp_path / "eg0-basic"

def test_separate_cache_directories_for_different_projects(
self, eg0_temp_dir: Path
) -> None:
"""Test that different project files create separate cache directories.

This verifies the fix for the bug where --project argument would use
the wrong cache if a cache existed for the default pythinfer.yaml.
"""
# Verify example project exists and has both config files
default_config = eg0_temp_dir / "pythinfer.yaml"
celebrity_config = eg0_temp_dir / "pythinfer_celebrity.yaml"

assert default_config.exists(), "pythinfer.yaml not found"
assert celebrity_config.exists(), "pythinfer_celebrity.yaml not found"

# Load both projects
default_project = load_project(default_config)
celebrity_project = load_project(celebrity_config)

# Verify they have different output paths based on project file stem
default_output = default_project.path_output
celebrity_output = celebrity_project.path_output

assert default_output == eg0_temp_dir / "derived" / "pythinfer"
assert celebrity_output == eg0_temp_dir / "derived" / "pythinfer_celebrity"
assert default_output != celebrity_output

def test_different_inference_results_with_different_projects(
self, eg0_temp_dir: Path
) -> None:
"""Test that different project files produce different inference results.

The celebrity project includes an additional SPARQL inference rule
that the default project does not, so they should have different
inferred triples and cache files.
"""
original_cwd = Path.cwd()
try:
os.chdir(eg0_temp_dir)

# Default project inference
default_project = load_project(None) # Uses discovery
default_project.owl_backend = "owlrl"
default_ds, default_external_ids = merge_graphs(
default_project,
output=True,
export_external=False,
extra_export_formats=None,
)
run_inference_backend(
default_ds,
default_external_ids,
default_project,
None,
include_unwanted_triples=False,
export_full=True,
export_external_inferences=False,
extra_export_formats=None,
)
default_count = len(default_ds)

# Celebrity project inference
celebrity_project = load_project(Path("pythinfer_celebrity.yaml"))
celebrity_project.owl_backend = "owlrl"
celebrity_ds, celebrity_external_ids = merge_graphs(
celebrity_project,
output=True,
export_external=False,
extra_export_formats=None,
)
run_inference_backend(
celebrity_ds,
celebrity_external_ids,
celebrity_project,
None,
include_unwanted_triples=False,
export_full=True,
export_external_inferences=False,
extra_export_formats=None,
)
celebrity_count = len(celebrity_ds)

# Verify different numbers of triples (celebrity has more due
# to extra inference)
assert default_count > 0
assert celebrity_count > 0
assert celebrity_count > default_count, (
f"Celebrity project should have more inferences ({celebrity_count}) "
f"than default ({default_count})"
)

# Verify cache files exist in separate directories
default_cache = (
default_project.path_output / f"{COMBINED_FULL_FILESTEM}.trig"
)
celebrity_cache = (
celebrity_project.path_output / f"{COMBINED_FULL_FILESTEM}.trig"
)

assert default_cache.exists(), (
f"Default cache not found at {default_cache}"
)
assert celebrity_cache.exists(), (
f"Celebrity cache not found at {celebrity_cache}"
)

# Verify they're in different directories
assert default_cache.parent != celebrity_cache.parent

finally:
os.chdir(original_cwd)

def test_cache_not_mixed_between_projects(self, eg0_temp_dir: Path) -> None:
"""Test that loading project doesn't confuse caches between projects.

This is the specific bug scenario: if we run infer with default project,
then run infer with celebrity project, the celebrity project should not
load the default project's cache.
"""
original_cwd = Path.cwd()
try:
os.chdir(eg0_temp_dir)

# Step 1: Run inference for default project (creates cache)
default_project = load_project(None)
default_project.owl_backend = "owlrl"
default_ds, default_external_ids = merge_graphs(
default_project,
output=True,
export_external=False,
)
run_inference_backend(
default_ds,
default_external_ids,
default_project,
None,
include_unwanted_triples=False,
export_full=True,
export_external_inferences=False,
)

# Verify default cache was created
default_cache = load_cache(default_project)
assert default_cache is not None, (
"Default project cache should exist"
)
default_triple_count = len(default_cache)

# Step 2: Load celebrity project and verify it doesn't use
# default cache
celebrity_project = load_project(Path("pythinfer_celebrity.yaml"))
celebrity_cache = load_cache(celebrity_project)

# If cache was incorrectly shared, this assertion would fail
# because celebrity cache would have same triple count as default
if celebrity_cache is not None:
celebrity_triple_count = len(celebrity_cache)
# Celebrity has more triples due to additional inference
assert celebrity_triple_count > default_triple_count, (
f"Celebrity cache should have more triples "
f"({celebrity_triple_count}) than default "
f"({default_triple_count}), but got fewer. "
f"This suggests the wrong cache is being used."
)
finally:
os.chdir(original_cwd)