diff --git a/carte_cli/loader/carte_loader.py b/carte_cli/loader/carte_loader.py index 94a0a3b..14b9145 100644 --- a/carte_cli/loader/carte_loader.py +++ b/carte_cli/loader/carte_loader.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import os.path +import os from pathlib import Path from typing import Union from databuilder.loader.base_loader import Loader @@ -15,6 +15,7 @@ TABLES_OUTPUT_PATH = "content/tables" JOBS_OUTPUT_PATH = "content/jobs" FRONTMATTER_SEPARATOR = "---" +MANIFESTS_FILE = "manifests" class CarteLoader(Loader): @@ -25,6 +26,14 @@ def init(self, conf: ConfigTree): "tables_output_path", TABLES_OUTPUT_PATH ) self.jobs_path = self.conf.get_string("jobs_output_path", JOBS_OUTPUT_PATH) + self.processed_files = [] + + default_manifests_path = os.path.join( + self.base_directory, self.tables_path, MANIFESTS_FILE + ) + self.manifests_path = self.conf.get_string( + "manifests_path", default_manifests_path + ) def load( self, record: Union[None, JobMetadata, DatabuilderTableMetadata, TableMetadata] @@ -64,12 +73,21 @@ def load_table(self, record: Union[DatabuilderTableMetadata, TableMetadata]): raise ValueError(f"{e}\nFile name: {full_file_name}") frontmatter.dump(full_file_name, *extractor_metadata.to_frontmatter()) + with open( + os.path.join(self.base_directory, self.tables_path, MANIFESTS_FILE), "a" + ) as manifests_file: + manifests_file.write(f"{record.get_file_name()}\n") def get_table_file_name(self, record: TableMetadata): - return os.path.join(self.base_directory, self.tables_path, f"{record.get_file_name()}.md") + return os.path.join( + self.base_directory, self.tables_path, f"{record.get_file_name()}.md" + ) def load_job(self, record: JobMetadata): pass def get_scope(self) -> str: return "loader.carte" + + def get_manifests_path(self) -> str: + return self.manifests_path diff --git a/carte_cli/main.py b/carte_cli/main.py index 1965700..01568f2 100644 --- a/carte_cli/main.py +++ b/carte_cli/main.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import os import typer import click_spinner from databuilder.extractor.csv_extractor import CsvExtractor @@ -9,7 +10,8 @@ from databuilder.transformer.base_transformer import NoopTransformer from pyhocon import ConfigFactory -from carte_cli.loader.carte_loader import CarteLoader +from carte_cli.loader.carte_loader import CarteLoader, MANIFESTS_FILE +from carte_cli.publisher.remove_deleted_publisher import RemoveDeletedPublisher from carte_cli.utils.config_parser import parse_config from carte_cli.scaffolding.frontend import create_frontend_dir from carte_cli.utils.flatten import flatten as execute_flatten @@ -28,20 +30,34 @@ def run_extraction( Optionally, you can set an --output directory. By default it uses the current working directory. """ + manifests_file = os.path.join(output_dir, MANIFESTS_FILE) + if os.path.isfile(manifests_file): + os.remove(manifests_file) + carte_loader = CarteLoader() + remove_deleted_publisher = RemoveDeletedPublisher() extractors, config = parse_config(config_path) job_config = ConfigFactory.from_dict( - {"loader.carte.tables_output_path": output_dir, **config} + { + "loader.carte.tables_output_path": output_dir, + "loader.carte.manifests_path": manifests_file, + "publisher.carte.tables_output_path": output_dir, + "publisher.carte.manifests_path": manifests_file, + **config, + } ) typer.echo("Running extraction...") with click_spinner.spinner(): - for extractor in extractors: + for index, extractor in enumerate(extractors): task = DefaultTask(extractor=extractor, loader=carte_loader) + job_args = dict(conf=job_config, task=task) + if index == len(extractors) - 1: # if last job, remove deleted tables + job_args["publisher"] = remove_deleted_publisher - DefaultJob(conf=job_config, task=task).launch() + DefaultJob(**job_args).launch() typer.echo("Done!") @@ -67,11 +83,12 @@ def flatten( output_dir: str = typer.Argument( ..., help="The destination directory for flattened markdown files" ), - template: str = typer.Option(None, "--template", "-t", help="The template to use for flattening datasets") + template: str = typer.Option( + None, "--template", "-t", help="The template to use for flattening datasets" + ), ): execute_flatten(input_dir, output_dir, template) - if __name__ == "__main__": app() diff --git a/carte_cli/model/carte_table_model.py b/carte_cli/model/carte_table_model.py index 0e4baee..15860d8 100644 --- a/carte_cli/model/carte_table_model.py +++ b/carte_cli/model/carte_table_model.py @@ -10,7 +10,8 @@ def get_description_text(description: DatabuilderDescription): - return description._text + if hasattr(description, 'text'): + return description.text class ColumnType(Enum): diff --git a/carte_cli/publisher/remove_deleted_publisher.py b/carte_cli/publisher/remove_deleted_publisher.py new file mode 100644 index 0000000..b8a65f5 --- /dev/null +++ b/carte_cli/publisher/remove_deleted_publisher.py @@ -0,0 +1,42 @@ +import os +import glob +from typing import Iterator, List, Set +from databuilder.publisher.base_publisher import Publisher +from pyhocon.config_tree import ConfigTree + + +class RemoveDeletedPublisher(Publisher): + def init(self, conf: ConfigTree) -> None: + self.conf = conf + self.manifests_path = self.conf.get_string("manifests_path") + self.tables_path = self.conf.get_string("tables_output_path") + + if self.tables_path is None: + raise ValueError("Output path is needed for publisher") + + def _get_datasets_to_delete( + self, datasets: Set[str], file_paths: List[str] + ) -> Iterator[str]: + file_ids = [ + path[(len(self.tables_path) + 1) : -(len(".md"))] for path in file_paths + ] + + for file_path, file_id in zip(file_paths, file_ids): + if file_id not in datasets: + yield file_path + + def publish_impl(self) -> None: + print("Publishing") + print(f"Tables path: {self.tables_path}") + with open(self.manifests_path) as f: + lines = f.readlines() + datasets = set([line.strip() for line in lines]) + + file_paths = glob.glob(self.tables_path + "/*/*/*.md", recursive=True) + + for file_path in self._get_datasets_to_delete(datasets, file_paths): + os.remove(file_path) + print(f"Removed {file_path}") + + def get_scope(self) -> str: + return "publisher.carte" diff --git a/pyproject.toml b/pyproject.toml index fc78ce3..f98028a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "carte-cli" -version = "0.3.5" +version = "0.3.6" description = "A static site generator for data catalogs" authors = ["Balint Haller "] license = "GPL-3.0-or-later" diff --git a/tests/loader/test_carte_loader.py b/tests/loader/test_carte_loader.py index 668e0f4..d1eacb2 100644 --- a/tests/loader/test_carte_loader.py +++ b/tests/loader/test_carte_loader.py @@ -43,9 +43,6 @@ def test_load_carte_metadata(mock_os, mock_frontmatter, patched_config): loader.close() mock_frontmatter.dump.assert_called_with("mock_path", *test_record.to_frontmatter()) - mock_os.path.join.assert_called_with( - ".", "tables", f"{test_record.get_file_name()}.md" - ) @patch("carte_cli.loader.carte_loader.frontmatter") diff --git a/tests/publisher/test_remove_deleted_publisher.py b/tests/publisher/test_remove_deleted_publisher.py new file mode 100644 index 0000000..9d5b615 --- /dev/null +++ b/tests/publisher/test_remove_deleted_publisher.py @@ -0,0 +1,36 @@ +from carte_cli.publisher.remove_deleted_publisher import RemoveDeletedPublisher + + +def test_get_datasets_to_delete(): + publisher = RemoveDeletedPublisher() + tables_path = "test-tables-path" + + publisher.tables_path = tables_path + + test_datasets = set( + [ + f"db1/dataset1", + f"db1/dataset3", + f"db1/dataset4", + f"db1/dataset5", + f"db2/dataset1", + ] + ) + + test_file_paths = [ + f"{tables_path}/db1/dataset1.md", + f"{tables_path}/db2/dataset1.md", + f"{tables_path}/db2/dataset2.md", + f"{tables_path}/db1/dataset3.md", + f"{tables_path}/db1/dataset4.md", + f"{tables_path}/db3/dataset1.md", + ] + + to_delete = [ + dataset + for dataset in publisher._get_datasets_to_delete(test_datasets, test_file_paths) + ] + + assert len(to_delete) == 2 + assert "test-tables-path/db3/dataset1.md" in to_delete + assert "test-tables-path/db2/dataset2.md" in to_delete