From 8d2156b7c1ead81385285d43b76208db80c28a24 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 09:27:39 +0200 Subject: [PATCH 1/8] DE-211: publisher for removing deleted datasets --- carte_cli/loader/carte_loader.py | 22 ++++++++++++-- carte_cli/main.py | 29 ++++++++++++++---- .../publisher/remove_deleted_publisher.py | 30 +++++++++++++++++++ 3 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 carte_cli/publisher/remove_deleted_publisher.py diff --git a/carte_cli/loader/carte_loader.py b/carte_cli/loader/carte_loader.py index 94a0a3b..14b9145 100644 --- a/carte_cli/loader/carte_loader.py +++ b/carte_cli/loader/carte_loader.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -import os.path +import os from pathlib import Path from typing import Union from databuilder.loader.base_loader import Loader @@ -15,6 +15,7 @@ TABLES_OUTPUT_PATH = "content/tables" JOBS_OUTPUT_PATH = "content/jobs" FRONTMATTER_SEPARATOR = "---" +MANIFESTS_FILE = "manifests" class CarteLoader(Loader): @@ -25,6 +26,14 @@ def init(self, conf: ConfigTree): "tables_output_path", TABLES_OUTPUT_PATH ) self.jobs_path = self.conf.get_string("jobs_output_path", JOBS_OUTPUT_PATH) + self.processed_files = [] + + default_manifests_path = os.path.join( + self.base_directory, self.tables_path, MANIFESTS_FILE + ) + self.manifests_path = self.conf.get_string( + "manifests_path", default_manifests_path + ) def load( self, record: Union[None, JobMetadata, DatabuilderTableMetadata, TableMetadata] @@ -64,12 +73,21 @@ def load_table(self, record: Union[DatabuilderTableMetadata, TableMetadata]): raise ValueError(f"{e}\nFile name: {full_file_name}") frontmatter.dump(full_file_name, *extractor_metadata.to_frontmatter()) + with open( + os.path.join(self.base_directory, self.tables_path, MANIFESTS_FILE), "a" + ) as manifests_file: + manifests_file.write(f"{record.get_file_name()}\n") def get_table_file_name(self, record: TableMetadata): - return os.path.join(self.base_directory, self.tables_path, f"{record.get_file_name()}.md") + return os.path.join( + self.base_directory, self.tables_path, f"{record.get_file_name()}.md" + ) def load_job(self, record: JobMetadata): pass def get_scope(self) -> str: return "loader.carte" + + def get_manifests_path(self) -> str: + return self.manifests_path diff --git a/carte_cli/main.py b/carte_cli/main.py index 1965700..01568f2 100644 --- a/carte_cli/main.py +++ b/carte_cli/main.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 +import os import typer import click_spinner from databuilder.extractor.csv_extractor import CsvExtractor @@ -9,7 +10,8 @@ from databuilder.transformer.base_transformer import NoopTransformer from pyhocon import ConfigFactory -from carte_cli.loader.carte_loader import CarteLoader +from carte_cli.loader.carte_loader import CarteLoader, MANIFESTS_FILE +from carte_cli.publisher.remove_deleted_publisher import RemoveDeletedPublisher from carte_cli.utils.config_parser import parse_config from carte_cli.scaffolding.frontend import create_frontend_dir from carte_cli.utils.flatten import flatten as execute_flatten @@ -28,20 +30,34 @@ def run_extraction( Optionally, you can set an --output directory. By default it uses the current working directory. """ + manifests_file = os.path.join(output_dir, MANIFESTS_FILE) + if os.path.isfile(manifests_file): + os.remove(manifests_file) + carte_loader = CarteLoader() + remove_deleted_publisher = RemoveDeletedPublisher() extractors, config = parse_config(config_path) job_config = ConfigFactory.from_dict( - {"loader.carte.tables_output_path": output_dir, **config} + { + "loader.carte.tables_output_path": output_dir, + "loader.carte.manifests_path": manifests_file, + "publisher.carte.tables_output_path": output_dir, + "publisher.carte.manifests_path": manifests_file, + **config, + } ) typer.echo("Running extraction...") with click_spinner.spinner(): - for extractor in extractors: + for index, extractor in enumerate(extractors): task = DefaultTask(extractor=extractor, loader=carte_loader) + job_args = dict(conf=job_config, task=task) + if index == len(extractors) - 1: # if last job, remove deleted tables + job_args["publisher"] = remove_deleted_publisher - DefaultJob(conf=job_config, task=task).launch() + DefaultJob(**job_args).launch() typer.echo("Done!") @@ -67,11 +83,12 @@ def flatten( output_dir: str = typer.Argument( ..., help="The destination directory for flattened markdown files" ), - template: str = typer.Option(None, "--template", "-t", help="The template to use for flattening datasets") + template: str = typer.Option( + None, "--template", "-t", help="The template to use for flattening datasets" + ), ): execute_flatten(input_dir, output_dir, template) - if __name__ == "__main__": app() diff --git a/carte_cli/publisher/remove_deleted_publisher.py b/carte_cli/publisher/remove_deleted_publisher.py new file mode 100644 index 0000000..584e58a --- /dev/null +++ b/carte_cli/publisher/remove_deleted_publisher.py @@ -0,0 +1,30 @@ +import os +import glob +from databuilder.publisher.base_publisher import Publisher +from pyhocon.config_tree import ConfigTree + + +class RemoveDeletedPublisher(Publisher): + def init(self, conf: ConfigTree) -> None: + self.conf = conf + self.manifests_path = self.conf.get_string("manifests_path") + self.tables_path = self.conf.get_string("tables_output_path") + + def publish_impl(self) -> None: + print("Publishing") + print(f"Tables path: {self.tables_path}") + with open(self.manifests_path) as f: + lines = f.readlines() + datasets = set([line.strip() for line in lines]) + + file_paths = glob.glob(self.tables_path + "/*/*/*.md", recursive=True) + file_ids = [path[(len(self.tables_path) + 1):-(len(".md"))] for path in file_paths] + + for file_path, file_id in zip(file_paths, file_ids): + if file_id not in datasets: + os.remove(file_path) + print(f"Removed {file_path}") + + + def get_scope(self) -> str: + return "publisher.carte" From 7335c6d8862ba3c85b3fb9e13238e0d4055725c1 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 14:31:51 +0200 Subject: [PATCH 2/8] DE-211: remove unneeded assertion --- tests/loader/test_carte_loader.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/tests/loader/test_carte_loader.py b/tests/loader/test_carte_loader.py index 668e0f4..d1eacb2 100644 --- a/tests/loader/test_carte_loader.py +++ b/tests/loader/test_carte_loader.py @@ -43,9 +43,6 @@ def test_load_carte_metadata(mock_os, mock_frontmatter, patched_config): loader.close() mock_frontmatter.dump.assert_called_with("mock_path", *test_record.to_frontmatter()) - mock_os.path.join.assert_called_with( - ".", "tables", f"{test_record.get_file_name()}.md" - ) @patch("carte_cli.loader.carte_loader.frontmatter") From 759306fbaec4d851a752ef22881eb0aed435e71d Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 14:32:13 +0200 Subject: [PATCH 3/8] DE-211: bump patch version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index fc78ce3..f98028a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "carte-cli" -version = "0.3.5" +version = "0.3.6" description = "A static site generator for data catalogs" authors = ["Balint Haller "] license = "GPL-3.0-or-later" From bc3c4ae635aff7cdd1e44dde8edad18d17d147c3 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 14:41:27 +0200 Subject: [PATCH 4/8] DE-211: try only python 3.8 --- .github/workflows/pull_request.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 73b5f07..9e570e5 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - python-version: ['3.7.x', '3.8.x'] + python-version: ['3.8.x'] steps: - name: Checkout uses: actions/checkout@v1 From f4b503d63cede6890f917179d7ae188e6eca59b7 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 14:54:00 +0200 Subject: [PATCH 5/8] DE-211: fix Databuilder change --- carte_cli/model/carte_table_model.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/carte_cli/model/carte_table_model.py b/carte_cli/model/carte_table_model.py index 0e4baee..15860d8 100644 --- a/carte_cli/model/carte_table_model.py +++ b/carte_cli/model/carte_table_model.py @@ -10,7 +10,8 @@ def get_description_text(description: DatabuilderDescription): - return description._text + if hasattr(description, 'text'): + return description.text class ColumnType(Enum): From 7c45dcb7bca1c86c72a73d88213346b7db0cfcd4 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 14:56:51 +0200 Subject: [PATCH 6/8] DE-211: add back 3.7 for PR tests --- .github/workflows/pull_request.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 9e570e5..73b5f07 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-18.04 strategy: matrix: - python-version: ['3.8.x'] + python-version: ['3.7.x', '3.8.x'] steps: - name: Checkout uses: actions/checkout@v1 From 6faf956ad51e21486d8b64ce88285bef9e727eb3 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 16:24:58 +0200 Subject: [PATCH 7/8] DE-211: extract file path logic for testing --- .../publisher/remove_deleted_publisher.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/carte_cli/publisher/remove_deleted_publisher.py b/carte_cli/publisher/remove_deleted_publisher.py index 584e58a..b8a65f5 100644 --- a/carte_cli/publisher/remove_deleted_publisher.py +++ b/carte_cli/publisher/remove_deleted_publisher.py @@ -1,5 +1,6 @@ import os import glob +from typing import Iterator, List, Set from databuilder.publisher.base_publisher import Publisher from pyhocon.config_tree import ConfigTree @@ -10,6 +11,20 @@ def init(self, conf: ConfigTree) -> None: self.manifests_path = self.conf.get_string("manifests_path") self.tables_path = self.conf.get_string("tables_output_path") + if self.tables_path is None: + raise ValueError("Output path is needed for publisher") + + def _get_datasets_to_delete( + self, datasets: Set[str], file_paths: List[str] + ) -> Iterator[str]: + file_ids = [ + path[(len(self.tables_path) + 1) : -(len(".md"))] for path in file_paths + ] + + for file_path, file_id in zip(file_paths, file_ids): + if file_id not in datasets: + yield file_path + def publish_impl(self) -> None: print("Publishing") print(f"Tables path: {self.tables_path}") @@ -18,13 +33,10 @@ def publish_impl(self) -> None: datasets = set([line.strip() for line in lines]) file_paths = glob.glob(self.tables_path + "/*/*/*.md", recursive=True) - file_ids = [path[(len(self.tables_path) + 1):-(len(".md"))] for path in file_paths] - - for file_path, file_id in zip(file_paths, file_ids): - if file_id not in datasets: - os.remove(file_path) - print(f"Removed {file_path}") + for file_path in self._get_datasets_to_delete(datasets, file_paths): + os.remove(file_path) + print(f"Removed {file_path}") def get_scope(self) -> str: return "publisher.carte" From b8c211119ff037bf0a33b0f3527a320b3aeeabb4 Mon Sep 17 00:00:00 2001 From: Balint Haller Date: Mon, 19 Apr 2021 16:25:20 +0200 Subject: [PATCH 8/8] DE-211: add publisher test --- .../test_remove_deleted_publisher.py | 36 +++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100644 tests/publisher/test_remove_deleted_publisher.py diff --git a/tests/publisher/test_remove_deleted_publisher.py b/tests/publisher/test_remove_deleted_publisher.py new file mode 100644 index 0000000..9d5b615 --- /dev/null +++ b/tests/publisher/test_remove_deleted_publisher.py @@ -0,0 +1,36 @@ +from carte_cli.publisher.remove_deleted_publisher import RemoveDeletedPublisher + + +def test_get_datasets_to_delete(): + publisher = RemoveDeletedPublisher() + tables_path = "test-tables-path" + + publisher.tables_path = tables_path + + test_datasets = set( + [ + f"db1/dataset1", + f"db1/dataset3", + f"db1/dataset4", + f"db1/dataset5", + f"db2/dataset1", + ] + ) + + test_file_paths = [ + f"{tables_path}/db1/dataset1.md", + f"{tables_path}/db2/dataset1.md", + f"{tables_path}/db2/dataset2.md", + f"{tables_path}/db1/dataset3.md", + f"{tables_path}/db1/dataset4.md", + f"{tables_path}/db3/dataset1.md", + ] + + to_delete = [ + dataset + for dataset in publisher._get_datasets_to_delete(test_datasets, test_file_paths) + ] + + assert len(to_delete) == 2 + assert "test-tables-path/db3/dataset1.md" in to_delete + assert "test-tables-path/db2/dataset2.md" in to_delete