Skip to content

Commit

Permalink
Merge pull request #3 from carte-data/DE-211-delete-removed-datasets
Browse files Browse the repository at this point in the history
DE-211: publisher for removing deleted datasets
  • Loading branch information
Balint Haller authored Apr 19, 2021
2 parents 52cc04d + b8c2111 commit 07e6786
Show file tree
Hide file tree
Showing 7 changed files with 124 additions and 13 deletions.
22 changes: 20 additions & 2 deletions carte_cli/loader/carte_loader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python3

import os.path
import os
from pathlib import Path
from typing import Union
from databuilder.loader.base_loader import Loader
Expand All @@ -15,6 +15,7 @@
TABLES_OUTPUT_PATH = "content/tables"
JOBS_OUTPUT_PATH = "content/jobs"
FRONTMATTER_SEPARATOR = "---"
MANIFESTS_FILE = "manifests"


class CarteLoader(Loader):
Expand All @@ -25,6 +26,14 @@ def init(self, conf: ConfigTree):
"tables_output_path", TABLES_OUTPUT_PATH
)
self.jobs_path = self.conf.get_string("jobs_output_path", JOBS_OUTPUT_PATH)
self.processed_files = []

default_manifests_path = os.path.join(
self.base_directory, self.tables_path, MANIFESTS_FILE
)
self.manifests_path = self.conf.get_string(
"manifests_path", default_manifests_path
)

def load(
self, record: Union[None, JobMetadata, DatabuilderTableMetadata, TableMetadata]
Expand Down Expand Up @@ -64,12 +73,21 @@ def load_table(self, record: Union[DatabuilderTableMetadata, TableMetadata]):
raise ValueError(f"{e}\nFile name: {full_file_name}")

frontmatter.dump(full_file_name, *extractor_metadata.to_frontmatter())
with open(
os.path.join(self.base_directory, self.tables_path, MANIFESTS_FILE), "a"
) as manifests_file:
manifests_file.write(f"{record.get_file_name()}\n")

def get_table_file_name(self, record: TableMetadata):
return os.path.join(self.base_directory, self.tables_path, f"{record.get_file_name()}.md")
return os.path.join(
self.base_directory, self.tables_path, f"{record.get_file_name()}.md"
)

def load_job(self, record: JobMetadata):
pass

def get_scope(self) -> str:
return "loader.carte"

def get_manifests_path(self) -> str:
return self.manifests_path
29 changes: 23 additions & 6 deletions carte_cli/main.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/usr/bin/env python3

import os
import typer
import click_spinner
from databuilder.extractor.csv_extractor import CsvExtractor
Expand All @@ -9,7 +10,8 @@
from databuilder.transformer.base_transformer import NoopTransformer
from pyhocon import ConfigFactory

from carte_cli.loader.carte_loader import CarteLoader
from carte_cli.loader.carte_loader import CarteLoader, MANIFESTS_FILE
from carte_cli.publisher.remove_deleted_publisher import RemoveDeletedPublisher
from carte_cli.utils.config_parser import parse_config
from carte_cli.scaffolding.frontend import create_frontend_dir
from carte_cli.utils.flatten import flatten as execute_flatten
Expand All @@ -28,20 +30,34 @@ def run_extraction(
Optionally, you can set an --output directory. By default it uses the current
working directory.
"""
manifests_file = os.path.join(output_dir, MANIFESTS_FILE)
if os.path.isfile(manifests_file):
os.remove(manifests_file)

carte_loader = CarteLoader()
remove_deleted_publisher = RemoveDeletedPublisher()
extractors, config = parse_config(config_path)

job_config = ConfigFactory.from_dict(
{"loader.carte.tables_output_path": output_dir, **config}
{
"loader.carte.tables_output_path": output_dir,
"loader.carte.manifests_path": manifests_file,
"publisher.carte.tables_output_path": output_dir,
"publisher.carte.manifests_path": manifests_file,
**config,
}
)

typer.echo("Running extraction...")

with click_spinner.spinner():
for extractor in extractors:
for index, extractor in enumerate(extractors):
task = DefaultTask(extractor=extractor, loader=carte_loader)
job_args = dict(conf=job_config, task=task)
if index == len(extractors) - 1: # if last job, remove deleted tables
job_args["publisher"] = remove_deleted_publisher

DefaultJob(conf=job_config, task=task).launch()
DefaultJob(**job_args).launch()

typer.echo("Done!")

Expand All @@ -67,11 +83,12 @@ def flatten(
output_dir: str = typer.Argument(
..., help="The destination directory for flattened markdown files"
),
template: str = typer.Option(None, "--template", "-t", help="The template to use for flattening datasets")
template: str = typer.Option(
None, "--template", "-t", help="The template to use for flattening datasets"
),
):
execute_flatten(input_dir, output_dir, template)



if __name__ == "__main__":
app()
3 changes: 2 additions & 1 deletion carte_cli/model/carte_table_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@


def get_description_text(description: DatabuilderDescription):
return description._text
if hasattr(description, 'text'):
return description.text


class ColumnType(Enum):
Expand Down
42 changes: 42 additions & 0 deletions carte_cli/publisher/remove_deleted_publisher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import os
import glob
from typing import Iterator, List, Set
from databuilder.publisher.base_publisher import Publisher
from pyhocon.config_tree import ConfigTree


class RemoveDeletedPublisher(Publisher):
def init(self, conf: ConfigTree) -> None:
self.conf = conf
self.manifests_path = self.conf.get_string("manifests_path")
self.tables_path = self.conf.get_string("tables_output_path")

if self.tables_path is None:
raise ValueError("Output path is needed for publisher")

def _get_datasets_to_delete(
self, datasets: Set[str], file_paths: List[str]
) -> Iterator[str]:
file_ids = [
path[(len(self.tables_path) + 1) : -(len(".md"))] for path in file_paths
]

for file_path, file_id in zip(file_paths, file_ids):
if file_id not in datasets:
yield file_path

def publish_impl(self) -> None:
print("Publishing")
print(f"Tables path: {self.tables_path}")
with open(self.manifests_path) as f:
lines = f.readlines()
datasets = set([line.strip() for line in lines])

file_paths = glob.glob(self.tables_path + "/*/*/*.md", recursive=True)

for file_path in self._get_datasets_to_delete(datasets, file_paths):
os.remove(file_path)
print(f"Removed {file_path}")

def get_scope(self) -> str:
return "publisher.carte"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "carte-cli"
version = "0.3.5"
version = "0.3.6"
description = "A static site generator for data catalogs"
authors = ["Balint Haller <balint@hey.com>"]
license = "GPL-3.0-or-later"
Expand Down
3 changes: 0 additions & 3 deletions tests/loader/test_carte_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ def test_load_carte_metadata(mock_os, mock_frontmatter, patched_config):
loader.close()

mock_frontmatter.dump.assert_called_with("mock_path", *test_record.to_frontmatter())
mock_os.path.join.assert_called_with(
".", "tables", f"{test_record.get_file_name()}.md"
)


@patch("carte_cli.loader.carte_loader.frontmatter")
Expand Down
36 changes: 36 additions & 0 deletions tests/publisher/test_remove_deleted_publisher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from carte_cli.publisher.remove_deleted_publisher import RemoveDeletedPublisher


def test_get_datasets_to_delete():
publisher = RemoveDeletedPublisher()
tables_path = "test-tables-path"

publisher.tables_path = tables_path

test_datasets = set(
[
f"db1/dataset1",
f"db1/dataset3",
f"db1/dataset4",
f"db1/dataset5",
f"db2/dataset1",
]
)

test_file_paths = [
f"{tables_path}/db1/dataset1.md",
f"{tables_path}/db2/dataset1.md",
f"{tables_path}/db2/dataset2.md",
f"{tables_path}/db1/dataset3.md",
f"{tables_path}/db1/dataset4.md",
f"{tables_path}/db3/dataset1.md",
]

to_delete = [
dataset
for dataset in publisher._get_datasets_to_delete(test_datasets, test_file_paths)
]

assert len(to_delete) == 2
assert "test-tables-path/db3/dataset1.md" in to_delete
assert "test-tables-path/db2/dataset2.md" in to_delete

0 comments on commit 07e6786

Please sign in to comment.