Skip to content

Commit

Permalink
Merge pull request #1 from carte-data/json-schema-extractor
Browse files Browse the repository at this point in the history
Add JSON Schema extractor
  • Loading branch information
Balint Haller authored Mar 10, 2021
2 parents 6ea9a32 + e3ee61c commit 60609ff
Show file tree
Hide file tree
Showing 7 changed files with 485 additions and 19 deletions.
13 changes: 9 additions & 4 deletions .github/workflows/pull_request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,20 @@ jobs:
runs-on: ubuntu-18.04
strategy:
matrix:
python-version: ['3.6.x', '3.7.x']
python-version: ['3.7.x', '3.8.x']
steps:
- name: Checkout
uses: actions/checkout@v1
- name: Setup python ${{ matrix.python-version }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: pip3 install -r requirements.txt
- name: Run image
uses: abatilo/actions-poetry@v2.0.0
with:
poetry-version: '1.1.4'
- name: Install library
run: poetry install --no-interaction
- name: Run python unit tests
run: make test
run: |
poetry run python -bb -m pytest tests
154 changes: 154 additions & 0 deletions carte_cli/extractor/json_schema_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
from typing import List, Union, Iterator, Any, Iterable, Dict
from carte_cli.model.carte_table_model import TableMetadata, ColumnMetadata, TableType
from carte_cli.utils.file_io import read_json
from databuilder.extractor.base_extractor import Extractor
from pyhocon import ConfigTree
import boto3
import json
import copy


class JSONSchemaExtractor(Extractor):

SCHEMA_PATH_KEY = "schema_path"
S3_PROTOCOL = "s3://"

def __init__(
self,
connection_name: str,
database: str,
schema_path: str,
pivot_column: str = None,
object_expand: Iterable[str] = None,
):
super().__init__()
self.connection_name = connection_name
self.database = database
self.schema_path = schema_path
self.s3 = boto3.resource("s3")
self.pivot_column = pivot_column
self.object_expand = object_expand
self._extract_iter = iter(self._get_extract_iter())

def init(self, conf: ConfigTree) -> None:
self.conf = conf

def extract(self) -> Any:
try:
return next(self._extract_iter)
except StopIteration:
return None

def get_scope(self):
return "carte.extractor.json_schema"

def _get_extract_iter(self) -> Iterator[TableMetadata]:
schema = self._get_schema()

if "type" not in schema or schema["type"] != "object":
raise ValueError("Schema type has to be 'object'")

tables = self._process_schema(schema)
for table in tables:
yield table

def _get_schema(self):
if self.schema_path.startswith(self.S3_PROTOCOL):
schema = self._read_file_from_s3(self.schema_path)
else:
schema = read_json(self.schema_path)

return schema


def _process_schema(
self, schema: dict, column_prefix: str = ""
) -> Iterable[TableMetadata]:
if self.pivot_column:
if "oneOf" not in schema:
raise ValueError(
"Pivot column provided, but no top-level 'oneOf' in schema"
)
schemas = {}
for constraint in schema["oneOf"]:
try:
subschema_name = str(constraint["properties"][self.pivot_column][
"const"
])
except KeyError:
raise ValueError("Pivot column inside oneOf should be a const")

merged_schema = self._deep_merge_dicts(constraint, copy.deepcopy(schema))
schemas[subschema_name] = merged_schema

else:
schemas = {self.normalise(schema.get("title", "schema")): schema}

return [self._schema_to_table(name, schema) for name, schema in schemas.items()]

def _deep_merge_dicts(self, source: dict, destination: dict):
for key, value in source.items():
if isinstance(value, dict):
# get node or create one
node = destination.setdefault(key, {})
self._deep_merge_dicts(value, node)
elif (
isinstance(value, list)
and key in destination
and isinstance(destination[key], list)
):
destination[key] += value
else:
destination[key] = value

return destination

def _schema_to_table(self, name: str, schema: dict) -> TableMetadata:
required_columns = schema.get("required", [])
columns = {}
for key, val in schema.get("properties").items():
columns[key] = val

if self.object_expand and key in self.object_expand:
for subkey, subval in val.get("properties", {}).items():
columns[f"{key}.{subkey}"] = subval

mapped_columns = [
self._process_column(column_name, column_def, required_columns)
for column_name, column_def in columns.items()
]

return TableMetadata(
name=name,
database=self.database,
connection=self.connection_name,
location=self.schema_path,
columns=mapped_columns,
table_type=TableType.TABLE,
)

def _process_column(
self, column_name: str, column_def: dict, required_columns: List[str]
):
is_required = column_name in required_columns
column_type = column_def.get("type", "") + (
" (required)" if is_required else ""
)
column_values = column_def.get("enum", None)
return ColumnMetadata(
name=column_name, column_type=column_type, values=column_values
)

def _read_file_from_s3(self, path):
path_parts_without_protocol = path[len(self.S3_PROTOCOL) :].split("/")
bucket = path_parts_without_protocol[0]
key = "/".join(path_parts_without_protocol[1:])
content_object = self.s3.Object(bucket, key)

file_content = content_object.get()["Body"].read().decode("utf-8")
json_content = json.loads(file_content)

return json_content

def normalise(self, value: str):
return value.replace("-", "_").replace(" ", "_").lower()
31 changes: 22 additions & 9 deletions carte_cli/model/carte_table_model.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from enum import Enum
from typing import Any, List
from typing import Any, List, Union
from databuilder.models.table_metadata import TableMetadata as DatabuilderTableMetadata
from databuilder.models.table_metadata import (
ColumnMetadata as DatabuilderColumnMetadata,
Expand Down Expand Up @@ -28,10 +28,17 @@ class TableType(Enum):


class ColumnMetadata:
def __init__(self, name: str, column_type: str, description: str):
def __init__(
self,
name: str,
column_type: str,
description: str = None,
values: Union[None, List[Any]] = None,
):
self.name = name
self.column_type = column_type
self.description = description
self.values = values

@classmethod
def from_databuilder(cls, column: DatabuilderColumnMetadata):
Expand All @@ -43,28 +50,33 @@ def from_databuilder(cls, column: DatabuilderColumnMetadata):
else ""
),
column_type=column.type,
values=None,
)

@classmethod
def from_frontmatter(cls, meta_dict):
return cls(
name=meta_dict["name"],
column_type=meta_dict.get("type"),
column_type=meta_dict.get("type"),
description=meta_dict.get("description"),
values=meta_dict.get("values"),
)

def to_frontmatter(self):
return {
frontmatter = {
"name": self.name,
"type": self.column_type,
"description": self.description,
}

if self.values is not None:
frontmatter["values"] = self.values

return frontmatter

def __repr__(self) -> str:
return "CarteTableMetadata({!r}, {!r}, {!r})".format(
self.name,
self.column_type,
self.description,
return "CarteTableMetadata({!r}, {!r}, {!r}, {!r})".format(
self.name, self.column_type, self.description, self.values
)


Expand All @@ -74,10 +86,10 @@ def __init__(
name: str,
database: str,
connection: str,
description: str,
location: str,
columns: List[ColumnMetadata],
table_type: TableType,
description: str = None,
):
self.name = name
self.connection = connection
Expand Down Expand Up @@ -156,6 +168,7 @@ def merge_columns(self, existing, preserve_descriptions=True):
name=column_name,
column_type=column.column_type,
description=merged_description,
values=column.values,
)
)
return merged_columns
Expand Down
30 changes: 27 additions & 3 deletions carte_cli/utils/config_parser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
from carte_cli.extractor.json_schema_extractor import JSONSchemaExtractor
import io
import importlib
from pyhocon.exceptions import ConfigException
from ruamel.yaml import YAML
from carte_cli.extractor.glue_extractor import GlueExtractor
from carte_cli.utils.file_io import read_yaml

yaml = YAML()

Expand Down Expand Up @@ -35,16 +38,37 @@ def create_postgres_connection(conn_dict):
}


def create_json_schema_connection(conn_dict):
config = conn_dict.get(CONFIG_KEY, {})
try:
connection_name = conn_dict.get("name", "json_schema")
database = config["database"]
schema_path = config["schema_path"]
except KeyError:
raise ConfigException(
"The name, database, and schema_path values are required for JSON Schema connections"
)
return (
JSONSchemaExtractor(
connection_name,
database,
schema_path,
pivot_column=config.get("pivot_column"),
object_expand=config.get("object_expand"),
),
{},
)


CONNECTION_FACTORIES = {
"glue": create_glue_connection,
"postgresql": create_postgres_connection,
"json_schema": create_json_schema_connection,
}


def parse_config(filename):
data = _read_file(filename)

parsed_data = yaml.load(data)
parsed_data = read_yaml(filename)

connections = parsed_data.get("connections", [])

Expand Down
4 changes: 2 additions & 2 deletions carte_cli/utils/frontmatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ def dump(filename, metadata, content):
buf.write(FRONTMATTER_SEPARATOR)
yaml.dump(metadata, buf)
buf.write(FRONTMATTER_SEPARATOR)
if content.strip() is not None:
buf.write(content)
if content is not None:
buf.write(content.strip())

with open(filename, "w") as f:
print(buf.getvalue(), file=f)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "carte-cli"
version = "0.2.4"
version = "0.2.5"
description = "A static site generator for data catalogs"
authors = ["Balint Haller <balint@hey.com>"]
license = "Apache-2.0"
Expand Down
Loading

0 comments on commit 60609ff

Please sign in to comment.