Add parquet importer (datacontract#494)

stefannegele · web-flow · commit 64174d948539 · 2024-11-02T12:12:14.000+01:00
* Add parquet importer - WIP

* Add parquet importer - WIP

* Add pyarrow mapping

* Update README

* Update CHANGELOG

* Make ruff order imports like in ci action

* Move pyarrow to parquet extra
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- Support for model import from parquet file metadata.
 
 ### Changed
 
diff --git a/README.md b/README.md
@@ -194,13 +194,16 @@ A list of available extras:
 | Avro Support           | `pip install datacontract-cli[avro]`       |
 | Google BigQuery        | `pip install datacontract-cli[bigquery]`   |
 | Databricks Integration | `pip install datacontract-cli[databricks]` |
+| Iceberg                | `pip install datacontract-cli[iceberg]`    |
 | Kafka Integration      | `pip install datacontract-cli[kafka]`      |
 | PostgreSQL Integration | `pip install datacontract-cli[postgres]`   |
 | S3 Integration         | `pip install datacontract-cli[s3]`         |
 | Snowflake Integration  | `pip install datacontract-cli[snowflake]`  |
 | Microsoft SQL Server   | `pip install datacontract-cli[sqlserver]`  |
 | Trino                  | `pip install datacontract-cli[trino]`      |
 | Dbt                    | `pip install datacontract-cli[dbt]`        |
+| Dbml                   | `pip install datacontract-cli[dbml]`       |
+| Parquet                | `pip install datacontract-cli[parquet]`    |
 
 
 
@@ -930,8 +933,8 @@ models:
  Create a data contract from the given source location. Prints to stdout.                                                      
                                                                                                                                
 ╭─ Options ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
-│ *  --format                       [sql|avro|dbt|glue|jsonschema|bigquery|odcs  The format of the source file.               │
-│                                   |unity|spark|iceberg]                        [default: None]                              │
+│ *  --format                       [sql|avro|dbt|dbml|glue|jsonschema|bigquery  The format of the source file.               │
+│                                   |odcs|unity|spark|iceberg|parquet]           [default: None]                              │
 │                                                                                [required]                                   │
 │    --source                       TEXT                                         The path to the file or Glue Database that   │
 │                                                                                should be imported.                          │
@@ -962,7 +965,8 @@ models:
 │                                                                                empty for all tables in the file).           │
 │                                                                                [default: None]                              │
 │    --iceberg-table                TEXT                                         Table name to assign to the model created    │
-│                                                                                from the Iceberg schema. [default: None]     │
+│                                                                                from the Iceberg schema.                     │
+│                                                                                [default: None]                              │
 │    --help                                                                      Show this message and exit.                  │
 ╰─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────╯
 ```
@@ -989,6 +993,7 @@ Available import options:
 | `dbml`             | Import from DBML models                        | ✅      |
 | `protobuf`         | Import from Protobuf schemas                   | TBD    |
 | `iceberg`          | Import from an Iceberg JSON Schema Definition  | partial |
+| `parquet`          | Import from Parquet File Metadta               | ✅      |
 | Missing something? | Please create an issue on GitHub               | TBD    |
 
 
diff --git a/datacontract/imports/importer.py b/datacontract/imports/importer.py
@@ -30,6 +30,7 @@ class ImportFormat(str, Enum):
     unity = "unity"
     spark = "spark"
     iceberg = "iceberg"
+    parquet = "parquet"
 
     @classmethod
     def get_supported_formats(cls):
diff --git a/datacontract/imports/importer_factory.py b/datacontract/imports/importer_factory.py
@@ -99,3 +99,8 @@ def load_module_class(module_path, class_name):
     module_path="datacontract.imports.iceberg_importer",
     class_name="IcebergImporter",
 )
+importer_factory.register_lazy_importer(
+    name=ImportFormat.parquet,
+    module_path="datacontract.imports.parquet_importer",
+    class_name="ParquetImporter",
+)
diff --git a/datacontract/imports/parquet_importer.py b/datacontract/imports/parquet_importer.py
@@ -0,0 +1,81 @@
+import os.path
+
+import pyarrow
+from pyarrow import parquet
+
+from datacontract.imports.importer import Importer
+from datacontract.model.data_contract_specification import (
+    DataContractSpecification,
+    Field,
+    Model,
+)
+from datacontract.model.exceptions import DataContractException
+
+
+class ParquetImporter(Importer):
+    def import_source(
+        self, data_contract_specification: DataContractSpecification, source: str, import_args: dict
+    ) -> DataContractSpecification:
+        return import_parquet(data_contract_specification, source)
+
+
+def import_parquet(data_contract_specification: DataContractSpecification, source: str) -> DataContractSpecification:
+    # use filename as schema name, remove .parquet suffix, avoid breaking the yaml output by replacing dots
+    schema_name = os.path.basename(source).removesuffix(".parquet").replace(".", "_")
+
+    fields: dict[str, Field] = {}
+
+    arrow_schema = parquet.read_schema(source)
+    for field_name in arrow_schema.names:
+        parquet_field = arrow_schema.field(field_name)
+
+        field = map_pyarrow_field_to_specification_field(parquet_field, "parquet")
+
+        if not parquet_field.nullable:
+            field.required = True
+
+        fields[field_name] = field
+
+    data_contract_specification.models[schema_name] = Model(fields=fields)
+
+    return data_contract_specification
+
+
+def map_pyarrow_field_to_specification_field(pyarrow_field: pyarrow.Field, file_format: str) -> Field:
+    if pyarrow.types.is_boolean(pyarrow_field.type):
+        return Field(type="boolean")
+    if pyarrow.types.is_int32(pyarrow_field.type):
+        return Field(type="int")
+    if pyarrow.types.is_int64(pyarrow_field.type):
+        return Field(type="long")
+    if pyarrow.types.is_integer(pyarrow_field.type):
+        return Field(type="number")
+    if pyarrow.types.is_float32(pyarrow_field.type):
+        return Field(type="float")
+    if pyarrow.types.is_float64(pyarrow_field.type):
+        return Field(type="double")
+    if pyarrow.types.is_decimal(pyarrow_field.type):
+        return Field(type="decimal", precision=pyarrow_field.type.precision, scale=pyarrow_field.type.scale)
+    if pyarrow.types.is_timestamp(pyarrow_field.type):
+        return Field(type="timestamp")
+    if pyarrow.types.is_date(pyarrow_field.type):
+        return Field(type="date")
+    if pyarrow.types.is_null(pyarrow_field.type):
+        return Field(type="null")
+    if pyarrow.types.is_binary(pyarrow_field.type):
+        return Field(type="bytes")
+    if pyarrow.types.is_string(pyarrow_field.type):
+        return Field(type="string")
+    if pyarrow.types.is_map(pyarrow_field.type) or pyarrow.types.is_dictionary(pyarrow_field.type):
+        return Field(type="map")
+    if pyarrow.types.is_struct(pyarrow_field.type):
+        return Field(type="struct")
+    if pyarrow.types.is_list(pyarrow_field.type):
+        return Field(type="array")
+
+    raise DataContractException(
+        type="schema",
+        name=f"Parse {file_format} schema",
+        reason=f"{pyarrow_field.type} currently not supported.",
+        engine="datacontract",
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -94,8 +94,12 @@ dbml = [
   "pydbml>=1.1.1"
 ]
 
+parquet = [
+  "pyarrow>=12.0.0"
+]
+
 all = [
-  "datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,trino,dbt,dbml,iceberg]"
+  "datacontract-cli[kafka,bigquery,snowflake,postgres,databricks,sqlserver,s3,trino,dbt,dbml,iceberg,parquet]"
 ]
 
 dev = [
@@ -105,7 +109,6 @@ dev = [
   "moto==5.0.18",
   "pandas>=2.1.0",
   "pre-commit>=3.7.1,<3.9.0",
-  "pyarrow>=12.0.0",
   "pytest",
   "pytest-xdist",
   "pymssql==2.3.1",
diff --git a/tests/fixtures/parquet/data/combined_no_time.parquet b/tests/fixtures/parquet/data/combined_no_time.parquet
diff --git a/tests/test_import_parquet.py b/tests/test_import_parquet.py
@@ -0,0 +1,68 @@
+from typer.testing import CliRunner
+
+from datacontract.cli import app
+from datacontract.data_contract import DataContract
+
+parquet_file_path = "fixtures/parquet/data/combined_no_time.parquet"
+
+
+def test_cli():
+    runner = CliRunner()
+    result = runner.invoke(
+        app,
+        [
+            "import",
+            "--format",
+            "parquet",
+            "--source",
+            parquet_file_path,
+        ],
+    )
+    assert result.exit_code == 0
+
+
+def test_import_parquet():
+    result = DataContract().import_from_source(format="parquet", source=parquet_file_path)
+
+    expected = """dataContractSpecification: 1.1.0
+id: my-data-contract-id
+info:
+  title: My Data Contract
+  version: 0.0.1
+models:
+  combined_no_time:
+    fields:
+      string_field:
+        type: string
+      blob_field:
+        type: bytes
+      boolean_field:
+        type: boolean
+      decimal_field:
+        type: decimal
+        precision: 10
+        scale: 2
+      float_field:
+        type: float
+      double_field:
+        type: double
+      integer_field:
+        type: int
+      bigint_field:
+        type: long
+      struct_field:
+        type: struct
+      array_field:
+        type: array
+      list_field:
+        type: array
+      map_field:
+        type: map
+      date_field:
+        type: date
+      timestamp_field:
+        type: timestamp
+"""
+
+    assert result.to_yaml() == expected
+    assert DataContract(data_contract_str=expected).lint(enabled_linters=set()).has_passed()

Original file line number	Diff line number	Diff line change
`@@ -99,3 +99,8 @@ def load_module_class(module_path, class_name):`
`99`	`99`	`module_path="datacontract.imports.iceberg_importer",`
`100`	`100`	`class_name="IcebergImporter",`
`101`	`101`	`)`
	`102`	`+importer_factory.register_lazy_importer(`
	`103`	`+ name=ImportFormat.parquet,`
	`104`	`+ module_path="datacontract.imports.parquet_importer",`
	`105`	`+ class_name="ParquetImporter",`
	`106`	`+)`