add dockerfile; some other minor tweaks to entrypoint

Naman Jain · Naman Jain · commit ae62a5840fa2 · 2024-08-25T00:06:43.000+02:00
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,17 @@
+# Ignore Python cache files
+__pycache__/
+*.pyc
+*.pyo
+
+# Ignore virtual environments
+venv/
+.env/
+
+# Ignore data directories
+data/
+output/
+
+# Ignore other unnecessary files
+*.log
+*.tmp
+*.bak
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -1,4 +1,3 @@
-# .github/workflows/ci.yml
 name: CI
 
 on:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,4 +17,4 @@ repos:
         name: flake8
         entry: poetry run flake8
         language: system
-        types: [python]
+        types: [python]
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+COPY pyproject.toml poetry.lock ./
+
+RUN pip install poetry
+
+RUN poetry install --no-dev
+
+COPY . .
+
+ENV PYTHONPATH=/app
+
+ENTRYPOINT ["poetry", "run", "python", "src/__main__.py"]
diff --git a/README.md b/README.md
@@ -1,17 +1,21 @@
-# Jua Data Engineer Assignment
+# Data Transformation pipeline: NetCDF to Parquet
 
 
 ### Task
-Create a pipeline for transforming 2022 total precipitation NetCDF files into Parquet format. <br/>
+This pipeline transforms total precipitation NetCDF files into Parquet format. <br/>
 The source data is publicly available and is hosted at `gs://gcp-public-data-arco-era5/raw/date-variable-single_level`.
 
-### Requirements
- - The transformed data should support regular queries with filtering with timestamp.
- - The transformed data should support filtering by H3 geospatial index
+The transformed data supports:
+ - regular queries with filtering with timestamp.
+ - filtering by H3 geospatial index
 
-<br/><br/>
-# Solution
-The solution has been implemented in python.
+## How to run
+### TODO: update instructions
+Build the Docker image
+`docker build -t data_transformation .`
+
+Run the Docker container
+`docker run -v $(pwd)/output:/app/output data_transformation 01-01-2022 02-01-2022 out_dir`
 
 ### Features
  - Poetry for dependency management - Poetry makes life easier for managing dependencies and creating environments. To setup a virtual environment, simply run `poetry install` from the root directory. It will create a virtual environment for you. To spawn a shell, run `poetry shell`. Note that you need to have Poetry pre-installed in your system. To install Poetry, follow the steps listed [here](https://python-poetry.org/docs/#installation).
@@ -45,3 +49,9 @@ Note that some of the tests are live tests that actually download a file from GC
  - OOP?
  - Package it into a library
  - setup GitHub actions
+ - add coverage report?
+ - add doc strings
+ - add exception handling and logs
+ - add tqdm or some other progress bar
+
+## Improvements
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -15,6 +15,7 @@ gcsfs = "^2024.6.1"
 scipy = "^1.14.1"
 netcdf4 = "^1.7.1.post2"
 h3 = "^3.7.7"
+tqdm = "^4.66.5"
 
 [tool.poetry.group.dev.dependencies]
 black = "^24.8.0"
diff --git a/src/__main__.py b/src/__main__.py
@@ -0,0 +1,64 @@
+import argparse
+import datetime
+import logging
+import pathlib
+
+import gcsfs
+import tqdm
+
+from src import constants, process_data, utils
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+file_system = utils.initialize_gcsfs()
+
+
+def netcdf_to_parquet(
+    file_path: str, output_path: str, file_system: gcsfs.GCSFileSystem
+):
+    dataframe = process_data.netcdf_to_dataframe(file_path, file_system)
+    dataframe = process_data.add_h3_index(dataframe)
+    process_data.save_dataframe_as_parquet(dataframe, output_path)
+
+
+parser = argparse.ArgumentParser(
+    description="Process netCDF files and save as Parquet."
+)
+parser.add_argument(
+    "start_date",
+    type=lambda d: datetime.datetime.strptime(d, "%d-%m-%Y").date(),
+    help="Start date in DD-MM-YYYY format",
+)
+parser.add_argument(
+    "end_date",
+    type=lambda d: datetime.datetime.strptime(d, "%d-%m-%Y").date(),
+    help="End date in DD-MM-YYYY format",
+)
+parser.add_argument(
+    "out_dir", type=str, help="Output directory for Parquet files"
+)
+
+args = parser.parse_args()
+pathlib.Path(args.out_dir).mkdir(parents=True, exist_ok=True)
+
+current_date = args.start_date
+total_days = (args.end_date - current_date).days + 1
+
+for _ in tqdm.tqdm(range(total_days), desc="Processing dates"):
+    try:
+        date_str = current_date.strftime("%Y/%m/%d")
+        file_path = (
+           f"{constants.GCS_BASE_URL}/{date_str}/total_precipitation/surface.nc"
+        )
+        output_path = (
+            f"{args.out_dir}/precipitation_{current_date.strftime('%d_%m_%Y')}.parquet"
+        )
+        netcdf_to_parquet(file_path, output_path, file_system)
+    except Exception as e:
+        logging.error(
+            f"Failed to process date {current_date.strftime('%d_%m_%Y')}: {e}"
+        )
+    finally:
+        current_date += datetime.timedelta(days=1)
diff --git a/src/process_data.py b/src/process_data.py
@@ -1,14 +1,21 @@
+import logging
+
 import gcsfs
 import h3
 import pandas as pd
 import xarray as xr
 
 from . import utils
 
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
 
 def netcdf_to_dataframe(
     file_path: str, file_system: gcsfs.GCSFileSystem
 ) -> pd.DataFrame:
+    logging.info("Streaming data from GCS bucket for %s", file_path)
     with utils.open_file(file_path, file_system, mode="rb") as f:
         return xr.open_dataset(f).to_dataframe().reset_index()
 
diff --git a/src/utils.py b/src/utils.py
@@ -1,8 +1,15 @@
+import logging
+
 import gcsfs
 
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
 
 def initialize_gcsfs():
-    return gcsfs.GCSFileSystem()
+    logging.info("Initializing GCS file system")
+    return gcsfs.GCSFileSystem(token="anon")
 
 
 def open_file(

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-# .github/workflows/ci.yml`
`2`	`1`	`name: CI`
`3`	`2`
`4`	`3`	`on:`