From a0b2478a23fca99cba0189ab311d4540289e0f7f Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 13 Dec 2024 15:31:37 +0100 Subject: [PATCH 01/63] current state --- deep_code/__init__.py | 2 +- deep_code/api/__init__.py | 2 +- deep_code/api/new.py | 2 +- deep_code/api/publish_experiments.py | 2 +- deep_code/api/publish_products.py | 2 +- deep_code/api/setup_ci.py | 2 +- deep_code/constants.py | 1 + .../tests/utils/test_github_automation.py | 123 ++++++++++++ deep_code/tests/utils/test_osc_extension.py | 119 ++++++++++++ deep_code/utils/dataset_stac_generator.py | 139 ++++++++++++++ deep_code/utils/github_automation.py | 113 +++++++++++ deep_code/utils/osc_extension.py | 181 ++++++++++++++++++ deep_code/version.py | 2 +- environment.yml | 8 +- 14 files changed, 688 insertions(+), 10 deletions(-) create mode 100644 deep_code/constants.py create mode 100644 deep_code/tests/utils/test_github_automation.py create mode 100644 deep_code/tests/utils/test_osc_extension.py create mode 100644 deep_code/utils/dataset_stac_generator.py create mode 100644 deep_code/utils/github_automation.py create mode 100644 deep_code/utils/osc_extension.py diff --git a/deep_code/__init__.py b/deep_code/__init__.py index 451592c..ac5e73a 100644 --- a/deep_code/__init__.py +++ b/deep_code/__init__.py @@ -21,4 +21,4 @@ from .version import version -__version__ = version \ No newline at end of file +__version__ = version diff --git a/deep_code/api/__init__.py b/deep_code/api/__init__.py index 96d27c6..e4323e4 100644 --- a/deep_code/api/__init__.py +++ b/deep_code/api/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024 by xcube team and contributors # Permissions are hereby granted under the terms of the MIT License: -# https://opensource.org/licenses/MIT. \ No newline at end of file +# https://opensource.org/licenses/MIT. diff --git a/deep_code/api/new.py b/deep_code/api/new.py index b5a382d..c5c4099 100644 --- a/deep_code/api/new.py +++ b/deep_code/api/new.py @@ -1 +1 @@ -# Logic for initializing repositories \ No newline at end of file +# Logic for initializing repositories diff --git a/deep_code/api/publish_experiments.py b/deep_code/api/publish_experiments.py index 8586380..710bf53 100644 --- a/deep_code/api/publish_experiments.py +++ b/deep_code/api/publish_experiments.py @@ -1 +1 @@ -# Logic for publishing experiments on EarthCODE catalog \ No newline at end of file +# Logic for publishing experiments on EarthCODE catalog diff --git a/deep_code/api/publish_products.py b/deep_code/api/publish_products.py index a3afe4a..e9f436f 100644 --- a/deep_code/api/publish_products.py +++ b/deep_code/api/publish_products.py @@ -1 +1 @@ -# Logic for publishing products on EarthCODE catalog \ No newline at end of file +# Logic for publishing products on EarthCODE catalog diff --git a/deep_code/api/setup_ci.py b/deep_code/api/setup_ci.py index 41c5846..65b61b9 100644 --- a/deep_code/api/setup_ci.py +++ b/deep_code/api/setup_ci.py @@ -1 +1 @@ -# Logic for setting up build pipelines \ No newline at end of file +# Logic for setting up build pipelines diff --git a/deep_code/constants.py b/deep_code/constants.py new file mode 100644 index 0000000..4362520 --- /dev/null +++ b/deep_code/constants.py @@ -0,0 +1 @@ +OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0-rc.3/schema.json" diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py new file mode 100644 index 0000000..b858b35 --- /dev/null +++ b/deep_code/tests/utils/test_github_automation.py @@ -0,0 +1,123 @@ +import unittest +from unittest.mock import patch, MagicMock +from pathlib import Path +import json +from deep_code.utils.github_automation import GitHubAutomation + + +class TestGitHubAutomation(unittest.TestCase): + def setUp(self): + self.github = GitHubAutomation( + username="test-user", + token="test-token", + repo_owner="test-owner", + repo_name="test-repo", + ) + + @patch("requests.post") + def test_fork_repository(self, mock_post): + """Test the fork_repository method.""" + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + self.github.fork_repository() + + mock_post.assert_called_once_with( + "https://api.github.com/repos/test-owner/test-repo/forks", + headers={"Authorization": "token test-token"}, + ) + + @patch("subprocess.run") + @patch("os.chdir") + def test_clone_repository(self, mock_chdir, mock_run): + """Test the clone_repository method.""" + self.github.clone_repository() + + mock_run.assert_called_once_with( + ["git", "clone", self.github.fork_repo_url, self.github.local_clone_dir], + check=True, + ) + mock_chdir.assert_called_once_with(self.github.local_clone_dir) + + @patch("subprocess.run") + def test_create_branch(self, mock_run): + """Test the create_branch method.""" + branch_name = "test-branch" + self.github.create_branch(branch_name) + + mock_run.assert_called_once_with( + ["git", "checkout", "-b", branch_name], check=True + ) + + @patch("subprocess.run") + @patch("builtins.open", new_callable=unittest.mock.mock_open) + @patch("pathlib.Path.mkdir") + def test_add_file(self, mock_mkdir, mock_open, mock_run): + """Test the add_file method.""" + file_path = "test-dir/test-file.json" + content = {"key": "value"} + + self.github.add_file(file_path, content) + + mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) + mock_open.assert_called_once_with( + Path(self.github.local_clone_dir) / file_path, "w" + ) + mock_open().write.assert_called_once_with(json.dumps(content, indent=2)) + mock_run.assert_called_once_with( + ["git", "add", str(Path(self.github.local_clone_dir) / file_path)], + check=True, + ) + + @patch("subprocess.run") + def test_commit_and_push(self, mock_run): + """Test the commit_and_push method.""" + branch_name = "test-branch" + commit_message = "Test commit message" + + self.github.commit_and_push(branch_name, commit_message) + + mock_run.assert_any_call(["git", "commit", "-m", commit_message], check=True) + mock_run.assert_any_call( + ["git", "push", "-u", "origin", branch_name], check=True + ) + + @patch("requests.post") + def test_create_pull_request(self, mock_post): + """Test the create_pull_request method.""" + branch_name = "test-branch" + pr_title = "Test PR" + pr_body = "This is a test PR" + base_branch = "main" + + mock_response = MagicMock() + mock_response.json.return_value = {"html_url": "https://github.com/test-pr"} + mock_response.raise_for_status.return_value = None + mock_post.return_value = mock_response + + self.github.create_pull_request(branch_name, pr_title, pr_body, base_branch) + + mock_post.assert_called_once_with( + "https://api.github.com/repos/test-owner/test-repo/pulls", + headers={"Authorization": "token test-token"}, + json={ + "title": pr_title, + "head": f"test-user:{branch_name}", + "base": base_branch, + "body": pr_body, + }, + ) + + @patch("subprocess.run") + @patch("os.chdir") + def test_clean_up(self, mock_chdir, mock_run): + """Test the clean_up method.""" + self.github.clean_up() + + mock_chdir.assert_called_once_with("..") + mock_run.assert_called_once_with(["rm", "-rf", self.github.local_clone_dir]) + + +if __name__ == "__main__": + unittest.main() diff --git a/deep_code/tests/utils/test_osc_extension.py b/deep_code/tests/utils/test_osc_extension.py new file mode 100644 index 0000000..6670f5c --- /dev/null +++ b/deep_code/tests/utils/test_osc_extension.py @@ -0,0 +1,119 @@ +import unittest +from pystac import Collection, Extent, SpatialExtent, TemporalExtent +from deep_code.utils.osc_extension import OscExtension + + +class TestOscExtension(unittest.TestCase): + def setUp(self): + """Set up a test Collection object and attach the OscExtension.""" + self.collection = Collection( + id="test-collection", + description="Test collection for unit tests", + extent=Extent( + spatial=SpatialExtent([[-180, -90, 180, 90]]), + temporal=TemporalExtent( + [["2022-01-01T00:00:00Z", "2023-01-01T00:00:00Z"]] + ), + ), + stac_extensions=[], + ) + OscExtension.add_to(self.collection) + + def test_osc_status(self): + """Test the osc:status property.""" + extension = OscExtension.ext(self.collection) + extension.osc_status = "ongoing" + self.assertEqual(extension.osc_status, "ongoing") + + def test_osc_region(self): + """Test the osc:region property.""" + extension = OscExtension.ext(self.collection) + extension.osc_region = "Mediterranean region" + self.assertEqual(extension.osc_region, "Mediterranean region") + + def test_osc_themes(self): + """Test the osc:themes property.""" + extension = OscExtension.ext(self.collection) + extension.osc_themes = ["land", "ocean"] + self.assertEqual(extension.osc_themes, ["land", "ocean"]) + + def test_osc_missions(self): + """Test the osc:missions property.""" + extension = OscExtension.ext(self.collection) + extension.osc_missions = ["mission1", "mission2"] + self.assertEqual(extension.osc_missions, ["mission1", "mission2"]) + + def test_keywords(self): + """Test the keywords property.""" + extension = OscExtension.ext(self.collection) + extension.keywords = ["Hydrology", "Remote Sensing"] + self.assertEqual(extension.keywords, ["Hydrology", "Remote Sensing"]) + + def test_cf_parameters(self): + """Test the cf:parameter property.""" + extension = OscExtension.ext(self.collection) + extension.cf_parameters = [{"name": "hydrology-4D"}] + self.assertEqual(extension.cf_parameters, [{"name": "hydrology-4D"}]) + + def test_created_updated(self): + """Test the created and updated properties.""" + extension = OscExtension.ext(self.collection) + extension.created = "2023-12-21T11:50:17Z" + extension.updated = "2023-12-21T11:50:17Z" + self.assertEqual(extension.created, "2023-12-21T11:50:17Z") + self.assertEqual(extension.updated, "2023-12-21T11:50:17Z") + + def test_set_extent(self): + """Test setting spatial and temporal extent.""" + extension = OscExtension.ext(self.collection) + spatial = [[-5.7, 28.3, 37.7, 48.1]] + temporal = [["2014-12-31T12:00:00Z", "2022-10-06T12:00:00Z"]] + extension.set_extent(spatial, temporal) + + self.assertEqual(self.collection.extent.spatial.bboxes, spatial) + self.assertEqual(self.collection.extent.temporal.intervals, temporal) + + def test_validation_success(self): + """Test validation with all required fields.""" + extension = OscExtension.ext(self.collection) + extension.osc_type = "product" + extension.osc_project = "test-project" + extension.osc_status = "ongoing" + extension.validate_extension() # Should not raise an exception + + def test_add_osc_extension(self): + osc_ext = OscExtension.add_to(self.collection) + self.assertIn(OscExtension.get_schema_uri(), self.collection.stac_extensions) + self.assertIsInstance(osc_ext, OscExtension) + + def test_has_extension(self): + self.collection.stac_extensions = [] + self.assertFalse(OscExtension.has_extension(self.collection)) + OscExtension.add_to(self.collection) + self.assertTrue(OscExtension.has_extension(self.collection)) + + def test_set_and_get_properties(self): + osc_ext = OscExtension.add_to(self.collection) + osc_ext.osc_type = "example-type" + osc_ext.osc_project = "example-project" + osc_ext.osc_product = "example-product" + osc_ext.osc_theme = ["example-theme"] + osc_ext.osc_variables = ["var1", "var2", "var3"] + + self.assertEqual(osc_ext.osc_type, "example-type") + self.assertEqual(osc_ext.osc_project, "example-project") + self.assertEqual(osc_ext.osc_product, "example-product") + self.assertEqual(osc_ext.osc_theme, ["example-theme"]) + self.assertListEqual(osc_ext.osc_variables, ["var1", "var2", "var3"]) + + def test_validation_missing_fields(self): + """Test validation with missing required fields.""" + extension = OscExtension.ext(self.collection) + with self.assertRaises(ValueError) as context: + extension.validate_extension() + self.assertIn("Missing required fields", str(context.exception)) + self.assertIn("osc:type", str(context.exception)) + + +if __name__ == "__main__": + unittest.main() diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py new file mode 100644 index 0000000..977e44a --- /dev/null +++ b/deep_code/utils/dataset_stac_generator.py @@ -0,0 +1,139 @@ +import os + +import xarray as xr +from pystac import Collection, Extent, SpatialExtent, TemporalExtent +from pystac.extensions.base import PropertiesExtension +from datetime import datetime +from typing import List +from xcube.core.store import new_data_store +from deep_code.utils.osc_extension import OscExtension + + +class OSCProductSTACGenerator: + """ + A class to generate OSC STAC Collections for a product from Zarr datasets. + """ + + def __init__(self, dataset_id: str): + """ + Initialize the generator with the path to the Zarr dataset. + + :param dataset_path: Path to the Zarr dataset. + """ + self.dataset_id = dataset_id + self.dataset = self._open_dataset() + + + def _open_dataset(self): + """Open the Zarr dataset as an xarray Dataset.""" + try: + store = new_data_store('s3', root=os.environ["S3_USER_STORAGE_BUCKET"], + storage_options=dict(anon=False, + key=os.environ["S3_USER_STORAGE_KEY"], + secret=os.environ[ + "S3_USER_STORAGE_SECRET"])) + return store.open_data(self.dataset_id) + except Exception as e: + raise ValueError(f"Failed to open Zarr dataset at " + f"{self.dataset_id}: {e}") + + def _get_spatial_extent(self) -> SpatialExtent: + """Extract spatial extent from the dataset.""" + if "longitude" in self.dataset.coords and "latitude" in self.dataset.coords: + lon_min, lon_max = ( + float(self.dataset.longitude.min()), + float(self.dataset.longitude.max()), + ) + lat_min, lat_max = ( + float(self.dataset.latitude.min()), + float(self.dataset.latitude.max()), + ) + return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]]) + else: + raise ValueError( + "Dataset does not have 'longitude' and 'latitude' coordinates." + ) + + def _get_temporal_extent(self) -> TemporalExtent: + """Extract temporal extent from the dataset.""" + if "time" in self.dataset.coords: + time_min = str(self.dataset.time.min().values) + time_max = str(self.dataset.time.max().values) + return TemporalExtent([[time_min, time_max]]) + else: + raise ValueError("Dataset does not have a 'time' coordinate.") + + def _get_variables(self) -> List[str]: + """Extract the variable names from the dataset.""" + return list(self.dataset.data_vars.keys()) + + def _get_general_metadata(self) -> dict: + return {'description': self.dataset.attrs['description'], + 'title': self.dataset.attrs['title']} + + def build_stac_collection( + self, + collection_id: str, + osc_status: str = "ongoing", + osc_region: str = "Global", + osc_themes: List[str] = None, + ) -> Collection: + """ + Build an OSC STAC Collection for the product. + + :param collection_id: Unique ID for the collection. + :param description: Description of the collection. + :param title: Title of the collection. + :param osc_project: Project name for OSC metadata. + :param osc_status: Status of the dataset (e.g., "ongoing"). + :param osc_region: Geographical region of the dataset. + :param osc_themes: Themes of the dataset (e.g., ["climate", "environment"]). + :return: A pystac.Collection object. + """ + # if osc_themes is None: + # osc_themes = [] + + # Extract metadata + spatial_extent = self._get_spatial_extent() + temporal_extent = self._get_temporal_extent() + variables = self._get_variables() + description, title = self._get_general_metadata() + + # Build base STAC Collection + collection = Collection( + id=collection_id, + description=description, + extent=Extent(spatial=spatial_extent, temporal=temporal_extent), + title=title, + ) + + # Add OSC extension metadata + osc_extension = OscExtension.add_to(collection) + osc_extension.osc_project = "deep-earth-system-data-lab" + osc_extension.osc_type = "product" + osc_extension.osc_status = osc_status + osc_extension.osc_region = osc_region + osc_extension.osc_themes = osc_themes + osc_extension.osc_variables = variables + osc_extension.osc_missions = [] + collection.extra_fields["created"] = datetime.utcnow().isoformat() + "Z" + collection.extra_fields["updated"] = datetime.utcnow().isoformat() + "Z" + + return collection + + +# Example Usage +if __name__ == "__main__": + zarr_dataset_path = "path/to/zarr/dataset" + collection_id = "example-collection-id" + description = "An example OSC collection built from a Zarr dataset." + title = "Example Collection" + + try: + generator = OSCProductSTACGenerator(zarr_dataset_path) + collection = generator.build_stac_collection( + collection_id=collection_id + ) + print(collection.to_dict()) + except Exception as e: + print(f"Error: {e}") diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py new file mode 100644 index 0000000..2b6aee9 --- /dev/null +++ b/deep_code/utils/github_automation.py @@ -0,0 +1,113 @@ +import os +import json +import subprocess +import requests +from pathlib import Path + + +class GitHubAutomation: + def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): + """ + Initialize the GitHubAutomation class. + + :param username: Your GitHub username + :param token: Your GitHub personal access token + :param repo_owner: Owner of the repository to fork + :param repo_name: Name of the repository to fork + """ + self.username = username + self.token = token + self.repo_owner = repo_owner + self.repo_name = repo_name + self.base_repo_url = f"https://github.com/{repo_owner}/{repo_name}.git" + self.fork_repo_url = f"https://github.com/{username}/{repo_name}.git" + self.local_clone_dir = "./temp_repo" + + def fork_repository(self): + """Fork the repository to the user's GitHub account.""" + print("Forking repository...") + url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/forks" + headers = {"Authorization": f"token {self.token}"} + response = requests.post(url, headers=headers) + response.raise_for_status() + print(f"Repository forked to {self.username}/{self.repo_name}") + + def clone_repository(self): + """Clone the forked repository locally.""" + print("Cloning forked repository...") + subprocess.run( + ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True + ) + os.chdir(self.local_clone_dir) + + def create_branch(self, branch_name: str): + """Create a new branch in the local repository.""" + print(f"Creating new branch: {branch_name}...") + subprocess.run(["git", "checkout", "-b", branch_name], check=True) + + def add_file(self, file_path: str, content: dict): + """Add a new file to the local repository.""" + print(f"Adding new file: {file_path}...") + full_path = Path(self.local_clone_dir) / file_path + full_path.parent.mkdir(parents=True, exist_ok=True) + with open(full_path, "w") as f: + f.write(json.dumps(content, indent=2)) + subprocess.run(["git", "add", str(full_path)], check=True) + + def commit_and_push(self, branch_name: str, commit_message: str): + """Commit changes and push to the forked repository.""" + print("Committing and pushing changes...") + subprocess.run(["git", "commit", "-m", commit_message], check=True) + subprocess.run(["git", "push", "-u", "origin", branch_name], check=True) + + def create_pull_request( + self, branch_name: str, pr_title: str, pr_body: str, base_branch: str = "main" + ): + """Create a pull request from the forked repository to the base repository.""" + print("Creating a pull request...") + url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls" + headers = {"Authorization": f"token {self.token}"} + data = { + "title": pr_title, + "head": f"{self.username}:{branch_name}", + "base": base_branch, + "body": pr_body, + } + response = requests.post(url, headers=headers, json=data) + response.raise_for_status() + pr_url = response.json()["html_url"] + print(f"Pull request created: {pr_url}") + + def clean_up(self): + """Clean up the local cloned repository.""" + print("Cleaning up local repository...") + os.chdir("..") + subprocess.run(["rm", "-rf", self.local_clone_dir]) + + +# if __name__ == "__main__": +# # Configuration +# GITHUB_USERNAME = "your-username" # Replace with your GitHub username +# GITHUB_TOKEN = "your-personal-access-token" # Replace with your GitHub PAT +# REPO_OWNER = "ESA-EarthCODE" +# REPO_NAME = "open-science-catalog-metadata-testing" +# NEW_BRANCH_NAME = "add-new-collection" +# NEW_FILE_PATH = "products/new-collection-folder/collection.json" +# COLLECTION_CONTENT = { +# "id": "example-collection", +# "description": "An example collection", +# "items": [] +# } +# +# try: +# github_automation = GitHubAutomation(GITHUB_USERNAME, GITHUB_TOKEN, REPO_OWNER, REPO_NAME) +# +# # Automate the process +# github_automation.fork_repository() +# github_automation.clone_repository() +# github_automation.create_branch(NEW_BRANCH_NAME) +# github_automation.add_file(NEW_FILE_PATH, COLLECTION_CONTENT) +# github_automation.commit_and_push(NEW_BRANCH_NAME, "Add new collection") +# github_automation.create_pull_request(NEW_BRANCH_NAME, "Add new collection", "This PR adds a new collection folder to the products directory.") +# finally: +# github_automation.clean_up() diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py new file mode 100644 index 0000000..1007cb8 --- /dev/null +++ b/deep_code/utils/osc_extension.py @@ -0,0 +1,181 @@ +from typing import Optional, Union, Literal, List +import pystac +from pystac import SpatialExtent, TemporalExtent, Extent +from pystac.extensions.base import PropertiesExtension, ExtensionManagementMixin +from deep_code.constants import OSC_SCHEMA_URI + + +class OscExtension( + PropertiesExtension, ExtensionManagementMixin[Union[pystac.Item, pystac.Collection]] +): + name: Literal["osc"] = "osc" + + def __init__(self, obj: Union[pystac.Item, pystac.Collection]): + if isinstance(obj, pystac.Collection): + self.properties = obj.extra_fields + else: + self.properties = obj.properties + self.obj = obj + + # Existing properties... + @property + def osc_type(self) -> Optional[str]: + return self._get_property("osc:type", str) + + @osc_type.setter + def osc_type(self, v: str) -> None: + self._set_property("osc:type", v, pop_if_none=False) + + @property + def osc_name(self) -> Optional[str]: + return self._get_property("osc:name", str) + + @osc_name.setter + def osc_name(self, v: str) -> None: + self._set_property("osc:name", v, pop_if_none=False) + + @property + def osc_status(self) -> Optional[str]: + return self._get_property("osc:status", str) + + @osc_status.setter + def osc_status(self, value: str) -> None: + self._set_property("osc:status", value, pop_if_none=False) + + @property + def osc_project(self) -> Optional[str]: + return self._get_property("osc:project", str) + + @osc_project.setter + def osc_project(self, v: str) -> None: + self._set_property("osc:project", v, pop_if_none=False) + + @property + def osc_theme(self) -> Optional[List[str]]: + return self._get_property("osc:themes", list) + + @osc_theme.setter + def osc_theme(self, value: List[str]) -> None: + if not isinstance(value, list) or not all( + isinstance(item, str) for item in value + ): + raise ValueError("osc:themes must be a list of strings") + self._set_property("osc:themes", value, pop_if_none=False) + + @property + def osc_region(self) -> Optional[str]: + return self._get_property("osc:region", str) + + @osc_region.setter + def osc_region(self, value: str) -> None: + self._set_property("osc:region", value, pop_if_none=False) + + @property + def osc_missions(self) -> Optional[List[str]]: + return self._get_property("osc:missions", list) + + @osc_missions.setter + def osc_missions(self, value: List[str]) -> None: + if not isinstance(value, list) or not all( + isinstance(item, str) for item in value + ): + raise ValueError("osc:missions must be a list of strings") + self._set_property("osc:missions", value, pop_if_none=False) + + # Utility methods for handling temporal and spatial extent + def set_extent(self, spatial: List[List[float]], temporal: List[List[str]]) -> None: + self.obj.extent = Extent(SpatialExtent(spatial), TemporalExtent(temporal)) + + @property + def osc_variables(self) -> Optional[List[str]]: + return self._get_property("osc:variables", list) + + @osc_variables.setter + def osc_variables(self, v: List[str]) -> None: + if not isinstance(v, list) or not all(isinstance(item, str) for item in v): + raise ValueError("osc:variables must be a list of strings") + self._set_property("osc:variables", v, pop_if_none=False) + + # Keywords property + @property + def keywords(self) -> Optional[List[str]]: + return self._get_property("keywords", list) + + @keywords.setter + def keywords(self, value: List[str]) -> None: + if not isinstance(value, list) or not all( + isinstance(item, str) for item in value + ): + raise ValueError("keywords must be a list of strings") + self._set_property("keywords", value, pop_if_none=False) + + # CF Parameters + @property + def cf_parameters(self) -> Optional[List[dict]]: + return self._get_property("cf:parameter", list) + + @cf_parameters.setter + def cf_parameters(self, value: List[dict]) -> None: + if not isinstance(value, list) or not all( + isinstance(item, dict) for item in value + ): + raise ValueError("cf:parameter must be a list of dictionaries") + self._set_property("cf:parameter", value, pop_if_none=False) + + # Created and Updated timestamps + @property + def created(self) -> Optional[str]: + return self._get_property("created", str) + + @created.setter + def created(self, value: str) -> None: + self._set_property("created", value, pop_if_none=False) + + @property + def updated(self) -> Optional[str]: + return self._get_property("updated", str) + + @updated.setter + def updated(self, value: str) -> None: + self._set_property("updated", value, pop_if_none=False) + + @classmethod + def get_schema_uri(cls) -> str: + return OSC_SCHEMA_URI + + @classmethod + def ext( + cls, obj: Union[pystac.Item, pystac.Collection], add_if_missing: bool = False + ) -> "OscExtension": + """Returns the OscExtension instance for the given object, adding the extension if missing.""" + if cls.has_extension(obj): + return OscExtension(obj) + elif add_if_missing: + return cls.add_to(obj) + else: + raise ValueError( + "OSC extension is not present and add_if_missing is False." + ) + + @classmethod + def has_extension(cls, obj: Union[pystac.Item, pystac.Collection]) -> bool: + """Checks if the OSC extension is present in the object's extensions.""" + return cls.get_schema_uri() in obj.stac_extensions + + @classmethod + def add_to(cls, obj: Union[pystac.Item, pystac.Collection]) -> "OscExtension": + """Adds the OSC extension to the object's extensions.""" + if cls.get_schema_uri() not in obj.stac_extensions: + obj.stac_extensions.append(cls.get_schema_uri()) + return OscExtension(obj) + + def validate_extension(self) -> None: + """Validates that all required fields for the OSC extension are set.""" + required_fields = ["osc:type", "osc:project", "osc:status"] + missing_fields = [ + field + for field in required_fields + if self._get_property(field, None) is None + ] + if missing_fields: + raise ValueError(f"Missing required fields: {', '.join(missing_fields)}") diff --git a/deep_code/version.py b/deep_code/version.py index edabde0..711ec8b 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.0.dev0" \ No newline at end of file +version = "0.1.0.dev0" diff --git a/environment.yml b/environment.yml index 8e231c4..6166ef7 100644 --- a/environment.yml +++ b/environment.yml @@ -3,8 +3,10 @@ channels: - conda-forge dependencies: # Required - - pystac + - python >=3.9 +# - click - jsonschema - - click + - pystac - pyyaml - - requests \ No newline at end of file + - requests + - xcube \ No newline at end of file From 42367da3bf4522575cd3b14853d74d6262847fe8 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 23 Dec 2024 11:44:48 +0100 Subject: [PATCH 02/63] implemented dataset stac generator class and unit tests --- .../utils/test_dataset_stac_generator.py | 106 ++++++++++ deep_code/utils/dataset_stac_generator.py | 186 ++++++++++++------ 2 files changed, 234 insertions(+), 58 deletions(-) create mode 100644 deep_code/tests/utils/test_dataset_stac_generator.py diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py new file mode 100644 index 0000000..2d6aade --- /dev/null +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -0,0 +1,106 @@ +import unittest +from unittest.mock import patch, MagicMock +from datetime import datetime +from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from pystac import Collection +from xarray import Dataset +import numpy as np + + +class TestOSCProductSTACGenerator(unittest.TestCase): + + @patch("deep_code.utils.dataset_stac_generator.new_data_store") + def setUp(self, mock_data_store): + """Set up a mock dataset and generator.""" + self.mock_dataset = Dataset( + coords={ + "lon": ("lon", np.linspace(-180, 180, 10)), + "lat": ("lat", np.linspace(-90, 90, 5)), + "time": ( + "time", + [np.datetime64("2023-01-01T00:00:00Z", "ns"), np.datetime64("2023-01-02T00:00:00Z", "ns")] + ), + }, + attrs={ + "description": "Mock dataset for testing.", + "title": "Mock Dataset", + }, + data_vars={ + "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), + "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), + } + ) + mock_store = MagicMock() + mock_store.open_data.return_value = self.mock_dataset + mock_data_store.return_value = mock_store + + self.generator = OSCProductSTACGenerator("mock-dataset-id") + + def test_open_dataset(self): + """Test if the dataset is opened correctly.""" + self.assertIsInstance(self.generator.dataset, Dataset) + self.assertIn("lon", self.generator.dataset.coords) + self.assertIn("lat", self.generator.dataset.coords) + self.assertIn("time", self.generator.dataset.coords) + + def test_get_spatial_extent(self): + """Test spatial extent extraction.""" + extent = self.generator._get_spatial_extent() + self.assertEqual(extent.bboxes[0], [-180.0, -90.0, 180.0, 90.0]) + + def test_get_temporal_extent(self): + """Test temporal extent extraction.""" + extent = self.generator._get_temporal_extent() + expected_intervals = [ + datetime(2023, 1, 1, 0, 0), + datetime(2023, 1, 2, 0, 0), + ] + self.assertEqual(extent.intervals[0], expected_intervals) + + def test_get_variables(self): + """Test variable extraction.""" + variables = self.generator._get_variables() + self.assertEqual(variables, ["var1", "var2"]) + + def test_get_general_metadata(self): + """Test general metadata extraction.""" + metadata = self.generator._get_general_metadata() + self.assertEqual(metadata["description"], "Mock dataset for testing.") + self.assertEqual(metadata["title"], "Mock Dataset") + + @patch("pystac.Collection.add_link") + @patch("pystac.Collection.set_self_href") + def test_build_stac_collection(self, mock_set_self_href, mock_add_link): + """Test STAC collection creation.""" + collection = self.generator.build_stac_collection( + collection_id="mock-collection-id", + access_link="s3://mock-bucket/mock-dataset", + documentation_link="https://example.com/docs", + osc_status="ongoing", + osc_region="Global", + osc_themes=["climate", "environment"], + ) + self.assertIsInstance(collection, Collection) + self.assertEqual(collection.id, "mock-collection-id") + self.assertEqual(collection.description, "Mock dataset for testing.") + self.assertEqual(collection.title, "Mock Dataset") + self.assertEqual(collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0]) + self.assertEqual( + collection.extent.temporal.intervals[0], + [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)], + ) + mock_set_self_href.assert_called_once() + mock_add_link.assert_called() + + def test_invalid_spatial_extent(self): + """Test spatial extent extraction with missing coordinates.""" + self.generator.dataset = Dataset(coords={"x": [], "y": []}) + with self.assertRaises(ValueError): + self.generator._get_spatial_extent() + + def test_invalid_temporal_extent(self): + """Test temporal extent extraction with missing time.""" + self.generator.dataset = Dataset(coords={}) + with self.assertRaises(ValueError): + self.generator._get_temporal_extent() + diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 977e44a..d0559bc 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,10 +1,8 @@ import os - -import xarray as xr -from pystac import Collection, Extent, SpatialExtent, TemporalExtent -from pystac.extensions.base import PropertiesExtension -from datetime import datetime -from typing import List +import pandas as pd +from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent +from datetime import datetime, timezone +from typing import List, Optional from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension @@ -23,92 +21,159 @@ def __init__(self, dataset_id: str): self.dataset_id = dataset_id self.dataset = self._open_dataset() - def _open_dataset(self): - """Open the Zarr dataset as an xarray Dataset.""" + """Open the dataset using a s3 store as a xarray Dataset.""" try: - store = new_data_store('s3', root=os.environ["S3_USER_STORAGE_BUCKET"], - storage_options=dict(anon=False, - key=os.environ["S3_USER_STORAGE_KEY"], - secret=os.environ[ - "S3_USER_STORAGE_SECRET"])) + store = new_data_store('s3', + root="deep-esdl-public", + storage_options=dict(anon=True)) return store.open_data(self.dataset_id) except Exception as e: - raise ValueError(f"Failed to open Zarr dataset at " - f"{self.dataset_id}: {e}") + try: + store = new_data_store( + "s3", + root=os.environ["S3_USER_STORAGE_BUCKET"], + storage_options=dict( + anon=False, + key=os.environ.get("S3_USER_STORAGE_KEY"), + secret=os.environ.get("S3_USER_STORAGE_SECRET"), + ) + ) + return store.open_data(self.dataset_id) + except Exception as inner_e: + raise ValueError(f"Failed to open Zarr dataset with ID " + f"{self.dataset_id}: {inner_e}") from e + + except Exception as e: + raise ValueError( + f"Failed to open Zarr dataset with ID " f"{self.dataset_id}: {e}" + ) def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" - if "longitude" in self.dataset.coords and "latitude" in self.dataset.coords: + if "lon" in self.dataset.coords and "lat" in self.dataset.coords: + # For regular gridding lon_min, lon_max = ( - float(self.dataset.longitude.min()), - float(self.dataset.longitude.max()), + float(self.dataset.lon.min()), + float(self.dataset.lon.max()), ) lat_min, lat_max = ( - float(self.dataset.latitude.min()), - float(self.dataset.latitude.max()), + float(self.dataset.lat.min()), + float(self.dataset.lat.max()), ) return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]]) + elif "x" in self.dataset.coords and "y" in self.dataset.coords: + # For irregular gridding + x_min, x_max = ( + float(self.dataset.x.min()), + float(self.dataset.x.max()), + ) + y_min, y_max = ( + float(self.dataset.y.min()), + float(self.dataset.y.max()), + ) + return SpatialExtent([[x_min, y_min, x_max, y_max]]) else: raise ValueError( - "Dataset does not have 'longitude' and 'latitude' coordinates." + "Dataset does not have recognized spatial coordinates ('lon', 'lat' or 'x', 'y')." ) def _get_temporal_extent(self) -> TemporalExtent: """Extract temporal extent from the dataset.""" if "time" in self.dataset.coords: - time_min = str(self.dataset.time.min().values) - time_max = str(self.dataset.time.max().values) - return TemporalExtent([[time_min, time_max]]) + try: + # Convert the time bounds to datetime objects + time_min = pd.to_datetime( + self.dataset.time.min().values).to_pydatetime() + time_max = pd.to_datetime( + self.dataset.time.max().values).to_pydatetime() + return TemporalExtent([[time_min, time_max]]) + except Exception as e: + raise ValueError(f"Failed to parse temporal extent: {e}") else: raise ValueError("Dataset does not have a 'time' coordinate.") def _get_variables(self) -> List[str]: - """Extract the variable names from the dataset.""" - return list(self.dataset.data_vars.keys()) + """ + Extract variable names from the dataset. + + Prioritize fetching `long_name` or `standard_name` from each variable's attributes. + If neither is available, return the variable's name from `dataset.data_vars.keys()`. + + :return: A list of variable names or descriptions. + """ + variables = [] + for var_name, variable in self.dataset.data_vars.items(): + # Fetch 'long_name' or 'standard_name' if they exist + long_name = variable.attrs.get("long_name") + standard_name = variable.attrs.get("standard_name") + # Prioritize 'long_name', fallback to 'standard_name', then use variable key + variables.append(long_name or standard_name or var_name) + return variables def _get_general_metadata(self) -> dict: - return {'description': self.dataset.attrs['description'], - 'title': self.dataset.attrs['title']} + """ + Extract general metadata from the dataset attributes. + Fallback to default values if the keys are missing. + + :return: A dictionary containing metadata such as 'description' and 'title'. + """ + return { + "description": self.dataset.attrs.get("description", + "No description available."), + "title": self.dataset.attrs.get("title", "No title available."), + } def build_stac_collection( self, collection_id: str, + access_link: Optional[str] = None, + documentation_link: Optional[str] = None, osc_status: str = "ongoing", osc_region: str = "Global", - osc_themes: List[str] = None, + osc_themes: Optional[List[str]] = None, ) -> Collection: """ Build an OSC STAC Collection for the product. + :param access_link: Public access link to the dataset. :param collection_id: Unique ID for the collection. - :param description: Description of the collection. - :param title: Title of the collection. - :param osc_project: Project name for OSC metadata. + :param documentation_link: (Optional) Link to documentation related to the dataset. :param osc_status: Status of the dataset (e.g., "ongoing"). :param osc_region: Geographical region of the dataset. - :param osc_themes: Themes of the dataset (e.g., ["climate", "environment"]). + :param osc_themes: (Optional) Themes of the dataset (e.g., ["climate", "environment"]). :return: A pystac.Collection object. """ - # if osc_themes is None: - # osc_themes = [] + + # Set default access link if not provided, assume dataset_id is + # already in deepesdl public s3 + if access_link is None: + access_link = f"s3://deep-esdl-public/{self.dataset_id}" + + # Ensure osc_themes has a default value + if osc_themes is None: + osc_themes = [] # Extract metadata - spatial_extent = self._get_spatial_extent() - temporal_extent = self._get_temporal_extent() - variables = self._get_variables() - description, title = self._get_general_metadata() + try: + spatial_extent = self._get_spatial_extent() + temporal_extent = self._get_temporal_extent() + variables = self._get_variables() + general_metadata = self._get_general_metadata() + except ValueError as e: + raise ValueError(f"Metadata extraction failed: {e}") # Build base STAC Collection collection = Collection( id=collection_id, - description=description, + description=general_metadata.get('description', "No description provided."), extent=Extent(spatial=spatial_extent, temporal=temporal_extent), - title=title, + title=general_metadata.get('title', "Unnamed Collection"), ) # Add OSC extension metadata osc_extension = OscExtension.add_to(collection) + # osc_project and osc_type are fixed constant values osc_extension.osc_project = "deep-earth-system-data-lab" osc_extension.osc_type = "product" osc_extension.osc_status = osc_status @@ -116,24 +181,29 @@ def build_stac_collection( osc_extension.osc_themes = osc_themes osc_extension.osc_variables = variables osc_extension.osc_missions = [] - collection.extra_fields["created"] = datetime.utcnow().isoformat() + "Z" - collection.extra_fields["updated"] = datetime.utcnow().isoformat() + "Z" - return collection + # Add creation and update timestamps for the collection + now_iso = datetime.now(timezone.utc).isoformat() + collection.extra_fields["created"] = now_iso + collection.extra_fields["updated"] = now_iso + collection_name = f"{general_metadata.get('title', collection_id).replace(' ', '-').lower()}.json" + collection.set_self_href(collection_name) + + + collection.add_link(Link(rel="self", target=access_link, + title="Access")) + if documentation_link: + collection.add_link( + Link(rel="via", target=documentation_link, + title="Documentation")) + + # Validate OSC extension fields + try: + osc_extension.validate_extension() + except ValueError as e: + raise ValueError(f"OSC Extension validation failed: {e}") + + return collection -# Example Usage -if __name__ == "__main__": - zarr_dataset_path = "path/to/zarr/dataset" - collection_id = "example-collection-id" - description = "An example OSC collection built from a Zarr dataset." - title = "Example Collection" - try: - generator = OSCProductSTACGenerator(zarr_dataset_path) - collection = generator.build_stac_collection( - collection_id=collection_id - ) - print(collection.to_dict()) - except Exception as e: - print(f"Error: {e}") From 76c7e8216fee561893e4f83d7d524c22de7643fb Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 23 Dec 2024 11:45:17 +0100 Subject: [PATCH 03/63] refactor --- .../utils/test_dataset_stac_generator.py | 23 ++++----- .../tests/utils/test_github_automation.py | 4 -- deep_code/tests/utils/test_osc_extension.py | 4 -- deep_code/utils/dataset_stac_generator.py | 49 +++++++++---------- 4 files changed, 32 insertions(+), 48 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 2d6aade..74d7662 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -8,7 +8,6 @@ class TestOSCProductSTACGenerator(unittest.TestCase): - @patch("deep_code.utils.dataset_stac_generator.new_data_store") def setUp(self, mock_data_store): """Set up a mock dataset and generator.""" @@ -18,17 +17,17 @@ def setUp(self, mock_data_store): "lat": ("lat", np.linspace(-90, 90, 5)), "time": ( "time", - [np.datetime64("2023-01-01T00:00:00Z", "ns"), np.datetime64("2023-01-02T00:00:00Z", "ns")] + [ + np.datetime64("2023-01-01T00:00:00Z", "ns"), + np.datetime64("2023-01-02T00:00:00Z", "ns"), + ], ), }, - attrs={ - "description": "Mock dataset for testing.", - "title": "Mock Dataset", - }, + attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"}, data_vars={ "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), - } + }, ) mock_store = MagicMock() mock_store.open_data.return_value = self.mock_dataset @@ -51,10 +50,7 @@ def test_get_spatial_extent(self): def test_get_temporal_extent(self): """Test temporal extent extraction.""" extent = self.generator._get_temporal_extent() - expected_intervals = [ - datetime(2023, 1, 1, 0, 0), - datetime(2023, 1, 2, 0, 0), - ] + expected_intervals = [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)] self.assertEqual(extent.intervals[0], expected_intervals) def test_get_variables(self): @@ -84,7 +80,9 @@ def test_build_stac_collection(self, mock_set_self_href, mock_add_link): self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") self.assertEqual(collection.title, "Mock Dataset") - self.assertEqual(collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0]) + self.assertEqual( + collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0] + ) self.assertEqual( collection.extent.temporal.intervals[0], [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)], @@ -103,4 +101,3 @@ def test_invalid_temporal_extent(self): self.generator.dataset = Dataset(coords={}) with self.assertRaises(ValueError): self.generator._get_temporal_extent() - diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index b858b35..58acc09 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -117,7 +117,3 @@ def test_clean_up(self, mock_chdir, mock_run): mock_chdir.assert_called_once_with("..") mock_run.assert_called_once_with(["rm", "-rf", self.github.local_clone_dir]) - - -if __name__ == "__main__": - unittest.main() diff --git a/deep_code/tests/utils/test_osc_extension.py b/deep_code/tests/utils/test_osc_extension.py index 6670f5c..07866fb 100644 --- a/deep_code/tests/utils/test_osc_extension.py +++ b/deep_code/tests/utils/test_osc_extension.py @@ -113,7 +113,3 @@ def test_validation_missing_fields(self): extension.validate_extension() self.assertIn("Missing required fields", str(context.exception)) self.assertIn("osc:type", str(context.exception)) - - -if __name__ == "__main__": - unittest.main() diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index d0559bc..cb4a9b3 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -24,9 +24,9 @@ def __init__(self, dataset_id: str): def _open_dataset(self): """Open the dataset using a s3 store as a xarray Dataset.""" try: - store = new_data_store('s3', - root="deep-esdl-public", - storage_options=dict(anon=True)) + store = new_data_store( + "s3", root="deep-esdl-public", storage_options=dict(anon=True) + ) return store.open_data(self.dataset_id) except Exception as e: try: @@ -37,12 +37,14 @@ def _open_dataset(self): anon=False, key=os.environ.get("S3_USER_STORAGE_KEY"), secret=os.environ.get("S3_USER_STORAGE_SECRET"), - ) + ), ) return store.open_data(self.dataset_id) except Exception as inner_e: - raise ValueError(f"Failed to open Zarr dataset with ID " - f"{self.dataset_id}: {inner_e}") from e + raise ValueError( + f"Failed to open Zarr dataset with ID " + f"{self.dataset_id}: {inner_e}" + ) from e except Exception as e: raise ValueError( @@ -64,14 +66,8 @@ def _get_spatial_extent(self) -> SpatialExtent: return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]]) elif "x" in self.dataset.coords and "y" in self.dataset.coords: # For irregular gridding - x_min, x_max = ( - float(self.dataset.x.min()), - float(self.dataset.x.max()), - ) - y_min, y_max = ( - float(self.dataset.y.min()), - float(self.dataset.y.max()), - ) + x_min, x_max = (float(self.dataset.x.min()), float(self.dataset.x.max())) + y_min, y_max = (float(self.dataset.y.min()), float(self.dataset.y.max())) return SpatialExtent([[x_min, y_min, x_max, y_max]]) else: raise ValueError( @@ -84,9 +80,11 @@ def _get_temporal_extent(self) -> TemporalExtent: try: # Convert the time bounds to datetime objects time_min = pd.to_datetime( - self.dataset.time.min().values).to_pydatetime() + self.dataset.time.min().values + ).to_pydatetime() time_max = pd.to_datetime( - self.dataset.time.max().values).to_pydatetime() + self.dataset.time.max().values + ).to_pydatetime() return TemporalExtent([[time_min, time_max]]) except Exception as e: raise ValueError(f"Failed to parse temporal extent: {e}") @@ -119,8 +117,9 @@ def _get_general_metadata(self) -> dict: :return: A dictionary containing metadata such as 'description' and 'title'. """ return { - "description": self.dataset.attrs.get("description", - "No description available."), + "description": self.dataset.attrs.get( + "description", "No description available." + ), "title": self.dataset.attrs.get("title", "No title available."), } @@ -166,9 +165,9 @@ def build_stac_collection( # Build base STAC Collection collection = Collection( id=collection_id, - description=general_metadata.get('description', "No description provided."), + description=general_metadata.get("description", "No description provided."), extent=Extent(spatial=spatial_extent, temporal=temporal_extent), - title=general_metadata.get('title', "Unnamed Collection"), + title=general_metadata.get("title", "Unnamed Collection"), ) # Add OSC extension metadata @@ -190,13 +189,11 @@ def build_stac_collection( collection_name = f"{general_metadata.get('title', collection_id).replace(' ', '-').lower()}.json" collection.set_self_href(collection_name) - - collection.add_link(Link(rel="self", target=access_link, - title="Access")) + collection.add_link(Link(rel="self", target=access_link, title="Access")) if documentation_link: collection.add_link( - Link(rel="via", target=documentation_link, - title="Documentation")) + Link(rel="via", target=documentation_link, title="Documentation") + ) # Validate OSC extension fields try: @@ -205,5 +202,3 @@ def build_stac_collection( raise ValueError(f"OSC Extension validation failed: {e}") return collection - - From 79321d9db5a20bc7d53a8be438a0375764a399d1 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 23 Dec 2024 12:08:09 +0100 Subject: [PATCH 04/63] The parameters of build_stac_collection have been moved to the class initializer --- .../utils/test_dataset_stac_generator.py | 26 +++--- deep_code/utils/dataset_stac_generator.py | 79 ++++++++----------- 2 files changed, 49 insertions(+), 56 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 74d7662..5c3c602 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -1,8 +1,9 @@ import unittest from unittest.mock import patch, MagicMock -from datetime import datetime +from datetime import datetime, timezone from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from pystac import Collection +from pystac.extensions.base import PropertiesExtension from xarray import Dataset import numpy as np @@ -18,8 +19,8 @@ def setUp(self, mock_data_store): "time": ( "time", [ - np.datetime64("2023-01-01T00:00:00Z", "ns"), - np.datetime64("2023-01-02T00:00:00Z", "ns"), + np.datetime64(datetime(2023, 1, 1), "ns"), + np.datetime64(datetime(2023, 1, 2), "ns"), ], ), }, @@ -33,7 +34,15 @@ def setUp(self, mock_data_store): mock_store.open_data.return_value = self.mock_dataset mock_data_store.return_value = mock_store - self.generator = OSCProductSTACGenerator("mock-dataset-id") + self.generator = OSCProductSTACGenerator( + dataset_id="mock-dataset-id", + collection_id="mock-collection-id", + access_link="s3://mock-bucket/mock-dataset", + documentation_link="https://example.com/docs", + osc_status="ongoing", + osc_region="Global", + osc_themes=["climate", "environment"], + ) def test_open_dataset(self): """Test if the dataset is opened correctly.""" @@ -68,14 +77,7 @@ def test_get_general_metadata(self): @patch("pystac.Collection.set_self_href") def test_build_stac_collection(self, mock_set_self_href, mock_add_link): """Test STAC collection creation.""" - collection = self.generator.build_stac_collection( - collection_id="mock-collection-id", - access_link="s3://mock-bucket/mock-dataset", - documentation_link="https://example.com/docs", - osc_status="ongoing", - osc_region="Global", - osc_themes=["climate", "environment"], - ) + collection = self.generator.build_stac_collection() self.assertIsInstance(collection, Collection) self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index cb4a9b3..abcfd78 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -12,17 +12,38 @@ class OSCProductSTACGenerator: A class to generate OSC STAC Collections for a product from Zarr datasets. """ - def __init__(self, dataset_id: str): + def __init__( + self, + dataset_id: str, + collection_id: str, + access_link: Optional[str] = None, + documentation_link: Optional[str] = None, + osc_status: str = "ongoing", + osc_region: str = "Global", + osc_themes: Optional[List[str]] = None, + ): """ - Initialize the generator with the path to the Zarr dataset. + Initialize the generator with the path to the Zarr dataset and metadata. - :param dataset_path: Path to the Zarr dataset. + :param dataset_id: Path to the Zarr dataset. + :param collection_id: Unique ID for the collection. + :param access_link: Public access link to the dataset. + :param documentation_link: Link to documentation related to the dataset. + :param osc_status: Status of the dataset (e.g., "ongoing"). + :param osc_region: Geographical region of the dataset. + :param osc_themes: Themes of the dataset (e.g., ["climate", "environment"]). """ self.dataset_id = dataset_id + self.collection_id = collection_id + self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}" + self.documentation_link = documentation_link + self.osc_status = osc_status + self.osc_region = osc_region + self.osc_themes = osc_themes or [] self.dataset = self._open_dataset() def _open_dataset(self): - """Open the dataset using a s3 store as a xarray Dataset.""" + """Open the dataset using a S3 store as an xarray Dataset.""" try: store = new_data_store( "s3", root="deep-esdl-public", storage_options=dict(anon=True) @@ -42,15 +63,9 @@ def _open_dataset(self): return store.open_data(self.dataset_id) except Exception as inner_e: raise ValueError( - f"Failed to open Zarr dataset with ID " - f"{self.dataset_id}: {inner_e}" + f"Failed to open Zarr dataset with ID {self.dataset_id}: {inner_e}" ) from e - except Exception as e: - raise ValueError( - f"Failed to open Zarr dataset with ID " f"{self.dataset_id}: {e}" - ) - def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" if "lon" in self.dataset.coords and "lat" in self.dataset.coords: @@ -123,36 +138,12 @@ def _get_general_metadata(self) -> dict: "title": self.dataset.attrs.get("title", "No title available."), } - def build_stac_collection( - self, - collection_id: str, - access_link: Optional[str] = None, - documentation_link: Optional[str] = None, - osc_status: str = "ongoing", - osc_region: str = "Global", - osc_themes: Optional[List[str]] = None, - ) -> Collection: + def build_stac_collection(self) -> Collection: """ Build an OSC STAC Collection for the product. - :param access_link: Public access link to the dataset. - :param collection_id: Unique ID for the collection. - :param documentation_link: (Optional) Link to documentation related to the dataset. - :param osc_status: Status of the dataset (e.g., "ongoing"). - :param osc_region: Geographical region of the dataset. - :param osc_themes: (Optional) Themes of the dataset (e.g., ["climate", "environment"]). :return: A pystac.Collection object. """ - - # Set default access link if not provided, assume dataset_id is - # already in deepesdl public s3 - if access_link is None: - access_link = f"s3://deep-esdl-public/{self.dataset_id}" - - # Ensure osc_themes has a default value - if osc_themes is None: - osc_themes = [] - # Extract metadata try: spatial_extent = self._get_spatial_extent() @@ -164,7 +155,7 @@ def build_stac_collection( # Build base STAC Collection collection = Collection( - id=collection_id, + id=self.collection_id, description=general_metadata.get("description", "No description provided."), extent=Extent(spatial=spatial_extent, temporal=temporal_extent), title=general_metadata.get("title", "Unnamed Collection"), @@ -175,9 +166,9 @@ def build_stac_collection( # osc_project and osc_type are fixed constant values osc_extension.osc_project = "deep-earth-system-data-lab" osc_extension.osc_type = "product" - osc_extension.osc_status = osc_status - osc_extension.osc_region = osc_region - osc_extension.osc_themes = osc_themes + osc_extension.osc_status = self.osc_status + osc_extension.osc_region = self.osc_region + osc_extension.osc_themes = self.osc_themes osc_extension.osc_variables = variables osc_extension.osc_missions = [] @@ -186,13 +177,13 @@ def build_stac_collection( collection.extra_fields["created"] = now_iso collection.extra_fields["updated"] = now_iso - collection_name = f"{general_metadata.get('title', collection_id).replace(' ', '-').lower()}.json" + collection_name = f"{general_metadata.get('title', self.collection_id).replace(' ', '-').lower()}.json" collection.set_self_href(collection_name) - collection.add_link(Link(rel="self", target=access_link, title="Access")) - if documentation_link: + collection.add_link(Link(rel="self", target=self.access_link, title="Access")) + if self.documentation_link: collection.add_link( - Link(rel="via", target=documentation_link, title="Documentation") + Link(rel="via", target=self.documentation_link, title="Documentation") ) # Validate OSC extension fields From b8523159244974dcbda8d3c09372cf29e6c4d23e Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 23 Dec 2024 13:09:33 +0100 Subject: [PATCH 05/63] modified logic for open_data --- .../utils/test_dataset_stac_generator.py | 103 +++++++++++++++++- deep_code/utils/dataset_stac_generator.py | 75 ++++++++++--- 2 files changed, 159 insertions(+), 19 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 5c3c602..ed8bda5 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -3,9 +3,9 @@ from datetime import datetime, timezone from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from pystac import Collection -from pystac.extensions.base import PropertiesExtension from xarray import Dataset import numpy as np +import os class TestOSCProductSTACGenerator(unittest.TestCase): @@ -103,3 +103,104 @@ def test_invalid_temporal_extent(self): self.generator.dataset = Dataset(coords={}) with self.assertRaises(ValueError): self.generator._get_temporal_extent() + + +class TestOpenDataset(unittest.TestCase): + @patch("deep_code.utils.dataset_stac_generator.new_data_store") + @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") + def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): + """Test dataset opening with the public store configuration.""" + # Create a mock store and mock its `open_data` method + mock_store = MagicMock() + mock_new_data_store.return_value = mock_store + mock_store.open_data.return_value = "mock_dataset" + + # Instantiate the generator (this will implicitly call _open_dataset) + generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + + # Validate that the dataset is assigned correctly + self.assertEqual(generator.dataset, "mock_dataset") + + # Validate that `new_data_store` was called once with the correct parameters + mock_new_data_store.assert_called_once_with( + "s3", root="deep-esdl-public", storage_options={"anon": True} + ) + + # Ensure `open_data` was called once on the returned store + mock_store.open_data.assert_called_once_with("mock-dataset-id") + + # Validate logging behavior + mock_logger().info.assert_any_call( + "Attempting to open dataset with configuration: Public store" + ) + mock_logger().info.assert_any_call( + "Successfully opened dataset with configuration: Public store" + ) + + @patch("deep_code.utils.dataset_stac_generator.new_data_store") + @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") + def test_open_dataset_success_authenticated_store( + self, mock_logger, mock_new_data_store + ): + """Test dataset opening with the authenticated store configuration.""" + # Simulate public store failure + mock_store = MagicMock() + mock_new_data_store.side_effect = [ + Exception("Public store failure"), + # First call (public store) raises an exception + mock_store, + # Second call (authenticated store) returns a mock store + ] + mock_store.open_data.return_value = "mock_dataset" + + os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" + os.environ["S3_USER_STORAGE_KEY"] = "mock-key" + os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" + + generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + + # Validate that the dataset was successfully opened with the authenticated store + self.assertEqual(generator.dataset, "mock_dataset") + self.assertEqual(mock_new_data_store.call_count, 2) + + # Validate calls to `new_data_store` + mock_new_data_store.assert_any_call( + "s3", root="deep-esdl-public", storage_options={"anon": True} + ) + mock_new_data_store.assert_any_call( + "s3", + root="mock-bucket", + storage_options={"anon": False, "key": "mock-key", "secret": "mock-secret"}, + ) + + # Validate logging calls + mock_logger().info.assert_any_call( + "Attempting to open dataset with configuration: Public store" + ) + mock_logger().info.assert_any_call( + "Attempting to open dataset with configuration: Authenticated store" + ) + mock_logger().info.assert_any_call( + "Successfully opened dataset with configuration: Authenticated store" + ) + + @patch("deep_code.utils.dataset_stac_generator.new_data_store") + @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") + def test_open_dataset_failure(self, mock_logger, mock_new_data_store): + """Test dataset opening failure with all configurations.""" + # Simulate all store failures + mock_new_data_store.side_effect = Exception("Store failure") + os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" + os.environ["S3_USER_STORAGE_KEY"] = "mock-key" + os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" + + with self.assertRaises(ValueError) as context: + OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + + self.assertIn( + "Failed to open Zarr dataset with ID mock-dataset-id", + str(context.exception), + ) + self.assertIn("Public store, Authenticated store", str(context.exception)) + self.assertEqual(mock_new_data_store.call_count, 2) + mock_logger().critical.assert_called_once() diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index abcfd78..7692b6e 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -5,6 +5,7 @@ from typing import List, Optional from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension +import logging class OSCProductSTACGenerator: @@ -44,27 +45,65 @@ def __init__( def _open_dataset(self): """Open the dataset using a S3 store as an xarray Dataset.""" - try: - store = new_data_store( - "s3", root="deep-esdl-public", storage_options=dict(anon=True) - ) - return store.open_data(self.dataset_id) - except Exception as e: + # Configure logging + logger = logging.getLogger(__name__) + + store_configs = [ + { + "description": "Public store", + "params": { + "storage_type": "s3", + "root": "deep-esdl-public", + "storage_options": {"anon": True}, + }, + }, + { + "description": "Authenticated store", + "params": { + "storage_type": "s3", + "root": os.environ.get("S3_USER_STORAGE_BUCKET"), + "storage_options": { + "anon": False, + "key": os.environ.get("S3_USER_STORAGE_KEY"), + "secret": os.environ.get("S3_USER_STORAGE_SECRET"), + }, + }, + }, + ] + + # Iterate through configurations and attempt to open the dataset + last_exception = None + tried_configurations = [] + for config in store_configs: + tried_configurations.append(config["description"]) try: + logger.info( + f"Attempting to open dataset with configuration: {config['description']}" + ) store = new_data_store( - "s3", - root=os.environ["S3_USER_STORAGE_BUCKET"], - storage_options=dict( - anon=False, - key=os.environ.get("S3_USER_STORAGE_KEY"), - secret=os.environ.get("S3_USER_STORAGE_SECRET"), - ), + config["params"]["storage_type"], + root=config["params"]["root"], + storage_options=config["params"]["storage_options"], ) - return store.open_data(self.dataset_id) - except Exception as inner_e: - raise ValueError( - f"Failed to open Zarr dataset with ID {self.dataset_id}: {inner_e}" - ) from e + # Try to open the dataset; return immediately if successful + dataset = store.open_data(self.dataset_id) + logger.info( + f"Successfully opened dataset with configuration: {config['description']}" + ) + return dataset + except Exception as e: + logger.error( + f"Failed to open dataset with configuration: {config['description']}. Error: {e}" + ) + last_exception = e + + # If all attempts fail, raise an error + logger.critical( + f"Failed to open Zarr dataset with ID {self.dataset_id}. Tried configurations: {', '.join(tried_configurations)}. Last error: {last_exception}" + ) + raise ValueError( + f"Failed to open Zarr dataset with ID {self.dataset_id}. Tried configurations: {', '.join(tried_configurations)}. Last error: {last_exception}" + ) def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" From 6830d271ab8821c18ef890fdf9bd3f787019be71 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 11:04:18 +0100 Subject: [PATCH 06/63] reordered imports and made logger part of the class --- deep_code/utils/dataset_stac_generator.py | 55 +++++++++++----- deep_code/utils/github_automation.py | 76 ++++++++++++++--------- deep_code/utils/osc_extension.py | 2 + 3 files changed, 90 insertions(+), 43 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 7692b6e..15cd5de 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,11 +1,13 @@ import os -import pandas as pd -from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent +import logging from datetime import datetime, timezone from typing import List, Optional + +import pandas as pd +from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent from xcube.core.store import new_data_store + from deep_code.utils.osc_extension import OscExtension -import logging class OSCProductSTACGenerator: @@ -41,12 +43,11 @@ def __init__( self.osc_status = osc_status self.osc_region = osc_region self.osc_themes = osc_themes or [] + self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() def _open_dataset(self): """Open the dataset using a S3 store as an xarray Dataset.""" - # Configure logging - logger = logging.getLogger(__name__) store_configs = [ { @@ -77,7 +78,7 @@ def _open_dataset(self): for config in store_configs: tried_configurations.append(config["description"]) try: - logger.info( + self.logger.info( f"Attempting to open dataset with configuration: {config['description']}" ) store = new_data_store( @@ -87,18 +88,18 @@ def _open_dataset(self): ) # Try to open the dataset; return immediately if successful dataset = store.open_data(self.dataset_id) - logger.info( + self.logger.info( f"Successfully opened dataset with configuration: {config['description']}" ) return dataset except Exception as e: - logger.error( + self.logger.error( f"Failed to open dataset with configuration: {config['description']}. Error: {e}" ) last_exception = e # If all attempts fail, raise an error - logger.critical( + self.logger.critical( f"Failed to open Zarr dataset with ID {self.dataset_id}. Tried configurations: {', '.join(tried_configurations)}. Last error: {last_exception}" ) raise ValueError( @@ -156,9 +157,17 @@ def _get_variables(self) -> List[str]: """ variables = [] for var_name, variable in self.dataset.data_vars.items(): - # Fetch 'long_name' or 'standard_name' if they exist long_name = variable.attrs.get("long_name") standard_name = variable.attrs.get("standard_name") + # Replace spaces with hyphens and convert to lowercase if attributes exist + long_name = long_name.replace(" ", "-").lower() if long_name else None + standard_name = ( + standard_name.replace(" ", "-").lower() if standard_name else None + ) + if not long_name and not standard_name: + self.logger.error( + f"Metadata missing for variable '{var_name}': 'long_name' and 'standard_name' attributes are not available." + ) # Prioritize 'long_name', fallback to 'standard_name', then use variable key variables.append(long_name or standard_name or var_name) return variables @@ -216,14 +225,32 @@ def build_stac_collection(self) -> Collection: collection.extra_fields["created"] = now_iso collection.extra_fields["updated"] = now_iso - collection_name = f"{general_metadata.get('title', self.collection_id).replace(' ', '-').lower()}.json" - collection.set_self_href(collection_name) - - collection.add_link(Link(rel="self", target=self.access_link, title="Access")) + # Remove any existing root link and re-add it properly + collection.remove_links("root") + collection.add_link( + Link( + rel="root", + target="../../catalog.json", + media_type="application/json", + title="Open Science Catalog", + ) + ) + collection.add_link(Link(rel="via", target=self.access_link, title="Access")) if self.documentation_link: collection.add_link( Link(rel="via", target=self.documentation_link, title="Documentation") ) + collection.add_link( + Link( + rel="parent", + target="../catalog.json", + media_type="application/json", + title="Products", + ) + ) + + self_href = "https://esa-earthcode.github.io/open-science-catalog-metadata/products/deepesdl/collection.json" + collection.set_self_href(self_href) # Validate OSC extension fields try: diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 2b6aee9..94e57e2 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -4,6 +4,8 @@ import requests from pathlib import Path +from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator + class GitHubAutomation: def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): @@ -20,8 +22,10 @@ def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): self.repo_owner = repo_owner self.repo_name = repo_name self.base_repo_url = f"https://github.com/{repo_owner}/{repo_name}.git" - self.fork_repo_url = f"https://github.com/{username}/{repo_name}.git" - self.local_clone_dir = "./temp_repo" + self.fork_repo_url = ( + f"https://{username}:{token}@github.com/{username}/{repo_name}.git" + ) + self.local_clone_dir = os.path.join(os.path.expanduser("~"), "temp_repo") def fork_repository(self): """Fork the repository to the user's GitHub account.""" @@ -45,12 +49,15 @@ def create_branch(self, branch_name: str): print(f"Creating new branch: {branch_name}...") subprocess.run(["git", "checkout", "-b", branch_name], check=True) - def add_file(self, file_path: str, content: dict): + def add_file(self, file_path: str, content): """Add a new file to the local repository.""" print(f"Adding new file: {file_path}...") full_path = Path(self.local_clone_dir) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) with open(full_path, "w") as f: + # Convert content to dictionary if it's a PySTAC object + if hasattr(content, "to_dict"): + content = content.to_dict() f.write(json.dumps(content, indent=2)) subprocess.run(["git", "add", str(full_path)], check=True) @@ -85,29 +92,40 @@ def clean_up(self): subprocess.run(["rm", "-rf", self.local_clone_dir]) -# if __name__ == "__main__": -# # Configuration -# GITHUB_USERNAME = "your-username" # Replace with your GitHub username -# GITHUB_TOKEN = "your-personal-access-token" # Replace with your GitHub PAT -# REPO_OWNER = "ESA-EarthCODE" -# REPO_NAME = "open-science-catalog-metadata-testing" -# NEW_BRANCH_NAME = "add-new-collection" -# NEW_FILE_PATH = "products/new-collection-folder/collection.json" -# COLLECTION_CONTENT = { -# "id": "example-collection", -# "description": "An example collection", -# "items": [] -# } -# -# try: -# github_automation = GitHubAutomation(GITHUB_USERNAME, GITHUB_TOKEN, REPO_OWNER, REPO_NAME) -# -# # Automate the process -# github_automation.fork_repository() -# github_automation.clone_repository() -# github_automation.create_branch(NEW_BRANCH_NAME) -# github_automation.add_file(NEW_FILE_PATH, COLLECTION_CONTENT) -# github_automation.commit_and_push(NEW_BRANCH_NAME, "Add new collection") -# github_automation.create_pull_request(NEW_BRANCH_NAME, "Add new collection", "This PR adds a new collection folder to the products directory.") -# finally: -# github_automation.clean_up() +if __name__ == "__main__": + # Configuration + GITHUB_USERNAME = "TejasMorbagal" # Replace with your GitHub username + GITHUB_TOKEN = ( + "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" + ) # Replace with your GitHub PAT + REPO_OWNER = "ESA-EarthCODE" + REPO_NAME = "open-science-catalog-metadata-testing" + NEW_BRANCH_NAME = "add-new-collection" + NEW_FILE_PATH = "products/hydrology/collection.json" + collection_id = "example-collection-id" + title = "Example Collection" + dataset_id = "hydrology-1D-0.009deg-100x60x60-3.0.2.zarr" + doc_link = "https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/" + generator = OSCProductSTACGenerator( + dataset_id, collection_id="example-collection-id", documentation_link=doc_link + ) + collection = generator.build_stac_collection() + + # try: + github_automation = GitHubAutomation( + GITHUB_USERNAME, GITHUB_TOKEN, REPO_OWNER, REPO_NAME + ) + + # Automate the process + github_automation.fork_repository() + github_automation.clone_repository() + github_automation.create_branch(NEW_BRANCH_NAME) + github_automation.add_file(NEW_FILE_PATH, collection) + github_automation.commit_and_push(NEW_BRANCH_NAME, "Add new collection") + github_automation.create_pull_request( + NEW_BRANCH_NAME, + "Add new collection", + "This PR adds a new collection folder to the products directory.", + ) + # finally: + # github_automation.clean_up() diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 1007cb8..419900f 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -1,7 +1,9 @@ from typing import Optional, Union, Literal, List + import pystac from pystac import SpatialExtent, TemporalExtent, Extent from pystac.extensions.base import PropertiesExtension, ExtensionManagementMixin + from deep_code.constants import OSC_SCHEMA_URI From e1eda9f6a562e03b0c27a501211b7c36e3f365fe Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 11:05:34 +0100 Subject: [PATCH 07/63] refactor --- deep_code/utils/github_automation.py | 38 ---------------------------- 1 file changed, 38 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 94e57e2..6f92ecb 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -91,41 +91,3 @@ def clean_up(self): os.chdir("..") subprocess.run(["rm", "-rf", self.local_clone_dir]) - -if __name__ == "__main__": - # Configuration - GITHUB_USERNAME = "TejasMorbagal" # Replace with your GitHub username - GITHUB_TOKEN = ( - "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" - ) # Replace with your GitHub PAT - REPO_OWNER = "ESA-EarthCODE" - REPO_NAME = "open-science-catalog-metadata-testing" - NEW_BRANCH_NAME = "add-new-collection" - NEW_FILE_PATH = "products/hydrology/collection.json" - collection_id = "example-collection-id" - title = "Example Collection" - dataset_id = "hydrology-1D-0.009deg-100x60x60-3.0.2.zarr" - doc_link = "https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/" - generator = OSCProductSTACGenerator( - dataset_id, collection_id="example-collection-id", documentation_link=doc_link - ) - collection = generator.build_stac_collection() - - # try: - github_automation = GitHubAutomation( - GITHUB_USERNAME, GITHUB_TOKEN, REPO_OWNER, REPO_NAME - ) - - # Automate the process - github_automation.fork_repository() - github_automation.clone_repository() - github_automation.create_branch(NEW_BRANCH_NAME) - github_automation.add_file(NEW_FILE_PATH, collection) - github_automation.commit_and_push(NEW_BRANCH_NAME, "Add new collection") - github_automation.create_pull_request( - NEW_BRANCH_NAME, - "Add new collection", - "This PR adds a new collection folder to the products directory.", - ) - # finally: - # github_automation.clean_up() From ef97cc0abd9d19fa0b592d17286348054a2602f1 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:15:17 +0100 Subject: [PATCH 08/63] updated git ignore to Stop tracking git.yaml and dataset-config.yaml --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 82f9275..4cbe696 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,7 @@ cython_debug/ # and can be added to the global gitignore or merged into this file. For a more nuclear # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ + +# Ignore git.yaml and dataset-config.yaml +git.yaml +dataset-config.yaml \ No newline at end of file From 6d5d9af06cd96a6a34a1b6e5c209c70f61dbdb62 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:16:26 +0100 Subject: [PATCH 09/63] updated constants --- deep_code/constants.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/deep_code/constants.py b/deep_code/constants.py index 4362520..1ef32bb 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -1 +1,4 @@ OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0-rc.3/schema.json" +OSC_REPO_OWNER = "ESA-EarthCODE" +OSC_REPO_NAME = "open-science-catalog-metadata-testing" +OSC_NEW_BRANCH_NAME = "add-new-collection" From f8b139f64958f297273fcdac729cc9a85a418632 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:17:01 +0100 Subject: [PATCH 10/63] refactor --- deep_code/utils/github_automation.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 6f92ecb..14355a5 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -4,8 +4,6 @@ import requests from pathlib import Path -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator - class GitHubAutomation: def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): @@ -90,4 +88,3 @@ def clean_up(self): print("Cleaning up local repository...") os.chdir("..") subprocess.run(["rm", "-rf", self.local_clone_dir]) - From e4fe7f2b0737d11d8d0785171ddc61d4424f1ea2 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:17:43 +0100 Subject: [PATCH 11/63] add main func for cli --- deep_code/cli/main.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 deep_code/cli/main.py diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py new file mode 100644 index 0000000..d3c98ba --- /dev/null +++ b/deep_code/cli/main.py @@ -0,0 +1,12 @@ +import click + +from deep_code.cli.publish import publish_product + +@click.group() +def main(): + """Deep Code CLI.""" + pass + +main.add_command(publish_product) +if __name__ == "__main__": + main() From 3d35cd0af0266ae7f678b3fb721a17fea84b52e7 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:17:58 +0100 Subject: [PATCH 12/63] publish as api and cli --- deep_code/api/publish.py | 89 ++++++++++++++++++++++++++++++++++++++++ deep_code/cli/publish.py | 18 ++++++++ 2 files changed, 107 insertions(+) create mode 100644 deep_code/api/publish.py create mode 100644 deep_code/cli/publish.py diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py new file mode 100644 index 0000000..eea87f2 --- /dev/null +++ b/deep_code/api/publish.py @@ -0,0 +1,89 @@ +import logging +import yaml +import fsspec + +from deep_code.utils.github_automation import GitHubAutomation +from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_NEW_BRANCH_NAME + +logger = logging.getLogger(__name__) +logging.basicConfig(level=logging.INFO) + +class ProductPublisher: + def __init__(self, git_config_path: str): + """ + Initialize the ProductPublisher class. + :param git_config_path: Path to the YAML file containing GitHub credentials + """ + with fsspec.open(git_config_path, "r") as file: + git_config = yaml.safe_load(file) or {} + + self.github_username = git_config.get("github-username") + self.github_token = git_config.get("github-token") + + if not self.github_username or not self.github_token: + raise ValueError("GitHub credentials are missing in the git.yaml file.") + + self.github_automation = GitHubAutomation( + self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME + ) + + def publish_product( + self, + dataset_config_path: str, + ): + """ + Publish a product collection to the specified GitHub repository. + + :param dataset_config_path: Path to the YAML file containing dataset configuration + """ + with fsspec.open(dataset_config_path, "r") as file: + dataset_config = yaml.safe_load(file) + + dataset_id = dataset_config.get("dataset-id") + collection_id = dataset_config.get("collection-id") + documentation_link = dataset_config.get("documentation-link") + access_link = dataset_config.get("access-link") + dataset_status = dataset_config.get("dataset-status") + osc_region = dataset_config.get("dataset-region") + dataset_theme = dataset_config.get("dataset-theme") + + if not dataset_id or not collection_id: + raise ValueError("Dataset ID or Collection ID is missing in the dataset-config.yaml file.") + + try: + logger.info("Generating STAC collection...") + generator = OSCProductSTACGenerator( + dataset_id=dataset_id, + collection_id=collection_id, + documentation_link=documentation_link, + access_link=access_link, + osc_status=dataset_status, + osc_region=osc_region, + osc_themes=dataset_theme + ) + collection = generator.build_stac_collection() + collection.extra_fields["documentation_link"] = documentation_link + + file_path = f"products/{collection_id}/collection.json" + logger.info("Automating GitHub tasks...") + self.github_automation.fork_repository() + self.github_automation.clone_repository() + self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) + self.github_automation.add_file(file_path, collection.to_dict()) + self.github_automation.commit_and_push( + OSC_NEW_BRANCH_NAME, f"Add new collection: {collection_id}" + ) + pr_url = self.github_automation.create_pull_request( + OSC_NEW_BRANCH_NAME, + f"Add new collection", + "This PR adds a new collection to the repository.", + ) + + logger.info(f"Pull request created: {pr_url}") + + finally: + self.github_automation.clean_up() + + + diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py new file mode 100644 index 0000000..2fecb27 --- /dev/null +++ b/deep_code/cli/publish.py @@ -0,0 +1,18 @@ +import click + +from deep_code.api.publish import ProductPublisher + + +@click.command(name="publish-product") +@click.option("--git-config", required=True, type=click.Path(exists=True), + help="Path to the git.yaml file with GitHub credentials.") +@click.option("--dataset-config", required=True, type=click.Path(exists=True), + help="Path to the dataset-config.yaml file with dataset information.") + +def publish_product(git_config, dataset_config): + """ + Command-line interface for the ProductPublisher API. + """ + publisher = ProductPublisher(git_config_path=git_config) + + publisher.publish_product(dataset_config_path= dataset_config) \ No newline at end of file From 48a01de6d42e3d11649d484ab2297c956a9dfcf6 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:18:14 +0100 Subject: [PATCH 13/63] refactor --- deep_code/api/publish_experiments.py | 1 - deep_code/api/publish_products.py | 1 - 2 files changed, 2 deletions(-) delete mode 100644 deep_code/api/publish_experiments.py delete mode 100644 deep_code/api/publish_products.py diff --git a/deep_code/api/publish_experiments.py b/deep_code/api/publish_experiments.py deleted file mode 100644 index 710bf53..0000000 --- a/deep_code/api/publish_experiments.py +++ /dev/null @@ -1 +0,0 @@ -# Logic for publishing experiments on EarthCODE catalog diff --git a/deep_code/api/publish_products.py b/deep_code/api/publish_products.py deleted file mode 100644 index e9f436f..0000000 --- a/deep_code/api/publish_products.py +++ /dev/null @@ -1 +0,0 @@ -# Logic for publishing products on EarthCODE catalog From e8a2457baed90e5eefb1c003908745cabb953164 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:18:40 +0100 Subject: [PATCH 14/63] updated env and pyproject.toml --- environment.yml | 2 +- pyproject.toml | 13 +++++++++---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 6166ef7..67bafdb 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: # Required - python >=3.9 -# - click + - click - jsonschema - pystac - pyyaml diff --git a/pyproject.toml b/pyproject.toml index b5e1746..17da38b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["setuptools >= 61.2.0"] +requires = ["setuptools >= 61.2.0", "wheel", "build"] build-backend = "setuptools.build_meta" [project] @@ -8,7 +8,7 @@ dynamic = ["version"] authors = [ {name = "Tejas Morbagal Harish", email = "tejas.morbagalharish@brockmann-consult.de"} ] -description = """\ +description = """ deepesdl earthcode integration utility tool """ keywords = [ @@ -21,7 +21,8 @@ requires-python = ">=3.10" dependencies = [ "pystac", "jsonschema", - "click" + "click", + "xcube" ] [tool.setuptools.dynamic] @@ -42,7 +43,11 @@ dev = [ "pytest-recording" ] +# entry point CLI +[project.scripts] +deep-code = "deep_code.cli.main:main" + [project.urls] Repository = "https://github.com/deepesdl/deep-code" Issues = "https://github.com/deepesdl/deep-code/issues" -Changelog = "https://github.com/deepesdl/deep-code/blob/main/CHANGES.md" \ No newline at end of file +Changelog = "https://github.com/deepesdl/deep-code/blob/main/CHANGES.md" From 16c245adfa6ddb82d5178d3b8495a45f756545be Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 16:18:57 +0100 Subject: [PATCH 15/63] cli as module --- deep_code/cli/__init__.py | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 deep_code/cli/__init__.py diff --git a/deep_code/cli/__init__.py b/deep_code/cli/__init__.py new file mode 100644 index 0000000..399b802 --- /dev/null +++ b/deep_code/cli/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2025 by xcube team and contributors +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. \ No newline at end of file From 15c63da3e537d1dce3efa3a1087704b1af4e1989 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 17:20:11 +0100 Subject: [PATCH 16/63] updated constants --- deep_code/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 1ef32bb..cf10a7a 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -1,4 +1,4 @@ OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0-rc.3/schema.json" OSC_REPO_OWNER = "ESA-EarthCODE" OSC_REPO_NAME = "open-science-catalog-metadata-testing" -OSC_NEW_BRANCH_NAME = "add-new-collection" +OSC_BRANCH_NAME = "add-new-collection" From de4c5c6ed6f53d6a44a18b96aed67e7e7d467d2e Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 27 Dec 2024 17:26:28 +0100 Subject: [PATCH 17/63] latest state --- deep_code/api/publish.py | 7 +++++-- deep_code/utils/dataset_stac_generator.py | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index eea87f2..736ab5e 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -4,7 +4,7 @@ from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator -from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_NEW_BRANCH_NAME +from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -69,10 +69,13 @@ def publish_product( logger.info("Automating GitHub tasks...") self.github_automation.fork_repository() self.github_automation.clone_repository() + OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) self.github_automation.add_file(file_path, collection.to_dict()) self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new collection: {collection_id}" + OSC_NEW_BRANCH_NAME, f"Add new " + f"collection:" + f" {collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 15cd5de..1f2831d 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -24,6 +24,7 @@ def __init__( osc_status: str = "ongoing", osc_region: str = "Global", osc_themes: Optional[List[str]] = None, + osc_missions: Optional[List[str]] = None, ): """ Initialize the generator with the path to the Zarr dataset and metadata. @@ -43,6 +44,7 @@ def __init__( self.osc_status = osc_status self.osc_region = osc_region self.osc_themes = osc_themes or [] + self.osc_missions = osc_missions or [] self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() @@ -182,8 +184,7 @@ def _get_general_metadata(self) -> dict: return { "description": self.dataset.attrs.get( "description", "No description available." - ), - "title": self.dataset.attrs.get("title", "No title available."), + ) } def build_stac_collection(self) -> Collection: @@ -206,7 +207,6 @@ def build_stac_collection(self) -> Collection: id=self.collection_id, description=general_metadata.get("description", "No description provided."), extent=Extent(spatial=spatial_extent, temporal=temporal_extent), - title=general_metadata.get("title", "Unnamed Collection"), ) # Add OSC extension metadata @@ -218,7 +218,7 @@ def build_stac_collection(self) -> Collection: osc_extension.osc_region = self.osc_region osc_extension.osc_themes = self.osc_themes osc_extension.osc_variables = variables - osc_extension.osc_missions = [] + osc_extension.osc_missions = self.osc_missions # Add creation and update timestamps for the collection now_iso = datetime.now(timezone.utc).isoformat() From 58630b82b39d33261b6a70dbfda5c60d88a4f84a Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 11:48:16 +0100 Subject: [PATCH 18/63] latest state 30.12 --- deep_code/api/check.py | 2 + deep_code/api/check_repository.py | 0 deep_code/api/new.py | 4 ++ deep_code/api/publish.py | 19 ++--- deep_code/api/test.py | 2 + deep_code/cli/__init__.py | 2 +- deep_code/cli/main.py | 2 + deep_code/cli/publish.py | 19 +++-- deep_code/tests/api/test_publish.py | 103 ++++++++++++++++++++++++++++ 9 files changed, 134 insertions(+), 19 deletions(-) create mode 100644 deep_code/api/check.py delete mode 100644 deep_code/api/check_repository.py create mode 100644 deep_code/tests/api/test_publish.py diff --git a/deep_code/api/check.py b/deep_code/api/check.py new file mode 100644 index 0000000..16810c3 --- /dev/null +++ b/deep_code/api/check.py @@ -0,0 +1,2 @@ +# Verify the readiness of an existing workflow repository for experiment publication by +# identifying any issues or missing components diff --git a/deep_code/api/check_repository.py b/deep_code/api/check_repository.py deleted file mode 100644 index e69de29..0000000 diff --git a/deep_code/api/new.py b/deep_code/api/new.py index c5c4099..599bf1f 100644 --- a/deep_code/api/new.py +++ b/deep_code/api/new.py @@ -1 +1,5 @@ # Logic for initializing repositories +# Initialize a GitHub repository with the proposed configurations files, an initial workflow +# notebook template (e.g. workflow.ipynb), a template Python package (code and +# pyproject.toml), and a template setup for documentation (e.g., using mkdocs), setup of the +# build pipeline diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index 736ab5e..7c4a025 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -9,6 +9,7 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) + class ProductPublisher: def __init__(self, git_config_path: str): """ @@ -28,10 +29,7 @@ def __init__(self, git_config_path: str): self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) - def publish_product( - self, - dataset_config_path: str, - ): + def publish_product(self, dataset_config_path: str): """ Publish a product collection to the specified GitHub repository. @@ -49,7 +47,9 @@ def publish_product( dataset_theme = dataset_config.get("dataset-theme") if not dataset_id or not collection_id: - raise ValueError("Dataset ID or Collection ID is missing in the dataset-config.yaml file.") + raise ValueError( + "Dataset ID or Collection ID is missing in the dataset-config.yaml file." + ) try: logger.info("Generating STAC collection...") @@ -60,7 +60,7 @@ def publish_product( access_link=access_link, osc_status=dataset_status, osc_region=osc_region, - osc_themes=dataset_theme + osc_themes=dataset_theme, ) collection = generator.build_stac_collection() collection.extra_fields["documentation_link"] = documentation_link @@ -73,9 +73,7 @@ def publish_product( self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) self.github_automation.add_file(file_path, collection.to_dict()) self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new " - f"collection:" - f" {collection_id}" + OSC_NEW_BRANCH_NAME, f"Add new " f"collection:" f" {collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, @@ -87,6 +85,3 @@ def publish_product( finally: self.github_automation.clean_up() - - - diff --git a/deep_code/api/test.py b/deep_code/api/test.py index e69de29..0e682bc 100644 --- a/deep_code/api/test.py +++ b/deep_code/api/test.py @@ -0,0 +1,2 @@ +# Execute the application package of a published experiment on a subset of input data to +# verify the reproducibility is achieved diff --git a/deep_code/cli/__init__.py b/deep_code/cli/__init__.py index 399b802..dd9064b 100644 --- a/deep_code/cli/__init__.py +++ b/deep_code/cli/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2025 by xcube team and contributors # Permissions are hereby granted under the terms of the MIT License: -# https://opensource.org/licenses/MIT. \ No newline at end of file +# https://opensource.org/licenses/MIT. diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index d3c98ba..46c817a 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -2,11 +2,13 @@ from deep_code.cli.publish import publish_product + @click.group() def main(): """Deep Code CLI.""" pass + main.add_command(publish_product) if __name__ == "__main__": main() diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 2fecb27..66c2dfd 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -4,15 +4,22 @@ @click.command(name="publish-product") -@click.option("--git-config", required=True, type=click.Path(exists=True), - help="Path to the git.yaml file with GitHub credentials.") -@click.option("--dataset-config", required=True, type=click.Path(exists=True), - help="Path to the dataset-config.yaml file with dataset information.") - +@click.option( + "--git-config", + required=True, + type=click.Path(exists=True), + help="Path to the git.yaml file with GitHub credentials.", +) +@click.option( + "--dataset-config", + required=True, + type=click.Path(exists=True), + help="Path to the dataset-config.yaml file with dataset information.", +) def publish_product(git_config, dataset_config): """ Command-line interface for the ProductPublisher API. """ publisher = ProductPublisher(git_config_path=git_config) - publisher.publish_product(dataset_config_path= dataset_config) \ No newline at end of file + publisher.publish_product(dataset_config_path=dataset_config) diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/api/test_publish.py new file mode 100644 index 0000000..5b3a5da --- /dev/null +++ b/deep_code/tests/api/test_publish.py @@ -0,0 +1,103 @@ +import unittest +from unittest.mock import patch, MagicMock, mock_open +import yaml + +from deep_code.api.publish import ProductPublisher + + +class TestProductPublisher(unittest.TestCase): + @patch("fsspec.open") + @patch("deep_code.utils.dataset_stac_generator.OSCProductSTACGenerator") + @patch("deep_code.utils.github_automation.GitHubAutomation") + def test_publish_product_success( + self, mock_github_automation, mock_stac_generator, mock_fsspec_open + ): + # Mock the git.yaml configuration + git_config = {"github-username": "test-user", "github-token": "test-token"} + mock_fsspec_open.side_effect = [ + mock_open(read_data=yaml.dump(git_config)).return_value, + mock_open( + read_data=yaml.dump( + { + "dataset-id": "test-dataset", + "collection-id": "test-collection", + "documentation-link": "http://example.com/doc", + "access-link": "http://example.com/access", + "dataset-status": "active", + "dataset-region": "region-1", + "dataset-theme": ["theme-1", "theme-2"], + } + ) + ).return_value, + ] + + # Mock the STAC generator + mock_generator_instance = mock_stac_generator.return_value + mock_collection = MagicMock() + mock_generator_instance.build_stac_collection.return_value = mock_collection + + # Mock the GitHub automation + mock_github_instance = mock_github_automation.return_value + + # Create an instance of ProductPublisher + publisher = ProductPublisher("path/to/git_config.yaml") + publisher.github_automation = mock_github_instance + + # Call the publish_product method + publisher.publish_product("path/to/dataset_config.yaml") + + # Assertions for GitHub automation + mock_github_instance.fork_repository.assert_called_once() + mock_github_instance.clone_repository.assert_called_once() + mock_github_instance.create_branch.assert_called_once_with( + "new-branch-name" + ) # Replace with actual branch name + mock_github_instance.add_file.assert_called_once_with( + "products/test-collection/collection.json", mock_collection.to_dict() + ) + mock_github_instance.commit_and_push.assert_called_once_with( + "new-branch-name", "Add new collection: test-collection" + ) + mock_github_instance.create_pull_request.assert_called_once_with( + "new-branch-name", + "Add new collection", + "This PR adds a new collection to the repository.", + ) + mock_github_instance.clean_up.assert_called_once() + + # Assertions for STAC generator + mock_stac_generator.assert_called_once_with( + dataset_id="test-dataset", + collection_id="test-collection", + documentation_link="http://example.com/doc", + access_link="http://example.com/access", + osc_status="active", + osc_region="region-1", + osc_themes=["theme-1", "theme-2"], + ) + mock_generator_instance.build_stac_collection.assert_called_once() + + @patch("fsspec.open", mock_open(read_data="{}")) + def test_publish_product_missing_config(self): + # Test for missing dataset-id or collection-id + with self.assertRaises(ValueError) as context: + publisher = ProductPublisher("path/to/git_config.yaml") + publisher.publish_product("path/to/dataset_config.yaml") + self.assertEqual( + str(context.exception), + "Dataset ID or Collection ID is missing in the dataset-config.yaml file.", + ) + + def test_missing_git_credentials(self): + # Test for missing GitHub credentials + with patch("fsspec.open", mock_open(read_data="{}")): + with self.assertRaises(ValueError) as context: + ProductPublisher("path/to/git_config.yaml") + self.assertEqual( + str(context.exception), + "GitHub credentials are missing in the git.yaml file.", + ) + + +if __name__ == "__main__": + unittest.main() From aadcb076d45cedb6e8b2b9b3704addc7ace19f6d Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 12:12:48 +0100 Subject: [PATCH 19/63] refactor --- deep_code/api/publish.py | 7 ++++++- deep_code/utils/osc_extension.py | 6 +++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index 7c4a025..3d77aba 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -63,7 +63,6 @@ def publish_product(self, dataset_config_path: str): osc_themes=dataset_theme, ) collection = generator.build_stac_collection() - collection.extra_fields["documentation_link"] = documentation_link file_path = f"products/{collection_id}/collection.json" logger.info("Automating GitHub tasks...") @@ -85,3 +84,9 @@ def publish_product(self, dataset_config_path: str): finally: self.github_automation.clean_up() + +if __name__ == '__main__': + p = ProductPublisher(git_config_path="/home/tejas/bc/projects/deepesdl" + "/deep-code/git.yaml") + p.publish_product(dataset_config_path="/home/tejas/bc/projects/deepesdl" + "/deep-code/dataset-config.yaml") \ No newline at end of file diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 419900f..b21ca02 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -53,11 +53,11 @@ def osc_project(self, v: str) -> None: self._set_property("osc:project", v, pop_if_none=False) @property - def osc_theme(self) -> Optional[List[str]]: + def osc_themes(self) -> Optional[List[str]]: return self._get_property("osc:themes", list) - @osc_theme.setter - def osc_theme(self, value: List[str]) -> None: + @osc_themes.setter + def osc_themes(self, value: List[str]) -> None: if not isinstance(value, list) or not all( isinstance(item, str) for item in value ): From 5a40d767657aab5756f3ba2bf88912c581d2217d Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 12:53:22 +0100 Subject: [PATCH 20/63] support cf_parameter in stac generator --- deep_code/api/publish.py | 8 ++------ deep_code/tests/utils/test_osc_extension.py | 4 ++-- deep_code/utils/dataset_stac_generator.py | 10 ++++++++++ deep_code/utils/osc_extension.py | 6 +++--- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index 3d77aba..f3e0c4e 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -45,6 +45,7 @@ def publish_product(self, dataset_config_path: str): dataset_status = dataset_config.get("dataset-status") osc_region = dataset_config.get("dataset-region") dataset_theme = dataset_config.get("dataset-theme") + cf_params = dataset_config.get("cf-parameter") if not dataset_id or not collection_id: raise ValueError( @@ -61,6 +62,7 @@ def publish_product(self, dataset_config_path: str): osc_status=dataset_status, osc_region=osc_region, osc_themes=dataset_theme, + cf_params=cf_params, ) collection = generator.build_stac_collection() @@ -84,9 +86,3 @@ def publish_product(self, dataset_config_path: str): finally: self.github_automation.clean_up() - -if __name__ == '__main__': - p = ProductPublisher(git_config_path="/home/tejas/bc/projects/deepesdl" - "/deep-code/git.yaml") - p.publish_product(dataset_config_path="/home/tejas/bc/projects/deepesdl" - "/deep-code/dataset-config.yaml") \ No newline at end of file diff --git a/deep_code/tests/utils/test_osc_extension.py b/deep_code/tests/utils/test_osc_extension.py index 07866fb..6e61a38 100644 --- a/deep_code/tests/utils/test_osc_extension.py +++ b/deep_code/tests/utils/test_osc_extension.py @@ -52,8 +52,8 @@ def test_keywords(self): def test_cf_parameters(self): """Test the cf:parameter property.""" extension = OscExtension.ext(self.collection) - extension.cf_parameters = [{"name": "hydrology-4D"}] - self.assertEqual(extension.cf_parameters, [{"name": "hydrology-4D"}]) + extension.cf_parameter = [{"name": "hydrology-4D"}] + self.assertEqual(extension.cf_parameter, [{"name": "hydrology-4D"}]) def test_created_updated(self): """Test the created and updated properties.""" diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 1f2831d..763c57f 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -25,6 +25,7 @@ def __init__( osc_region: str = "Global", osc_themes: Optional[List[str]] = None, osc_missions: Optional[List[str]] = None, + cf_params: Optional[List[dict[str]]] = None, ): """ Initialize the generator with the path to the Zarr dataset and metadata. @@ -45,6 +46,7 @@ def __init__( self.osc_region = osc_region self.osc_themes = osc_themes or [] self.osc_missions = osc_missions or [] + self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() @@ -219,6 +221,14 @@ def build_stac_collection(self) -> Collection: osc_extension.osc_themes = self.osc_themes osc_extension.osc_variables = variables osc_extension.osc_missions = self.osc_missions + if self.cf_params: + osc_extension.cf_parameter = self.cf_params + else: + osc_extension.cf_parameter = [ + { + "Name": self.collection_id + } + ] # Add creation and update timestamps for the collection now_iso = datetime.now(timezone.utc).isoformat() diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index b21ca02..829de44 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -113,11 +113,11 @@ def keywords(self, value: List[str]) -> None: # CF Parameters @property - def cf_parameters(self) -> Optional[List[dict]]: + def cf_parameter(self) -> Optional[List[dict]]: return self._get_property("cf:parameter", list) - @cf_parameters.setter - def cf_parameters(self, value: List[dict]) -> None: + @cf_parameter.setter + def cf_parameter(self, value: List[dict]) -> None: if not isinstance(value, list) or not all( isinstance(item, dict) for item in value ): From 4f95d7799a91e2902c4ce43fba01fc5c77224c05 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 13:09:05 +0100 Subject: [PATCH 21/63] modify the get_schema_uri method to handle osc and cf ext uris --- deep_code/constants.py | 1 + deep_code/utils/osc_extension.py | 26 ++++++++++++++++++-------- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index cf10a7a..f221abd 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -1,4 +1,5 @@ OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0-rc.3/schema.json" +CF_SCHEMA_URI = "https://stac-extensions.github.io/cf/v0.2.0/schema.json" OSC_REPO_OWNER = "ESA-EarthCODE" OSC_REPO_NAME = "open-science-catalog-metadata-testing" OSC_BRANCH_NAME = "add-new-collection" diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 829de44..f74d228 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -4,7 +4,7 @@ from pystac import SpatialExtent, TemporalExtent, Extent from pystac.extensions.base import PropertiesExtension, ExtensionManagementMixin -from deep_code.constants import OSC_SCHEMA_URI +from deep_code.constants import OSC_SCHEMA_URI, CF_SCHEMA_URI class OscExtension( @@ -142,8 +142,8 @@ def updated(self, value: str) -> None: self._set_property("updated", value, pop_if_none=False) @classmethod - def get_schema_uri(cls) -> str: - return OSC_SCHEMA_URI + def get_schema_uri(cls) -> List[str]: + return [OSC_SCHEMA_URI, CF_SCHEMA_URI] @classmethod def ext( @@ -161,14 +161,24 @@ def ext( @classmethod def has_extension(cls, obj: Union[pystac.Item, pystac.Collection]) -> bool: - """Checks if the OSC extension is present in the object's extensions.""" - return cls.get_schema_uri() in obj.stac_extensions + """Checks if all required extensions are present.""" + schema_uris = cls.get_schema_uri() + if isinstance(schema_uris, list): + return all(uri in obj.stac_extensions for uri in schema_uris) + elif isinstance(schema_uris, str): + return schema_uris in obj.stac_extensions @classmethod def add_to(cls, obj: Union[pystac.Item, pystac.Collection]) -> "OscExtension": - """Adds the OSC extension to the object's extensions.""" - if cls.get_schema_uri() not in obj.stac_extensions: - obj.stac_extensions.append(cls.get_schema_uri()) + """Adds the OSC and CF extensions to the object's extensions.""" + schema_uris = cls.get_schema_uri() + if isinstance(schema_uris, list): # Handle list of URIs + for uri in schema_uris: + if uri not in obj.stac_extensions: + obj.stac_extensions.append(uri) + elif isinstance(schema_uris, str): # Handle single URI + if schema_uris not in obj.stac_extensions: + obj.stac_extensions.append(schema_uris) return OscExtension(obj) def validate_extension(self) -> None: From bfe04c85abb93d127c78c2423f4dbc8427a0e19e Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 15:43:07 +0100 Subject: [PATCH 22/63] refactor line wrap at 88 chars --- deep_code/api/publish.py | 9 +++-- deep_code/utils/dataset_stac_generator.py | 40 ++++++++++++++--------- deep_code/utils/osc_extension.py | 3 +- 3 files changed, 33 insertions(+), 19 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index f3e0c4e..2280bd5 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -33,7 +33,8 @@ def publish_product(self, dataset_config_path: str): """ Publish a product collection to the specified GitHub repository. - :param dataset_config_path: Path to the YAML file containing dataset configuration + :param dataset_config_path: Path to the YAML file containing dataset + configuration """ with fsspec.open(dataset_config_path, "r") as file: dataset_config = yaml.safe_load(file) @@ -49,7 +50,8 @@ def publish_product(self, dataset_config_path: str): if not dataset_id or not collection_id: raise ValueError( - "Dataset ID or Collection ID is missing in the dataset-config.yaml file." + "Dataset ID or Collection ID is missing in the dataset-config.yaml " + "file." ) try: @@ -74,7 +76,8 @@ def publish_product(self, dataset_config_path: str): self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) self.github_automation.add_file(file_path, collection.to_dict()) self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new " f"collection:" f" {collection_id}" + OSC_NEW_BRANCH_NAME, f"Add new " f"collection:" + f" {collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 763c57f..b2eff16 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -83,7 +83,8 @@ def _open_dataset(self): tried_configurations.append(config["description"]) try: self.logger.info( - f"Attempting to open dataset with configuration: {config['description']}" + f"Attempting to open dataset with configuration: " + f"{config['description']}" ) store = new_data_store( config["params"]["storage_type"], @@ -93,21 +94,27 @@ def _open_dataset(self): # Try to open the dataset; return immediately if successful dataset = store.open_data(self.dataset_id) self.logger.info( - f"Successfully opened dataset with configuration: {config['description']}" + f"Successfully opened dataset with configuration: " + f"{config['description']}" ) return dataset except Exception as e: self.logger.error( - f"Failed to open dataset with configuration: {config['description']}. Error: {e}" + f"Failed to open dataset with configuration: " + f"{config['description']}. Error: {e}" ) last_exception = e # If all attempts fail, raise an error self.logger.critical( - f"Failed to open Zarr dataset with ID {self.dataset_id}. Tried configurations: {', '.join(tried_configurations)}. Last error: {last_exception}" + f"Failed to open Zarr dataset with ID {self.dataset_id}. " + f"Tried configurations: {', '.join(tried_configurations)}. " + f"Last error: {last_exception}" ) raise ValueError( - f"Failed to open Zarr dataset with ID {self.dataset_id}. Tried configurations: {', '.join(tried_configurations)}. Last error: {last_exception}" + f"Failed to open Zarr dataset with ID {self.dataset_id}. " + f"Tried configurations: {', '.join(tried_configurations)}. " + f"Last error: {last_exception}" ) def _get_spatial_extent(self) -> SpatialExtent: @@ -130,7 +137,8 @@ def _get_spatial_extent(self) -> SpatialExtent: return SpatialExtent([[x_min, y_min, x_max, y_max]]) else: raise ValueError( - "Dataset does not have recognized spatial coordinates ('lon', 'lat' or 'x', 'y')." + "Dataset does not have recognized spatial coordinates " + "('lon', 'lat' or 'x', 'y')." ) def _get_temporal_extent(self) -> TemporalExtent: @@ -154,8 +162,10 @@ def _get_variables(self) -> List[str]: """ Extract variable names from the dataset. - Prioritize fetching `long_name` or `standard_name` from each variable's attributes. - If neither is available, return the variable's name from `dataset.data_vars.keys()`. + Prioritize fetching `long_name` or `standard_name` from each variable's + attributes. + If neither is available, return the variable's name from + `dataset.data_vars.keys()`. :return: A list of variable names or descriptions. """ @@ -170,7 +180,8 @@ def _get_variables(self) -> List[str]: ) if not long_name and not standard_name: self.logger.error( - f"Metadata missing for variable '{var_name}': 'long_name' and 'standard_name' attributes are not available." + f"Metadata missing for variable '{var_name}': 'long_name' and " + f"'standard_name' attributes are not available." ) # Prioritize 'long_name', fallback to 'standard_name', then use variable key variables.append(long_name or standard_name or var_name) @@ -224,11 +235,7 @@ def build_stac_collection(self) -> Collection: if self.cf_params: osc_extension.cf_parameter = self.cf_params else: - osc_extension.cf_parameter = [ - { - "Name": self.collection_id - } - ] + osc_extension.cf_parameter = [{"Name": self.collection_id}] # Add creation and update timestamps for the collection now_iso = datetime.now(timezone.utc).isoformat() @@ -259,7 +266,10 @@ def build_stac_collection(self) -> Collection: ) ) - self_href = "https://esa-earthcode.github.io/open-science-catalog-metadata/products/deepesdl/collection.json" + self_href = ( + "https://esa-earthcode.github.io/" + "open-science-catalog-metadata/products/deepesdl/collection.json" + ) collection.set_self_href(self_href) # Validate OSC extension fields diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index f74d228..bc3dc15 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -149,7 +149,8 @@ def get_schema_uri(cls) -> List[str]: def ext( cls, obj: Union[pystac.Item, pystac.Collection], add_if_missing: bool = False ) -> "OscExtension": - """Returns the OscExtension instance for the given object, adding the extension if missing.""" + """Returns the OscExtension instance for the given object, adding the extension + if missing.""" if cls.has_extension(obj): return OscExtension(obj) elif add_if_missing: From f5ad7f1d3b6673386b01346fa93a51bef7fed273 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 16:25:08 +0100 Subject: [PATCH 23/63] python 3.10 typing updates and black code formatting --- deep_code/api/publish.py | 3 +- deep_code/cli/publish.py | 1 - deep_code/utils/dataset_stac_generator.py | 28 ++++++------- deep_code/utils/osc_extension.py | 50 +++++++++++------------ 4 files changed, 40 insertions(+), 42 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index 2280bd5..ac737a1 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -76,8 +76,7 @@ def publish_product(self, dataset_config_path: str): self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) self.github_automation.add_file(file_path, collection.to_dict()) self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new " f"collection:" - f" {collection_id}" + OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 66c2dfd..8c16335 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -21,5 +21,4 @@ def publish_product(git_config, dataset_config): Command-line interface for the ProductPublisher API. """ publisher = ProductPublisher(git_config_path=git_config) - publisher.publish_product(dataset_config_path=dataset_config) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index b2eff16..e5cbde8 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,7 +1,6 @@ import os import logging from datetime import datetime, timezone -from typing import List, Optional import pandas as pd from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent @@ -19,13 +18,13 @@ def __init__( self, dataset_id: str, collection_id: str, - access_link: Optional[str] = None, - documentation_link: Optional[str] = None, + access_link: str | None = None, + documentation_link: str | None = None, osc_status: str = "ongoing", osc_region: str = "Global", - osc_themes: Optional[List[str]] = None, - osc_missions: Optional[List[str]] = None, - cf_params: Optional[List[dict[str]]] = None, + osc_themes: list[str] | None = None, + osc_missions: list[str] | None = None, + cf_params: list[dict[str]] | None = None, ): """ Initialize the generator with the path to the Zarr dataset and metadata. @@ -37,6 +36,8 @@ def __init__( :param osc_status: Status of the dataset (e.g., "ongoing"). :param osc_region: Geographical region of the dataset. :param osc_themes: Themes of the dataset (e.g., ["climate", "environment"]). + :param osc_missions: Satellite mission to which dataset belongs. + :param cf_params: params related to CF metadata convention. """ self.dataset_id = dataset_id self.collection_id = collection_id @@ -158,7 +159,11 @@ def _get_temporal_extent(self) -> TemporalExtent: else: raise ValueError("Dataset does not have a 'time' coordinate.") - def _get_variables(self) -> List[str]: + @staticmethod + def _normalize_name(name: str | None) -> str | None: + return name.replace(" ", "-").lower() if name else None + + def _get_variables(self) -> list[str]: """ Extract variable names from the dataset. @@ -171,13 +176,9 @@ def _get_variables(self) -> List[str]: """ variables = [] for var_name, variable in self.dataset.data_vars.items(): - long_name = variable.attrs.get("long_name") - standard_name = variable.attrs.get("standard_name") # Replace spaces with hyphens and convert to lowercase if attributes exist - long_name = long_name.replace(" ", "-").lower() if long_name else None - standard_name = ( - standard_name.replace(" ", "-").lower() if standard_name else None - ) + long_name = self._normalize_name(variable.attrs.get("long_name")) + standard_name = self._normalize_name(variable.attrs.get("standard_name")) if not long_name and not standard_name: self.logger.error( f"Metadata missing for variable '{var_name}': 'long_name' and " @@ -206,7 +207,6 @@ def build_stac_collection(self) -> Collection: :return: A pystac.Collection object. """ - # Extract metadata try: spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index bc3dc15..6bfa1ab 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -1,4 +1,4 @@ -from typing import Optional, Union, Literal, List +from typing import Literal import pystac from pystac import SpatialExtent, TemporalExtent, Extent @@ -8,11 +8,11 @@ class OscExtension( - PropertiesExtension, ExtensionManagementMixin[Union[pystac.Item, pystac.Collection]] + PropertiesExtension, ExtensionManagementMixin[pystac.Item | pystac.Collection] ): name: Literal["osc"] = "osc" - def __init__(self, obj: Union[pystac.Item, pystac.Collection]): + def __init__(self, obj: pystac.Item | pystac.Collection): if isinstance(obj, pystac.Collection): self.properties = obj.extra_fields else: @@ -21,7 +21,7 @@ def __init__(self, obj: Union[pystac.Item, pystac.Collection]): # Existing properties... @property - def osc_type(self) -> Optional[str]: + def osc_type(self) -> str | None: return self._get_property("osc:type", str) @osc_type.setter @@ -29,7 +29,7 @@ def osc_type(self, v: str) -> None: self._set_property("osc:type", v, pop_if_none=False) @property - def osc_name(self) -> Optional[str]: + def osc_name(self) -> str | None: return self._get_property("osc:name", str) @osc_name.setter @@ -37,7 +37,7 @@ def osc_name(self, v: str) -> None: self._set_property("osc:name", v, pop_if_none=False) @property - def osc_status(self) -> Optional[str]: + def osc_status(self) -> str | None: return self._get_property("osc:status", str) @osc_status.setter @@ -45,7 +45,7 @@ def osc_status(self, value: str) -> None: self._set_property("osc:status", value, pop_if_none=False) @property - def osc_project(self) -> Optional[str]: + def osc_project(self) -> str | None: return self._get_property("osc:project", str) @osc_project.setter @@ -53,11 +53,11 @@ def osc_project(self, v: str) -> None: self._set_property("osc:project", v, pop_if_none=False) @property - def osc_themes(self) -> Optional[List[str]]: + def osc_themes(self) -> list[str] | None: return self._get_property("osc:themes", list) @osc_themes.setter - def osc_themes(self, value: List[str]) -> None: + def osc_themes(self, value: list[str]) -> None: if not isinstance(value, list) or not all( isinstance(item, str) for item in value ): @@ -65,7 +65,7 @@ def osc_themes(self, value: List[str]) -> None: self._set_property("osc:themes", value, pop_if_none=False) @property - def osc_region(self) -> Optional[str]: + def osc_region(self) -> str | None: return self._get_property("osc:region", str) @osc_region.setter @@ -73,11 +73,11 @@ def osc_region(self, value: str) -> None: self._set_property("osc:region", value, pop_if_none=False) @property - def osc_missions(self) -> Optional[List[str]]: + def osc_missions(self) -> list[str] | None: return self._get_property("osc:missions", list) @osc_missions.setter - def osc_missions(self, value: List[str]) -> None: + def osc_missions(self, value: list[str]) -> None: if not isinstance(value, list) or not all( isinstance(item, str) for item in value ): @@ -85,26 +85,26 @@ def osc_missions(self, value: List[str]) -> None: self._set_property("osc:missions", value, pop_if_none=False) # Utility methods for handling temporal and spatial extent - def set_extent(self, spatial: List[List[float]], temporal: List[List[str]]) -> None: + def set_extent(self, spatial: list[list[float]], temporal: list[list[str]]) -> None: self.obj.extent = Extent(SpatialExtent(spatial), TemporalExtent(temporal)) @property - def osc_variables(self) -> Optional[List[str]]: + def osc_variables(self) -> list[str] | None: return self._get_property("osc:variables", list) @osc_variables.setter - def osc_variables(self, v: List[str]) -> None: + def osc_variables(self, v: list[str]) -> None: if not isinstance(v, list) or not all(isinstance(item, str) for item in v): raise ValueError("osc:variables must be a list of strings") self._set_property("osc:variables", v, pop_if_none=False) # Keywords property @property - def keywords(self) -> Optional[List[str]]: + def keywords(self) -> list[str] | None: return self._get_property("keywords", list) @keywords.setter - def keywords(self, value: List[str]) -> None: + def keywords(self, value: list[str]) -> None: if not isinstance(value, list) or not all( isinstance(item, str) for item in value ): @@ -113,11 +113,11 @@ def keywords(self, value: List[str]) -> None: # CF Parameters @property - def cf_parameter(self) -> Optional[List[dict]]: + def cf_parameter(self) -> list[dict] | None: return self._get_property("cf:parameter", list) @cf_parameter.setter - def cf_parameter(self, value: List[dict]) -> None: + def cf_parameter(self, value: list[dict]) -> None: if not isinstance(value, list) or not all( isinstance(item, dict) for item in value ): @@ -126,7 +126,7 @@ def cf_parameter(self, value: List[dict]) -> None: # Created and Updated timestamps @property - def created(self) -> Optional[str]: + def created(self) -> str | None: return self._get_property("created", str) @created.setter @@ -134,7 +134,7 @@ def created(self, value: str) -> None: self._set_property("created", value, pop_if_none=False) @property - def updated(self) -> Optional[str]: + def updated(self) -> str | None: return self._get_property("updated", str) @updated.setter @@ -142,12 +142,12 @@ def updated(self, value: str) -> None: self._set_property("updated", value, pop_if_none=False) @classmethod - def get_schema_uri(cls) -> List[str]: + def get_schema_uri(cls) -> list[str]: return [OSC_SCHEMA_URI, CF_SCHEMA_URI] @classmethod def ext( - cls, obj: Union[pystac.Item, pystac.Collection], add_if_missing: bool = False + cls, obj: pystac.Item | pystac.Collection, add_if_missing: bool = False ) -> "OscExtension": """Returns the OscExtension instance for the given object, adding the extension if missing.""" @@ -161,7 +161,7 @@ def ext( ) @classmethod - def has_extension(cls, obj: Union[pystac.Item, pystac.Collection]) -> bool: + def has_extension(cls, obj: pystac.Item | pystac.Collection) -> bool: """Checks if all required extensions are present.""" schema_uris = cls.get_schema_uri() if isinstance(schema_uris, list): @@ -170,7 +170,7 @@ def has_extension(cls, obj: Union[pystac.Item, pystac.Collection]) -> bool: return schema_uris in obj.stac_extensions @classmethod - def add_to(cls, obj: Union[pystac.Item, pystac.Collection]) -> "OscExtension": + def add_to(cls, obj: pystac.Item | pystac.Collection) -> "OscExtension": """Adds the OSC and CF extensions to the object's extensions.""" schema_uris = cls.get_schema_uri() if isinstance(schema_uris, list): # Handle list of URIs From 87bb0bdfa551acb9b3a2d757904d1d1bb82940e9 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 16:40:16 +0100 Subject: [PATCH 24/63] updated copyright notices --- deep_code/__init__.py | 2 +- deep_code/api/publish.py | 6 ++++++ deep_code/cli/__init__.py | 2 +- deep_code/cli/main.py | 6 ++++++ deep_code/cli/publish.py | 6 ++++++ deep_code/constants.py | 6 ++++++ deep_code/utils/dataset_stac_generator.py | 6 ++++++ deep_code/utils/github_automation.py | 6 ++++++ deep_code/utils/osc_extension.py | 6 ++++++ deep_code/version.py | 4 ++-- 10 files changed, 46 insertions(+), 4 deletions(-) diff --git a/deep_code/__init__.py b/deep_code/__init__.py index ac5e73a..ff01dbd 100644 --- a/deep_code/__init__.py +++ b/deep_code/__init__.py @@ -1,5 +1,5 @@ # The MIT License (MIT) -# Copyright (c) 2024 by the xcube development team and contributors +# Copyright (c) 2024 by DeepESDL and Brockmann Consult GmbH # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index ac737a1..a723013 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import logging import yaml import fsspec diff --git a/deep_code/cli/__init__.py b/deep_code/cli/__init__.py index dd9064b..6f1b5dc 100644 --- a/deep_code/cli/__init__.py +++ b/deep_code/cli/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2025 by xcube team and contributors +# Copyright (c) 2024 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index 46c817a..4f11c44 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import click from deep_code.cli.publish import publish_product diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 8c16335..5c127c6 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import click from deep_code.api.publish import ProductPublisher diff --git a/deep_code/constants.py b/deep_code/constants.py index f221abd..68982bc 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0-rc.3/schema.json" CF_SCHEMA_URI = "https://stac-extensions.github.io/cf/v0.2.0/schema.json" OSC_REPO_OWNER = "ESA-EarthCODE" diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index e5cbde8..d470c9d 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import os import logging from datetime import datetime, timezone diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 14355a5..fb474c7 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import os import json import subprocess diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 6bfa1ab..9d0e5e3 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -1,3 +1,9 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + from typing import Literal import pystac diff --git a/deep_code/version.py b/deep_code/version.py index 711ec8b..e2b6e0b 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -1,5 +1,5 @@ # The MIT License (MIT) -# Copyright (c) 2024 by the xcube development team and contributors +# Copyright (c) 2024 by DeepESDL and Brockmann Consult GmbH # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.0.dev0" +version = "0.0.1.dev0" From bbd474653742dec37e9cbccdc7c4cd4027052890 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 16:56:45 +0100 Subject: [PATCH 25/63] updated environment.yml and pyproject.toml --- environment.yml | 6 ++++-- pyproject.toml | 8 +++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 67bafdb..64e1db1 100644 --- a/environment.yml +++ b/environment.yml @@ -6,7 +6,9 @@ dependencies: - python >=3.9 - click - jsonschema + - requests - pystac - pyyaml - - requests - - xcube \ No newline at end of file + - xcube + # test dependencies + - pytest \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 17da38b..5daaf0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,10 +19,12 @@ readme = {file = "README.md", content-type = "text/markdown"} license = {text = "MIT"} requires-python = ">=3.10" dependencies = [ - "pystac", - "jsonschema", "click", - "xcube" + "jsonschema", + "requests", + "pystac", + "pyyaml", + "xcube-core" ] [tool.setuptools.dynamic] From a56798174af2a5772c8f65e3055d087cabdba824 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 30 Dec 2024 17:31:30 +0100 Subject: [PATCH 26/63] updated environment.yml and pyproject.toml --- environment.yml | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 64e1db1..0051009 100644 --- a/environment.yml +++ b/environment.yml @@ -5,6 +5,7 @@ dependencies: # Required - python >=3.9 - click + - fsspec - jsonschema - requests - pystac diff --git a/pyproject.toml b/pyproject.toml index 5daaf0e..c44ba3f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,7 @@ license = {text = "MIT"} requires-python = ">=3.10" dependencies = [ "click", + "fsspec", "jsonschema", "requests", "pystac", From 76da5aa0dcdb3b603ae06563843b018d3e8ed014 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 10:42:18 +0100 Subject: [PATCH 27/63] refactor --- .../tests/utils/test_dataset_stac_generator.py | 14 +++++++------- deep_code/tests/utils/test_osc_extension.py | 2 +- deep_code/utils/dataset_stac_generator.py | 2 -- environment.yml | 4 +++- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index ed8bda5..fb2c0d1 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -1,11 +1,13 @@ +import os +from datetime import datetime + +import numpy as np +from pystac import Collection import unittest from unittest.mock import patch, MagicMock -from datetime import datetime, timezone -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator -from pystac import Collection from xarray import Dataset -import numpy as np -import os + +from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator class TestOSCProductSTACGenerator(unittest.TestCase): @@ -71,7 +73,6 @@ def test_get_general_metadata(self): """Test general metadata extraction.""" metadata = self.generator._get_general_metadata() self.assertEqual(metadata["description"], "Mock dataset for testing.") - self.assertEqual(metadata["title"], "Mock Dataset") @patch("pystac.Collection.add_link") @patch("pystac.Collection.set_self_href") @@ -81,7 +82,6 @@ def test_build_stac_collection(self, mock_set_self_href, mock_add_link): self.assertIsInstance(collection, Collection) self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") - self.assertEqual(collection.title, "Mock Dataset") self.assertEqual( collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0] ) diff --git a/deep_code/tests/utils/test_osc_extension.py b/deep_code/tests/utils/test_osc_extension.py index 6e61a38..66300cc 100644 --- a/deep_code/tests/utils/test_osc_extension.py +++ b/deep_code/tests/utils/test_osc_extension.py @@ -83,7 +83,7 @@ def test_validation_success(self): def test_add_osc_extension(self): osc_ext = OscExtension.add_to(self.collection) - self.assertIn(OscExtension.get_schema_uri(), self.collection.stac_extensions) + self.assertEqual(OscExtension.get_schema_uri(), self.collection.stac_extensions) self.assertIsInstance(osc_ext, OscExtension) def test_has_extension(self): diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index d470c9d..d02d795 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -98,7 +98,6 @@ def _open_dataset(self): root=config["params"]["root"], storage_options=config["params"]["storage_options"], ) - # Try to open the dataset; return immediately if successful dataset = store.open_data(self.dataset_id) self.logger.info( f"Successfully opened dataset with configuration: " @@ -112,7 +111,6 @@ def _open_dataset(self): ) last_exception = e - # If all attempts fail, raise an error self.logger.critical( f"Failed to open Zarr dataset with ID {self.dataset_id}. " f"Tried configurations: {', '.join(tried_configurations)}. " diff --git a/environment.yml b/environment.yml index 0051009..25cc383 100644 --- a/environment.yml +++ b/environment.yml @@ -8,8 +8,10 @@ dependencies: - fsspec - jsonschema - requests + - pandas - pystac - pyyaml - xcube # test dependencies - - pytest \ No newline at end of file + - pytest + - numpy \ No newline at end of file From c72044ef83fc3b3f86d32f61dc0631cd722b0bca Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 11:16:53 +0100 Subject: [PATCH 28/63] refactor imports --- deep_code/api/publish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index a723013..617d095 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -4,9 +4,9 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. +import fsspec import logging import yaml -import fsspec from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator From 5e4b2ed4b53392ecc814ba0410b0f6ef238149ab Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 11:17:37 +0100 Subject: [PATCH 29/63] update environment.yml --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index 25cc383..d192aae 100644 --- a/environment.yml +++ b/environment.yml @@ -14,4 +14,5 @@ dependencies: - xcube # test dependencies - pytest + - pytest-cov - numpy \ No newline at end of file From 23568fbe67323b078cf2c9e14cc772b6842f077f Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 14:53:56 +0100 Subject: [PATCH 30/63] make directories module --- deep_code/tests/api/__init__.py | 3 +++ deep_code/tests/utils/__init__.py | 3 +++ deep_code/utils/__init__.py | 3 +++ 3 files changed, 9 insertions(+) create mode 100644 deep_code/tests/api/__init__.py create mode 100644 deep_code/tests/utils/__init__.py create mode 100644 deep_code/utils/__init__.py diff --git a/deep_code/tests/api/__init__.py b/deep_code/tests/api/__init__.py new file mode 100644 index 0000000..8b75a76 --- /dev/null +++ b/deep_code/tests/api/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. \ No newline at end of file diff --git a/deep_code/tests/utils/__init__.py b/deep_code/tests/utils/__init__.py new file mode 100644 index 0000000..8b75a76 --- /dev/null +++ b/deep_code/tests/utils/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. \ No newline at end of file diff --git a/deep_code/utils/__init__.py b/deep_code/utils/__init__.py new file mode 100644 index 0000000..8b75a76 --- /dev/null +++ b/deep_code/utils/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2024 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. \ No newline at end of file From b9ddbec5302d795914d999e9477bc7c6422af296 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 14:54:30 +0100 Subject: [PATCH 31/63] introduced logging --- deep_code/utils/github_automation.py | 62 ++++++++++++++++++---------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index fb474c7..7f1662f 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -4,10 +4,11 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. -import os import json -import subprocess +import logging +import os import requests +import subprocess from pathlib import Path @@ -33,29 +34,36 @@ def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): def fork_repository(self): """Fork the repository to the user's GitHub account.""" - print("Forking repository...") + logging.info("Forking repository...") url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/forks" headers = {"Authorization": f"token {self.token}"} response = requests.post(url, headers=headers) response.raise_for_status() - print(f"Repository forked to {self.username}/{self.repo_name}") + logging.info(f"Repository forked to {self.username}/{self.repo_name}") def clone_repository(self): """Clone the forked repository locally.""" - print("Cloning forked repository...") - subprocess.run( - ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True - ) - os.chdir(self.local_clone_dir) + logging.info("Cloning forked repository...") + try: + subprocess.run( + ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True + ) + os.chdir(self.local_clone_dir) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to clone repository: {e}") - def create_branch(self, branch_name: str): + @staticmethod + def create_branch(branch_name: str): """Create a new branch in the local repository.""" - print(f"Creating new branch: {branch_name}...") - subprocess.run(["git", "checkout", "-b", branch_name], check=True) + logging.info(f"Creating new branch: {branch_name}...") + try: + subprocess.run(["git", "checkout", "-b", branch_name], check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed Creating branch: '{branch_name}': {e}") def add_file(self, file_path: str, content): """Add a new file to the local repository.""" - print(f"Adding new file: {file_path}...") + logging.info(f"Adding new file: {file_path}...") full_path = Path(self.local_clone_dir) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) with open(full_path, "w") as f: @@ -63,19 +71,26 @@ def add_file(self, file_path: str, content): if hasattr(content, "to_dict"): content = content.to_dict() f.write(json.dumps(content, indent=2)) - subprocess.run(["git", "add", str(full_path)], check=True) + try: + subprocess.run(["git", "add", str(full_path)], check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to add file '{file_path}': {e}") - def commit_and_push(self, branch_name: str, commit_message: str): + @staticmethod + def commit_and_push(branch_name: str, commit_message: str): """Commit changes and push to the forked repository.""" - print("Committing and pushing changes...") - subprocess.run(["git", "commit", "-m", commit_message], check=True) - subprocess.run(["git", "push", "-u", "origin", branch_name], check=True) + logging.info("Committing and pushing changes...") + try: + subprocess.run(["git", "commit", "-m", commit_message], check=True) + subprocess.run(["git", "push", "-u", "origin", branch_name], check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to commit and push: {e}") def create_pull_request( self, branch_name: str, pr_title: str, pr_body: str, base_branch: str = "main" ): """Create a pull request from the forked repository to the base repository.""" - print("Creating a pull request...") + logging.info("Creating a pull request...") url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls" headers = {"Authorization": f"token {self.token}"} data = { @@ -87,10 +102,13 @@ def create_pull_request( response = requests.post(url, headers=headers, json=data) response.raise_for_status() pr_url = response.json()["html_url"] - print(f"Pull request created: {pr_url}") + logging.info(f"Pull request created: {pr_url}") def clean_up(self): """Clean up the local cloned repository.""" - print("Cleaning up local repository...") + logging.info("Cleaning up local repository...") os.chdir("..") - subprocess.run(["rm", "-rf", self.local_clone_dir]) + try: + subprocess.run(["rm", "-rf", self.local_clone_dir]) + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to clean-up local repository: {e}") From 43cddde7b1d9e44f3d5b03a3d5b36df7e24ef665 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 14:55:48 +0100 Subject: [PATCH 32/63] unit test for publish api --- deep_code/tests/api/test_publish.py | 206 ++++++++++++++++------------ 1 file changed, 115 insertions(+), 91 deletions(-) diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/api/test_publish.py index 5b3a5da..9c746c7 100644 --- a/deep_code/tests/api/test_publish.py +++ b/deep_code/tests/api/test_publish.py @@ -1,103 +1,127 @@ -import unittest +import os + +import pytest from unittest.mock import patch, MagicMock, mock_open -import yaml from deep_code.api.publish import ProductPublisher +class TestProductPublisher: + + @patch("deep_code.api.publish.fsspec.open") + def test_init_missing_credentials(self, mock_fsspec_open): + mock_fsspec_open.return_value.__enter__.return_value = mock_open(read_data="{}")() + + with pytest.raises(ValueError, match="GitHub credentials are missing in the git.yaml file."): + ProductPublisher("/path/to/git.yaml") -class TestProductPublisher(unittest.TestCase): - @patch("fsspec.open") - @patch("deep_code.utils.dataset_stac_generator.OSCProductSTACGenerator") @patch("deep_code.utils.github_automation.GitHubAutomation") - def test_publish_product_success( - self, mock_github_automation, mock_stac_generator, mock_fsspec_open - ): - # Mock the git.yaml configuration - git_config = {"github-username": "test-user", "github-token": "test-token"} + @patch("deep_code.api.publish.fsspec.open") + def test_init_with_credentials(self, mock_fsspec_open): + git_yaml_content = """ + github-username: test-user + github-token: test-token + """ + mock_fsspec_open.return_value.__enter__.return_value = mock_open( + read_data=git_yaml_content)() + + publisher = ProductPublisher("/path/to/git.yaml") + + assert publisher.github_username == "test-user" + assert publisher.github_token == "test-token" + + @patch("deep_code.api.publish.fsspec.open") + def test_publish_product_missing_ids(self, mock_fsspec_open): + git_yaml_content = """ + github-username: test-user + github-token: test-token + """ + dataset_yaml_content = """ + collection-id: test-collection + """ mock_fsspec_open.side_effect = [ - mock_open(read_data=yaml.dump(git_config)).return_value, - mock_open( - read_data=yaml.dump( - { - "dataset-id": "test-dataset", - "collection-id": "test-collection", - "documentation-link": "http://example.com/doc", - "access-link": "http://example.com/access", - "dataset-status": "active", - "dataset-region": "region-1", - "dataset-theme": ["theme-1", "theme-2"], - } - ) - ).return_value, + mock_open(read_data=git_yaml_content)(), + mock_open(read_data=dataset_yaml_content)() ] - # Mock the STAC generator - mock_generator_instance = mock_stac_generator.return_value + publisher = ProductPublisher("/path/to/git.yaml") + + with pytest.raises(ValueError, + match="Dataset ID or Collection ID is missing in the " + "dataset-config.yaml file."): + publisher.publish_product("/path/to/dataset-config.yaml") + + @patch("os.makedirs") + @patch("subprocess.run") + @patch("requests.post") + @patch("deep_code.utils.github_automation.GitHubAutomation") + @patch("deep_code.api.publish.fsspec.open") + def test_publish_product_success(self, mock_fsspec_open, mock_github_automation, + mock_requests_post, mock_subprocess_run, + mock_makedirs): + git_yaml_content = """ + github-username: test-user + github-token: test-token + """ + dataset_yaml_content = """ + dataset-id: test-dataset + collection-id: test-collection + documentation-link: http://example.com/doc + access-link: http://example.com/access + dataset-status: ongoing + dataset-region: Global + dataset-theme: ["climate"] + cf-parameter: [] + """ + mock_fsspec_open.side_effect = [ + mock_open(read_data=git_yaml_content)(), + mock_open(read_data=dataset_yaml_content)() + ] + + # Mock GitHubAutomation methods + mock_git = mock_github_automation.return_value + mock_git.fork_repository.return_value = None + mock_git.clone_repository.return_value = None + mock_git.create_branch.return_value = None + mock_git.add_file.return_value = None + mock_git.commit_and_push.return_value = None + mock_git.create_pull_request.return_value = "http://example.com/pr" + + # Mock requests.post for GitHub API calls + mock_response = MagicMock() + mock_response.raise_for_status.return_value = None + mock_requests_post.return_value = mock_response + + # Mock subprocess.run for git commands + mock_subprocess_run.return_value = None + + home_dir = os.path.expanduser("~") + local_clone_dir = os.path.join(home_dir, "temp_repo") + + # Mock os.makedirs to avoid real directory creation + mock_makedirs.return_value = None + + # Mock OSCProductSTACGenerator mock_collection = MagicMock() - mock_generator_instance.build_stac_collection.return_value = mock_collection - - # Mock the GitHub automation - mock_github_instance = mock_github_automation.return_value - - # Create an instance of ProductPublisher - publisher = ProductPublisher("path/to/git_config.yaml") - publisher.github_automation = mock_github_instance - - # Call the publish_product method - publisher.publish_product("path/to/dataset_config.yaml") - - # Assertions for GitHub automation - mock_github_instance.fork_repository.assert_called_once() - mock_github_instance.clone_repository.assert_called_once() - mock_github_instance.create_branch.assert_called_once_with( - "new-branch-name" - ) # Replace with actual branch name - mock_github_instance.add_file.assert_called_once_with( - "products/test-collection/collection.json", mock_collection.to_dict() - ) - mock_github_instance.commit_and_push.assert_called_once_with( - "new-branch-name", "Add new collection: test-collection" - ) - mock_github_instance.create_pull_request.assert_called_once_with( - "new-branch-name", - "Add new collection", - "This PR adds a new collection to the repository.", - ) - mock_github_instance.clean_up.assert_called_once() - - # Assertions for STAC generator - mock_stac_generator.assert_called_once_with( - dataset_id="test-dataset", - collection_id="test-collection", - documentation_link="http://example.com/doc", - access_link="http://example.com/access", - osc_status="active", - osc_region="region-1", - osc_themes=["theme-1", "theme-2"], - ) - mock_generator_instance.build_stac_collection.assert_called_once() - - @patch("fsspec.open", mock_open(read_data="{}")) - def test_publish_product_missing_config(self): - # Test for missing dataset-id or collection-id - with self.assertRaises(ValueError) as context: - publisher = ProductPublisher("path/to/git_config.yaml") - publisher.publish_product("path/to/dataset_config.yaml") - self.assertEqual( - str(context.exception), - "Dataset ID or Collection ID is missing in the dataset-config.yaml file.", - ) - - def test_missing_git_credentials(self): - # Test for missing GitHub credentials - with patch("fsspec.open", mock_open(read_data="{}")): - with self.assertRaises(ValueError) as context: - ProductPublisher("path/to/git_config.yaml") - self.assertEqual( - str(context.exception), - "GitHub credentials are missing in the git.yaml file.", - ) + mock_collection.to_dict.return_value = { + "type": "Collection", + "id": "test-collection", + "description": "A test STAC collection", + "extent": { + "spatial": {"bbox": [[-180.0, -90.0, 180.0, 90.0]]}, + "temporal": {"interval": [["2023-01-01T00:00:00Z", None]]}, + }, + "links": [], + "stac_version": "1.0.0", + } + + with patch("deep_code.api.publish.OSCProductSTACGenerator") as mock_generator: + mock_generator.return_value.build_stac_collection.return_value = mock_collection + publisher = ProductPublisher("/path/to/git.yaml") + publisher.publish_product("/path/to/dataset-config.yaml") + + auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" + mock_subprocess_run.assert_any_call( + ["git", "clone", auth_url, local_clone_dir], check=True + ) -if __name__ == "__main__": - unittest.main() From be055af2f87cac9ab4170ca05e2de62edb21a8ab Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 14:56:17 +0100 Subject: [PATCH 33/63] unit test workflow --- .github/workflows/unittest-workflow.yaml | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) create mode 100644 .github/workflows/unittest-workflow.yaml diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml new file mode 100644 index 0000000..c54e45d --- /dev/null +++ b/.github/workflows/unittest-workflow.yaml @@ -0,0 +1,23 @@ +name: Unittest deep-code + +on: + push: + release: + types: [published] + +jobs: + unittest: + runs-on: ubuntu-latest + steps: + - name: checkout deep-code + uses: actions/checkout@v4 + + - name: Set up MicroMamba + uses: mamba-org/setup-micromamba@v1 + with: + environment-file: environment.yml + + - name: Run unit tests + shell: bash -l {0} + run: | + pytest --cov=deep_code --cov-report=xml From bdc8bf1a44dd10621bc85c138127431fec24a09a Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 15:28:54 +0100 Subject: [PATCH 34/63] updated unit tests and workflow --- .github/workflows/unittest-workflow.yaml | 1 + deep_code/tests/api/test_publish.py | 15 --------------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index c54e45d..5ffb4b4 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -20,4 +20,5 @@ jobs: - name: Run unit tests shell: bash -l {0} run: | + cd /home/runner/work/deep-code/deep-code pytest --cov=deep_code --cov-report=xml diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/api/test_publish.py index 9c746c7..eca0ae6 100644 --- a/deep_code/tests/api/test_publish.py +++ b/deep_code/tests/api/test_publish.py @@ -14,21 +14,6 @@ def test_init_missing_credentials(self, mock_fsspec_open): with pytest.raises(ValueError, match="GitHub credentials are missing in the git.yaml file."): ProductPublisher("/path/to/git.yaml") - @patch("deep_code.utils.github_automation.GitHubAutomation") - @patch("deep_code.api.publish.fsspec.open") - def test_init_with_credentials(self, mock_fsspec_open): - git_yaml_content = """ - github-username: test-user - github-token: test-token - """ - mock_fsspec_open.return_value.__enter__.return_value = mock_open( - read_data=git_yaml_content)() - - publisher = ProductPublisher("/path/to/git.yaml") - - assert publisher.github_username == "test-user" - assert publisher.github_token == "test-token" - @patch("deep_code.api.publish.fsspec.open") def test_publish_product_missing_ids(self, mock_fsspec_open): git_yaml_content = """ From b5863a69b23cc52d0dcbdc81cf35a4cfcbc36a9f Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 15:35:03 +0100 Subject: [PATCH 35/63] updated workflow --- .github/workflows/unittest-workflow.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index 5ffb4b4..9e915dd 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -17,6 +17,11 @@ jobs: with: environment-file: environment.yml + - name: Install deep-code in editable mode + run: | + cd /home/runner/work/deep-code/deep-code + pip install -e . + - name: Run unit tests shell: bash -l {0} run: | From 94bad29c3f2ec6b5cbef73349907be3ed258d9b9 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 15:43:44 +0100 Subject: [PATCH 36/63] updated workflow --- .github/workflows/unittest-workflow.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index 9e915dd..0bc3338 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -18,6 +18,7 @@ jobs: environment-file: environment.yml - name: Install deep-code in editable mode + shell: bash -l {0} run: | cd /home/runner/work/deep-code/deep-code pip install -e . From 1ef4c9d92ed90a0013002a112b77fcb878a7dd1b Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 16:54:24 +0100 Subject: [PATCH 37/63] updated ENV --- environment.yml | 6 +++--- pyproject.toml | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index d192aae..b52434a 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge dependencies: # Required - - python >=3.9 + - python >=3.10 - click - fsspec - jsonschema @@ -13,6 +13,6 @@ dependencies: - pyyaml - xcube # test dependencies + - numpy - pytest - - pytest-cov - - numpy \ No newline at end of file + - pytest-cov \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c44ba3f..057f7b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "fsspec", "jsonschema", "requests", + "pandas", "pystac", "pyyaml", "xcube-core" @@ -41,6 +42,7 @@ exclude = [ dev = [ "black", "flake8", + "numpy", "pytest", "pytest-cov", "pytest-recording" From d03479c5d58a7d255e4c8df73f1fcd2b61905091 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 22:35:18 +0100 Subject: [PATCH 38/63] updated unit test --- deep_code/tests/api/test_publish.py | 88 ++++++++++++++++------------- 1 file changed, 48 insertions(+), 40 deletions(-) diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/api/test_publish.py index eca0ae6..4fc3c8c 100644 --- a/deep_code/tests/api/test_publish.py +++ b/deep_code/tests/api/test_publish.py @@ -1,5 +1,3 @@ -import os - import pytest from unittest.mock import patch, MagicMock, mock_open @@ -35,31 +33,41 @@ def test_publish_product_missing_ids(self, mock_fsspec_open): "dataset-config.yaml file."): publisher.publish_product("/path/to/dataset-config.yaml") - @patch("os.makedirs") - @patch("subprocess.run") + @patch("deep_code.utils.github_automation.os.chdir") + @patch("deep_code.utils.github_automation.subprocess.run") + @patch("deep_code.utils.github_automation.os.path.expanduser", + return_value="/tmp") @patch("requests.post") @patch("deep_code.utils.github_automation.GitHubAutomation") @patch("deep_code.api.publish.fsspec.open") - def test_publish_product_success(self, mock_fsspec_open, mock_github_automation, - mock_requests_post, mock_subprocess_run, - mock_makedirs): + def test_publish_product_success( + self, + mock_fsspec_open, + mock_github_automation, + mock_requests_post, + mock_expanduser, + mock_subprocess_run, + mock_chdir + ): + + # Mock the YAML reads git_yaml_content = """ - github-username: test-user - github-token: test-token - """ + github-username: test-user + github-token: test-token + """ dataset_yaml_content = """ - dataset-id: test-dataset - collection-id: test-collection - documentation-link: http://example.com/doc - access-link: http://example.com/access - dataset-status: ongoing - dataset-region: Global - dataset-theme: ["climate"] - cf-parameter: [] - """ + dataset-id: test-dataset + collection-id: test-collection + documentation-link: http://example.com/doc + access-link: http://example.com/access + dataset-status: ongoing + dataset-region: Global + dataset-theme: ["climate"] + cf-parameter: [] + """ mock_fsspec_open.side_effect = [ mock_open(read_data=git_yaml_content)(), - mock_open(read_data=dataset_yaml_content)() + mock_open(read_data=dataset_yaml_content)(), ] # Mock GitHubAutomation methods @@ -70,22 +78,13 @@ def test_publish_product_success(self, mock_fsspec_open, mock_github_automation, mock_git.add_file.return_value = None mock_git.commit_and_push.return_value = None mock_git.create_pull_request.return_value = "http://example.com/pr" + mock_git.clean_up.return_value = None - # Mock requests.post for GitHub API calls - mock_response = MagicMock() - mock_response.raise_for_status.return_value = None - mock_requests_post.return_value = mock_response - - # Mock subprocess.run for git commands + # Mock subprocess.run & os.chdir mock_subprocess_run.return_value = None + mock_chdir.return_value = None - home_dir = os.path.expanduser("~") - local_clone_dir = os.path.join(home_dir, "temp_repo") - - # Mock os.makedirs to avoid real directory creation - mock_makedirs.return_value = None - - # Mock OSCProductSTACGenerator + # Mock STAC generator mock_collection = MagicMock() mock_collection.to_dict.return_value = { "type": "Collection", @@ -98,15 +97,24 @@ def test_publish_product_success(self, mock_fsspec_open, mock_github_automation, "links": [], "stac_version": "1.0.0", } - with patch("deep_code.api.publish.OSCProductSTACGenerator") as mock_generator: mock_generator.return_value.build_stac_collection.return_value = mock_collection - publisher = ProductPublisher("/path/to/git.yaml") - publisher.publish_product("/path/to/dataset-config.yaml") + # Instantiate & publish + publisher = ProductPublisher("/fake/path/to/git.yaml") + publisher.publish_product("/fake/path/to/dataset-config.yaml") + + # 6Assert that we called git clone with /tmp/temp_repo + # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo + auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" + mock_subprocess_run.assert_any_call( + ["git", "clone", auth_url, "/tmp/temp_repo"], + check=True + ) + + # Also confirm we changed directories to /tmp/temp_repo + mock_chdir.assert_any_call("/tmp/temp_repo") + + - auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" - mock_subprocess_run.assert_any_call( - ["git", "clone", auth_url, local_clone_dir], check=True - ) From 1749ae81336bb37afe5f05bce00d84d6647a96dd Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 23:49:28 +0100 Subject: [PATCH 39/63] upload codecov --- .github/workflows/unittest-workflow.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index 0bc3338..10c9cb0 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -28,3 +28,9 @@ jobs: run: | cd /home/runner/work/deep-code/deep-code pytest --cov=deep_code --cov-report=xml + + - name: Upload coverage reports to Codecov + uses: codecov/codecov-action@v5 + with: + token: ${{ secrets.CODECOV_TOKEN }} + slug: deepesdl/deep-code \ No newline at end of file From ecc795145f1806369120fb3c66a800c567b385aa Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 23:53:54 +0100 Subject: [PATCH 40/63] badges --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 405e0f2..7a04464 100644 --- a/README.md +++ b/README.md @@ -1 +1,6 @@ -# deep-code \ No newline at end of file +# deep-code + +[![Build Status](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yml/badge.svg?branch=main)](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yaml) +[![codecov](https://codecov.io/gh/deepesdl/deep-code/graph/badge.svg?token=ktcp1maEgz)](https://codecov.io/gh/deepesdl/deep-code) +[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) +[![License](https://img.shields.io/github/license/dcs4cop/xcube-smos)](https://github.com/deepesdl/deep-code/blob/main/LICENSE) \ No newline at end of file From 92537dfe72a36de7e79dd0013052c8cd1bdaa704 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 2 Jan 2025 23:57:41 +0100 Subject: [PATCH 41/63] badges --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7a04464..c8f24c9 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,7 @@ # deep-code -[![Build Status](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yml/badge.svg?branch=main)](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yaml) +[![Build Status](https://github.com/deepesdl/deep-code/actions/workflows/unittest +-workflow.yaml/badge.svg)](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yaml) [![codecov](https://codecov.io/gh/deepesdl/deep-code/graph/badge.svg?token=ktcp1maEgz)](https://codecov.io/gh/deepesdl/deep-code) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) [![License](https://img.shields.io/github/license/dcs4cop/xcube-smos)](https://github.com/deepesdl/deep-code/blob/main/LICENSE) \ No newline at end of file From d301f8298638dbb67adbc9658b4772cf2c20154c Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 3 Jan 2025 00:01:36 +0100 Subject: [PATCH 42/63] code formatting --- deep_code/api/publish.py | 4 +-- deep_code/tests/api/__init__.py | 2 +- deep_code/tests/api/test_publish.py | 48 +++++++++++++++-------------- deep_code/tests/utils/__init__.py | 2 +- deep_code/utils/__init__.py | 2 +- 5 files changed, 30 insertions(+), 28 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index 617d095..8714bed 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -8,9 +8,9 @@ import logging import yaml -from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME +from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.github_automation import GitHubAutomation logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) diff --git a/deep_code/tests/api/__init__.py b/deep_code/tests/api/__init__.py index 8b75a76..6f1b5dc 100644 --- a/deep_code/tests/api/__init__.py +++ b/deep_code/tests/api/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: -# https://opensource.org/licenses/MIT. \ No newline at end of file +# https://opensource.org/licenses/MIT. diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/api/test_publish.py index 4fc3c8c..e042aa6 100644 --- a/deep_code/tests/api/test_publish.py +++ b/deep_code/tests/api/test_publish.py @@ -3,13 +3,17 @@ from deep_code.api.publish import ProductPublisher -class TestProductPublisher: +class TestProductPublisher: @patch("deep_code.api.publish.fsspec.open") def test_init_missing_credentials(self, mock_fsspec_open): - mock_fsspec_open.return_value.__enter__.return_value = mock_open(read_data="{}")() + mock_fsspec_open.return_value.__enter__.return_value = mock_open( + read_data="{}" + )() - with pytest.raises(ValueError, match="GitHub credentials are missing in the git.yaml file."): + with pytest.raises( + ValueError, match="GitHub credentials are missing in the git.yaml file." + ): ProductPublisher("/path/to/git.yaml") @patch("deep_code.api.publish.fsspec.open") @@ -23,31 +27,32 @@ def test_publish_product_missing_ids(self, mock_fsspec_open): """ mock_fsspec_open.side_effect = [ mock_open(read_data=git_yaml_content)(), - mock_open(read_data=dataset_yaml_content)() + mock_open(read_data=dataset_yaml_content)(), ] publisher = ProductPublisher("/path/to/git.yaml") - with pytest.raises(ValueError, - match="Dataset ID or Collection ID is missing in the " - "dataset-config.yaml file."): + with pytest.raises( + ValueError, + match="Dataset ID or Collection ID is missing in the " + "dataset-config.yaml file.", + ): publisher.publish_product("/path/to/dataset-config.yaml") @patch("deep_code.utils.github_automation.os.chdir") @patch("deep_code.utils.github_automation.subprocess.run") - @patch("deep_code.utils.github_automation.os.path.expanduser", - return_value="/tmp") + @patch("deep_code.utils.github_automation.os.path.expanduser", return_value="/tmp") @patch("requests.post") @patch("deep_code.utils.github_automation.GitHubAutomation") @patch("deep_code.api.publish.fsspec.open") def test_publish_product_success( - self, - mock_fsspec_open, - mock_github_automation, - mock_requests_post, - mock_expanduser, - mock_subprocess_run, - mock_chdir + self, + mock_fsspec_open, + mock_github_automation, + mock_requests_post, + mock_expanduser, + mock_subprocess_run, + mock_chdir, ): # Mock the YAML reads @@ -98,7 +103,9 @@ def test_publish_product_success( "stac_version": "1.0.0", } with patch("deep_code.api.publish.OSCProductSTACGenerator") as mock_generator: - mock_generator.return_value.build_stac_collection.return_value = mock_collection + mock_generator.return_value.build_stac_collection.return_value = ( + mock_collection + ) # Instantiate & publish publisher = ProductPublisher("/fake/path/to/git.yaml") @@ -108,13 +115,8 @@ def test_publish_product_success( # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" mock_subprocess_run.assert_any_call( - ["git", "clone", auth_url, "/tmp/temp_repo"], - check=True + ["git", "clone", auth_url, "/tmp/temp_repo"], check=True ) # Also confirm we changed directories to /tmp/temp_repo mock_chdir.assert_any_call("/tmp/temp_repo") - - - - diff --git a/deep_code/tests/utils/__init__.py b/deep_code/tests/utils/__init__.py index 8b75a76..6f1b5dc 100644 --- a/deep_code/tests/utils/__init__.py +++ b/deep_code/tests/utils/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: -# https://opensource.org/licenses/MIT. \ No newline at end of file +# https://opensource.org/licenses/MIT. diff --git a/deep_code/utils/__init__.py b/deep_code/utils/__init__.py index 8b75a76..6f1b5dc 100644 --- a/deep_code/utils/__init__.py +++ b/deep_code/utils/__init__.py @@ -1,3 +1,3 @@ # Copyright (c) 2024 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: -# https://opensource.org/licenses/MIT. \ No newline at end of file +# https://opensource.org/licenses/MIT. From 40e94e079f3eac06ba6ade13eabb10fd590eb0df Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 6 Jan 2025 15:50:57 +0100 Subject: [PATCH 43/63] updated README.md --- README.md | 99 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 95 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c8f24c9..756fd2b 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,98 @@ # deep-code -[![Build Status](https://github.com/deepesdl/deep-code/actions/workflows/unittest --workflow.yaml/badge.svg)](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yaml) -[![codecov](https://codecov.io/gh/deepesdl/deep-code/graph/badge.svg?token=ktcp1maEgz)](https://codecov.io/gh/deepesdl/deep-code) +[![Build Status](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yaml/badge.svg)](https://github.com/deepesdl/deep-code/actions/workflows/unittest-workflow.yaml) +[![codecov](https://codecov.io/gh/deepesdl/deep-code/graph/badge.svg?token=47MQXOXWOK)](https://codecov.io/gh/deepesdl/deep-code) [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -[![License](https://img.shields.io/github/license/dcs4cop/xcube-smos)](https://github.com/deepesdl/deep-code/blob/main/LICENSE) \ No newline at end of file +[![License](https://img.shields.io/github/license/dcs4cop/xcube-smos)](https://github.com/deepesdl/deep-code/blob/main/LICENSE) + +`deep-code` is a lightweight python tool that comprises a command line interface(CLI) +and Python API that implements utilities that aid integration of DeepESDL datasets, +experiments with EarthCODE. + +## Setup + +## Installing from the repository + +To install deep-code directly from the git repository, clone the repository, and +execute the steps below: + +```commandline +conda env create -f environment.yml +conda activate deep-code +pip install -e . +``` + +This installs all the dependencies of `deep-code` into a fresh conda environment, +and installs deep-code from the repository into the same environment. + +## Testing + +To run the unit test suite: + +```commandline +pytest +``` + +To analyze test coverage +```shell +pytest --cov=deep-code +``` + +To produce an HTML coverage report + +```commandline +pytest --cov-report html --cov=deep-code +``` + +## deep_code usage + +`deep_code` provides a command-line tool called deep-code, which has several subcommands +providing different utility functions. +Use the --help option with these subcommands to get more details on usage. + +### deep-code publish-product + +Publish a dataset which is a result of a experiment to the EarthCODE +open-science catalog. + +```commandline + deep-code publish-product --help + ``` + +``` +Usage: deep-code publish-product [OPTIONS] + + Command-line interface for the ProductPublisher API. + +Options: + --git-config PATH Path to the git.yaml file with GitHub credentials. + [required] + --dataset-config PATH Path to the dataset-config.yaml file with dataset + information. [required] + --help Show this message and exit. + +``` + +#### git.yaml example + +``` +github-username: your-git-user +github-token: personal access token +``` + +#### dataset-config.yaml example + +``` +dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr +collection-id: hydrology + +#non-mandatory +documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/ +access-link: s3://test +dataset-status: completed +dataset-region: global +dataset-theme: ["ocean", "environment"] +cf-parameter: [{"Name" : "hydrology"}] +``` + +dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. \ No newline at end of file From 3fda30d209b72d2ff00265f666915270d9d0b691 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 10:10:23 +0100 Subject: [PATCH 44/63] extended get_spatial_extent to handle ds with latitude and longitude coords --- deep_code/utils/dataset_stac_generator.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index d02d795..1b5a077 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -124,7 +124,7 @@ def _open_dataset(self): def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" - if "lon" in self.dataset.coords and "lat" in self.dataset.coords: + if {"lon", "lat"}.issubset(self.dataset.coords): # For regular gridding lon_min, lon_max = ( float(self.dataset.lon.min()), @@ -135,7 +135,18 @@ def _get_spatial_extent(self) -> SpatialExtent: float(self.dataset.lat.max()), ) return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]]) - elif "x" in self.dataset.coords and "y" in self.dataset.coords: + elif {"longitude", "latitude"}.issubset(self.dataset.coords): + # For regular gridding with 'longitude' and 'latitude' + lon_min, lon_max = ( + float(self.dataset.longitude.min()), + float(self.dataset.longitude.max()), + ) + lat_min, lat_max = ( + float(self.dataset.latitude.min()), + float(self.dataset.latitude.max()), + ) + return SpatialExtent([[lon_min, lat_min, lon_max, lat_max]]) + elif {"x", "y"}.issubset(self.dataset.coords): # For irregular gridding x_min, x_max = (float(self.dataset.x.min()), float(self.dataset.x.max())) y_min, y_max = (float(self.dataset.y.min()), float(self.dataset.y.max())) From c39b4be0053865e01935afadecd84c1ae9fec9a0 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 14:24:27 +0100 Subject: [PATCH 45/63] adapted doc strings to follow google style --- deep_code/api/publish.py | 20 ++++----- deep_code/cli/main.py | 4 +- deep_code/cli/publish.py | 10 ++--- deep_code/tests/api/test_publish.py | 18 ++++---- deep_code/utils/dataset_stac_generator.py | 51 +++++++++-------------- deep_code/utils/github_automation.py | 14 +++---- deep_code/utils/osc_extension.py | 10 ++--- 7 files changed, 58 insertions(+), 69 deletions(-) diff --git a/deep_code/api/publish.py b/deep_code/api/publish.py index 8714bed..e20a7a1 100644 --- a/deep_code/api/publish.py +++ b/deep_code/api/publish.py @@ -16,12 +16,13 @@ logging.basicConfig(level=logging.INFO) -class ProductPublisher: +class DatasetPublisher: + """Publishes products to a GitHub repository. + + Args: + git_config_path: Path to the YAML file containing GitHub credentials. + """ def __init__(self, git_config_path: str): - """ - Initialize the ProductPublisher class. - :param git_config_path: Path to the YAML file containing GitHub credentials - """ with fsspec.open(git_config_path, "r") as file: git_config = yaml.safe_load(file) or {} @@ -35,12 +36,11 @@ def __init__(self, git_config_path: str): self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) - def publish_product(self, dataset_config_path: str): - """ - Publish a product collection to the specified GitHub repository. + def publish_dataset(self, dataset_config_path: str): + """Publish a product collection to the specified GitHub repository. - :param dataset_config_path: Path to the YAML file containing dataset - configuration + Args: + dataset_config_path: Path to the YAML file containing dataset config """ with fsspec.open(dataset_config_path, "r") as file: dataset_config = yaml.safe_load(file) diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index 4f11c44..3a781c0 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -6,7 +6,7 @@ import click -from deep_code.cli.publish import publish_product +from deep_code.cli.publish import publish_dataset @click.group() @@ -15,6 +15,6 @@ def main(): pass -main.add_command(publish_product) +main.add_command(publish_dataset) if __name__ == "__main__": main() diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 5c127c6..5f2e609 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -6,10 +6,10 @@ import click -from deep_code.api.publish import ProductPublisher +from deep_code.api.publish import DatasetPublisher -@click.command(name="publish-product") +@click.command(name="publish-dataset") @click.option( "--git-config", required=True, @@ -22,9 +22,9 @@ type=click.Path(exists=True), help="Path to the dataset-config.yaml file with dataset information.", ) -def publish_product(git_config, dataset_config): +def publish_dataset(git_config, dataset_config): """ Command-line interface for the ProductPublisher API. """ - publisher = ProductPublisher(git_config_path=git_config) - publisher.publish_product(dataset_config_path=dataset_config) + publisher = DatasetPublisher(git_config_path=git_config) + publisher.publish_dataset(dataset_config_path=dataset_config) diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/api/test_publish.py index e042aa6..be45a20 100644 --- a/deep_code/tests/api/test_publish.py +++ b/deep_code/tests/api/test_publish.py @@ -1,10 +1,10 @@ import pytest from unittest.mock import patch, MagicMock, mock_open -from deep_code.api.publish import ProductPublisher +from deep_code.api.publish import DatasetPublisher -class TestProductPublisher: +class TestDatasetPublisher: @patch("deep_code.api.publish.fsspec.open") def test_init_missing_credentials(self, mock_fsspec_open): mock_fsspec_open.return_value.__enter__.return_value = mock_open( @@ -14,10 +14,10 @@ def test_init_missing_credentials(self, mock_fsspec_open): with pytest.raises( ValueError, match="GitHub credentials are missing in the git.yaml file." ): - ProductPublisher("/path/to/git.yaml") + DatasetPublisher("/path/to/git.yaml") @patch("deep_code.api.publish.fsspec.open") - def test_publish_product_missing_ids(self, mock_fsspec_open): + def test_publish_dataset_missing_ids(self, mock_fsspec_open): git_yaml_content = """ github-username: test-user github-token: test-token @@ -30,14 +30,14 @@ def test_publish_product_missing_ids(self, mock_fsspec_open): mock_open(read_data=dataset_yaml_content)(), ] - publisher = ProductPublisher("/path/to/git.yaml") + publisher = DatasetPublisher("/path/to/git.yaml") with pytest.raises( ValueError, match="Dataset ID or Collection ID is missing in the " "dataset-config.yaml file.", ): - publisher.publish_product("/path/to/dataset-config.yaml") + publisher.publish_dataset("/path/to/dataset-config.yaml") @patch("deep_code.utils.github_automation.os.chdir") @patch("deep_code.utils.github_automation.subprocess.run") @@ -45,7 +45,7 @@ def test_publish_product_missing_ids(self, mock_fsspec_open): @patch("requests.post") @patch("deep_code.utils.github_automation.GitHubAutomation") @patch("deep_code.api.publish.fsspec.open") - def test_publish_product_success( + def test_publish_dataset_success( self, mock_fsspec_open, mock_github_automation, @@ -108,8 +108,8 @@ def test_publish_product_success( ) # Instantiate & publish - publisher = ProductPublisher("/fake/path/to/git.yaml") - publisher.publish_product("/fake/path/to/dataset-config.yaml") + publisher = DatasetPublisher("/fake/path/to/git.yaml") + publisher.publish_dataset("/fake/path/to/dataset-config.yaml") # 6Assert that we called git clone with /tmp/temp_repo # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 1b5a077..ebcbf31 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -16,8 +16,18 @@ class OSCProductSTACGenerator: - """ - A class to generate OSC STAC Collections for a product from Zarr datasets. + """Generates OSC STAC Collections for a product from Zarr datasets. + + Args: + dataset_id: ID of the Zarr dataset. + collection_id: Unique identifier for the STAC collection. + access_link: Public access link to the dataset. + documentation_link: Link to dataset documentation. + osc_status: Status of the dataset (e.g., "ongoing"). + osc_region: Geographical region associated with the dataset. + osc_themes: List of themes related to the dataset (e.g., ["climate"]). + osc_missions: List of satellite missions associated with the dataset. + cf_params: CF metadata parameters for the dataset. """ def __init__( @@ -32,19 +42,6 @@ def __init__( osc_missions: list[str] | None = None, cf_params: list[dict[str]] | None = None, ): - """ - Initialize the generator with the path to the Zarr dataset and metadata. - - :param dataset_id: Path to the Zarr dataset. - :param collection_id: Unique ID for the collection. - :param access_link: Public access link to the dataset. - :param documentation_link: Link to documentation related to the dataset. - :param osc_status: Status of the dataset (e.g., "ongoing"). - :param osc_region: Geographical region of the dataset. - :param osc_themes: Themes of the dataset (e.g., ["climate", "environment"]). - :param osc_missions: Satellite mission to which dataset belongs. - :param cf_params: params related to CF metadata convention. - """ self.dataset_id = dataset_id self.collection_id = collection_id self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}" @@ -58,7 +55,7 @@ def __init__( self.dataset = self._open_dataset() def _open_dataset(self): - """Open the dataset using a S3 store as an xarray Dataset.""" + """Open the dataset using a S3 store as a xarray Dataset.""" store_configs = [ { @@ -179,19 +176,17 @@ def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None def _get_variables(self) -> list[str]: - """ - Extract variable names from the dataset. + """Extracts variable names or descriptions from the dataset. - Prioritize fetching `long_name` or `standard_name` from each variable's - attributes. - If neither is available, return the variable's name from - `dataset.data_vars.keys()`. + Variables are prioritized based on their `long_name` or `standard_name` + attributes. If neither is available, the variable's key from + `dataset.data_vars.keys()` is used. - :return: A list of variable names or descriptions. + Returns: + A list of variable names or descriptions. """ variables = [] for var_name, variable in self.dataset.data_vars.items(): - # Replace spaces with hyphens and convert to lowercase if attributes exist long_name = self._normalize_name(variable.attrs.get("long_name")) standard_name = self._normalize_name(variable.attrs.get("standard_name")) if not long_name and not standard_name: @@ -204,12 +199,6 @@ def _get_variables(self) -> list[str]: return variables def _get_general_metadata(self) -> dict: - """ - Extract general metadata from the dataset attributes. - Fallback to default values if the keys are missing. - - :return: A dictionary containing metadata such as 'description' and 'title'. - """ return { "description": self.dataset.attrs.get( "description", "No description available." @@ -218,7 +207,7 @@ def _get_general_metadata(self) -> dict: def build_stac_collection(self) -> Collection: """ - Build an OSC STAC Collection for the product. + Build an OSC STAC Collection for the dataset. :return: A pystac.Collection object. """ diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 7f1662f..699636c 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -13,15 +13,15 @@ class GitHubAutomation: - def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): - """ - Initialize the GitHubAutomation class. + """Automates GitHub operations needed to create a Pull Request. - :param username: Your GitHub username - :param token: Your GitHub personal access token - :param repo_owner: Owner of the repository to fork - :param repo_name: Name of the repository to fork + Args: + username: GitHub username. + token: Personal access token for GitHub. + repo_owner: Owner of the repository to fork. + repo_name: Name of the repository to fork. """ + def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): self.username = username self.token = token self.repo_owner = repo_owner diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 9d0e5e3..f02454d 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -16,6 +16,11 @@ class OscExtension( PropertiesExtension, ExtensionManagementMixin[pystac.Item | pystac.Collection] ): + """Handles the OSC extension for STAC Items and Collections. + + Args: + obj: The STAC Item or Collection to which the OSC extension is applied. + """ name: Literal["osc"] = "osc" def __init__(self, obj: pystac.Item | pystac.Collection): @@ -25,7 +30,6 @@ def __init__(self, obj: pystac.Item | pystac.Collection): self.properties = obj.properties self.obj = obj - # Existing properties... @property def osc_type(self) -> str | None: return self._get_property("osc:type", str) @@ -90,7 +94,6 @@ def osc_missions(self, value: list[str]) -> None: raise ValueError("osc:missions must be a list of strings") self._set_property("osc:missions", value, pop_if_none=False) - # Utility methods for handling temporal and spatial extent def set_extent(self, spatial: list[list[float]], temporal: list[list[str]]) -> None: self.obj.extent = Extent(SpatialExtent(spatial), TemporalExtent(temporal)) @@ -104,7 +107,6 @@ def osc_variables(self, v: list[str]) -> None: raise ValueError("osc:variables must be a list of strings") self._set_property("osc:variables", v, pop_if_none=False) - # Keywords property @property def keywords(self) -> list[str] | None: return self._get_property("keywords", list) @@ -117,7 +119,6 @@ def keywords(self, value: list[str]) -> None: raise ValueError("keywords must be a list of strings") self._set_property("keywords", value, pop_if_none=False) - # CF Parameters @property def cf_parameter(self) -> list[dict] | None: return self._get_property("cf:parameter", list) @@ -130,7 +131,6 @@ def cf_parameter(self, value: list[dict]) -> None: raise ValueError("cf:parameter must be a list of dictionaries") self._set_property("cf:parameter", value, pop_if_none=False) - # Created and Updated timestamps @property def created(self) -> str | None: return self._get_property("created", str) From 03060016eb699671e47d0d5e6710c9cb05230d57 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 14:26:03 +0100 Subject: [PATCH 46/63] adapted doc strings to follow google style --- deep_code/cli/publish.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 5f2e609..69d82d5 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -23,8 +23,7 @@ help="Path to the dataset-config.yaml file with dataset information.", ) def publish_dataset(git_config, dataset_config): - """ - Command-line interface for the ProductPublisher API. + """Command-line interface for the ProductPublisher API. """ publisher = DatasetPublisher(git_config_path=git_config) publisher.publish_dataset(dataset_config_path=dataset_config) From ead03d9db463206fb398db95173f7aba7c269afc Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 14:43:34 +0100 Subject: [PATCH 47/63] renamed api module to tools --- deep_code/cli/publish.py | 2 +- deep_code/tests/{api => tools}/__init__.py | 0 deep_code/tests/{api => tools}/test_publish.py | 10 +++++----- deep_code/{api => tools}/__init__.py | 0 deep_code/{api => tools}/check.py | 0 deep_code/{api => tools}/new.py | 0 deep_code/{api => tools}/publish.py | 0 deep_code/{api => tools}/register.py | 0 deep_code/{api => tools}/setup_ci.py | 0 deep_code/{api => tools}/test.py | 0 10 files changed, 6 insertions(+), 6 deletions(-) rename deep_code/tests/{api => tools}/__init__.py (100%) rename deep_code/tests/{api => tools}/test_publish.py (93%) rename deep_code/{api => tools}/__init__.py (100%) rename deep_code/{api => tools}/check.py (100%) rename deep_code/{api => tools}/new.py (100%) rename deep_code/{api => tools}/publish.py (100%) rename deep_code/{api => tools}/register.py (100%) rename deep_code/{api => tools}/setup_ci.py (100%) rename deep_code/{api => tools}/test.py (100%) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 69d82d5..5af5c2d 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -6,7 +6,7 @@ import click -from deep_code.api.publish import DatasetPublisher +from deep_code.tools.publish import DatasetPublisher @click.command(name="publish-dataset") diff --git a/deep_code/tests/api/__init__.py b/deep_code/tests/tools/__init__.py similarity index 100% rename from deep_code/tests/api/__init__.py rename to deep_code/tests/tools/__init__.py diff --git a/deep_code/tests/api/test_publish.py b/deep_code/tests/tools/test_publish.py similarity index 93% rename from deep_code/tests/api/test_publish.py rename to deep_code/tests/tools/test_publish.py index be45a20..a1c3935 100644 --- a/deep_code/tests/api/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -1,11 +1,11 @@ import pytest from unittest.mock import patch, MagicMock, mock_open -from deep_code.api.publish import DatasetPublisher +from deep_code.tools.publish import DatasetPublisher class TestDatasetPublisher: - @patch("deep_code.api.publish.fsspec.open") + @patch("deep_code.tools.publish.fsspec.open") def test_init_missing_credentials(self, mock_fsspec_open): mock_fsspec_open.return_value.__enter__.return_value = mock_open( read_data="{}" @@ -16,7 +16,7 @@ def test_init_missing_credentials(self, mock_fsspec_open): ): DatasetPublisher("/path/to/git.yaml") - @patch("deep_code.api.publish.fsspec.open") + @patch("deep_code.tools.publish.fsspec.open") def test_publish_dataset_missing_ids(self, mock_fsspec_open): git_yaml_content = """ github-username: test-user @@ -44,7 +44,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open): @patch("deep_code.utils.github_automation.os.path.expanduser", return_value="/tmp") @patch("requests.post") @patch("deep_code.utils.github_automation.GitHubAutomation") - @patch("deep_code.api.publish.fsspec.open") + @patch("deep_code.tools.publish.fsspec.open") def test_publish_dataset_success( self, mock_fsspec_open, @@ -102,7 +102,7 @@ def test_publish_dataset_success( "links": [], "stac_version": "1.0.0", } - with patch("deep_code.api.publish.OSCProductSTACGenerator") as mock_generator: + with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator: mock_generator.return_value.build_stac_collection.return_value = ( mock_collection ) diff --git a/deep_code/api/__init__.py b/deep_code/tools/__init__.py similarity index 100% rename from deep_code/api/__init__.py rename to deep_code/tools/__init__.py diff --git a/deep_code/api/check.py b/deep_code/tools/check.py similarity index 100% rename from deep_code/api/check.py rename to deep_code/tools/check.py diff --git a/deep_code/api/new.py b/deep_code/tools/new.py similarity index 100% rename from deep_code/api/new.py rename to deep_code/tools/new.py diff --git a/deep_code/api/publish.py b/deep_code/tools/publish.py similarity index 100% rename from deep_code/api/publish.py rename to deep_code/tools/publish.py diff --git a/deep_code/api/register.py b/deep_code/tools/register.py similarity index 100% rename from deep_code/api/register.py rename to deep_code/tools/register.py diff --git a/deep_code/api/setup_ci.py b/deep_code/tools/setup_ci.py similarity index 100% rename from deep_code/api/setup_ci.py rename to deep_code/tools/setup_ci.py diff --git a/deep_code/api/test.py b/deep_code/tools/test.py similarity index 100% rename from deep_code/api/test.py rename to deep_code/tools/test.py From 0aab936af444e77000bed537891a81ea1769840b Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 14:53:33 +0100 Subject: [PATCH 48/63] refactor doc string --- deep_code/utils/github_automation.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 699636c..b25310e 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -15,12 +15,12 @@ class GitHubAutomation: """Automates GitHub operations needed to create a Pull Request. - Args: - username: GitHub username. - token: Personal access token for GitHub. - repo_owner: Owner of the repository to fork. - repo_name: Name of the repository to fork. - """ + Args: + username: GitHub username. + token: Personal access token for GitHub. + repo_owner: Owner of the repository to fork. + repo_name: Name of the repository to fork. + """ def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): self.username = username self.token = token From 021647dc321f3e9041255388eb678742243b0d25 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 16:34:39 +0100 Subject: [PATCH 49/63] update copywrite notices --- deep_code/cli/__init__.py | 2 +- deep_code/cli/main.py | 2 +- deep_code/cli/publish.py | 2 +- deep_code/tests/tools/__init__.py | 2 +- deep_code/tests/utils/__init__.py | 2 +- deep_code/tools/__init__.py | 2 +- deep_code/tools/publish.py | 2 +- deep_code/utils/__init__.py | 2 +- deep_code/utils/dataset_stac_generator.py | 26 ++++++++++++++++++++++- deep_code/utils/github_automation.py | 2 +- deep_code/utils/osc_extension.py | 2 +- 11 files changed, 35 insertions(+), 11 deletions(-) diff --git a/deep_code/cli/__init__.py b/deep_code/cli/__init__.py index 6f1b5dc..073ddd0 100644 --- a/deep_code/cli/__init__.py +++ b/deep_code/cli/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index 3a781c0..be88985 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 5af5c2d..1c12947 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/tests/tools/__init__.py b/deep_code/tests/tools/__init__.py index 6f1b5dc..073ddd0 100644 --- a/deep_code/tests/tools/__init__.py +++ b/deep_code/tests/tools/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/tests/utils/__init__.py b/deep_code/tests/utils/__init__.py index 6f1b5dc..073ddd0 100644 --- a/deep_code/tests/utils/__init__.py +++ b/deep_code/tests/utils/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/tools/__init__.py b/deep_code/tools/__init__.py index e4323e4..073ddd0 100644 --- a/deep_code/tools/__init__.py +++ b/deep_code/tools/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2024 by xcube team and contributors +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index e20a7a1..108c751 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/utils/__init__.py b/deep_code/utils/__init__.py index 6f1b5dc..073ddd0 100644 --- a/deep_code/utils/__init__.py +++ b/deep_code/utils/__init__.py @@ -1,3 +1,3 @@ -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index ebcbf31..45a28c9 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. @@ -205,6 +205,30 @@ def _get_general_metadata(self) -> dict: ) } + def _get_variable_metadata(self, var_name, var_data) -> dict: + """Extract metadata from a single variable's attributes. + + Args: + var_name: The raw variable name in the dataset. + var_data: An xarray DataArray containing variable data and attrs. + + Returns: + A dict with 'id', 'title', and 'description'. + """ + long_name = var_data.attrs.get("long_name") + standard_name = var_data.attrs.get("standard_name") + title = long_name or standard_name or var_name + + normalized_title = self._normalize_name(title) + + description = var_data.attrs.get("description", "No variable description") + + return { + "id": var_name, + "title": normalized_title, + "description": description + } + def build_stac_collection(self) -> Collection: """ Build an OSC STAC Collection for the dataset. diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index b25310e..eb8e05e 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index f02454d..21372a3 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 -# Copyright (c) 2024 by Brockmann Consult GmbH +# Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. From 4b2987eab83f333868f0cb97387399e625b6bbb5 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 17:07:58 +0100 Subject: [PATCH 50/63] refactor --- .gitignore | 3 ++- deep_code/tools/check.py | 6 ++++-- deep_code/tools/new.py | 10 +++++----- deep_code/tools/setup_ci.py | 2 +- deep_code/tools/test.py | 4 ++-- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 4cbe696..116b25d 100644 --- a/.gitignore +++ b/.gitignore @@ -161,6 +161,7 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -# Ignore git.yaml and dataset-config.yaml +# Exclude sensitive configuration files from version control +.gitaccess git.yaml dataset-config.yaml \ No newline at end of file diff --git a/deep_code/tools/check.py b/deep_code/tools/check.py index 16810c3..8072cc3 100644 --- a/deep_code/tools/check.py +++ b/deep_code/tools/check.py @@ -1,2 +1,4 @@ -# Verify the readiness of an existing workflow repository for experiment publication by -# identifying any issues or missing components +""" +Verify the readiness of a dataset or an existing workflow repository for experiment +publication by identifying any issues or missing components +""" \ No newline at end of file diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index 599bf1f..3d1ed1e 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -1,5 +1,5 @@ -# Logic for initializing repositories -# Initialize a GitHub repository with the proposed configurations files, an initial workflow -# notebook template (e.g. workflow.ipynb), a template Python package (code and -# pyproject.toml), and a template setup for documentation (e.g., using mkdocs), setup of the -# build pipeline +"""Logic for initializing repositories + Initialize a GitHub repository with the proposed configurations files, an initial + workflow notebook template (e.g. workflow.ipynb), a template Python package (code and +pyproject.toml), and a template setup for documentation (e.g., using mkdocs), +setup of thebuild pipeline""" diff --git a/deep_code/tools/setup_ci.py b/deep_code/tools/setup_ci.py index 65b61b9..889d59b 100644 --- a/deep_code/tools/setup_ci.py +++ b/deep_code/tools/setup_ci.py @@ -1 +1 @@ -# Logic for setting up build pipelines +"""Logic for setting up build pipelines""" diff --git a/deep_code/tools/test.py b/deep_code/tools/test.py index 0e682bc..5bdf092 100644 --- a/deep_code/tools/test.py +++ b/deep_code/tools/test.py @@ -1,2 +1,2 @@ -# Execute the application package of a published experiment on a subset of input data to -# verify the reproducibility is achieved +""" Execute the application package of a published experiment on a subset of input data +to verify the reproducibility is achieved""" From 099354093495477ca780fa43e2f10d9bdd7866dd Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 17:23:10 +0100 Subject: [PATCH 51/63] update README.md --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 756fd2b..6c9c654 100644 --- a/README.md +++ b/README.md @@ -6,15 +6,18 @@ [![License](https://img.shields.io/github/license/dcs4cop/xcube-smos)](https://github.com/deepesdl/deep-code/blob/main/LICENSE) `deep-code` is a lightweight python tool that comprises a command line interface(CLI) -and Python API that implements utilities that aid integration of DeepESDL datasets, +and Python API providing utilities that aid integration of DeepESDL datasets, experiments with EarthCODE. ## Setup -## Installing from the repository +## Install +`deep-code` will be available in PyPI and conda-forge. Till the stable release, +developers/contributors can follow the below steps to install deep-code. -To install deep-code directly from the git repository, clone the repository, and -execute the steps below: +## Installing from the repository for Developer + +To install deep-code directly from the git repository, clone the repository, and execute the steps below: ```commandline conda env create -f environment.yml From 823838060abaf2b015a1809699f8a8300e86cc06 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 17:26:41 +0100 Subject: [PATCH 52/63] refactor --- deep_code/utils/dataset_stac_generator.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 45a28c9..7598fb6 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -113,11 +113,6 @@ def _open_dataset(self): f"Tried configurations: {', '.join(tried_configurations)}. " f"Last error: {last_exception}" ) - raise ValueError( - f"Failed to open Zarr dataset with ID {self.dataset_id}. " - f"Tried configurations: {', '.join(tried_configurations)}. " - f"Last error: {last_exception}" - ) def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" @@ -263,7 +258,7 @@ def build_stac_collection(self) -> Collection: if self.cf_params: osc_extension.cf_parameter = self.cf_params else: - osc_extension.cf_parameter = [{"Name": self.collection_id}] + osc_extension.cf_parameter = [{"name": self.collection_id}] # Add creation and update timestamps for the collection now_iso = datetime.now(timezone.utc).isoformat() From 04428d8993d3d3a5cb1fc5dfdb11f1249cf6a887 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 17:42:10 +0100 Subject: [PATCH 53/63] refactor test case --- deep_code/tests/utils/test_dataset_stac_generator.py | 1 - deep_code/utils/dataset_stac_generator.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index fb2c0d1..12321b2 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -203,4 +203,3 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): ) self.assertIn("Public store, Authenticated store", str(context.exception)) self.assertEqual(mock_new_data_store.call_count, 2) - mock_logger().critical.assert_called_once() diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 7598fb6..f915d8d 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -108,7 +108,7 @@ def _open_dataset(self): ) last_exception = e - self.logger.critical( + raise ValueError( f"Failed to open Zarr dataset with ID {self.dataset_id}. " f"Tried configurations: {', '.join(tried_configurations)}. " f"Last error: {last_exception}" From abe73f97b3ee5c65ba2e3e18acddf85fb60c1d8c Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 17:43:47 +0100 Subject: [PATCH 54/63] black code formatting --- deep_code/tools/check.py | 2 +- deep_code/tools/publish.py | 1 + deep_code/utils/dataset_stac_generator.py | 6 +----- deep_code/utils/github_automation.py | 1 + deep_code/utils/osc_extension.py | 1 + 5 files changed, 5 insertions(+), 6 deletions(-) diff --git a/deep_code/tools/check.py b/deep_code/tools/check.py index 8072cc3..3b54c65 100644 --- a/deep_code/tools/check.py +++ b/deep_code/tools/check.py @@ -1,4 +1,4 @@ """ Verify the readiness of a dataset or an existing workflow repository for experiment publication by identifying any issues or missing components -""" \ No newline at end of file +""" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 108c751..a24bb1c 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -22,6 +22,7 @@ class DatasetPublisher: Args: git_config_path: Path to the YAML file containing GitHub credentials. """ + def __init__(self, git_config_path: str): with fsspec.open(git_config_path, "r") as file: git_config = yaml.safe_load(file) or {} diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index f915d8d..21f4cf8 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -218,11 +218,7 @@ def _get_variable_metadata(self, var_name, var_data) -> dict: description = var_data.attrs.get("description", "No variable description") - return { - "id": var_name, - "title": normalized_title, - "description": description - } + return {"id": var_name, "title": normalized_title, "description": description} def build_stac_collection(self) -> Collection: """ diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index eb8e05e..d934d2a 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -21,6 +21,7 @@ class GitHubAutomation: repo_owner: Owner of the repository to fork. repo_name: Name of the repository to fork. """ + def __init__(self, username: str, token: str, repo_owner: str, repo_name: str): self.username = username self.token = token diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 21372a3..6aa7519 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -21,6 +21,7 @@ class OscExtension( Args: obj: The STAC Item or Collection to which the OSC extension is applied. """ + name: Literal["osc"] = "osc" def __init__(self, obj: pystac.Item | pystac.Collection): From 737e3f3798c08dd92b2e597f95ec67f20b75ba64 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Thu, 9 Jan 2025 17:45:07 +0100 Subject: [PATCH 55/63] Update README.md Co-authored-by: Norman Fomferra --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6c9c654..2dba566 100644 --- a/README.md +++ b/README.md @@ -63,7 +63,7 @@ open-science catalog. ``` ``` -Usage: deep-code publish-product [OPTIONS] +Usage: deep-code publish-dataset [OPTIONS] Command-line interface for the ProductPublisher API. From f456821fa117ea1df1066584ce9e81b1f3b171d5 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Thu, 9 Jan 2025 17:45:19 +0100 Subject: [PATCH 56/63] Update README.md Co-authored-by: Norman Fomferra --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2dba566..c2f4793 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Usage: deep-code publish-dataset [OPTIONS] Options: --git-config PATH Path to the git.yaml file with GitHub credentials. [required] - --dataset-config PATH Path to the dataset-config.yaml file with dataset + --product-config PATH Path to the dataset-config.yaml file with dataset information. [required] --help Show this message and exit. From b12cb84384ec99f4588a39f9d7fe252fd33087de Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 17:57:28 +0100 Subject: [PATCH 57/63] refactor --- README.md | 4 ++-- deep_code/cli/publish.py | 2 +- deep_code/tools/publish.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index c2f4793..a69c26f 100644 --- a/README.md +++ b/README.md @@ -59,13 +59,13 @@ Publish a dataset which is a result of a experiment to the EarthCODE open-science catalog. ```commandline - deep-code publish-product --help + deep-code publish-dataset --help ``` ``` Usage: deep-code publish-dataset [OPTIONS] - Command-line interface for the ProductPublisher API. + Request publishing a dataset to the open science catalogue. Options: --git-config PATH Path to the git.yaml file with GitHub credentials. diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 1c12947..fac4535 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -23,7 +23,7 @@ help="Path to the dataset-config.yaml file with dataset information.", ) def publish_dataset(git_config, dataset_config): - """Command-line interface for the ProductPublisher API. + """Request publishing a dataset to the open science catalogue. """ publisher = DatasetPublisher(git_config_path=git_config) publisher.publish_dataset(dataset_config_path=dataset_config) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index a24bb1c..a3b6648 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -17,7 +17,7 @@ class DatasetPublisher: - """Publishes products to a GitHub repository. + """Publishes products to the OSC GitHub repository. Args: git_config_path: Path to the YAML file containing GitHub credentials. From 1b7202eaa1f33f68335e986723897109beffb42a Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 18:12:13 +0100 Subject: [PATCH 58/63] removed .gitaccess as a option in the cli cmd --- .gitignore | 1 - deep_code/cli/publish.py | 10 ++-------- deep_code/tests/tools/test_publish.py | 8 ++++---- deep_code/tools/publish.py | 16 ++++++++++------ 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/.gitignore b/.gitignore index 116b25d..4df4d63 100644 --- a/.gitignore +++ b/.gitignore @@ -163,5 +163,4 @@ cython_debug/ # Exclude sensitive configuration files from version control .gitaccess -git.yaml dataset-config.yaml \ No newline at end of file diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index fac4535..4152e46 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -10,20 +10,14 @@ @click.command(name="publish-dataset") -@click.option( - "--git-config", - required=True, - type=click.Path(exists=True), - help="Path to the git.yaml file with GitHub credentials.", -) @click.option( "--dataset-config", required=True, type=click.Path(exists=True), help="Path to the dataset-config.yaml file with dataset information.", ) -def publish_dataset(git_config, dataset_config): +def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. """ - publisher = DatasetPublisher(git_config_path=git_config) + publisher = DatasetPublisher() publisher.publish_dataset(dataset_config_path=dataset_config) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index a1c3935..47c9961 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -12,9 +12,9 @@ def test_init_missing_credentials(self, mock_fsspec_open): )() with pytest.raises( - ValueError, match="GitHub credentials are missing in the git.yaml file." + ValueError, match="GitHub credentials are missing in the `.gitaccess` file." ): - DatasetPublisher("/path/to/git.yaml") + DatasetPublisher() @patch("deep_code.tools.publish.fsspec.open") def test_publish_dataset_missing_ids(self, mock_fsspec_open): @@ -30,7 +30,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open): mock_open(read_data=dataset_yaml_content)(), ] - publisher = DatasetPublisher("/path/to/git.yaml") + publisher = DatasetPublisher() with pytest.raises( ValueError, @@ -108,7 +108,7 @@ def test_publish_dataset_success( ) # Instantiate & publish - publisher = DatasetPublisher("/fake/path/to/git.yaml") + publisher = DatasetPublisher() publisher.publish_dataset("/fake/path/to/dataset-config.yaml") # 6Assert that we called git clone with /tmp/temp_repo diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index a3b6648..26b49f3 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -17,21 +17,25 @@ class DatasetPublisher: - """Publishes products to the OSC GitHub repository. + """ + Publishes products to the OSC GitHub repository. + + Credentials must be provided via a hidden file named `.gitaccess`, located in + the root of the repository. This file is expected to contain YAML of the form: - Args: - git_config_path: Path to the YAML file containing GitHub credentials. + github-username: "YOUR_GITHUB_USERNAME" + github-token: "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN" """ - def __init__(self, git_config_path: str): - with fsspec.open(git_config_path, "r") as file: + def __init__(self): + with fsspec.open(".gitaccess", "r") as file: git_config = yaml.safe_load(file) or {} self.github_username = git_config.get("github-username") self.github_token = git_config.get("github-token") if not self.github_username or not self.github_token: - raise ValueError("GitHub credentials are missing in the git.yaml file.") + raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") self.github_automation = GitHubAutomation( self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME From 452c104b3d927ac3188eca51acd4ed83dafe5e46 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 9 Jan 2025 18:20:23 +0100 Subject: [PATCH 59/63] dataset_config is not an option but an argument --- README.md | 20 +++----------------- deep_code/cli/publish.py | 8 +++----- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/README.md b/README.md index a69c26f..f42edc9 100644 --- a/README.md +++ b/README.md @@ -55,28 +55,14 @@ Use the --help option with these subcommands to get more details on usage. ### deep-code publish-product -Publish a dataset which is a result of a experiment to the EarthCODE +Publish a dataset which is a result of an experiment to the EarthCODE open-science catalog. ```commandline - deep-code publish-dataset --help + deep-code publish-dataset /path/to/dataset-config.yaml ``` -``` -Usage: deep-code publish-dataset [OPTIONS] - - Request publishing a dataset to the open science catalogue. - -Options: - --git-config PATH Path to the git.yaml file with GitHub credentials. - [required] - --product-config PATH Path to the dataset-config.yaml file with dataset - information. [required] - --help Show this message and exit. - -``` - -#### git.yaml example +#### .gitaccess example ``` github-username: your-git-user diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 4152e46..48b1e63 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -10,11 +10,9 @@ @click.command(name="publish-dataset") -@click.option( - "--dataset-config", - required=True, - type=click.Path(exists=True), - help="Path to the dataset-config.yaml file with dataset information.", +@click.argument( + "dataset_config", + type=click.Path(exists=True) ) def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. From 965674bc3876836218c3dd2552504707c4f16eaf Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 10 Jan 2025 09:48:12 +0100 Subject: [PATCH 60/63] pin zarr version to fix failing ci --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index b52434a..de92fff 100644 --- a/environment.yml +++ b/environment.yml @@ -12,6 +12,7 @@ dependencies: - pystac - pyyaml - xcube + - zarr <=3 # test dependencies - numpy - pytest From dc7d7262eb2b1ed8c66813d0ba9e32589424fa3f Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 10 Jan 2025 09:51:39 +0100 Subject: [PATCH 61/63] pin zarr version to fix failing ci --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index de92fff..a89b71b 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - pystac - pyyaml - xcube - - zarr <=3 + - zarr <3 # test dependencies - numpy - pytest From a4efe5aeb52c24e2bdf2aff8f1cfd62d4970d5f2 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 10 Jan 2025 09:53:34 +0100 Subject: [PATCH 62/63] pin zarr version to fix failing ci --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index a89b71b..9570a8d 100644 --- a/environment.yml +++ b/environment.yml @@ -12,7 +12,7 @@ dependencies: - pystac - pyyaml - xcube - - zarr <3 + - zarr >=2.11,<3 # test dependencies - numpy - pytest From e69805e67c524b23ad178aa7799676bde55f5691 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 10 Jan 2025 10:02:01 +0100 Subject: [PATCH 63/63] updated read me --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f42edc9..1d4b381 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ developers/contributors can follow the below steps to install deep-code. To install deep-code directly from the git repository, clone the repository, and execute the steps below: ```commandline -conda env create -f environment.yml +conda env create conda activate deep-code pip install -e . ```