diff --git a/README.md b/README.md index 1d4b381..e7737e6 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,9 @@ and Python API providing utilities that aid integration of DeepESDL datasets, experiments with EarthCODE. +The first release will focus on implementing the publish feature of DeepESDL +experiments/workflow as OGC API record and Datasets as an OSC stac collection. + ## Setup ## Install @@ -72,16 +75,18 @@ github-token: personal access token #### dataset-config.yaml example ``` -dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr -collection-id: hydrology - -#non-mandatory -documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/ -access-link: s3://test -dataset-status: completed -dataset-region: global -dataset-theme: ["ocean", "environment"] -cf-parameter: [{"Name" : "hydrology"}] +dataset_id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr +collection_id: hydrology +osc_themes: + - Land + - Oceans +# non-mandatory +documentation_link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0.009deg-100x60x60-3.0.2.zarr/ +access_link: s3://test +dataset_status: completed +osc_region: global +cf_parameter: + - name: hydrology ``` dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. \ No newline at end of file diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 48b1e63..a9da0f3 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -10,10 +10,7 @@ @click.command(name="publish-dataset") -@click.argument( - "dataset_config", - type=click.Path(exists=True) -) +@click.argument("dataset_config", type=click.Path(exists=True)) def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. """ diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 47c9961..6be4601 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -61,14 +61,14 @@ def test_publish_dataset_success( github-token: test-token """ dataset_yaml_content = """ - dataset-id: test-dataset - collection-id: test-collection - documentation-link: http://example.com/doc - access-link: http://example.com/access - dataset-status: ongoing - dataset-region: Global - dataset-theme: ["climate"] - cf-parameter: [] + dataset_id: test-dataset + collection_id: test-collection + documentation_link: http://example.com/doc + access_link: http://example.com/access + dataset_status: ongoing + dataset_region: Global + osc_theme: ["climate"] + cf_parameter: [] """ mock_fsspec_open.side_effect = [ mock_open(read_data=git_yaml_content)(), @@ -102,8 +102,8 @@ def test_publish_dataset_success( "links": [], "stac_version": "1.0.0", } - with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator: - mock_generator.return_value.build_stac_collection.return_value = ( + with patch("deep_code.tools.publish.OSCDatasetSTACGenerator") as mock_generator: + mock_generator.return_value.build_dataset_stac_collection.return_value = ( mock_collection ) @@ -111,7 +111,7 @@ def test_publish_dataset_success( publisher = DatasetPublisher() publisher.publish_dataset("/fake/path/to/dataset-config.yaml") - # 6Assert that we called git clone with /tmp/temp_repo + # Assert that we called git clone with /tmp/temp_repo # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" mock_subprocess_run.assert_any_call( diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 12321b2..95ecb85 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -7,7 +7,7 @@ from unittest.mock import patch, MagicMock from xarray import Dataset -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator class TestOSCProductSTACGenerator(unittest.TestCase): @@ -28,15 +28,31 @@ def setUp(self, mock_data_store): }, attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"}, data_vars={ - "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), - "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), + "var1": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var1", + "gcmd_keyword_url": "https://dummy", + }, + ), + "var2": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var2", + "gcmd_keyword_url": "https://dummy", + }, + ), }, ) mock_store = MagicMock() mock_store.open_data.return_value = self.mock_dataset mock_data_store.return_value = mock_store - self.generator = OSCProductSTACGenerator( + self.generator = OSCDatasetSTACGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", access_link="s3://mock-bucket/mock-dataset", @@ -66,7 +82,7 @@ def test_get_temporal_extent(self): def test_get_variables(self): """Test variable extraction.""" - variables = self.generator._get_variables() + variables = self.generator.get_variable_ids() self.assertEqual(variables, ["var1", "var2"]) def test_get_general_metadata(self): @@ -78,7 +94,7 @@ def test_get_general_metadata(self): @patch("pystac.Collection.set_self_href") def test_build_stac_collection(self, mock_set_self_href, mock_add_link): """Test STAC collection creation.""" - collection = self.generator.build_stac_collection() + collection = self.generator.build_dataset_stac_collection() self.assertIsInstance(collection, Collection) self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") @@ -104,8 +120,6 @@ def test_invalid_temporal_extent(self): with self.assertRaises(ValueError): self.generator._get_temporal_extent() - -class TestOpenDataset(unittest.TestCase): @patch("deep_code.utils.dataset_stac_generator.new_data_store") @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): @@ -113,10 +127,10 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor # Create a mock store and mock its `open_data` method mock_store = MagicMock() mock_new_data_store.return_value = mock_store - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset # Instantiate the generator (this will implicitly call _open_dataset) - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset is assigned correctly self.assertEqual(generator.dataset, "mock_dataset") @@ -151,13 +165,13 @@ def test_open_dataset_success_authenticated_store( mock_store, # Second call (authenticated store) returns a mock store ] - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" os.environ["S3_USER_STORAGE_KEY"] = "mock-key" os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset was successfully opened with the authenticated store self.assertEqual(generator.dataset, "mock_dataset") @@ -195,7 +209,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" with self.assertRaises(ValueError) as context: - OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") self.assertIn( "Failed to open Zarr dataset with ID mock-dataset-id", diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 26b49f3..dccb336 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -7,9 +7,10 @@ import fsspec import logging import yaml +from pathlib import Path from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator from deep_code.utils.github_automation import GitHubAutomation logger = logging.getLogger(__name__) @@ -50,14 +51,14 @@ def publish_dataset(self, dataset_config_path: str): with fsspec.open(dataset_config_path, "r") as file: dataset_config = yaml.safe_load(file) - dataset_id = dataset_config.get("dataset-id") - collection_id = dataset_config.get("collection-id") - documentation_link = dataset_config.get("documentation-link") - access_link = dataset_config.get("access-link") - dataset_status = dataset_config.get("dataset-status") - osc_region = dataset_config.get("dataset-region") - dataset_theme = dataset_config.get("dataset-theme") - cf_params = dataset_config.get("cf-parameter") + dataset_id = dataset_config.get("dataset_id") + collection_id = dataset_config.get("collection_id") + documentation_link = dataset_config.get("documentation_link") + access_link = dataset_config.get("access_link") + dataset_status = dataset_config.get("dataset_status") + osc_region = dataset_config.get("osc_region") + osc_themes = dataset_config.get("osc_themes") + cf_params = dataset_config.get("cf_parameter") if not dataset_id or not collection_id: raise ValueError( @@ -67,17 +68,20 @@ def publish_dataset(self, dataset_config_path: str): try: logger.info("Generating STAC collection...") - generator = OSCProductSTACGenerator( + generator = OSCDatasetSTACGenerator( dataset_id=dataset_id, collection_id=collection_id, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, osc_region=osc_region, - osc_themes=dataset_theme, + osc_themes=osc_themes, cf_params=cf_params, ) - collection = generator.build_stac_collection() + # get variables from the datasets + variable_ids = generator.get_variable_ids() + # build STAC collection for the dataset + ds_collection = generator.build_dataset_stac_collection() file_path = f"products/{collection_id}/collection.json" logger.info("Automating GitHub tasks...") @@ -85,13 +89,41 @@ def publish_dataset(self, dataset_config_path: str): self.github_automation.clone_repository() OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - self.github_automation.add_file(file_path, collection.to_dict()) + + for var_id in variable_ids: + var_file_path = f"variables/{var_id}/catalog.json" + if not self.github_automation.file_exists(var_file_path): + logger.info( + f"Variable catalog for {var_id} does not exist. Creating..." + ) + var_metadata = generator.variables_metadata.get(var_id) + var_catalog = generator.build_variable_catalog(var_metadata) + self.github_automation.add_file( + var_file_path, var_catalog.to_dict() + ) + else: + logger.info( + f"Variable catalog already exists for {var_id}. so add the " + f"product as child link..." + ) + full_path = ( + Path(self.github_automation.local_clone_dir) / var_file_path + ) + self.github_automation.add_file( + var_file_path, + generator.update_existing_variable_catalog( + full_path, var_id + ).to_dict(), + ) + + self.github_automation.add_file(file_path, ds_collection.to_dict()) + self.github_automation.commit_and_push( OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, - f"Add new collection", + f"Add new dataset collection", "This PR adds a new collection to the repository.", ) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 21f4cf8..8889387 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -9,13 +9,15 @@ from datetime import datetime, timezone import pandas as pd -from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent +import requests +from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent, Catalog +from urllib.parse import quote_plus from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension -class OSCProductSTACGenerator: +class OSCDatasetSTACGenerator: """Generates OSC STAC Collections for a product from Zarr datasets. Args: @@ -53,6 +55,7 @@ def __init__( self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() + self.variables_metadata = self.get_variables_metadata() def _open_dataset(self): """Open the dataset using a S3 store as a xarray Dataset.""" @@ -170,29 +173,6 @@ def _get_temporal_extent(self) -> TemporalExtent: def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - def _get_variables(self) -> list[str]: - """Extracts variable names or descriptions from the dataset. - - Variables are prioritized based on their `long_name` or `standard_name` - attributes. If neither is available, the variable's key from - `dataset.data_vars.keys()` is used. - - Returns: - A list of variable names or descriptions. - """ - variables = [] - for var_name, variable in self.dataset.data_vars.items(): - long_name = self._normalize_name(variable.attrs.get("long_name")) - standard_name = self._normalize_name(variable.attrs.get("standard_name")) - if not long_name and not standard_name: - self.logger.error( - f"Metadata missing for variable '{var_name}': 'long_name' and " - f"'standard_name' attributes are not available." - ) - # Prioritize 'long_name', fallback to 'standard_name', then use variable key - variables.append(long_name or standard_name or var_name) - return variables - def _get_general_metadata(self) -> dict: return { "description": self.dataset.attrs.get( @@ -200,36 +180,173 @@ def _get_general_metadata(self) -> dict: ) } - def _get_variable_metadata(self, var_name, var_data) -> dict: - """Extract metadata from a single variable's attributes. + def extract_metadata_for_variable(self, variable_data) -> dict: + """Extract metadata for a single variable.""" + long_name = variable_data.attrs.get("long_name") + standard_name = variable_data.attrs.get("standard_name") + title = long_name or standard_name or variable_data.name + description = variable_data.attrs.get("description", "No variable description") + gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url") + return { + "variable_id": self._normalize_name(title), + "description": description, + "gcmd_keyword_url": gcmd_keyword_url, + } + + def get_variable_ids(self) -> list[str]: + """Get variable IDs for all variables in the dataset.""" + return list(self.variables_metadata.keys()) + + def get_variables_metadata(self) -> dict[str, dict]: + """Extract metadata for all variables in the dataset.""" + variables_metadata = {} + for var_name, variable in self.dataset.data_vars.items(): + var_metadata = self.extract_metadata_for_variable(variable) + variables_metadata[var_metadata.get("variable_id")] = var_metadata + return variables_metadata + + def _add_gcmd_link_to_var_catalog( + self, var_catalog: Catalog, var_metadata: dict + ) -> None: + """ + Checks for a GCMD keyword URL in var_metadata, adds a 'via' link to the catalog + pointing to the GCMD Keyword Viewer. Args: - var_name: The raw variable name in the dataset. - var_data: An xarray DataArray containing variable data and attrs. + var_catalog: The PySTAC Catalog to which we want to add the link. + var_metadata: Dictionary containing metadata about the variable, + including 'gcmd_keyword_url'. + """ + gcmd_keyword_url = var_metadata.get("gcmd_keyword_url") + if not gcmd_keyword_url: + self.logger.debug( + f"No gcmd_keyword_url in var_metadata. Skipping adding GCMD link in " + f'the {var_metadata.get("variable_id")} catalog' + ) + return + var_catalog.add_link( + Link( + rel="via", + target=gcmd_keyword_url, + title="Description", + media_type="text/html", + ) + ) + self.logger.info( + f'Added GCMD link for {var_metadata.get("variable_id")} ' + f"catalog {gcmd_keyword_url}." + ) + + def build_variable_catalog(self, var_metadata) -> Catalog: + """Build an OSC STAC Catalog for the variables in the dataset. Returns: - A dict with 'id', 'title', and 'description'. + A pystac.Catalog object. """ - long_name = var_data.attrs.get("long_name") - standard_name = var_data.attrs.get("standard_name") - title = long_name or standard_name or var_name + var_id = var_metadata.get("variable_id") + concepts = [{"id": theme} for theme in self.osc_themes] - normalized_title = self._normalize_name(title) + themes = [ + { + "scheme": "https://github.com/stac-extensions/osc#theme", + "concepts": concepts, + } + ] - description = var_data.attrs.get("description", "No variable description") + now_iso = datetime.now(timezone.utc).isoformat() - return {"id": var_name, "title": normalized_title, "description": description} + # Create a PySTAC Catalog object + var_catalog = Catalog( + id=var_id, + description=var_metadata.get("description"), + title=var_id, + stac_extensions=[ + "https://stac-extensions.github.io/themes/v1.0.0/schema.json" + ], + ) - def build_stac_collection(self) -> Collection: - """ - Build an OSC STAC Collection for the dataset. + var_catalog.stac_version = "1.0.0" + var_catalog.extra_fields["updated"] = now_iso + var_catalog.keywords = [] + + # Add the 'themes' block (from your example JSON) + var_catalog.extra_fields["themes"] = themes - :return: A pystac.Collection object. + var_catalog.remove_links("root") + # Add relevant links + var_catalog.add_link( + Link( + rel="root", + target="../../catalog.json", + media_type="application/json", + title="Open Science Catalog", + ) + ) + + # 'child' link: points to the product (or one of its collections) using this variable + var_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + + # 'parent' link: back up to the variables overview + var_catalog.add_link( + Link( + rel="parent", + target="../catalog.json", + media_type="application/json", + title="Variables", + ) + ) + # Add gcmd link for the variable definition + self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata) + + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + var_catalog.set_self_href(self_href) + + return var_catalog + + def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: + existing_catalog = Catalog.from_file(var_file_path) + now_iso = datetime.now(timezone.utc).isoformat() + existing_catalog.extra_fields["updated"] = now_iso + + # add 'child' link as the product + existing_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + existing_catalog.set_self_href(self_href) + + return existing_catalog + + def build_dataset_stac_collection(self) -> Collection: + """Build an OSC STAC Collection for the dataset. + + Returns: + A pystac.Collection object. """ try: spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() - variables = self._get_variables() + variables = self.get_variable_ids() general_metadata = self._get_general_metadata() except ValueError as e: raise ValueError(f"Metadata extraction failed: {e}") @@ -260,6 +377,7 @@ def build_stac_collection(self) -> Collection: now_iso = datetime.now(timezone.utc).isoformat() collection.extra_fields["created"] = now_iso collection.extra_fields["updated"] = now_iso + collection.title = self.collection_id # Remove any existing root link and re-add it properly collection.remove_links("root") @@ -284,6 +402,16 @@ def build_stac_collection(self) -> Collection: title="Products", ) ) + # Add variables ref + for var in variables: + collection.add_link( + Link( + rel="related", + target=f"../../varibales/{var}/catalog.json", + media_type="application/json", + title="Variable: " + var, + ) + ) self_href = ( "https://esa-earthcode.github.io/" diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index d934d2a..0218c13 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -113,3 +113,9 @@ def clean_up(self): subprocess.run(["rm", "-rf", self.local_clone_dir]) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to clean-up local repository: {e}") + + def file_exists(self, file_path) -> bool: + full_path = Path(self.local_clone_dir) / file_path + exists = os.path.isfile(full_path) + logging.debug(f"Checking existence of {full_path}: {exists}") + return exists