From 2b6891a70c26fbaef37a22fcbe17ed6b036ffd9d Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 10 Jan 2025 18:07:22 +0100 Subject: [PATCH 1/7] current state 10.1 --- deep_code/cli/publish.py | 5 +- .../utils/test_dataset_stac_generator.py | 2 +- deep_code/tools/publish.py | 11 +- deep_code/utils/dataset_stac_generator.py | 129 ++++++++++++------ 4 files changed, 99 insertions(+), 48 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 48b1e63..a9da0f3 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -10,10 +10,7 @@ @click.command(name="publish-dataset") -@click.argument( - "dataset_config", - type=click.Path(exists=True) -) +@click.argument("dataset_config", type=click.Path(exists=True)) def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. """ diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 12321b2..8fc5e39 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -66,7 +66,7 @@ def test_get_temporal_extent(self): def test_get_variables(self): """Test variable extraction.""" - variables = self.generator._get_variables() + variables = self.generator._get_variable_ids() self.assertEqual(variables, ["var1", "var2"]) def test_get_general_metadata(self): diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 26b49f3..1d36fc9 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -77,21 +77,26 @@ def publish_dataset(self, dataset_config_path: str): osc_themes=dataset_theme, cf_params=cf_params, ) - collection = generator.build_stac_collection() + var_catalogs = generator.get_variables_and_build_catalog() + ds_collection = generator.build_stac_collection() file_path = f"products/{collection_id}/collection.json" logger.info("Automating GitHub tasks...") + self.github_automation.fork_repository() self.github_automation.clone_repository() OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - self.github_automation.add_file(file_path, collection.to_dict()) + self.github_automation.add_file(file_path, ds_collection.to_dict()) + for var_id, var_catalog in var_catalogs.items(): + var_file_path = f"variables/{var_id}/catalog.json" + self.github_automation.add_file(var_file_path, var_catalog.to_dict()) self.github_automation.commit_and_push( OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, - f"Add new collection", + f"Add new dataset collection", "This PR adds a new collection to the repository.", ) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 21f4cf8..8ccd49e 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -9,7 +9,7 @@ from datetime import datetime, timezone import pandas as pd -from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent +from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent, Catalog from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension @@ -170,29 +170,6 @@ def _get_temporal_extent(self) -> TemporalExtent: def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - def _get_variables(self) -> list[str]: - """Extracts variable names or descriptions from the dataset. - - Variables are prioritized based on their `long_name` or `standard_name` - attributes. If neither is available, the variable's key from - `dataset.data_vars.keys()` is used. - - Returns: - A list of variable names or descriptions. - """ - variables = [] - for var_name, variable in self.dataset.data_vars.items(): - long_name = self._normalize_name(variable.attrs.get("long_name")) - standard_name = self._normalize_name(variable.attrs.get("standard_name")) - if not long_name and not standard_name: - self.logger.error( - f"Metadata missing for variable '{var_name}': 'long_name' and " - f"'standard_name' attributes are not available." - ) - # Prioritize 'long_name', fallback to 'standard_name', then use variable key - variables.append(long_name or standard_name or var_name) - return variables - def _get_general_metadata(self) -> dict: return { "description": self.dataset.attrs.get( @@ -200,36 +177,108 @@ def _get_general_metadata(self) -> dict: ) } - def _get_variable_metadata(self, var_name, var_data) -> dict: - """Extract metadata from a single variable's attributes. + def _extract_variable_metadata(self, variable_data) -> dict: + """Extract metadata for a single variable.""" + long_name = variable_data.attrs.get("long_name") + standard_name = variable_data.attrs.get("standard_name") + title = long_name or standard_name or variable_data.name + description = variable_data.attrs.get("description", "No variable description") + return {"variable_id": self._normalize_name(title), "description": description} + + def _get_variable_ids(self) -> list[str]: + """Extract variable IDs for each variable in the dataset.""" + return [ + self._extract_variable_metadata(variable)["variable_id"] + for variable in self.dataset.data_vars.values() + ] + + def get_variables_and_build_catalog(self) -> dict[str, Catalog]: + """Extract metadata and STAC catalog for each variable in the dataset.""" + var_catalogs = {} + for var_name, variable in self.dataset.data_vars.items(): + var_metadata = self._extract_variable_metadata(variable) + var_catalog = self.build_variable_catalog(var_metadata) + var_catalogs[var_name] = var_catalog + return var_catalogs - Args: - var_name: The raw variable name in the dataset. - var_data: An xarray DataArray containing variable data and attrs. + def build_variable_catalog(self, var_metadata) -> Catalog: + """Build an OSC STAC Catalog for the variables in the dataset. Returns: - A dict with 'id', 'title', and 'description'. + A pystac.Catalog object. """ - long_name = var_data.attrs.get("long_name") - standard_name = var_data.attrs.get("standard_name") - title = long_name or standard_name or var_name + var_id = var_metadata.get("variable_id") + # Set 'themes' to an empty list if none given + themes = self.osc_themes or [] + + now_iso = datetime.now(timezone.utc).isoformat() + + # Create a PySTAC Catalog object + var_catalog = Catalog( + id=var_id, + description=var_metadata.get("description"), + stac_extensions=[ + "https://stac-extensions.github.io/themes/v1.0.0/schema.json" + ], + ) + + var_catalog.stac_version = "1.0.0" + var_catalog.extra_fields["updated"] = now_iso + var_catalog.keywords = [] + + # Add the 'themes' block (from your example JSON) + var_catalog.extra_fields["themes"] = themes + + var_catalog.remove_links("root") + # Add relevant links + var_catalog.add_link( + Link( + rel="root", + target="../../catalog.json", + media_type="application/json", + title="Open Science Catalog", + ) + ) + + # 'child' link: points to the product (or one of its collections) using this variable + var_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) - normalized_title = self._normalize_name(title) + # 'parent' link: back up to the variables overview + var_catalog.add_link( + Link( + rel="parent", + target="../catalog.json", + media_type="application/json", + title="Variables", + ) + ) - description = var_data.attrs.get("description", "No variable description") + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + var_catalog.set_self_href(self_href) - return {"id": var_name, "title": normalized_title, "description": description} + return var_catalog def build_stac_collection(self) -> Collection: - """ - Build an OSC STAC Collection for the dataset. + """Build an OSC STAC Collection for the dataset. - :return: A pystac.Collection object. + Returns: + A pystac.Collection object. """ try: spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() - variables = self._get_variables() + variables = self._get_variable_ids() general_metadata = self._get_general_metadata() except ValueError as e: raise ValueError(f"Metadata extraction failed: {e}") From b0aabdad788291154f2ef2b5e2386ae6812fa279 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 14 Jan 2025 16:45:04 +0100 Subject: [PATCH 2/7] implemented build variable reference feature --- deep_code/tests/tools/test_publish.py | 22 +-- .../utils/test_dataset_stac_generator.py | 12 +- deep_code/tools/publish.py | 52 ++++-- deep_code/utils/dataset_stac_generator.py | 153 +++++++++++++++++- deep_code/utils/github_automation.py | 6 + environment.yml | 1 + pyproject.toml | 1 + 7 files changed, 210 insertions(+), 37 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 47c9961..6be4601 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -61,14 +61,14 @@ def test_publish_dataset_success( github-token: test-token """ dataset_yaml_content = """ - dataset-id: test-dataset - collection-id: test-collection - documentation-link: http://example.com/doc - access-link: http://example.com/access - dataset-status: ongoing - dataset-region: Global - dataset-theme: ["climate"] - cf-parameter: [] + dataset_id: test-dataset + collection_id: test-collection + documentation_link: http://example.com/doc + access_link: http://example.com/access + dataset_status: ongoing + dataset_region: Global + osc_theme: ["climate"] + cf_parameter: [] """ mock_fsspec_open.side_effect = [ mock_open(read_data=git_yaml_content)(), @@ -102,8 +102,8 @@ def test_publish_dataset_success( "links": [], "stac_version": "1.0.0", } - with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator: - mock_generator.return_value.build_stac_collection.return_value = ( + with patch("deep_code.tools.publish.OSCDatasetSTACGenerator") as mock_generator: + mock_generator.return_value.build_dataset_stac_collection.return_value = ( mock_collection ) @@ -111,7 +111,7 @@ def test_publish_dataset_success( publisher = DatasetPublisher() publisher.publish_dataset("/fake/path/to/dataset-config.yaml") - # 6Assert that we called git clone with /tmp/temp_repo + # Assert that we called git clone with /tmp/temp_repo # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" mock_subprocess_run.assert_any_call( diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 8fc5e39..ebceb60 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -7,7 +7,7 @@ from unittest.mock import patch, MagicMock from xarray import Dataset -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator class TestOSCProductSTACGenerator(unittest.TestCase): @@ -36,7 +36,7 @@ def setUp(self, mock_data_store): mock_store.open_data.return_value = self.mock_dataset mock_data_store.return_value = mock_store - self.generator = OSCProductSTACGenerator( + self.generator = OSCDatasetSTACGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", access_link="s3://mock-bucket/mock-dataset", @@ -78,7 +78,7 @@ def test_get_general_metadata(self): @patch("pystac.Collection.set_self_href") def test_build_stac_collection(self, mock_set_self_href, mock_add_link): """Test STAC collection creation.""" - collection = self.generator.build_stac_collection() + collection = self.generator.build_dataset_stac_collection() self.assertIsInstance(collection, Collection) self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") @@ -116,7 +116,7 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor mock_store.open_data.return_value = "mock_dataset" # Instantiate the generator (this will implicitly call _open_dataset) - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset is assigned correctly self.assertEqual(generator.dataset, "mock_dataset") @@ -157,7 +157,7 @@ def test_open_dataset_success_authenticated_store( os.environ["S3_USER_STORAGE_KEY"] = "mock-key" os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset was successfully opened with the authenticated store self.assertEqual(generator.dataset, "mock_dataset") @@ -195,7 +195,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" with self.assertRaises(ValueError) as context: - OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") self.assertIn( "Failed to open Zarr dataset with ID mock-dataset-id", diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 1d36fc9..8982ba1 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -7,9 +7,10 @@ import fsspec import logging import yaml +from pathlib import Path from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator from deep_code.utils.github_automation import GitHubAutomation logger = logging.getLogger(__name__) @@ -50,14 +51,14 @@ def publish_dataset(self, dataset_config_path: str): with fsspec.open(dataset_config_path, "r") as file: dataset_config = yaml.safe_load(file) - dataset_id = dataset_config.get("dataset-id") - collection_id = dataset_config.get("collection-id") - documentation_link = dataset_config.get("documentation-link") - access_link = dataset_config.get("access-link") - dataset_status = dataset_config.get("dataset-status") - osc_region = dataset_config.get("dataset-region") - dataset_theme = dataset_config.get("dataset-theme") - cf_params = dataset_config.get("cf-parameter") + dataset_id = dataset_config.get("dataset_id") + collection_id = dataset_config.get("collection_id") + documentation_link = dataset_config.get("documentation_link") + access_link = dataset_config.get("access_link") + dataset_status = dataset_config.get("dataset_status") + osc_region = dataset_config.get("osc_region") + osc_themes = dataset_config.get("osc_themes") + cf_params = dataset_config.get("cf_parameter") if not dataset_id or not collection_id: raise ValueError( @@ -67,18 +68,18 @@ def publish_dataset(self, dataset_config_path: str): try: logger.info("Generating STAC collection...") - generator = OSCProductSTACGenerator( + generator = OSCDatasetSTACGenerator( dataset_id=dataset_id, collection_id=collection_id, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, osc_region=osc_region, - osc_themes=dataset_theme, + osc_themes=osc_themes, cf_params=cf_params, ) var_catalogs = generator.get_variables_and_build_catalog() - ds_collection = generator.build_stac_collection() + ds_collection = generator.build_dataset_stac_collection() file_path = f"products/{collection_id}/collection.json" logger.info("Automating GitHub tasks...") @@ -87,10 +88,33 @@ def publish_dataset(self, dataset_config_path: str): self.github_automation.clone_repository() OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - self.github_automation.add_file(file_path, ds_collection.to_dict()) + for var_id, var_catalog in var_catalogs.items(): var_file_path = f"variables/{var_id}/catalog.json" - self.github_automation.add_file(var_file_path, var_catalog.to_dict()) + if not self.github_automation.file_exists(var_file_path): + logger.info( + f"Variable catalog for {var_id} does not exist. Creating..." + ) + self.github_automation.add_file( + var_file_path, var_catalog.to_dict() + ) + else: + logger.info( + f"Variable catalog already exists for {var_id}. so add the " + f"product as child link..." + ) + full_path = ( + Path(self.github_automation.local_clone_dir) / var_file_path + ) + self.github_automation.add_file( + var_file_path, + generator.update_existing_variable_catalog( + full_path, var_id + ).to_dict(), + ) + + self.github_automation.add_file(file_path, ds_collection.to_dict()) + self.github_automation.commit_and_push( OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" ) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 8ccd49e..2dd2d31 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -9,13 +9,15 @@ from datetime import datetime, timezone import pandas as pd +import requests from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent, Catalog +from urllib.parse import quote_plus from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension -class OSCProductSTACGenerator: +class OSCDatasetSTACGenerator: """Generates OSC STAC Collections for a product from Zarr datasets. Args: @@ -183,7 +185,12 @@ def _extract_variable_metadata(self, variable_data) -> dict: standard_name = variable_data.attrs.get("standard_name") title = long_name or standard_name or variable_data.name description = variable_data.attrs.get("description", "No variable description") - return {"variable_id": self._normalize_name(title), "description": description} + gcmd_keyword = variable_data.attrs.get("gcmd_keyword") + return { + "variable_id": self._normalize_name(title), + "description": description, + "gcmd_keyword": gcmd_keyword, + } def _get_variable_ids(self) -> list[str]: """Extract variable IDs for each variable in the dataset.""" @@ -198,9 +205,100 @@ def get_variables_and_build_catalog(self) -> dict[str, Catalog]: for var_name, variable in self.dataset.data_vars.items(): var_metadata = self._extract_variable_metadata(variable) var_catalog = self.build_variable_catalog(var_metadata) - var_catalogs[var_name] = var_catalog + var_catalogs[var_metadata.get("variable_id")] = var_catalog return var_catalogs + @staticmethod + def _get_gcmd_scheme_uuid(keyword: str) -> str | None: + """Query NASA's GCMD KMS concepts for a given keyword, and return the first matching UUID. + + Args: + keyword: The GCMD keyword to look up (e.g., "EVAPORATION"). + + Returns: + The UUID string if found, otherwise None. + """ + url = "https://api.gcmd.earthdata.nasa.gov/kms/concepts/concepts" + params = {"keyword": keyword, "format": "json"} + + resp = requests.get(url, params=params) + if resp.status_code != 200: + # Request failed + return None + + data = resp.json() + concepts = data.get("concepts", []) + # Loop through concepts and find the one that matches our keyword in short_name (case-insensitive). + for concept in concepts: + if concept.get("short_name", "").upper() == keyword.upper(): + return concept.get("uuid") + + return None + + @staticmethod + def _build_gcmd_viewer_url( + keyword: str, scheme_uuid: str, scheme: str = "Earth Science" + ) -> str: + """Builds the GCMD Keyword Viewer URL for a given keyword and UUID. + + Args: + keyword: GCMD keyword (e.g., "EVAPORATION"). + scheme_uuid: The UUID for this keyword (e.g., "b68ab978-6db6-49ee-84e2-5f37b461a998"). + scheme: The GCMD scheme, default is "Earth Science". + + Returns: + The fully qualified GCMD viewer URL, e.g.: + https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/Earth%20Science/... + """ + # URL-encode the scheme and keyword + url_scheme = quote_plus(scheme) + url_keyword = quote_plus(keyword) + + # Construct the GCMD viewer URL + gcmd_url = ( + f"https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/{url_scheme}/{scheme_uuid}" + f"?gtm_keyword={url_keyword}>m_scheme={url_scheme}" + ) + + return gcmd_url + + def _add_gcmd_link_to_var_catalog( + self, var_catalog: Catalog, var_metadata: dict + ) -> None: + """ + Checks for a GCMD keyword in var_metadata, retrieves its scheme UUID, + and if found, adds a 'via' link to the catalog pointing to the GCMD Keyword Viewer. + + Args: + var_catalog: The PySTAC Catalog to which we want to add the link. + var_metadata: Dictionary containing metadata about the variable, + including 'gcmd_keyword'. + """ + gcmd_keyword = var_metadata.get("gcmd_keyword") + if not gcmd_keyword: + self.logger.debug("No `gcmd_keyword` in var_metadata. Skipping GCMD link.") + return + + # Retrieve scheme UUID from the NASA KMS API + scheme_uuid = self._get_gcmd_scheme_uuid(gcmd_keyword) + if not scheme_uuid: + self.logger.debug( + f"No GCMD UUID found for keyword '{gcmd_keyword}'. Skipping GCMD link." + ) + return + + gcmd_url = self._build_gcmd_viewer_url(gcmd_keyword, scheme_uuid) + + # Add `rel="via"` link for the GCMD viewer + var_catalog.add_link( + Link( + rel="via", target=gcmd_url, title="Description", media_type="text/html" + ) + ) + self.logger.info( + f"Added GCMD link for keyword '{gcmd_keyword}' (UUID: {scheme_uuid})." + ) + def build_variable_catalog(self, var_metadata) -> Catalog: """Build an OSC STAC Catalog for the variables in the dataset. @@ -208,8 +306,14 @@ def build_variable_catalog(self, var_metadata) -> Catalog: A pystac.Catalog object. """ var_id = var_metadata.get("variable_id") - # Set 'themes' to an empty list if none given - themes = self.osc_themes or [] + concepts = [{"id": theme} for theme in self.osc_themes] + + themes = [ + { + "scheme": "https://github.com/stac-extensions/osc#theme", + "concepts": concepts, + } + ] now_iso = datetime.now(timezone.utc).isoformat() @@ -217,6 +321,7 @@ def build_variable_catalog(self, var_metadata) -> Catalog: var_catalog = Catalog( id=var_id, description=var_metadata.get("description"), + title=var_id, stac_extensions=[ "https://stac-extensions.github.io/themes/v1.0.0/schema.json" ], @@ -259,6 +364,8 @@ def build_variable_catalog(self, var_metadata) -> Catalog: title="Variables", ) ) + # Add gcmd link for the variable definition + self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata) self_href = ( f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" @@ -269,7 +376,30 @@ def build_variable_catalog(self, var_metadata) -> Catalog: return var_catalog - def build_stac_collection(self) -> Collection: + def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: + existing_catalog = Catalog.from_file(var_file_path) + now_iso = datetime.now(timezone.utc).isoformat() + existing_catalog.extra_fields["updated"] = now_iso + + # add 'child' link as the product + existing_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + existing_catalog.set_self_href(self_href) + + return existing_catalog + + def build_dataset_stac_collection(self) -> Collection: """Build an OSC STAC Collection for the dataset. Returns: @@ -309,6 +439,7 @@ def build_stac_collection(self) -> Collection: now_iso = datetime.now(timezone.utc).isoformat() collection.extra_fields["created"] = now_iso collection.extra_fields["updated"] = now_iso + collection.title = self.collection_id # Remove any existing root link and re-add it properly collection.remove_links("root") @@ -333,6 +464,16 @@ def build_stac_collection(self) -> Collection: title="Products", ) ) + # Add variables ref + for var in variables: + collection.add_link( + Link( + rel="related", + target=f"../../varibales/{var}/catalog.json", + media_type="application/json", + title="Variable: " + var, + ) + ) self_href = ( "https://esa-earthcode.github.io/" diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index d934d2a..0218c13 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -113,3 +113,9 @@ def clean_up(self): subprocess.run(["rm", "-rf", self.local_clone_dir]) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to clean-up local repository: {e}") + + def file_exists(self, file_path) -> bool: + full_path = Path(self.local_clone_dir) / file_path + exists = os.path.isfile(full_path) + logging.debug(f"Checking existence of {full_path}: {exists}") + return exists diff --git a/environment.yml b/environment.yml index 9570a8d..92ba6f2 100644 --- a/environment.yml +++ b/environment.yml @@ -11,6 +11,7 @@ dependencies: - pandas - pystac - pyyaml + - urllib - xcube - zarr >=2.11,<3 # test dependencies diff --git a/pyproject.toml b/pyproject.toml index 057f7b8..023efb8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,6 +26,7 @@ dependencies = [ "pandas", "pystac", "pyyaml", + "urllib", "xcube-core" ] From b6e3c4102bd907f4028721601932baab35a46a11 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 14 Jan 2025 16:49:02 +0100 Subject: [PATCH 3/7] update env with right conda package name --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 92ba6f2..27256c6 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,7 @@ dependencies: - pandas - pystac - pyyaml - - urllib + - urllib3 - xcube - zarr >=2.11,<3 # test dependencies From 431633218c53fd79dfde09e1851962568e9d560a Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 14 Jan 2025 17:21:23 +0100 Subject: [PATCH 4/7] remove urllib3 --- environment.yml | 1 - pyproject.toml | 1 - 2 files changed, 2 deletions(-) diff --git a/environment.yml b/environment.yml index 27256c6..9570a8d 100644 --- a/environment.yml +++ b/environment.yml @@ -11,7 +11,6 @@ dependencies: - pandas - pystac - pyyaml - - urllib3 - xcube - zarr >=2.11,<3 # test dependencies diff --git a/pyproject.toml b/pyproject.toml index 023efb8..057f7b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,6 @@ dependencies = [ "pandas", "pystac", "pyyaml", - "urllib", "xcube-core" ] From acc54078cd54d5cdae080a6213b4b53503951971 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 14 Jan 2025 17:25:11 +0100 Subject: [PATCH 5/7] update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 1d4b381..ce4ed34 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,9 @@ and Python API providing utilities that aid integration of DeepESDL datasets, experiments with EarthCODE. +The first release will focus on implementing the publish feature of DeepESDL +experiments/workflow as OGC API record and Datasets as an OSC stac collection. + ## Setup ## Install From cdd908f431fc8c7e0c4fdb46756a0b0205ca2203 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 15 Jan 2025 15:57:16 +0100 Subject: [PATCH 6/7] updated logic for variable catalog generation --- .../utils/test_dataset_stac_generator.py | 28 +++-- deep_code/tools/publish.py | 9 +- deep_code/utils/dataset_stac_generator.py | 116 ++++-------------- 3 files changed, 54 insertions(+), 99 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index ebceb60..95ecb85 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -28,8 +28,24 @@ def setUp(self, mock_data_store): }, attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"}, data_vars={ - "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), - "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), + "var1": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var1", + "gcmd_keyword_url": "https://dummy", + }, + ), + "var2": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var2", + "gcmd_keyword_url": "https://dummy", + }, + ), }, ) mock_store = MagicMock() @@ -66,7 +82,7 @@ def test_get_temporal_extent(self): def test_get_variables(self): """Test variable extraction.""" - variables = self.generator._get_variable_ids() + variables = self.generator.get_variable_ids() self.assertEqual(variables, ["var1", "var2"]) def test_get_general_metadata(self): @@ -104,8 +120,6 @@ def test_invalid_temporal_extent(self): with self.assertRaises(ValueError): self.generator._get_temporal_extent() - -class TestOpenDataset(unittest.TestCase): @patch("deep_code.utils.dataset_stac_generator.new_data_store") @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): @@ -113,7 +127,7 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor # Create a mock store and mock its `open_data` method mock_store = MagicMock() mock_new_data_store.return_value = mock_store - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset # Instantiate the generator (this will implicitly call _open_dataset) generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") @@ -151,7 +165,7 @@ def test_open_dataset_success_authenticated_store( mock_store, # Second call (authenticated store) returns a mock store ] - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" os.environ["S3_USER_STORAGE_KEY"] = "mock-key" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 8982ba1..dccb336 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -78,23 +78,26 @@ def publish_dataset(self, dataset_config_path: str): osc_themes=osc_themes, cf_params=cf_params, ) - var_catalogs = generator.get_variables_and_build_catalog() + # get variables from the datasets + variable_ids = generator.get_variable_ids() + # build STAC collection for the dataset ds_collection = generator.build_dataset_stac_collection() file_path = f"products/{collection_id}/collection.json" logger.info("Automating GitHub tasks...") - self.github_automation.fork_repository() self.github_automation.clone_repository() OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - for var_id, var_catalog in var_catalogs.items(): + for var_id in variable_ids: var_file_path = f"variables/{var_id}/catalog.json" if not self.github_automation.file_exists(var_file_path): logger.info( f"Variable catalog for {var_id} does not exist. Creating..." ) + var_metadata = generator.variables_metadata.get(var_id) + var_catalog = generator.build_variable_catalog(var_metadata) self.github_automation.add_file( var_file_path, var_catalog.to_dict() ) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 2dd2d31..8889387 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -55,6 +55,7 @@ def __init__( self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() + self.variables_metadata = self.get_variables_metadata() def _open_dataset(self): """Open the dataset using a S3 store as a xarray Dataset.""" @@ -179,124 +180,61 @@ def _get_general_metadata(self) -> dict: ) } - def _extract_variable_metadata(self, variable_data) -> dict: + def extract_metadata_for_variable(self, variable_data) -> dict: """Extract metadata for a single variable.""" long_name = variable_data.attrs.get("long_name") standard_name = variable_data.attrs.get("standard_name") title = long_name or standard_name or variable_data.name description = variable_data.attrs.get("description", "No variable description") - gcmd_keyword = variable_data.attrs.get("gcmd_keyword") + gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url") return { "variable_id": self._normalize_name(title), "description": description, - "gcmd_keyword": gcmd_keyword, + "gcmd_keyword_url": gcmd_keyword_url, } - def _get_variable_ids(self) -> list[str]: - """Extract variable IDs for each variable in the dataset.""" - return [ - self._extract_variable_metadata(variable)["variable_id"] - for variable in self.dataset.data_vars.values() - ] + def get_variable_ids(self) -> list[str]: + """Get variable IDs for all variables in the dataset.""" + return list(self.variables_metadata.keys()) - def get_variables_and_build_catalog(self) -> dict[str, Catalog]: - """Extract metadata and STAC catalog for each variable in the dataset.""" - var_catalogs = {} + def get_variables_metadata(self) -> dict[str, dict]: + """Extract metadata for all variables in the dataset.""" + variables_metadata = {} for var_name, variable in self.dataset.data_vars.items(): - var_metadata = self._extract_variable_metadata(variable) - var_catalog = self.build_variable_catalog(var_metadata) - var_catalogs[var_metadata.get("variable_id")] = var_catalog - return var_catalogs - - @staticmethod - def _get_gcmd_scheme_uuid(keyword: str) -> str | None: - """Query NASA's GCMD KMS concepts for a given keyword, and return the first matching UUID. - - Args: - keyword: The GCMD keyword to look up (e.g., "EVAPORATION"). - - Returns: - The UUID string if found, otherwise None. - """ - url = "https://api.gcmd.earthdata.nasa.gov/kms/concepts/concepts" - params = {"keyword": keyword, "format": "json"} - - resp = requests.get(url, params=params) - if resp.status_code != 200: - # Request failed - return None - - data = resp.json() - concepts = data.get("concepts", []) - # Loop through concepts and find the one that matches our keyword in short_name (case-insensitive). - for concept in concepts: - if concept.get("short_name", "").upper() == keyword.upper(): - return concept.get("uuid") - - return None - - @staticmethod - def _build_gcmd_viewer_url( - keyword: str, scheme_uuid: str, scheme: str = "Earth Science" - ) -> str: - """Builds the GCMD Keyword Viewer URL for a given keyword and UUID. - - Args: - keyword: GCMD keyword (e.g., "EVAPORATION"). - scheme_uuid: The UUID for this keyword (e.g., "b68ab978-6db6-49ee-84e2-5f37b461a998"). - scheme: The GCMD scheme, default is "Earth Science". - - Returns: - The fully qualified GCMD viewer URL, e.g.: - https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/Earth%20Science/... - """ - # URL-encode the scheme and keyword - url_scheme = quote_plus(scheme) - url_keyword = quote_plus(keyword) - - # Construct the GCMD viewer URL - gcmd_url = ( - f"https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/{url_scheme}/{scheme_uuid}" - f"?gtm_keyword={url_keyword}>m_scheme={url_scheme}" - ) - - return gcmd_url + var_metadata = self.extract_metadata_for_variable(variable) + variables_metadata[var_metadata.get("variable_id")] = var_metadata + return variables_metadata def _add_gcmd_link_to_var_catalog( self, var_catalog: Catalog, var_metadata: dict ) -> None: """ - Checks for a GCMD keyword in var_metadata, retrieves its scheme UUID, - and if found, adds a 'via' link to the catalog pointing to the GCMD Keyword Viewer. + Checks for a GCMD keyword URL in var_metadata, adds a 'via' link to the catalog + pointing to the GCMD Keyword Viewer. Args: var_catalog: The PySTAC Catalog to which we want to add the link. var_metadata: Dictionary containing metadata about the variable, - including 'gcmd_keyword'. + including 'gcmd_keyword_url'. """ - gcmd_keyword = var_metadata.get("gcmd_keyword") - if not gcmd_keyword: - self.logger.debug("No `gcmd_keyword` in var_metadata. Skipping GCMD link.") - return - - # Retrieve scheme UUID from the NASA KMS API - scheme_uuid = self._get_gcmd_scheme_uuid(gcmd_keyword) - if not scheme_uuid: + gcmd_keyword_url = var_metadata.get("gcmd_keyword_url") + if not gcmd_keyword_url: self.logger.debug( - f"No GCMD UUID found for keyword '{gcmd_keyword}'. Skipping GCMD link." + f"No gcmd_keyword_url in var_metadata. Skipping adding GCMD link in " + f'the {var_metadata.get("variable_id")} catalog' ) return - - gcmd_url = self._build_gcmd_viewer_url(gcmd_keyword, scheme_uuid) - - # Add `rel="via"` link for the GCMD viewer var_catalog.add_link( Link( - rel="via", target=gcmd_url, title="Description", media_type="text/html" + rel="via", + target=gcmd_keyword_url, + title="Description", + media_type="text/html", ) ) self.logger.info( - f"Added GCMD link for keyword '{gcmd_keyword}' (UUID: {scheme_uuid})." + f'Added GCMD link for {var_metadata.get("variable_id")} ' + f"catalog {gcmd_keyword_url}." ) def build_variable_catalog(self, var_metadata) -> Catalog: @@ -408,7 +346,7 @@ def build_dataset_stac_collection(self) -> Collection: try: spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() - variables = self._get_variable_ids() + variables = self.get_variable_ids() general_metadata = self._get_general_metadata() except ValueError as e: raise ValueError(f"Metadata extraction failed: {e}") From c9ee5addf44ccd6f0ee2b84990853cecc7fcf2f1 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 15 Jan 2025 15:58:21 +0100 Subject: [PATCH 7/7] updated README.md --- README.md | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ce4ed34..e7737e6 100644 --- a/README.md +++ b/README.md @@ -75,16 +75,18 @@ github-token: personal access token #### dataset-config.yaml example ``` -dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr -collection-id: hydrology - -#non-mandatory -documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/ -access-link: s3://test -dataset-status: completed -dataset-region: global -dataset-theme: ["ocean", "environment"] -cf-parameter: [{"Name" : "hydrology"}] +dataset_id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr +collection_id: hydrology +osc_themes: + - Land + - Oceans +# non-mandatory +documentation_link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0.009deg-100x60x60-3.0.2.zarr/ +access_link: s3://test +dataset_status: completed +osc_region: global +cf_parameter: + - name: hydrology ``` dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. \ No newline at end of file