From 2b6891a70c26fbaef37a22fcbe17ed6b036ffd9d Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Fri, 10 Jan 2025 18:07:22 +0100
Subject: [PATCH 1/7] current state 10.1

---
 deep_code/cli/publish.py                      |   5 +-
 .../utils/test_dataset_stac_generator.py      |   2 +-
 deep_code/tools/publish.py                    |  11 +-
 deep_code/utils/dataset_stac_generator.py     | 129 ++++++++++++------
 4 files changed, 99 insertions(+), 48 deletions(-)

diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py
index 48b1e63..a9da0f3 100644
--- a/deep_code/cli/publish.py
+++ b/deep_code/cli/publish.py
@@ -10,10 +10,7 @@
 
 
 @click.command(name="publish-dataset")
-@click.argument(
-    "dataset_config",
-    type=click.Path(exists=True)
-)
+@click.argument("dataset_config", type=click.Path(exists=True))
 def publish_dataset(dataset_config):
     """Request publishing a dataset to the open science catalogue.
     """
diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py
index 12321b2..8fc5e39 100644
--- a/deep_code/tests/utils/test_dataset_stac_generator.py
+++ b/deep_code/tests/utils/test_dataset_stac_generator.py
@@ -66,7 +66,7 @@ def test_get_temporal_extent(self):
 
     def test_get_variables(self):
         """Test variable extraction."""
-        variables = self.generator._get_variables()
+        variables = self.generator._get_variable_ids()
         self.assertEqual(variables, ["var1", "var2"])
 
     def test_get_general_metadata(self):
diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py
index 26b49f3..1d36fc9 100644
--- a/deep_code/tools/publish.py
+++ b/deep_code/tools/publish.py
@@ -77,21 +77,26 @@ def publish_dataset(self, dataset_config_path: str):
                 osc_themes=dataset_theme,
                 cf_params=cf_params,
             )
-            collection = generator.build_stac_collection()
+            var_catalogs = generator.get_variables_and_build_catalog()
+            ds_collection = generator.build_stac_collection()
 
             file_path = f"products/{collection_id}/collection.json"
             logger.info("Automating GitHub tasks...")
+
             self.github_automation.fork_repository()
             self.github_automation.clone_repository()
             OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id
             self.github_automation.create_branch(OSC_NEW_BRANCH_NAME)
-            self.github_automation.add_file(file_path, collection.to_dict())
+            self.github_automation.add_file(file_path, ds_collection.to_dict())
+            for var_id, var_catalog in var_catalogs.items():
+                var_file_path = f"variables/{var_id}/catalog.json"
+                self.github_automation.add_file(var_file_path, var_catalog.to_dict())
             self.github_automation.commit_and_push(
                 OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}"
             )
             pr_url = self.github_automation.create_pull_request(
                 OSC_NEW_BRANCH_NAME,
-                f"Add new collection",
+                f"Add new dataset collection",
                 "This PR adds a new collection to the repository.",
             )
 
diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py
index 21f4cf8..8ccd49e 100644
--- a/deep_code/utils/dataset_stac_generator.py
+++ b/deep_code/utils/dataset_stac_generator.py
@@ -9,7 +9,7 @@
 from datetime import datetime, timezone
 
 import pandas as pd
-from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent
+from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent, Catalog
 from xcube.core.store import new_data_store
 
 from deep_code.utils.osc_extension import OscExtension
@@ -170,29 +170,6 @@ def _get_temporal_extent(self) -> TemporalExtent:
     def _normalize_name(name: str | None) -> str | None:
         return name.replace(" ", "-").lower() if name else None
 
-    def _get_variables(self) -> list[str]:
-        """Extracts variable names or descriptions from the dataset.
-
-        Variables are prioritized based on their `long_name` or `standard_name`
-        attributes. If neither is available, the variable's key from
-        `dataset.data_vars.keys()` is used.
-
-        Returns:
-            A list of variable names or descriptions.
-        """
-        variables = []
-        for var_name, variable in self.dataset.data_vars.items():
-            long_name = self._normalize_name(variable.attrs.get("long_name"))
-            standard_name = self._normalize_name(variable.attrs.get("standard_name"))
-            if not long_name and not standard_name:
-                self.logger.error(
-                    f"Metadata missing for variable '{var_name}': 'long_name' and "
-                    f"'standard_name' attributes are not available."
-                )
-            # Prioritize 'long_name', fallback to 'standard_name', then use variable key
-            variables.append(long_name or standard_name or var_name)
-        return variables
-
     def _get_general_metadata(self) -> dict:
         return {
             "description": self.dataset.attrs.get(
@@ -200,36 +177,108 @@ def _get_general_metadata(self) -> dict:
             )
         }
 
-    def _get_variable_metadata(self, var_name, var_data) -> dict:
-        """Extract metadata from a single variable's attributes.
+    def _extract_variable_metadata(self, variable_data) -> dict:
+        """Extract metadata for a single variable."""
+        long_name = variable_data.attrs.get("long_name")
+        standard_name = variable_data.attrs.get("standard_name")
+        title = long_name or standard_name or variable_data.name
+        description = variable_data.attrs.get("description", "No variable description")
+        return {"variable_id": self._normalize_name(title), "description": description}
+
+    def _get_variable_ids(self) -> list[str]:
+        """Extract variable IDs for each variable in the dataset."""
+        return [
+            self._extract_variable_metadata(variable)["variable_id"]
+            for variable in self.dataset.data_vars.values()
+        ]
+
+    def get_variables_and_build_catalog(self) -> dict[str, Catalog]:
+        """Extract metadata and STAC catalog for each variable in the dataset."""
+        var_catalogs = {}
+        for var_name, variable in self.dataset.data_vars.items():
+            var_metadata = self._extract_variable_metadata(variable)
+            var_catalog = self.build_variable_catalog(var_metadata)
+            var_catalogs[var_name] = var_catalog
+        return var_catalogs
 
-        Args:
-            var_name: The raw variable name in the dataset.
-            var_data: An xarray DataArray containing variable data and attrs.
+    def build_variable_catalog(self, var_metadata) -> Catalog:
+        """Build an OSC STAC Catalog for the variables in the dataset.
 
         Returns:
-            A dict with 'id', 'title', and 'description'.
+            A pystac.Catalog object.
         """
-        long_name = var_data.attrs.get("long_name")
-        standard_name = var_data.attrs.get("standard_name")
-        title = long_name or standard_name or var_name
+        var_id = var_metadata.get("variable_id")
+        # Set 'themes' to an empty list if none given
+        themes = self.osc_themes or []
+
+        now_iso = datetime.now(timezone.utc).isoformat()
+
+        # Create a PySTAC Catalog object
+        var_catalog = Catalog(
+            id=var_id,
+            description=var_metadata.get("description"),
+            stac_extensions=[
+                "https://stac-extensions.github.io/themes/v1.0.0/schema.json"
+            ],
+        )
+
+        var_catalog.stac_version = "1.0.0"
+        var_catalog.extra_fields["updated"] = now_iso
+        var_catalog.keywords = []
+
+        # Add the 'themes' block (from your example JSON)
+        var_catalog.extra_fields["themes"] = themes
+
+        var_catalog.remove_links("root")
+        # Add relevant links
+        var_catalog.add_link(
+            Link(
+                rel="root",
+                target="../../catalog.json",
+                media_type="application/json",
+                title="Open Science Catalog",
+            )
+        )
+
+        # 'child' link: points to the product (or one of its collections) using this variable
+        var_catalog.add_link(
+            Link(
+                rel="child",
+                target=f"../../products/{self.collection_id}/collection.json",
+                media_type="application/json",
+                title=self.collection_id,
+            )
+        )
 
-        normalized_title = self._normalize_name(title)
+        # 'parent' link: back up to the variables overview
+        var_catalog.add_link(
+            Link(
+                rel="parent",
+                target="../catalog.json",
+                media_type="application/json",
+                title="Variables",
+            )
+        )
 
-        description = var_data.attrs.get("description", "No variable description")
+        self_href = (
+            f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables"
+            f"/{var_id}/catalog.json"
+        )
+        # 'self' link: the direct URL where this JSON is hosted
+        var_catalog.set_self_href(self_href)
 
-        return {"id": var_name, "title": normalized_title, "description": description}
+        return var_catalog
 
     def build_stac_collection(self) -> Collection:
-        """
-        Build an OSC STAC Collection for the dataset.
+        """Build an OSC STAC Collection for the dataset.
 
-        :return: A pystac.Collection object.
+        Returns:
+            A pystac.Collection object.
         """
         try:
             spatial_extent = self._get_spatial_extent()
             temporal_extent = self._get_temporal_extent()
-            variables = self._get_variables()
+            variables = self._get_variable_ids()
             general_metadata = self._get_general_metadata()
         except ValueError as e:
             raise ValueError(f"Metadata extraction failed: {e}")

From b0aabdad788291154f2ef2b5e2386ae6812fa279 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Tue, 14 Jan 2025 16:45:04 +0100
Subject: [PATCH 2/7] implemented build variable reference feature

---
 deep_code/tests/tools/test_publish.py         |  22 +--
 .../utils/test_dataset_stac_generator.py      |  12 +-
 deep_code/tools/publish.py                    |  52 ++++--
 deep_code/utils/dataset_stac_generator.py     | 153 +++++++++++++++++-
 deep_code/utils/github_automation.py          |   6 +
 environment.yml                               |   1 +
 pyproject.toml                                |   1 +
 7 files changed, 210 insertions(+), 37 deletions(-)

diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py
index 47c9961..6be4601 100644
--- a/deep_code/tests/tools/test_publish.py
+++ b/deep_code/tests/tools/test_publish.py
@@ -61,14 +61,14 @@ def test_publish_dataset_success(
           github-token: test-token
           """
         dataset_yaml_content = """
-          dataset-id: test-dataset
-          collection-id: test-collection
-          documentation-link: http://example.com/doc
-          access-link: http://example.com/access
-          dataset-status: ongoing
-          dataset-region: Global
-          dataset-theme: ["climate"]
-          cf-parameter: []
+          dataset_id: test-dataset
+          collection_id: test-collection
+          documentation_link: http://example.com/doc
+          access_link: http://example.com/access
+          dataset_status: ongoing
+          dataset_region: Global
+          osc_theme: ["climate"]
+          cf_parameter: []
           """
         mock_fsspec_open.side_effect = [
             mock_open(read_data=git_yaml_content)(),
@@ -102,8 +102,8 @@ def test_publish_dataset_success(
             "links": [],
             "stac_version": "1.0.0",
         }
-        with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator:
-            mock_generator.return_value.build_stac_collection.return_value = (
+        with patch("deep_code.tools.publish.OSCDatasetSTACGenerator") as mock_generator:
+            mock_generator.return_value.build_dataset_stac_collection.return_value = (
                 mock_collection
             )
 
@@ -111,7 +111,7 @@ def test_publish_dataset_success(
             publisher = DatasetPublisher()
             publisher.publish_dataset("/fake/path/to/dataset-config.yaml")
 
-        # 6Assert that we called git clone with /tmp/temp_repo
+        # Assert that we called git clone with /tmp/temp_repo
         # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo
         auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git"
         mock_subprocess_run.assert_any_call(
diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py
index 8fc5e39..ebceb60 100644
--- a/deep_code/tests/utils/test_dataset_stac_generator.py
+++ b/deep_code/tests/utils/test_dataset_stac_generator.py
@@ -7,7 +7,7 @@
 from unittest.mock import patch, MagicMock
 from xarray import Dataset
 
-from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator
+from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator
 
 
 class TestOSCProductSTACGenerator(unittest.TestCase):
@@ -36,7 +36,7 @@ def setUp(self, mock_data_store):
         mock_store.open_data.return_value = self.mock_dataset
         mock_data_store.return_value = mock_store
 
-        self.generator = OSCProductSTACGenerator(
+        self.generator = OSCDatasetSTACGenerator(
             dataset_id="mock-dataset-id",
             collection_id="mock-collection-id",
             access_link="s3://mock-bucket/mock-dataset",
@@ -78,7 +78,7 @@ def test_get_general_metadata(self):
     @patch("pystac.Collection.set_self_href")
     def test_build_stac_collection(self, mock_set_self_href, mock_add_link):
         """Test STAC collection creation."""
-        collection = self.generator.build_stac_collection()
+        collection = self.generator.build_dataset_stac_collection()
         self.assertIsInstance(collection, Collection)
         self.assertEqual(collection.id, "mock-collection-id")
         self.assertEqual(collection.description, "Mock dataset for testing.")
@@ -116,7 +116,7 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor
         mock_store.open_data.return_value = "mock_dataset"
 
         # Instantiate the generator (this will implicitly call _open_dataset)
-        generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id")
+        generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")
 
         # Validate that the dataset is assigned correctly
         self.assertEqual(generator.dataset, "mock_dataset")
@@ -157,7 +157,7 @@ def test_open_dataset_success_authenticated_store(
         os.environ["S3_USER_STORAGE_KEY"] = "mock-key"
         os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret"
 
-        generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id")
+        generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")
 
         # Validate that the dataset was successfully opened with the authenticated store
         self.assertEqual(generator.dataset, "mock_dataset")
@@ -195,7 +195,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store):
         os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret"
 
         with self.assertRaises(ValueError) as context:
-            OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id")
+            OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")
 
         self.assertIn(
             "Failed to open Zarr dataset with ID mock-dataset-id",
diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py
index 1d36fc9..8982ba1 100644
--- a/deep_code/tools/publish.py
+++ b/deep_code/tools/publish.py
@@ -7,9 +7,10 @@
 import fsspec
 import logging
 import yaml
+from pathlib import Path
 
 from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME
-from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator
+from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator
 from deep_code.utils.github_automation import GitHubAutomation
 
 logger = logging.getLogger(__name__)
@@ -50,14 +51,14 @@ def publish_dataset(self, dataset_config_path: str):
         with fsspec.open(dataset_config_path, "r") as file:
             dataset_config = yaml.safe_load(file)
 
-        dataset_id = dataset_config.get("dataset-id")
-        collection_id = dataset_config.get("collection-id")
-        documentation_link = dataset_config.get("documentation-link")
-        access_link = dataset_config.get("access-link")
-        dataset_status = dataset_config.get("dataset-status")
-        osc_region = dataset_config.get("dataset-region")
-        dataset_theme = dataset_config.get("dataset-theme")
-        cf_params = dataset_config.get("cf-parameter")
+        dataset_id = dataset_config.get("dataset_id")
+        collection_id = dataset_config.get("collection_id")
+        documentation_link = dataset_config.get("documentation_link")
+        access_link = dataset_config.get("access_link")
+        dataset_status = dataset_config.get("dataset_status")
+        osc_region = dataset_config.get("osc_region")
+        osc_themes = dataset_config.get("osc_themes")
+        cf_params = dataset_config.get("cf_parameter")
 
         if not dataset_id or not collection_id:
             raise ValueError(
@@ -67,18 +68,18 @@ def publish_dataset(self, dataset_config_path: str):
 
         try:
             logger.info("Generating STAC collection...")
-            generator = OSCProductSTACGenerator(
+            generator = OSCDatasetSTACGenerator(
                 dataset_id=dataset_id,
                 collection_id=collection_id,
                 documentation_link=documentation_link,
                 access_link=access_link,
                 osc_status=dataset_status,
                 osc_region=osc_region,
-                osc_themes=dataset_theme,
+                osc_themes=osc_themes,
                 cf_params=cf_params,
             )
             var_catalogs = generator.get_variables_and_build_catalog()
-            ds_collection = generator.build_stac_collection()
+            ds_collection = generator.build_dataset_stac_collection()
 
             file_path = f"products/{collection_id}/collection.json"
             logger.info("Automating GitHub tasks...")
@@ -87,10 +88,33 @@ def publish_dataset(self, dataset_config_path: str):
             self.github_automation.clone_repository()
             OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id
             self.github_automation.create_branch(OSC_NEW_BRANCH_NAME)
-            self.github_automation.add_file(file_path, ds_collection.to_dict())
+
             for var_id, var_catalog in var_catalogs.items():
                 var_file_path = f"variables/{var_id}/catalog.json"
-                self.github_automation.add_file(var_file_path, var_catalog.to_dict())
+                if not self.github_automation.file_exists(var_file_path):
+                    logger.info(
+                        f"Variable catalog for {var_id} does not exist. Creating..."
+                    )
+                    self.github_automation.add_file(
+                        var_file_path, var_catalog.to_dict()
+                    )
+                else:
+                    logger.info(
+                        f"Variable catalog already exists for {var_id}. so add the "
+                        f"product as child link..."
+                    )
+                    full_path = (
+                        Path(self.github_automation.local_clone_dir) / var_file_path
+                    )
+                    self.github_automation.add_file(
+                        var_file_path,
+                        generator.update_existing_variable_catalog(
+                            full_path, var_id
+                        ).to_dict(),
+                    )
+
+            self.github_automation.add_file(file_path, ds_collection.to_dict())
+
             self.github_automation.commit_and_push(
                 OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}"
             )
diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py
index 8ccd49e..2dd2d31 100644
--- a/deep_code/utils/dataset_stac_generator.py
+++ b/deep_code/utils/dataset_stac_generator.py
@@ -9,13 +9,15 @@
 from datetime import datetime, timezone
 
 import pandas as pd
+import requests
 from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent, Catalog
+from urllib.parse import quote_plus
 from xcube.core.store import new_data_store
 
 from deep_code.utils.osc_extension import OscExtension
 
 
-class OSCProductSTACGenerator:
+class OSCDatasetSTACGenerator:
     """Generates OSC STAC Collections for a product from Zarr datasets.
 
     Args:
@@ -183,7 +185,12 @@ def _extract_variable_metadata(self, variable_data) -> dict:
         standard_name = variable_data.attrs.get("standard_name")
         title = long_name or standard_name or variable_data.name
         description = variable_data.attrs.get("description", "No variable description")
-        return {"variable_id": self._normalize_name(title), "description": description}
+        gcmd_keyword = variable_data.attrs.get("gcmd_keyword")
+        return {
+            "variable_id": self._normalize_name(title),
+            "description": description,
+            "gcmd_keyword": gcmd_keyword,
+        }
 
     def _get_variable_ids(self) -> list[str]:
         """Extract variable IDs for each variable in the dataset."""
@@ -198,9 +205,100 @@ def get_variables_and_build_catalog(self) -> dict[str, Catalog]:
         for var_name, variable in self.dataset.data_vars.items():
             var_metadata = self._extract_variable_metadata(variable)
             var_catalog = self.build_variable_catalog(var_metadata)
-            var_catalogs[var_name] = var_catalog
+            var_catalogs[var_metadata.get("variable_id")] = var_catalog
         return var_catalogs
 
+    @staticmethod
+    def _get_gcmd_scheme_uuid(keyword: str) -> str | None:
+        """Query NASA's GCMD KMS concepts for a given keyword, and return the first matching UUID.
+
+        Args:
+            keyword: The GCMD keyword to look up (e.g., "EVAPORATION").
+
+        Returns:
+            The UUID string if found, otherwise None.
+        """
+        url = "https://api.gcmd.earthdata.nasa.gov/kms/concepts/concepts"
+        params = {"keyword": keyword, "format": "json"}
+
+        resp = requests.get(url, params=params)
+        if resp.status_code != 200:
+            # Request failed
+            return None
+
+        data = resp.json()
+        concepts = data.get("concepts", [])
+        # Loop through concepts and find the one that matches our keyword in short_name (case-insensitive).
+        for concept in concepts:
+            if concept.get("short_name", "").upper() == keyword.upper():
+                return concept.get("uuid")
+
+        return None
+
+    @staticmethod
+    def _build_gcmd_viewer_url(
+        keyword: str, scheme_uuid: str, scheme: str = "Earth Science"
+    ) -> str:
+        """Builds the GCMD Keyword Viewer URL for a given keyword and UUID.
+
+        Args:
+            keyword: GCMD keyword (e.g., "EVAPORATION").
+            scheme_uuid: The UUID for this keyword (e.g., "b68ab978-6db6-49ee-84e2-5f37b461a998").
+            scheme: The GCMD scheme, default is "Earth Science".
+
+        Returns:
+            The fully qualified GCMD viewer URL, e.g.:
+            https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/Earth%20Science/...
+        """
+        # URL-encode the scheme and keyword
+        url_scheme = quote_plus(scheme)
+        url_keyword = quote_plus(keyword)
+
+        # Construct the GCMD viewer URL
+        gcmd_url = (
+            f"https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/{url_scheme}/{scheme_uuid}"
+            f"?gtm_keyword={url_keyword}&gtm_scheme={url_scheme}"
+        )
+
+        return gcmd_url
+
+    def _add_gcmd_link_to_var_catalog(
+        self, var_catalog: Catalog, var_metadata: dict
+    ) -> None:
+        """
+        Checks for a GCMD keyword in var_metadata, retrieves its scheme UUID,
+        and if found, adds a 'via' link to the catalog pointing to the GCMD Keyword Viewer.
+
+        Args:
+            var_catalog: The PySTAC Catalog to which we want to add the link.
+            var_metadata: Dictionary containing metadata about the variable,
+                          including 'gcmd_keyword'.
+        """
+        gcmd_keyword = var_metadata.get("gcmd_keyword")
+        if not gcmd_keyword:
+            self.logger.debug("No `gcmd_keyword` in var_metadata. Skipping GCMD link.")
+            return
+
+        # Retrieve scheme UUID from the NASA KMS API
+        scheme_uuid = self._get_gcmd_scheme_uuid(gcmd_keyword)
+        if not scheme_uuid:
+            self.logger.debug(
+                f"No GCMD UUID found for keyword '{gcmd_keyword}'. Skipping GCMD link."
+            )
+            return
+
+        gcmd_url = self._build_gcmd_viewer_url(gcmd_keyword, scheme_uuid)
+
+        # Add `rel="via"` link for the GCMD viewer
+        var_catalog.add_link(
+            Link(
+                rel="via", target=gcmd_url, title="Description", media_type="text/html"
+            )
+        )
+        self.logger.info(
+            f"Added GCMD link for keyword '{gcmd_keyword}' (UUID: {scheme_uuid})."
+        )
+
     def build_variable_catalog(self, var_metadata) -> Catalog:
         """Build an OSC STAC Catalog for the variables in the dataset.
 
@@ -208,8 +306,14 @@ def build_variable_catalog(self, var_metadata) -> Catalog:
             A pystac.Catalog object.
         """
         var_id = var_metadata.get("variable_id")
-        # Set 'themes' to an empty list if none given
-        themes = self.osc_themes or []
+        concepts = [{"id": theme} for theme in self.osc_themes]
+
+        themes = [
+            {
+                "scheme": "https://github.com/stac-extensions/osc#theme",
+                "concepts": concepts,
+            }
+        ]
 
         now_iso = datetime.now(timezone.utc).isoformat()
 
@@ -217,6 +321,7 @@ def build_variable_catalog(self, var_metadata) -> Catalog:
         var_catalog = Catalog(
             id=var_id,
             description=var_metadata.get("description"),
+            title=var_id,
             stac_extensions=[
                 "https://stac-extensions.github.io/themes/v1.0.0/schema.json"
             ],
@@ -259,6 +364,8 @@ def build_variable_catalog(self, var_metadata) -> Catalog:
                 title="Variables",
             )
         )
+        # Add gcmd link for the variable definition
+        self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata)
 
         self_href = (
             f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables"
@@ -269,7 +376,30 @@ def build_variable_catalog(self, var_metadata) -> Catalog:
 
         return var_catalog
 
-    def build_stac_collection(self) -> Collection:
+    def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog:
+        existing_catalog = Catalog.from_file(var_file_path)
+        now_iso = datetime.now(timezone.utc).isoformat()
+        existing_catalog.extra_fields["updated"] = now_iso
+
+        # add 'child' link as the product
+        existing_catalog.add_link(
+            Link(
+                rel="child",
+                target=f"../../products/{self.collection_id}/collection.json",
+                media_type="application/json",
+                title=self.collection_id,
+            )
+        )
+        self_href = (
+            f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables"
+            f"/{var_id}/catalog.json"
+        )
+        # 'self' link: the direct URL where this JSON is hosted
+        existing_catalog.set_self_href(self_href)
+
+        return existing_catalog
+
+    def build_dataset_stac_collection(self) -> Collection:
         """Build an OSC STAC Collection for the dataset.
 
         Returns:
@@ -309,6 +439,7 @@ def build_stac_collection(self) -> Collection:
         now_iso = datetime.now(timezone.utc).isoformat()
         collection.extra_fields["created"] = now_iso
         collection.extra_fields["updated"] = now_iso
+        collection.title = self.collection_id
 
         # Remove any existing root link and re-add it properly
         collection.remove_links("root")
@@ -333,6 +464,16 @@ def build_stac_collection(self) -> Collection:
                 title="Products",
             )
         )
+        # Add variables ref
+        for var in variables:
+            collection.add_link(
+                Link(
+                    rel="related",
+                    target=f"../../varibales/{var}/catalog.json",
+                    media_type="application/json",
+                    title="Variable: " + var,
+                )
+            )
 
         self_href = (
             "https://esa-earthcode.github.io/"
diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py
index d934d2a..0218c13 100644
--- a/deep_code/utils/github_automation.py
+++ b/deep_code/utils/github_automation.py
@@ -113,3 +113,9 @@ def clean_up(self):
             subprocess.run(["rm", "-rf", self.local_clone_dir])
         except subprocess.CalledProcessError as e:
             raise RuntimeError(f"Failed to clean-up local repository: {e}")
+
+    def file_exists(self, file_path) -> bool:
+        full_path = Path(self.local_clone_dir) / file_path
+        exists = os.path.isfile(full_path)
+        logging.debug(f"Checking existence of {full_path}: {exists}")
+        return exists
diff --git a/environment.yml b/environment.yml
index 9570a8d..92ba6f2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,6 +11,7 @@ dependencies:
   - pandas
   - pystac
   - pyyaml
+  - urllib
   - xcube
   - zarr >=2.11,<3
   # test dependencies
diff --git a/pyproject.toml b/pyproject.toml
index 057f7b8..023efb8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "pandas",
     "pystac",
     "pyyaml",
+    "urllib",
     "xcube-core"
 ]
 

From b6e3c4102bd907f4028721601932baab35a46a11 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Tue, 14 Jan 2025 16:49:02 +0100
Subject: [PATCH 3/7] update env with right conda package name

---
 environment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/environment.yml b/environment.yml
index 92ba6f2..27256c6 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,7 +11,7 @@ dependencies:
   - pandas
   - pystac
   - pyyaml
-  - urllib
+  - urllib3
   - xcube
   - zarr >=2.11,<3
   # test dependencies

From 431633218c53fd79dfde09e1851962568e9d560a Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Tue, 14 Jan 2025 17:21:23 +0100
Subject: [PATCH 4/7] remove urllib3

---
 environment.yml | 1 -
 pyproject.toml  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/environment.yml b/environment.yml
index 27256c6..9570a8d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,7 +11,6 @@ dependencies:
   - pandas
   - pystac
   - pyyaml
-  - urllib3
   - xcube
   - zarr >=2.11,<3
   # test dependencies
diff --git a/pyproject.toml b/pyproject.toml
index 023efb8..057f7b8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,6 @@ dependencies = [
     "pandas",
     "pystac",
     "pyyaml",
-    "urllib",
     "xcube-core"
 ]
 

From acc54078cd54d5cdae080a6213b4b53503951971 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Tue, 14 Jan 2025 17:25:11 +0100
Subject: [PATCH 5/7] update README.md

---
 README.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/README.md b/README.md
index 1d4b381..ce4ed34 100644
--- a/README.md
+++ b/README.md
@@ -9,6 +9,9 @@
 and Python API providing utilities that aid integration of DeepESDL datasets, 
 experiments with EarthCODE.
 
+The first release will focus on implementing the publish feature of DeepESDL 
+experiments/workflow as OGC API record and Datasets as an OSC stac collection.
+
 ## Setup
 
 ## Install

From cdd908f431fc8c7e0c4fdb46756a0b0205ca2203 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Wed, 15 Jan 2025 15:57:16 +0100
Subject: [PATCH 6/7] updated logic for variable catalog generation

---
 .../utils/test_dataset_stac_generator.py      |  28 +++--
 deep_code/tools/publish.py                    |   9 +-
 deep_code/utils/dataset_stac_generator.py     | 116 ++++--------------
 3 files changed, 54 insertions(+), 99 deletions(-)

diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py
index ebceb60..95ecb85 100644
--- a/deep_code/tests/utils/test_dataset_stac_generator.py
+++ b/deep_code/tests/utils/test_dataset_stac_generator.py
@@ -28,8 +28,24 @@ def setUp(self, mock_data_store):
             },
             attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"},
             data_vars={
-                "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)),
-                "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)),
+                "var1": (
+                    ("time", "lat", "lon"),
+                    np.random.rand(2, 5, 10),
+                    {
+                        "description": "dummy",
+                        "standard_name": "var1",
+                        "gcmd_keyword_url": "https://dummy",
+                    },
+                ),
+                "var2": (
+                    ("time", "lat", "lon"),
+                    np.random.rand(2, 5, 10),
+                    {
+                        "description": "dummy",
+                        "standard_name": "var2",
+                        "gcmd_keyword_url": "https://dummy",
+                    },
+                ),
             },
         )
         mock_store = MagicMock()
@@ -66,7 +82,7 @@ def test_get_temporal_extent(self):
 
     def test_get_variables(self):
         """Test variable extraction."""
-        variables = self.generator._get_variable_ids()
+        variables = self.generator.get_variable_ids()
         self.assertEqual(variables, ["var1", "var2"])
 
     def test_get_general_metadata(self):
@@ -104,8 +120,6 @@ def test_invalid_temporal_extent(self):
         with self.assertRaises(ValueError):
             self.generator._get_temporal_extent()
 
-
-class TestOpenDataset(unittest.TestCase):
     @patch("deep_code.utils.dataset_stac_generator.new_data_store")
     @patch("deep_code.utils.dataset_stac_generator.logging.getLogger")
     def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store):
@@ -113,7 +127,7 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor
         # Create a mock store and mock its `open_data` method
         mock_store = MagicMock()
         mock_new_data_store.return_value = mock_store
-        mock_store.open_data.return_value = "mock_dataset"
+        mock_store.open_data.return_value = self.mock_dataset
 
         # Instantiate the generator (this will implicitly call _open_dataset)
         generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")
@@ -151,7 +165,7 @@ def test_open_dataset_success_authenticated_store(
             mock_store,
             # Second call (authenticated store) returns a mock store
         ]
-        mock_store.open_data.return_value = "mock_dataset"
+        mock_store.open_data.return_value = self.mock_dataset
 
         os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket"
         os.environ["S3_USER_STORAGE_KEY"] = "mock-key"
diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py
index 8982ba1..dccb336 100644
--- a/deep_code/tools/publish.py
+++ b/deep_code/tools/publish.py
@@ -78,23 +78,26 @@ def publish_dataset(self, dataset_config_path: str):
                 osc_themes=osc_themes,
                 cf_params=cf_params,
             )
-            var_catalogs = generator.get_variables_and_build_catalog()
+            # get variables from the datasets
+            variable_ids = generator.get_variable_ids()
+            # build STAC collection for the dataset
             ds_collection = generator.build_dataset_stac_collection()
 
             file_path = f"products/{collection_id}/collection.json"
             logger.info("Automating GitHub tasks...")
-
             self.github_automation.fork_repository()
             self.github_automation.clone_repository()
             OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id
             self.github_automation.create_branch(OSC_NEW_BRANCH_NAME)
 
-            for var_id, var_catalog in var_catalogs.items():
+            for var_id in variable_ids:
                 var_file_path = f"variables/{var_id}/catalog.json"
                 if not self.github_automation.file_exists(var_file_path):
                     logger.info(
                         f"Variable catalog for {var_id} does not exist. Creating..."
                     )
+                    var_metadata = generator.variables_metadata.get(var_id)
+                    var_catalog = generator.build_variable_catalog(var_metadata)
                     self.github_automation.add_file(
                         var_file_path, var_catalog.to_dict()
                     )
diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py
index 2dd2d31..8889387 100644
--- a/deep_code/utils/dataset_stac_generator.py
+++ b/deep_code/utils/dataset_stac_generator.py
@@ -55,6 +55,7 @@ def __init__(
         self.cf_params = cf_params or {}
         self.logger = logging.getLogger(__name__)
         self.dataset = self._open_dataset()
+        self.variables_metadata = self.get_variables_metadata()
 
     def _open_dataset(self):
         """Open the dataset using a S3 store as a xarray Dataset."""
@@ -179,124 +180,61 @@ def _get_general_metadata(self) -> dict:
             )
         }
 
-    def _extract_variable_metadata(self, variable_data) -> dict:
+    def extract_metadata_for_variable(self, variable_data) -> dict:
         """Extract metadata for a single variable."""
         long_name = variable_data.attrs.get("long_name")
         standard_name = variable_data.attrs.get("standard_name")
         title = long_name or standard_name or variable_data.name
         description = variable_data.attrs.get("description", "No variable description")
-        gcmd_keyword = variable_data.attrs.get("gcmd_keyword")
+        gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url")
         return {
             "variable_id": self._normalize_name(title),
             "description": description,
-            "gcmd_keyword": gcmd_keyword,
+            "gcmd_keyword_url": gcmd_keyword_url,
         }
 
-    def _get_variable_ids(self) -> list[str]:
-        """Extract variable IDs for each variable in the dataset."""
-        return [
-            self._extract_variable_metadata(variable)["variable_id"]
-            for variable in self.dataset.data_vars.values()
-        ]
+    def get_variable_ids(self) -> list[str]:
+        """Get variable IDs for all variables in the dataset."""
+        return list(self.variables_metadata.keys())
 
-    def get_variables_and_build_catalog(self) -> dict[str, Catalog]:
-        """Extract metadata and STAC catalog for each variable in the dataset."""
-        var_catalogs = {}
+    def get_variables_metadata(self) -> dict[str, dict]:
+        """Extract metadata for all variables in the dataset."""
+        variables_metadata = {}
         for var_name, variable in self.dataset.data_vars.items():
-            var_metadata = self._extract_variable_metadata(variable)
-            var_catalog = self.build_variable_catalog(var_metadata)
-            var_catalogs[var_metadata.get("variable_id")] = var_catalog
-        return var_catalogs
-
-    @staticmethod
-    def _get_gcmd_scheme_uuid(keyword: str) -> str | None:
-        """Query NASA's GCMD KMS concepts for a given keyword, and return the first matching UUID.
-
-        Args:
-            keyword: The GCMD keyword to look up (e.g., "EVAPORATION").
-
-        Returns:
-            The UUID string if found, otherwise None.
-        """
-        url = "https://api.gcmd.earthdata.nasa.gov/kms/concepts/concepts"
-        params = {"keyword": keyword, "format": "json"}
-
-        resp = requests.get(url, params=params)
-        if resp.status_code != 200:
-            # Request failed
-            return None
-
-        data = resp.json()
-        concepts = data.get("concepts", [])
-        # Loop through concepts and find the one that matches our keyword in short_name (case-insensitive).
-        for concept in concepts:
-            if concept.get("short_name", "").upper() == keyword.upper():
-                return concept.get("uuid")
-
-        return None
-
-    @staticmethod
-    def _build_gcmd_viewer_url(
-        keyword: str, scheme_uuid: str, scheme: str = "Earth Science"
-    ) -> str:
-        """Builds the GCMD Keyword Viewer URL for a given keyword and UUID.
-
-        Args:
-            keyword: GCMD keyword (e.g., "EVAPORATION").
-            scheme_uuid: The UUID for this keyword (e.g., "b68ab978-6db6-49ee-84e2-5f37b461a998").
-            scheme: The GCMD scheme, default is "Earth Science".
-
-        Returns:
-            The fully qualified GCMD viewer URL, e.g.:
-            https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/Earth%20Science/...
-        """
-        # URL-encode the scheme and keyword
-        url_scheme = quote_plus(scheme)
-        url_keyword = quote_plus(keyword)
-
-        # Construct the GCMD viewer URL
-        gcmd_url = (
-            f"https://gcmd.earthdata.nasa.gov/KeywordViewer/scheme/{url_scheme}/{scheme_uuid}"
-            f"?gtm_keyword={url_keyword}&gtm_scheme={url_scheme}"
-        )
-
-        return gcmd_url
+            var_metadata = self.extract_metadata_for_variable(variable)
+            variables_metadata[var_metadata.get("variable_id")] = var_metadata
+        return variables_metadata
 
     def _add_gcmd_link_to_var_catalog(
         self, var_catalog: Catalog, var_metadata: dict
     ) -> None:
         """
-        Checks for a GCMD keyword in var_metadata, retrieves its scheme UUID,
-        and if found, adds a 'via' link to the catalog pointing to the GCMD Keyword Viewer.
+        Checks for a GCMD keyword URL in var_metadata, adds a 'via' link to the catalog
+        pointing to the GCMD Keyword Viewer.
 
         Args:
             var_catalog: The PySTAC Catalog to which we want to add the link.
             var_metadata: Dictionary containing metadata about the variable,
-                          including 'gcmd_keyword'.
+                          including 'gcmd_keyword_url'.
         """
-        gcmd_keyword = var_metadata.get("gcmd_keyword")
-        if not gcmd_keyword:
-            self.logger.debug("No `gcmd_keyword` in var_metadata. Skipping GCMD link.")
-            return
-
-        # Retrieve scheme UUID from the NASA KMS API
-        scheme_uuid = self._get_gcmd_scheme_uuid(gcmd_keyword)
-        if not scheme_uuid:
+        gcmd_keyword_url = var_metadata.get("gcmd_keyword_url")
+        if not gcmd_keyword_url:
             self.logger.debug(
-                f"No GCMD UUID found for keyword '{gcmd_keyword}'. Skipping GCMD link."
+                f"No gcmd_keyword_url in var_metadata. Skipping adding GCMD link in "
+                f'the {var_metadata.get("variable_id")} catalog'
             )
             return
-
-        gcmd_url = self._build_gcmd_viewer_url(gcmd_keyword, scheme_uuid)
-
-        # Add `rel="via"` link for the GCMD viewer
         var_catalog.add_link(
             Link(
-                rel="via", target=gcmd_url, title="Description", media_type="text/html"
+                rel="via",
+                target=gcmd_keyword_url,
+                title="Description",
+                media_type="text/html",
             )
         )
         self.logger.info(
-            f"Added GCMD link for keyword '{gcmd_keyword}' (UUID: {scheme_uuid})."
+            f'Added GCMD link for {var_metadata.get("variable_id")} '
+            f"catalog {gcmd_keyword_url}."
         )
 
     def build_variable_catalog(self, var_metadata) -> Catalog:
@@ -408,7 +346,7 @@ def build_dataset_stac_collection(self) -> Collection:
         try:
             spatial_extent = self._get_spatial_extent()
             temporal_extent = self._get_temporal_extent()
-            variables = self._get_variable_ids()
+            variables = self.get_variable_ids()
             general_metadata = self._get_general_metadata()
         except ValueError as e:
             raise ValueError(f"Metadata extraction failed: {e}")

From c9ee5addf44ccd6f0ee2b84990853cecc7fcf2f1 Mon Sep 17 00:00:00 2001
From: tejas <tejas.morbagalharish@brockmann-consult.de>
Date: Wed, 15 Jan 2025 15:58:21 +0100
Subject: [PATCH 7/7] updated README.md

---
 README.md | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ce4ed34..e7737e6 100644
--- a/README.md
+++ b/README.md
@@ -75,16 +75,18 @@ github-token: personal access token
 #### dataset-config.yaml example
 
 ```
-dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr 
-collection-id: hydrology
-
-#non-mandatory
-documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/
-access-link: s3://test
-dataset-status: completed
-dataset-region: global
-dataset-theme: ["ocean", "environment"]
-cf-parameter: [{"Name" : "hydrology"}]
+dataset_id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr
+collection_id: hydrology
+osc_themes:
+  - Land
+  - Oceans
+# non-mandatory
+documentation_link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0.009deg-100x60x60-3.0.2.zarr/
+access_link: s3://test
+dataset_status: completed
+osc_region: global
+cf_parameter:
+  - name: hydrology
 ```
 
 dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket.
\ No newline at end of file