From a2261233541bdbf0725ddb3ecdea3b43aa35fbdf Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 22 Jul 2025 15:18:10 +0200 Subject: [PATCH 01/11] added rel link from product to experiment --- deep_code/tools/publish.py | 3 +++ deep_code/utils/dataset_stac_generator.py | 17 +++++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 9918847..6550cad 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -130,6 +130,7 @@ def __init__( self._read_config_files() self.collection_id = self.dataset_config.get("collection_id") self.workflow_title = self.workflow_config.get("properties", {}).get("title") + self.workflow_id = self.workflow_config.get("workflow_id") if not self.collection_id: raise ValueError("collection_id is missing in dataset config.") @@ -226,6 +227,8 @@ def publish_dataset(self, write_to_file: bool = False): generator = OscDatasetStacGenerator( dataset_id=dataset_id, collection_id=self.collection_id, + workflow_id= self.workflow_id, + workflow_title = self.workflow_title, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 3e96a6e..b852e8a 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -39,6 +39,9 @@ def __init__( self, dataset_id: str, collection_id: str, + workflow_id: str, + workflow_title: str, + license_type: str, access_link: str | None = None, documentation_link: str | None = None, osc_status: str = "ongoing", @@ -49,6 +52,9 @@ def __init__( ): self.dataset_id = dataset_id self.collection_id = collection_id + self.workflow_id = workflow_id + self.workflow_title = workflow_title + self.license_type = license_type self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}" self.documentation_link = documentation_link self.osc_status = osc_status @@ -478,6 +484,17 @@ def build_dataset_stac_collection(self) -> Collection: ) ) + collection.add_link( + Link( + rel="related", + target=f"../../experiments/{self.workflow_id}/record.json", + media_type="application/json", + title=f"Experiment: {self.workflow_title}" + ) + ) + + collection.license = self.license_type + # Validate OSC extension fields try: osc_extension.validate_extension() From 9dde5af243e19afb75cb9b00ac7e5dd6fac2986c Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 12:13:56 +0200 Subject: [PATCH 02/11] refactor --- deep_code/tools/new.py | 13 ++++++++++--- deep_code/utils/custom_xrlint_rules.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index d696175..3e1bd08 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -20,7 +20,11 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str: "title": "[Human-readable title of the workflow]", "description": "[A concise summary of what the workflow does]", "keywords": ["[KEYWORD1]", "[KEYWORD2]"], - "themes": ["[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]","[THEME1]", "[THEME2]"], + "themes": [ + "[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]", + "[THEME1]", + "[THEME2]", + ], "license": "[License type (e.g. MIT, Apache-2.0, CC-BY-4.0, proprietary)]", "jupyter_kernel_info": { "name": "[Name of the execution environment or notebook kernel]", @@ -61,8 +65,11 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str: template = { "dataset_id": "[The name of the dataset object within your S3 bucket].zarr", "collection_id": "[A unique identifier for the dataset collection]", - "osc_themes": ["[Oceans]", "[Open Science theme (choose from " - "https://opensciencedata.esa.int/themes/catalog)"], + "osc_themes": [ + "[Oceans]", + "[Open Science theme (choose from " + "https://opensciencedata.esa.int/themes/catalog)", + ], "osc_region": "[Geographical coverage, e.g. 'global']", "dataset_status": "[Status of the dataset: 'ongoing', 'completed', or 'planned']", "documentation_link": "[Link to relevant documentation, publication, or handbook]", diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py index d003840..c3b9506 100644 --- a/deep_code/utils/custom_xrlint_rules.py +++ b/deep_code/utils/custom_xrlint_rules.py @@ -71,7 +71,7 @@ def export_config() -> list: "content-desc": "off", "no-empty-attrs": "off", "conventions": "off", - "time-coordinate": "off" + "time-coordinate": "off", } }, "deepcode/recommended", From 07882106d802acb54eea8b6a00be61c3400c6129 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 12:14:12 +0200 Subject: [PATCH 03/11] refactor --- deep_code/utils/dataset_stac_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index b852e8a..a30a817 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -489,7 +489,7 @@ def build_dataset_stac_collection(self) -> Collection: rel="related", target=f"../../experiments/{self.workflow_id}/record.json", media_type="application/json", - title=f"Experiment: {self.workflow_title}" + title=f"Experiment: {self.workflow_title}", ) ) From a6b8eec44c2382c0477c61447890dfc88412e254 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 12:15:20 +0200 Subject: [PATCH 04/11] adding more links to worklow records --- deep_code/constants.py | 3 ++ deep_code/tools/publish.py | 26 +++++++----- deep_code/utils/ogc_api_record.py | 69 +++++++++++++++++++++++++++---- 3 files changed, 81 insertions(+), 17 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index ea3b762..e139ebf 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -29,3 +29,6 @@ ".json" ) PROJECT_COLLECTION_NAME = "deep-earth-system-data-lab" +DEEPESDL_GIT_PULL_BASE = ( + "https://deep.earthsystemdatalab.net/hub/user-redirect/git-pull" +) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 6550cad..c9efd0a 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -4,12 +4,12 @@ # https://opensource.org/licenses/MIT. import copy -import json import logging from datetime import datetime from pathlib import Path import fsspec +import jsonpickle import yaml from pystac import Catalog, Link @@ -22,7 +22,6 @@ ) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.helper import serialize from deep_code.utils.ogc_api_record import ( ExperimentAsOgcRecord, LinksBuilder, @@ -152,11 +151,13 @@ def _write_to_file(file_path: str, data: dict): # Create the directory if it doesn't exist Path(file_path).parent.mkdir(parents=True, exist_ok=True) try: - json_content = json.dumps(data, indent=2, default=serialize) + # unpicklable=False -> plain JSON (drops type metadata); cycles are resolved. + json_content = jsonpickle.encode(data, unpicklable=False, indent=2) + # json_content = json.dumps(data, indent=2, default=serialize) except TypeError as e: raise RuntimeError(f"JSON serialization failed: {e}") - with open(file_path, "w") as f: + with open(file_path, "w", encoding="utf-8") as f: f.write(json_content) def _update_and_add_to_file_dict( @@ -227,8 +228,8 @@ def publish_dataset(self, write_to_file: bool = False): generator = OscDatasetStacGenerator( dataset_id=dataset_id, collection_id=self.collection_id, - workflow_id= self.workflow_id, - workflow_title = self.workflow_title, + workflow_id=self.workflow_id, + workflow_title=self.workflow_title, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, @@ -331,16 +332,19 @@ def publish_workflow_experiment(self, write_to_file: bool = False): wf_record_properties = rg.build_record_properties(properties_list, contacts) # make a copy for experiment record exp_record_properties = copy.deepcopy(wf_record_properties) + jupyter_kernel_info = wf_record_properties.jupyter_kernel_info.to_dict() - link_builder = LinksBuilder(osc_themes) + link_builder = LinksBuilder(osc_themes, jupyter_kernel_info) theme_links = link_builder.build_theme_links_for_records() - + application_link = link_builder.build_link_to_jnb( + self.workflow_title, jupyter_notebook_url + ) workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", title=self.workflow_title, properties=wf_record_properties, - links=links + theme_links, + links=links + theme_links + application_link, jupyter_notebook_url=jupyter_notebook_url, themes=osc_themes, ) @@ -357,6 +361,8 @@ def publish_workflow_experiment(self, write_to_file: bool = False): exp_record_properties.type = "experiment" exp_record_properties.osc_workflow = workflow_id + dataset_link = link_builder.build_link_to_dataset(self.collection_id) + experiment_record = ExperimentAsOgcRecord( id=workflow_id, title=self.workflow_title, @@ -364,7 +370,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): jupyter_notebook_url=jupyter_notebook_url, collection_id=self.collection_id, properties=exp_record_properties, - links=links + theme_links, + links=links + theme_links + dataset_link, ) # Convert to dictionary and cleanup experiment_dict = experiment_record.to_dict() diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index fd417ca..99cb329 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,10 +1,12 @@ from typing import Any, Optional +from urllib.parse import quote, urlencode from xrlint.util.constructible import MappingConstructible from xrlint.util.serializable import JsonSerializable, JsonValue from deep_code.constants import ( BASE_URL_OSC, + DEEPESDL_GIT_PULL_BASE, OGC_API_RECORD_SPEC, PROJECT_COLLECTION_NAME, ) @@ -86,12 +88,16 @@ def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: if self.osc_project is not None: data["osc:project"] = self.osc_project del data["osc_project"] + data["application:type"] = "jupyter-notebook" + data["application:container"] = ("true",) + data["application:language"] = ("Python",) return data class LinksBuilder: - def __init__(self, themes: list[str]): + def __init__(self, themes: list[str], jupyter_kernel_info: dict[str]): self.themes = themes + self.jupyter_kernel_info = jupyter_kernel_info self.theme_links = [] def build_theme_links_for_records(self): @@ -117,6 +123,55 @@ def build_link_to_dataset(collection_id): } ] + def build_link_to_jnb(self, workflow_title, jupyter_nb_url): + return [ + { + "rel": "application", + "title": f"Jupyter Notebook: {workflow_title}", + "href": jupyter_nb_url, + "type": "application/x-ipynb+json", + "application:type": "jupyter-notebook", + "application:container": "true", + "application:language": "Python", + "jupyter:kernel": { + "name": self.jupyter_kernel_info["name"], + "pythonVersion": self.jupyter_kernel_info["python_version"], + "envFile": self.jupyter_kernel_info["env_file"], + }, + } + ] + + def build_deepesdl_notebook_href( + repo_url: str, + notebook_path: str, + branch: str = "main", + base_redirect: str = DEEPESDL_GIT_PULL_BASE, + ) -> str: + """ + Build a DeepESDL git-pull redirect URL: + {base}?repo=&urlpath=&branch= + """ + params = { + "repo": repo_url, + "urlpath": f"lab/tree/{notebook_path.lstrip('/')}", + "branch": branch, + } + return f"{base_redirect}?{urlencode(params, quote_via=quote)}" + + def make_related_link_for_opening_jnb( + self, + repo_url: str, + notebook_path: str, + branch: str = "main", + title: str = "Open notebook on the DeepESDL platform", + ) -> dict[str, str]: + return { + "rel": "related", + "href": self.build_deepesdl_notebook_href(repo_url, notebook_path, branch), + "type": "text/html", + "title": title, + } + class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( @@ -236,12 +291,12 @@ def _generate_static_links(self): "type": "application/json", "title": f"Workflow: {self.title}", }, - { - "rel": "child", - "href": f"../../products/{self.collection_id}/collection.json", - "type": "application/json", - "title": f"{self.collection_id}", - }, + # { + # "rel": "child", + # "href": f"../../products/{self.collection_id}/collection.json", + # "type": "application/json", + # "title": f"{self.collection_id}", + # }, { "rel": "related", "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", From 502cb7aaa2ca6e625766a01fc7304b9cee780c61 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 16:36:19 +0200 Subject: [PATCH 05/11] jnb_open_link to workflow ogc api record --- deep_code/constants.py | 7 ++ deep_code/tools/publish.py | 16 +++- deep_code/utils/ogc_api_record.py | 118 +++++++++++++++++++++++------- 3 files changed, 111 insertions(+), 30 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index e139ebf..9d01cae 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -32,3 +32,10 @@ DEEPESDL_GIT_PULL_BASE = ( "https://deep.earthsystemdatalab.net/hub/user-redirect/git-pull" ) +APPLICATION_TYPE_JUPYTER_SPEC = ( + "https://raw.githubusercontent.com/EOEPCA/metadata" + "-profile/refs/heads/1.0/schemas/application-type-jupyter-notebook" +) +APPLICATION_STAC_EXTENSION_SPEC = ( + "https://stac-extensions.github.io/application/v0.1.0/schema.json" +) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index c9efd0a..6436384 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -219,6 +219,7 @@ def publish_dataset(self, write_to_file: bool = False): osc_region = self.dataset_config.get("osc_region") osc_themes = self.dataset_config.get("osc_themes") cf_params = self.dataset_config.get("cf_parameter") + license_type = self.dataset_config.get("license_type") if not dataset_id or not self.collection_id: raise ValueError("Dataset ID or Collection ID missing in the config.") @@ -230,6 +231,7 @@ def publish_dataset(self, write_to_file: bool = False): collection_id=self.collection_id, workflow_id=self.workflow_id, workflow_title=self.workflow_title, + license_type=license_type, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, @@ -314,7 +316,7 @@ def _update_base_catalog( return base_catalog - def publish_workflow_experiment(self, write_to_file: bool = False): + def generate_workflow_experiment_records(self, write_to_file: bool = False): """prepare workflow and experiment as ogc api record to publish it to the specified GitHub repository.""" workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) @@ -339,12 +341,16 @@ def publish_workflow_experiment(self, write_to_file: bool = False): application_link = link_builder.build_link_to_jnb( self.workflow_title, jupyter_notebook_url ) + jnb_open_link = link_builder.make_related_link_for_opening_jnb_from_github( + jupyter_notebook_url=jupyter_notebook_url + ) + workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", title=self.workflow_title, properties=wf_record_properties, - links=links + theme_links + application_link, + links=links + theme_links + application_link + jnb_open_link, jupyter_notebook_url=jupyter_notebook_url, themes=osc_themes, ) @@ -354,6 +360,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): del workflow_dict["jupyter_notebook_url"] if "osc_workflow" in workflow_dict["properties"]: del workflow_dict["properties"]["osc_workflow"] + # add workflow record to file_dict wf_file_path = f"workflows/{workflow_id}/record.json" file_dict = {wf_file_path: workflow_dict} @@ -380,6 +387,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): del experiment_dict["collection_id"] if "osc:project" in experiment_dict["properties"]: del experiment_dict["properties"]["osc:project"] + # add experiment record to file_dict exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict @@ -406,7 +414,9 @@ def publish_all(self, write_to_file: bool = False): """Publish both dataset and workflow/experiment in a single PR.""" # Get file dictionaries from both methods dataset_files = self.publish_dataset(write_to_file=write_to_file) - workflow_files = self.publish_workflow_experiment(write_to_file=write_to_file) + workflow_files = self.generate_workflow_experiment_records( + write_to_file=write_to_file + ) # Combine the file dictionaries combined_files = {**dataset_files, **workflow_files} diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 99cb329..617f044 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,10 +1,12 @@ -from typing import Any, Optional -from urllib.parse import quote, urlencode +from typing import Any, Optional, Tuple +from urllib.parse import quote, urlencode, urlparse from xrlint.util.constructible import MappingConstructible from xrlint.util.serializable import JsonSerializable, JsonValue from deep_code.constants import ( + APPLICATION_STAC_EXTENSION_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, BASE_URL_OSC, DEEPESDL_GIT_PULL_BASE, OGC_API_RECORD_SPEC, @@ -141,36 +143,80 @@ def build_link_to_jnb(self, workflow_title, jupyter_nb_url): } ] - def build_deepesdl_notebook_href( - repo_url: str, - notebook_path: str, - branch: str = "main", + @staticmethod + def _parse_github_notebook_url(url: str) -> Tuple[str, str, str, str]: + """ + Returns (repo_url, repo_name, branch, file_path_in_repo) from a GitHub URL. + + Supports: + - https://github.com///blob// + - https://raw.githubusercontent.com//// + """ + p = urlparse(url) + parts = p.path.strip("/").split("/") + + if p.netloc == "github.com": + if len(parts) >= 5 and parts[2] in ("blob", "tree"): + owner, repo, _blob_or_tree, branch = parts[:4] + file_path = "/".join(parts[4:]) + else: + raise ValueError(f"Unexpected GitHub URL format: {url}") + repo_url = f"https://github.com/{owner}/{repo}" + repo_name = repo + + elif p.netloc == "raw.githubusercontent.com": + if len(parts) >= 4: + owner, repo, branch = parts[:3] + file_path = "/".join(parts[3:]) + else: + raise ValueError(f"Unexpected raw.githubusercontent URL format: {url}") + repo_url = f"https://github.com/{owner}/{repo}" + repo_name = repo + + else: + raise ValueError(f"Only GitHub URLs are supported: {url}") + + return repo_url, repo_name, branch, file_path + + def build_deepesdl_notebook_href_from_github( + self, + jupyter_notebook_url: str, base_redirect: str = DEEPESDL_GIT_PULL_BASE, + branch_override: str | None = None, ) -> str: """ - Build a DeepESDL git-pull redirect URL: - {base}?repo=&urlpath=&branch= + Build DeepESDL git-pull redirect from a full GitHub notebook URL. + {base}?repo=&urlpath=lab/tree//&branch= """ + repo_url, repo_name, branch, file_path = self._parse_github_notebook_url( + jupyter_notebook_url + ) + if branch_override: + branch = branch_override + params = { "repo": repo_url, - "urlpath": f"lab/tree/{notebook_path.lstrip('/')}", + "urlpath": f"lab/tree/{repo_name}/{file_path}", "branch": branch, } return f"{base_redirect}?{urlencode(params, quote_via=quote)}" - def make_related_link_for_opening_jnb( + def make_related_link_for_opening_jnb_from_github( self, - repo_url: str, - notebook_path: str, - branch: str = "main", + jupyter_notebook_url: str, title: str = "Open notebook on the DeepESDL platform", + branch_override: str | None = None, ) -> dict[str, str]: - return { - "rel": "related", - "href": self.build_deepesdl_notebook_href(repo_url, notebook_path, branch), - "type": "text/html", - "title": title, - } + return [ + { + "rel": "related", + "href": self.build_deepesdl_notebook_href_from_github( + jupyter_notebook_url, branch_override=branch_override + ), + "type": "text/html", + "title": title, + } + ] class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): @@ -188,7 +234,11 @@ def __init__( themes: Optional[Any] = None, ): if conformsTo is None: - conformsTo = [OGC_API_RECORD_SPEC] + conformsTo = [ + OGC_API_RECORD_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + APPLICATION_STAC_EXTENSION_SPEC, + ] self.id = id self.type = type self.title = title @@ -227,6 +277,14 @@ def _generate_static_links(self): "title": "Jupyter Notebook", "href": f"{self.jupyter_notebook_url}", }, + { + "rel": "application-originating-platform", + "title": "DeepESDL platform", + "href": "https://deep.earthsystemdatalab.net/", + "type": "text/html", + "application:platform_supports": ["jupyter-notebook"], + "application:preferred_app": "JupyterLab", + }, { "rel": "related", "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", @@ -258,7 +316,11 @@ def __init__( if linkTemplates is None: linkTemplates = [] if conformsTo is None: - conformsTo = [OGC_API_RECORD_SPEC] + conformsTo = [ + OGC_API_RECORD_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + APPLICATION_STAC_EXTENSION_SPEC, + ] self.id = id self.title = title self.type = type @@ -291,18 +353,20 @@ def _generate_static_links(self): "type": "application/json", "title": f"Workflow: {self.title}", }, - # { - # "rel": "child", - # "href": f"../../products/{self.collection_id}/collection.json", - # "type": "application/json", - # "title": f"{self.collection_id}", - # }, { "rel": "related", "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", "type": "application/json", "title": "Project: DeepESDL", }, + { + "rel": "application-originating-platform", + "title": "DeepESDL platform", + "href": "https://deep.earthsystemdatalab.net/", + "type": "text/html", + "application:platform_supports": ["jupyter-notebook"], + "application:preferred_app": "JupyterLab", + }, { "rel": "input", "href": "./input.yaml", From 191c8bc443fc0599d750a4e31fcdd42572d7f3f7 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 16:51:26 +0200 Subject: [PATCH 06/11] fix unit tests and remove APPLICATION STAC and JUPYTER SPEC schema conformation from experiment ogc record --- .../utils/test_dataset_stac_generator.py | 3 +++ deep_code/tests/utils/test_ogc_api_record.py | 19 ++++++++++++++++--- deep_code/utils/ogc_api_record.py | 6 +----- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 464c538..1e47b58 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -65,8 +65,11 @@ def setUp(self, mock_data_store): self.generator = OscDatasetStacGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", + workflow_id="dummy", + workflow_title="test", access_link="s3://mock-bucket/mock-dataset", documentation_link="https://example.com/docs", + license_type="proprietary", osc_status="ongoing", osc_region="Global", osc_themes=["climate", "environment"], diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index 236ed4c..6cc3f3f 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -1,6 +1,10 @@ import unittest -from deep_code.constants import OGC_API_RECORD_SPEC +from deep_code.constants import ( + APPLICATION_STAC_EXTENSION_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + OGC_API_RECORD_SPEC, +) from deep_code.utils.ogc_api_record import ( Contact, ExperimentAsOgcRecord, @@ -136,7 +140,9 @@ def test_record_properties_to_dict(self): class TestLinksBuilder(unittest.TestCase): def test_build_theme_links_for_records(self): - links_builder = LinksBuilder(themes=["climate", "ocean"]) + links_builder = LinksBuilder( + themes=["climate", "ocean"], jupyter_kernel_info={} + ) theme_links = links_builder.build_theme_links_for_records() expected_links = [ @@ -201,7 +207,14 @@ def test_workflow_as_ogc_record_initialization(self): workflow_record.jupyter_notebook_url, "https://example.com/notebook.ipynb" ) self.assertEqual(workflow_record.properties, record_properties) - self.assertEqual(workflow_record.conformsTo, [OGC_API_RECORD_SPEC]) + self.assertEqual( + workflow_record.conformsTo, + [ + OGC_API_RECORD_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + APPLICATION_STAC_EXTENSION_SPEC, + ], + ) self.assertEqual(workflow_record.links[0]["rel"], "root") self.assertEqual(workflow_record.links[-1]["rel"], "self") diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 617f044..831e383 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -316,11 +316,7 @@ def __init__( if linkTemplates is None: linkTemplates = [] if conformsTo is None: - conformsTo = [ - OGC_API_RECORD_SPEC, - APPLICATION_TYPE_JUPYTER_SPEC, - APPLICATION_STAC_EXTENSION_SPEC, - ] + conformsTo = [OGC_API_RECORD_SPEC] self.id = id self.title = title self.type = type From 6c082da9847fedf7a7816b8ccc312b855d3b6294 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 16:57:09 +0200 Subject: [PATCH 07/11] adding jsonpickle as dependency --- environment.yml | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index c0f4b28..b901853 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,7 @@ dependencies: - click - fsspec - jsonschema + - jsonpickle - requests - pandas - pystac diff --git a/pyproject.toml b/pyproject.toml index 2a4b696..042038a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "click", "fsspec", "jsonschema", + "jsonpickle", "requests", "pandas", "pystac", From 721985ceeea1e2c41a8648c5a0df589b158c5825 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 17:01:30 +0200 Subject: [PATCH 08/11] refactor --- deep_code/tools/publish.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 6436384..ecf425a 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -153,7 +153,6 @@ def _write_to_file(file_path: str, data: dict): try: # unpicklable=False -> plain JSON (drops type metadata); cycles are resolved. json_content = jsonpickle.encode(data, unpicklable=False, indent=2) - # json_content = json.dumps(data, indent=2, default=serialize) except TypeError as e: raise RuntimeError(f"JSON serialization failed: {e}") From 484845152fa5d9fff110ee35ce3dc4e695047b8d Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 19 Aug 2025 18:02:05 +0200 Subject: [PATCH 09/11] add more unit tests --- deep_code/tests/tools/test_publish.py | 58 +++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index f62db2c..d5cace9 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -4,10 +4,12 @@ from pathlib import Path from unittest.mock import MagicMock, mock_open, patch +import pytest import yaml from pystac import Catalog from deep_code.tools.publish import Publisher +from deep_code.utils.ogc_api_record import LinksBuilder class TestPublisher(unittest.TestCase): @@ -107,3 +109,59 @@ def test_read_config_files(self): # Assertions self.assertEqual(self.publisher.dataset_config, dataset_config) self.assertEqual(self.publisher.workflow_config, workflow_config) + + +class TestParseGithubNotebookUrl: + @pytest.mark.parametrize( + "url,repo_url,repo_name,branch,file_path", + [ + ( + "https://github.com/deepesdl/cube-gen/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + "https://github.com/deepesdl/cube-gen", + "cube-gen", + "main", + "Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + ), + ( + "https://github.com/deepesdl/cube-gen/tree/release-1.0/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + "https://github.com/deepesdl/cube-gen", + "cube-gen", + "release-1.0", + "Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + ), + ( + "https://raw.githubusercontent.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + "https://github.com/deepesdl/cube-gen", + "cube-gen", + "main", + "Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + ), + ], + ) + def test_valid_urls(self, url, repo_url, repo_name, branch, file_path): + got_repo_url, got_repo_name, got_branch, got_file_path = LinksBuilder._parse_github_notebook_url( + url + ) + assert got_repo_url == repo_url + assert got_repo_name == repo_name + assert got_branch == branch + assert got_file_path == file_path + + def test_invalid_domain(self): + url = "https://gitlab.com/deepesdl/cube-gen/-/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb" + with pytest.raises(ValueError) as e: + LinksBuilder._parse_github_notebook_url(url) + assert "Only GitHub URLs are supported" in str(e.value) + + def test_unexpected_github_format_missing_blob_or_tree(self): + # Missing the "blob" or "tree" segment + url = "https://github.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb" + with pytest.raises(ValueError) as e: + LinksBuilder._parse_github_notebook_url(url) + assert "Unexpected GitHub URL format" in str(e.value) + + def test_unexpected_raw_format_too_short(self): + url = "https://raw.githubusercontent.com/deepesdl/cube-gen/main" + with pytest.raises(ValueError) as e: + LinksBuilder._parse_github_notebook_url(url) + assert "Unexpected raw.githubusercontent URL format" in str(e.value) From c3e00929f2c5be1eb588a52169b56ac3685324d2 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Wed, 20 Aug 2025 08:56:46 +0200 Subject: [PATCH 10/11] Update deep_code/tools/publish.py Co-authored-by: Thomas Storm --- deep_code/tools/publish.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index ecf425a..15b3699 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -315,7 +315,7 @@ def _update_base_catalog( return base_catalog - def generate_workflow_experiment_records(self, write_to_file: bool = False): + def generate_workflow_experiment_records(self, write_to_file: bool = False) -> None: """prepare workflow and experiment as ogc api record to publish it to the specified GitHub repository.""" workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) From febcc45f840fd96a2c672b8151733eb49ed14b51 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 20 Aug 2025 09:07:42 +0200 Subject: [PATCH 11/11] add type hint for return type --- deep_code/utils/ogc_api_record.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 831e383..22bc5eb 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,4 +1,4 @@ -from typing import Any, Optional, Tuple +from typing import Any, Optional, Tuple, List, Dict from urllib.parse import quote, urlencode, urlparse from xrlint.util.constructible import MappingConstructible @@ -125,7 +125,7 @@ def build_link_to_dataset(collection_id): } ] - def build_link_to_jnb(self, workflow_title, jupyter_nb_url): + def build_link_to_jnb(self, workflow_title, jupyter_nb_url) -> List[Dict[str, Any]]: return [ { "rel": "application",