diff --git a/deep_code/constants.py b/deep_code/constants.py index ea3b762..9d01cae 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -29,3 +29,13 @@ ".json" ) PROJECT_COLLECTION_NAME = "deep-earth-system-data-lab" +DEEPESDL_GIT_PULL_BASE = ( + "https://deep.earthsystemdatalab.net/hub/user-redirect/git-pull" +) +APPLICATION_TYPE_JUPYTER_SPEC = ( + "https://raw.githubusercontent.com/EOEPCA/metadata" + "-profile/refs/heads/1.0/schemas/application-type-jupyter-notebook" +) +APPLICATION_STAC_EXTENSION_SPEC = ( + "https://stac-extensions.github.io/application/v0.1.0/schema.json" +) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index f62db2c..d5cace9 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -4,10 +4,12 @@ from pathlib import Path from unittest.mock import MagicMock, mock_open, patch +import pytest import yaml from pystac import Catalog from deep_code.tools.publish import Publisher +from deep_code.utils.ogc_api_record import LinksBuilder class TestPublisher(unittest.TestCase): @@ -107,3 +109,59 @@ def test_read_config_files(self): # Assertions self.assertEqual(self.publisher.dataset_config, dataset_config) self.assertEqual(self.publisher.workflow_config, workflow_config) + + +class TestParseGithubNotebookUrl: + @pytest.mark.parametrize( + "url,repo_url,repo_name,branch,file_path", + [ + ( + "https://github.com/deepesdl/cube-gen/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + "https://github.com/deepesdl/cube-gen", + "cube-gen", + "main", + "Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + ), + ( + "https://github.com/deepesdl/cube-gen/tree/release-1.0/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + "https://github.com/deepesdl/cube-gen", + "cube-gen", + "release-1.0", + "Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + ), + ( + "https://raw.githubusercontent.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + "https://github.com/deepesdl/cube-gen", + "cube-gen", + "main", + "Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb", + ), + ], + ) + def test_valid_urls(self, url, repo_url, repo_name, branch, file_path): + got_repo_url, got_repo_name, got_branch, got_file_path = LinksBuilder._parse_github_notebook_url( + url + ) + assert got_repo_url == repo_url + assert got_repo_name == repo_name + assert got_branch == branch + assert got_file_path == file_path + + def test_invalid_domain(self): + url = "https://gitlab.com/deepesdl/cube-gen/-/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb" + with pytest.raises(ValueError) as e: + LinksBuilder._parse_github_notebook_url(url) + assert "Only GitHub URLs are supported" in str(e.value) + + def test_unexpected_github_format_missing_blob_or_tree(self): + # Missing the "blob" or "tree" segment + url = "https://github.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb" + with pytest.raises(ValueError) as e: + LinksBuilder._parse_github_notebook_url(url) + assert "Unexpected GitHub URL format" in str(e.value) + + def test_unexpected_raw_format_too_short(self): + url = "https://raw.githubusercontent.com/deepesdl/cube-gen/main" + with pytest.raises(ValueError) as e: + LinksBuilder._parse_github_notebook_url(url) + assert "Unexpected raw.githubusercontent URL format" in str(e.value) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 464c538..1e47b58 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -65,8 +65,11 @@ def setUp(self, mock_data_store): self.generator = OscDatasetStacGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", + workflow_id="dummy", + workflow_title="test", access_link="s3://mock-bucket/mock-dataset", documentation_link="https://example.com/docs", + license_type="proprietary", osc_status="ongoing", osc_region="Global", osc_themes=["climate", "environment"], diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index 236ed4c..6cc3f3f 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -1,6 +1,10 @@ import unittest -from deep_code.constants import OGC_API_RECORD_SPEC +from deep_code.constants import ( + APPLICATION_STAC_EXTENSION_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + OGC_API_RECORD_SPEC, +) from deep_code.utils.ogc_api_record import ( Contact, ExperimentAsOgcRecord, @@ -136,7 +140,9 @@ def test_record_properties_to_dict(self): class TestLinksBuilder(unittest.TestCase): def test_build_theme_links_for_records(self): - links_builder = LinksBuilder(themes=["climate", "ocean"]) + links_builder = LinksBuilder( + themes=["climate", "ocean"], jupyter_kernel_info={} + ) theme_links = links_builder.build_theme_links_for_records() expected_links = [ @@ -201,7 +207,14 @@ def test_workflow_as_ogc_record_initialization(self): workflow_record.jupyter_notebook_url, "https://example.com/notebook.ipynb" ) self.assertEqual(workflow_record.properties, record_properties) - self.assertEqual(workflow_record.conformsTo, [OGC_API_RECORD_SPEC]) + self.assertEqual( + workflow_record.conformsTo, + [ + OGC_API_RECORD_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + APPLICATION_STAC_EXTENSION_SPEC, + ], + ) self.assertEqual(workflow_record.links[0]["rel"], "root") self.assertEqual(workflow_record.links[-1]["rel"], "self") diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index d696175..3e1bd08 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -20,7 +20,11 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str: "title": "[Human-readable title of the workflow]", "description": "[A concise summary of what the workflow does]", "keywords": ["[KEYWORD1]", "[KEYWORD2]"], - "themes": ["[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]","[THEME1]", "[THEME2]"], + "themes": [ + "[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]", + "[THEME1]", + "[THEME2]", + ], "license": "[License type (e.g. MIT, Apache-2.0, CC-BY-4.0, proprietary)]", "jupyter_kernel_info": { "name": "[Name of the execution environment or notebook kernel]", @@ -61,8 +65,11 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str: template = { "dataset_id": "[The name of the dataset object within your S3 bucket].zarr", "collection_id": "[A unique identifier for the dataset collection]", - "osc_themes": ["[Oceans]", "[Open Science theme (choose from " - "https://opensciencedata.esa.int/themes/catalog)"], + "osc_themes": [ + "[Oceans]", + "[Open Science theme (choose from " + "https://opensciencedata.esa.int/themes/catalog)", + ], "osc_region": "[Geographical coverage, e.g. 'global']", "dataset_status": "[Status of the dataset: 'ongoing', 'completed', or 'planned']", "documentation_link": "[Link to relevant documentation, publication, or handbook]", diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 9918847..15b3699 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -4,12 +4,12 @@ # https://opensource.org/licenses/MIT. import copy -import json import logging from datetime import datetime from pathlib import Path import fsspec +import jsonpickle import yaml from pystac import Catalog, Link @@ -22,7 +22,6 @@ ) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.helper import serialize from deep_code.utils.ogc_api_record import ( ExperimentAsOgcRecord, LinksBuilder, @@ -130,6 +129,7 @@ def __init__( self._read_config_files() self.collection_id = self.dataset_config.get("collection_id") self.workflow_title = self.workflow_config.get("properties", {}).get("title") + self.workflow_id = self.workflow_config.get("workflow_id") if not self.collection_id: raise ValueError("collection_id is missing in dataset config.") @@ -151,11 +151,12 @@ def _write_to_file(file_path: str, data: dict): # Create the directory if it doesn't exist Path(file_path).parent.mkdir(parents=True, exist_ok=True) try: - json_content = json.dumps(data, indent=2, default=serialize) + # unpicklable=False -> plain JSON (drops type metadata); cycles are resolved. + json_content = jsonpickle.encode(data, unpicklable=False, indent=2) except TypeError as e: raise RuntimeError(f"JSON serialization failed: {e}") - with open(file_path, "w") as f: + with open(file_path, "w", encoding="utf-8") as f: f.write(json_content) def _update_and_add_to_file_dict( @@ -217,6 +218,7 @@ def publish_dataset(self, write_to_file: bool = False): osc_region = self.dataset_config.get("osc_region") osc_themes = self.dataset_config.get("osc_themes") cf_params = self.dataset_config.get("cf_parameter") + license_type = self.dataset_config.get("license_type") if not dataset_id or not self.collection_id: raise ValueError("Dataset ID or Collection ID missing in the config.") @@ -226,6 +228,9 @@ def publish_dataset(self, write_to_file: bool = False): generator = OscDatasetStacGenerator( dataset_id=dataset_id, collection_id=self.collection_id, + workflow_id=self.workflow_id, + workflow_title=self.workflow_title, + license_type=license_type, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, @@ -310,7 +315,7 @@ def _update_base_catalog( return base_catalog - def publish_workflow_experiment(self, write_to_file: bool = False): + def generate_workflow_experiment_records(self, write_to_file: bool = False) -> None: """prepare workflow and experiment as ogc api record to publish it to the specified GitHub repository.""" workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) @@ -328,16 +333,23 @@ def publish_workflow_experiment(self, write_to_file: bool = False): wf_record_properties = rg.build_record_properties(properties_list, contacts) # make a copy for experiment record exp_record_properties = copy.deepcopy(wf_record_properties) + jupyter_kernel_info = wf_record_properties.jupyter_kernel_info.to_dict() - link_builder = LinksBuilder(osc_themes) + link_builder = LinksBuilder(osc_themes, jupyter_kernel_info) theme_links = link_builder.build_theme_links_for_records() + application_link = link_builder.build_link_to_jnb( + self.workflow_title, jupyter_notebook_url + ) + jnb_open_link = link_builder.make_related_link_for_opening_jnb_from_github( + jupyter_notebook_url=jupyter_notebook_url + ) workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", title=self.workflow_title, properties=wf_record_properties, - links=links + theme_links, + links=links + theme_links + application_link + jnb_open_link, jupyter_notebook_url=jupyter_notebook_url, themes=osc_themes, ) @@ -347,6 +359,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): del workflow_dict["jupyter_notebook_url"] if "osc_workflow" in workflow_dict["properties"]: del workflow_dict["properties"]["osc_workflow"] + # add workflow record to file_dict wf_file_path = f"workflows/{workflow_id}/record.json" file_dict = {wf_file_path: workflow_dict} @@ -354,6 +367,8 @@ def publish_workflow_experiment(self, write_to_file: bool = False): exp_record_properties.type = "experiment" exp_record_properties.osc_workflow = workflow_id + dataset_link = link_builder.build_link_to_dataset(self.collection_id) + experiment_record = ExperimentAsOgcRecord( id=workflow_id, title=self.workflow_title, @@ -361,7 +376,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): jupyter_notebook_url=jupyter_notebook_url, collection_id=self.collection_id, properties=exp_record_properties, - links=links + theme_links, + links=links + theme_links + dataset_link, ) # Convert to dictionary and cleanup experiment_dict = experiment_record.to_dict() @@ -371,6 +386,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): del experiment_dict["collection_id"] if "osc:project" in experiment_dict["properties"]: del experiment_dict["properties"]["osc:project"] + # add experiment record to file_dict exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict @@ -397,7 +413,9 @@ def publish_all(self, write_to_file: bool = False): """Publish both dataset and workflow/experiment in a single PR.""" # Get file dictionaries from both methods dataset_files = self.publish_dataset(write_to_file=write_to_file) - workflow_files = self.publish_workflow_experiment(write_to_file=write_to_file) + workflow_files = self.generate_workflow_experiment_records( + write_to_file=write_to_file + ) # Combine the file dictionaries combined_files = {**dataset_files, **workflow_files} diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py index d003840..c3b9506 100644 --- a/deep_code/utils/custom_xrlint_rules.py +++ b/deep_code/utils/custom_xrlint_rules.py @@ -71,7 +71,7 @@ def export_config() -> list: "content-desc": "off", "no-empty-attrs": "off", "conventions": "off", - "time-coordinate": "off" + "time-coordinate": "off", } }, "deepcode/recommended", diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 3e96a6e..a30a817 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -39,6 +39,9 @@ def __init__( self, dataset_id: str, collection_id: str, + workflow_id: str, + workflow_title: str, + license_type: str, access_link: str | None = None, documentation_link: str | None = None, osc_status: str = "ongoing", @@ -49,6 +52,9 @@ def __init__( ): self.dataset_id = dataset_id self.collection_id = collection_id + self.workflow_id = workflow_id + self.workflow_title = workflow_title + self.license_type = license_type self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}" self.documentation_link = documentation_link self.osc_status = osc_status @@ -478,6 +484,17 @@ def build_dataset_stac_collection(self) -> Collection: ) ) + collection.add_link( + Link( + rel="related", + target=f"../../experiments/{self.workflow_id}/record.json", + media_type="application/json", + title=f"Experiment: {self.workflow_title}", + ) + ) + + collection.license = self.license_type + # Validate OSC extension fields try: osc_extension.validate_extension() diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index fd417ca..22bc5eb 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,10 +1,14 @@ -from typing import Any, Optional +from typing import Any, Optional, Tuple, List, Dict +from urllib.parse import quote, urlencode, urlparse from xrlint.util.constructible import MappingConstructible from xrlint.util.serializable import JsonSerializable, JsonValue from deep_code.constants import ( + APPLICATION_STAC_EXTENSION_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, BASE_URL_OSC, + DEEPESDL_GIT_PULL_BASE, OGC_API_RECORD_SPEC, PROJECT_COLLECTION_NAME, ) @@ -86,12 +90,16 @@ def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: if self.osc_project is not None: data["osc:project"] = self.osc_project del data["osc_project"] + data["application:type"] = "jupyter-notebook" + data["application:container"] = ("true",) + data["application:language"] = ("Python",) return data class LinksBuilder: - def __init__(self, themes: list[str]): + def __init__(self, themes: list[str], jupyter_kernel_info: dict[str]): self.themes = themes + self.jupyter_kernel_info = jupyter_kernel_info self.theme_links = [] def build_theme_links_for_records(self): @@ -117,6 +125,99 @@ def build_link_to_dataset(collection_id): } ] + def build_link_to_jnb(self, workflow_title, jupyter_nb_url) -> List[Dict[str, Any]]: + return [ + { + "rel": "application", + "title": f"Jupyter Notebook: {workflow_title}", + "href": jupyter_nb_url, + "type": "application/x-ipynb+json", + "application:type": "jupyter-notebook", + "application:container": "true", + "application:language": "Python", + "jupyter:kernel": { + "name": self.jupyter_kernel_info["name"], + "pythonVersion": self.jupyter_kernel_info["python_version"], + "envFile": self.jupyter_kernel_info["env_file"], + }, + } + ] + + @staticmethod + def _parse_github_notebook_url(url: str) -> Tuple[str, str, str, str]: + """ + Returns (repo_url, repo_name, branch, file_path_in_repo) from a GitHub URL. + + Supports: + - https://github.com///blob// + - https://raw.githubusercontent.com//// + """ + p = urlparse(url) + parts = p.path.strip("/").split("/") + + if p.netloc == "github.com": + if len(parts) >= 5 and parts[2] in ("blob", "tree"): + owner, repo, _blob_or_tree, branch = parts[:4] + file_path = "/".join(parts[4:]) + else: + raise ValueError(f"Unexpected GitHub URL format: {url}") + repo_url = f"https://github.com/{owner}/{repo}" + repo_name = repo + + elif p.netloc == "raw.githubusercontent.com": + if len(parts) >= 4: + owner, repo, branch = parts[:3] + file_path = "/".join(parts[3:]) + else: + raise ValueError(f"Unexpected raw.githubusercontent URL format: {url}") + repo_url = f"https://github.com/{owner}/{repo}" + repo_name = repo + + else: + raise ValueError(f"Only GitHub URLs are supported: {url}") + + return repo_url, repo_name, branch, file_path + + def build_deepesdl_notebook_href_from_github( + self, + jupyter_notebook_url: str, + base_redirect: str = DEEPESDL_GIT_PULL_BASE, + branch_override: str | None = None, + ) -> str: + """ + Build DeepESDL git-pull redirect from a full GitHub notebook URL. + {base}?repo=&urlpath=lab/tree//&branch= + """ + repo_url, repo_name, branch, file_path = self._parse_github_notebook_url( + jupyter_notebook_url + ) + if branch_override: + branch = branch_override + + params = { + "repo": repo_url, + "urlpath": f"lab/tree/{repo_name}/{file_path}", + "branch": branch, + } + return f"{base_redirect}?{urlencode(params, quote_via=quote)}" + + def make_related_link_for_opening_jnb_from_github( + self, + jupyter_notebook_url: str, + title: str = "Open notebook on the DeepESDL platform", + branch_override: str | None = None, + ) -> dict[str, str]: + return [ + { + "rel": "related", + "href": self.build_deepesdl_notebook_href_from_github( + jupyter_notebook_url, branch_override=branch_override + ), + "type": "text/html", + "title": title, + } + ] + class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( @@ -133,7 +234,11 @@ def __init__( themes: Optional[Any] = None, ): if conformsTo is None: - conformsTo = [OGC_API_RECORD_SPEC] + conformsTo = [ + OGC_API_RECORD_SPEC, + APPLICATION_TYPE_JUPYTER_SPEC, + APPLICATION_STAC_EXTENSION_SPEC, + ] self.id = id self.type = type self.title = title @@ -172,6 +277,14 @@ def _generate_static_links(self): "title": "Jupyter Notebook", "href": f"{self.jupyter_notebook_url}", }, + { + "rel": "application-originating-platform", + "title": "DeepESDL platform", + "href": "https://deep.earthsystemdatalab.net/", + "type": "text/html", + "application:platform_supports": ["jupyter-notebook"], + "application:preferred_app": "JupyterLab", + }, { "rel": "related", "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", @@ -236,18 +349,20 @@ def _generate_static_links(self): "type": "application/json", "title": f"Workflow: {self.title}", }, - { - "rel": "child", - "href": f"../../products/{self.collection_id}/collection.json", - "type": "application/json", - "title": f"{self.collection_id}", - }, { "rel": "related", "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", "type": "application/json", "title": "Project: DeepESDL", }, + { + "rel": "application-originating-platform", + "title": "DeepESDL platform", + "href": "https://deep.earthsystemdatalab.net/", + "type": "text/html", + "application:platform_supports": ["jupyter-notebook"], + "application:preferred_app": "JupyterLab", + }, { "rel": "input", "href": "./input.yaml", diff --git a/environment.yml b/environment.yml index c0f4b28..b901853 100644 --- a/environment.yml +++ b/environment.yml @@ -7,6 +7,7 @@ dependencies: - click - fsspec - jsonschema + - jsonpickle - requests - pandas - pystac diff --git a/pyproject.toml b/pyproject.toml index 2a4b696..042038a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,7 @@ dependencies = [ "click", "fsspec", "jsonschema", + "jsonpickle", "requests", "pandas", "pystac",