Skip to content
10 changes: 10 additions & 0 deletions deep_code/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,13 @@
".json"
)
PROJECT_COLLECTION_NAME = "deep-earth-system-data-lab"
DEEPESDL_GIT_PULL_BASE = (
"https://deep.earthsystemdatalab.net/hub/user-redirect/git-pull"
)
APPLICATION_TYPE_JUPYTER_SPEC = (
"https://raw.githubusercontent.com/EOEPCA/metadata"
"-profile/refs/heads/1.0/schemas/application-type-jupyter-notebook"
)
APPLICATION_STAC_EXTENSION_SPEC = (
"https://stac-extensions.github.io/application/v0.1.0/schema.json"
)
58 changes: 58 additions & 0 deletions deep_code/tests/tools/test_publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
from pathlib import Path
from unittest.mock import MagicMock, mock_open, patch

import pytest
import yaml
from pystac import Catalog

from deep_code.tools.publish import Publisher
from deep_code.utils.ogc_api_record import LinksBuilder


class TestPublisher(unittest.TestCase):
Expand Down Expand Up @@ -107,3 +109,59 @@ def test_read_config_files(self):
# Assertions
self.assertEqual(self.publisher.dataset_config, dataset_config)
self.assertEqual(self.publisher.workflow_config, workflow_config)


class TestParseGithubNotebookUrl:
@pytest.mark.parametrize(
"url,repo_url,repo_name,branch,file_path",
[
(
"https://github.com/deepesdl/cube-gen/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
"https://github.com/deepesdl/cube-gen",
"cube-gen",
"main",
"Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
),
(
"https://github.com/deepesdl/cube-gen/tree/release-1.0/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
"https://github.com/deepesdl/cube-gen",
"cube-gen",
"release-1.0",
"Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
),
(
"https://raw.githubusercontent.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
"https://github.com/deepesdl/cube-gen",
"cube-gen",
"main",
"Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb",
),
],
)
def test_valid_urls(self, url, repo_url, repo_name, branch, file_path):
got_repo_url, got_repo_name, got_branch, got_file_path = LinksBuilder._parse_github_notebook_url(
url
)
assert got_repo_url == repo_url
assert got_repo_name == repo_name
assert got_branch == branch
assert got_file_path == file_path

def test_invalid_domain(self):
url = "https://gitlab.com/deepesdl/cube-gen/-/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb"
with pytest.raises(ValueError) as e:
LinksBuilder._parse_github_notebook_url(url)
assert "Only GitHub URLs are supported" in str(e.value)

def test_unexpected_github_format_missing_blob_or_tree(self):
# Missing the "blob" or "tree" segment
url = "https://github.com/deepesdl/cube-gen/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb"
with pytest.raises(ValueError) as e:
LinksBuilder._parse_github_notebook_url(url)
assert "Unexpected GitHub URL format" in str(e.value)

def test_unexpected_raw_format_too_short(self):
url = "https://raw.githubusercontent.com/deepesdl/cube-gen/main"
with pytest.raises(ValueError) as e:
LinksBuilder._parse_github_notebook_url(url)
assert "Unexpected raw.githubusercontent URL format" in str(e.value)
3 changes: 3 additions & 0 deletions deep_code/tests/utils/test_dataset_stac_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,11 @@ def setUp(self, mock_data_store):
self.generator = OscDatasetStacGenerator(
dataset_id="mock-dataset-id",
collection_id="mock-collection-id",
workflow_id="dummy",
workflow_title="test",
access_link="s3://mock-bucket/mock-dataset",
documentation_link="https://example.com/docs",
license_type="proprietary",
osc_status="ongoing",
osc_region="Global",
osc_themes=["climate", "environment"],
Expand Down
19 changes: 16 additions & 3 deletions deep_code/tests/utils/test_ogc_api_record.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import unittest

from deep_code.constants import OGC_API_RECORD_SPEC
from deep_code.constants import (
APPLICATION_STAC_EXTENSION_SPEC,
APPLICATION_TYPE_JUPYTER_SPEC,
OGC_API_RECORD_SPEC,
)
from deep_code.utils.ogc_api_record import (
Contact,
ExperimentAsOgcRecord,
Expand Down Expand Up @@ -136,7 +140,9 @@ def test_record_properties_to_dict(self):

class TestLinksBuilder(unittest.TestCase):
def test_build_theme_links_for_records(self):
links_builder = LinksBuilder(themes=["climate", "ocean"])
links_builder = LinksBuilder(
themes=["climate", "ocean"], jupyter_kernel_info={}
)
theme_links = links_builder.build_theme_links_for_records()

expected_links = [
Expand Down Expand Up @@ -201,7 +207,14 @@ def test_workflow_as_ogc_record_initialization(self):
workflow_record.jupyter_notebook_url, "https://example.com/notebook.ipynb"
)
self.assertEqual(workflow_record.properties, record_properties)
self.assertEqual(workflow_record.conformsTo, [OGC_API_RECORD_SPEC])
self.assertEqual(
workflow_record.conformsTo,
[
OGC_API_RECORD_SPEC,
APPLICATION_TYPE_JUPYTER_SPEC,
APPLICATION_STAC_EXTENSION_SPEC,
],
)
self.assertEqual(workflow_record.links[0]["rel"], "root")
self.assertEqual(workflow_record.links[-1]["rel"], "self")

Expand Down
13 changes: 10 additions & 3 deletions deep_code/tools/new.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,11 @@ def generate_workflow_template(output_path: Optional[str] = None) -> str:
"title": "[Human-readable title of the workflow]",
"description": "[A concise summary of what the workflow does]",
"keywords": ["[KEYWORD1]", "[KEYWORD2]"],
"themes": ["[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]","[THEME1]", "[THEME2]"],
"themes": [
"[Thematic area(s) of focus (e.g. land, ocean, atmosphere)]",
"[THEME1]",
"[THEME2]",
],
"license": "[License type (e.g. MIT, Apache-2.0, CC-BY-4.0, proprietary)]",
"jupyter_kernel_info": {
"name": "[Name of the execution environment or notebook kernel]",
Expand Down Expand Up @@ -61,8 +65,11 @@ def generate_dataset_template(output_path: Optional[str] = None) -> str:
template = {
"dataset_id": "[The name of the dataset object within your S3 bucket].zarr",
"collection_id": "[A unique identifier for the dataset collection]",
"osc_themes": ["[Oceans]", "[Open Science theme (choose from "
"https://opensciencedata.esa.int/themes/catalog)"],
"osc_themes": [
"[Oceans]",
"[Open Science theme (choose from "
"https://opensciencedata.esa.int/themes/catalog)",
],
"osc_region": "[Geographical coverage, e.g. 'global']",
"dataset_status": "[Status of the dataset: 'ongoing', 'completed', or 'planned']",
"documentation_link": "[Link to relevant documentation, publication, or handbook]",
Expand Down
36 changes: 27 additions & 9 deletions deep_code/tools/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
# https://opensource.org/licenses/MIT.

import copy
import json
import logging
from datetime import datetime
from pathlib import Path

import fsspec
import jsonpickle
import yaml
from pystac import Catalog, Link

Expand All @@ -22,7 +22,6 @@
)
from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator
from deep_code.utils.github_automation import GitHubAutomation
from deep_code.utils.helper import serialize
from deep_code.utils.ogc_api_record import (
ExperimentAsOgcRecord,
LinksBuilder,
Expand Down Expand Up @@ -130,6 +129,7 @@ def __init__(
self._read_config_files()
self.collection_id = self.dataset_config.get("collection_id")
self.workflow_title = self.workflow_config.get("properties", {}).get("title")
self.workflow_id = self.workflow_config.get("workflow_id")

if not self.collection_id:
raise ValueError("collection_id is missing in dataset config.")
Expand All @@ -151,11 +151,12 @@ def _write_to_file(file_path: str, data: dict):
# Create the directory if it doesn't exist
Path(file_path).parent.mkdir(parents=True, exist_ok=True)
try:
json_content = json.dumps(data, indent=2, default=serialize)
# unpicklable=False -> plain JSON (drops type metadata); cycles are resolved.
json_content = jsonpickle.encode(data, unpicklable=False, indent=2)
except TypeError as e:
raise RuntimeError(f"JSON serialization failed: {e}")

with open(file_path, "w") as f:
with open(file_path, "w", encoding="utf-8") as f:
f.write(json_content)

def _update_and_add_to_file_dict(
Expand Down Expand Up @@ -217,6 +218,7 @@ def publish_dataset(self, write_to_file: bool = False):
osc_region = self.dataset_config.get("osc_region")
osc_themes = self.dataset_config.get("osc_themes")
cf_params = self.dataset_config.get("cf_parameter")
license_type = self.dataset_config.get("license_type")

if not dataset_id or not self.collection_id:
raise ValueError("Dataset ID or Collection ID missing in the config.")
Expand All @@ -226,6 +228,9 @@ def publish_dataset(self, write_to_file: bool = False):
generator = OscDatasetStacGenerator(
dataset_id=dataset_id,
collection_id=self.collection_id,
workflow_id=self.workflow_id,
workflow_title=self.workflow_title,
license_type=license_type,
documentation_link=documentation_link,
access_link=access_link,
osc_status=dataset_status,
Expand Down Expand Up @@ -310,7 +315,7 @@ def _update_base_catalog(

return base_catalog

def publish_workflow_experiment(self, write_to_file: bool = False):
def generate_workflow_experiment_records(self, write_to_file: bool = False) -> None:
"""prepare workflow and experiment as ogc api record to publish it to the
specified GitHub repository."""
workflow_id = self._normalize_name(self.workflow_config.get("workflow_id"))
Expand All @@ -328,16 +333,23 @@ def publish_workflow_experiment(self, write_to_file: bool = False):
wf_record_properties = rg.build_record_properties(properties_list, contacts)
# make a copy for experiment record
exp_record_properties = copy.deepcopy(wf_record_properties)
jupyter_kernel_info = wf_record_properties.jupyter_kernel_info.to_dict()

link_builder = LinksBuilder(osc_themes)
link_builder = LinksBuilder(osc_themes, jupyter_kernel_info)
theme_links = link_builder.build_theme_links_for_records()
application_link = link_builder.build_link_to_jnb(
self.workflow_title, jupyter_notebook_url
)
jnb_open_link = link_builder.make_related_link_for_opening_jnb_from_github(
jupyter_notebook_url=jupyter_notebook_url
)

workflow_record = WorkflowAsOgcRecord(
id=workflow_id,
type="Feature",
title=self.workflow_title,
properties=wf_record_properties,
links=links + theme_links,
links=links + theme_links + application_link + jnb_open_link,
jupyter_notebook_url=jupyter_notebook_url,
themes=osc_themes,
)
Expand All @@ -347,21 +359,24 @@ def publish_workflow_experiment(self, write_to_file: bool = False):
del workflow_dict["jupyter_notebook_url"]
if "osc_workflow" in workflow_dict["properties"]:
del workflow_dict["properties"]["osc_workflow"]
# add workflow record to file_dict
wf_file_path = f"workflows/{workflow_id}/record.json"
file_dict = {wf_file_path: workflow_dict}

# Build properties for the experiment record
exp_record_properties.type = "experiment"
exp_record_properties.osc_workflow = workflow_id

dataset_link = link_builder.build_link_to_dataset(self.collection_id)

experiment_record = ExperimentAsOgcRecord(
id=workflow_id,
title=self.workflow_title,
type="Feature",
jupyter_notebook_url=jupyter_notebook_url,
collection_id=self.collection_id,
properties=exp_record_properties,
links=links + theme_links,
links=links + theme_links + dataset_link,
)
# Convert to dictionary and cleanup
experiment_dict = experiment_record.to_dict()
Expand All @@ -371,6 +386,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False):
del experiment_dict["collection_id"]
if "osc:project" in experiment_dict["properties"]:
del experiment_dict["properties"]["osc:project"]
# add experiment record to file_dict
exp_file_path = f"experiments/{workflow_id}/record.json"
file_dict[exp_file_path] = experiment_dict

Expand All @@ -397,7 +413,9 @@ def publish_all(self, write_to_file: bool = False):
"""Publish both dataset and workflow/experiment in a single PR."""
# Get file dictionaries from both methods
dataset_files = self.publish_dataset(write_to_file=write_to_file)
workflow_files = self.publish_workflow_experiment(write_to_file=write_to_file)
workflow_files = self.generate_workflow_experiment_records(
write_to_file=write_to_file
)

# Combine the file dictionaries
combined_files = {**dataset_files, **workflow_files}
Expand Down
2 changes: 1 addition & 1 deletion deep_code/utils/custom_xrlint_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ def export_config() -> list:
"content-desc": "off",
"no-empty-attrs": "off",
"conventions": "off",
"time-coordinate": "off"
"time-coordinate": "off",
}
},
"deepcode/recommended",
Expand Down
17 changes: 17 additions & 0 deletions deep_code/utils/dataset_stac_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def __init__(
self,
dataset_id: str,
collection_id: str,
workflow_id: str,
workflow_title: str,
license_type: str,
access_link: str | None = None,
documentation_link: str | None = None,
osc_status: str = "ongoing",
Expand All @@ -49,6 +52,9 @@ def __init__(
):
self.dataset_id = dataset_id
self.collection_id = collection_id
self.workflow_id = workflow_id
self.workflow_title = workflow_title
self.license_type = license_type
self.access_link = access_link or f"s3://deep-esdl-public/{dataset_id}"
self.documentation_link = documentation_link
self.osc_status = osc_status
Expand Down Expand Up @@ -478,6 +484,17 @@ def build_dataset_stac_collection(self) -> Collection:
)
)

collection.add_link(
Link(
rel="related",
target=f"../../experiments/{self.workflow_id}/record.json",
media_type="application/json",
title=f"Experiment: {self.workflow_title}",
)
)

collection.license = self.license_type

# Validate OSC extension fields
try:
osc_extension.validate_extension()
Expand Down
Loading