From 1bb3858d6177437f2beadd0649fdcefcd2c158cd Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 13:47:20 +0100 Subject: [PATCH 01/21] Implementation of publish experiment feature --- deep_code/constants.py | 4 ++ deep_code/tools/publish.py | 76 +++++++++++++++++++- deep_code/utils/ogc_api_record.py | 93 +++++++++++++++++++++++++ deep_code/utils/ogc_record_generator.py | 53 ++++++++++++++ pyproject.toml | 3 +- 5 files changed, 226 insertions(+), 3 deletions(-) create mode 100644 deep_code/utils/ogc_api_record.py create mode 100644 deep_code/utils/ogc_record_generator.py diff --git a/deep_code/constants.py b/deep_code/constants.py index 68982bc..d40d728 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -9,3 +9,7 @@ OSC_REPO_OWNER = "ESA-EarthCODE" OSC_REPO_NAME = "open-science-catalog-metadata-testing" OSC_BRANCH_NAME = "add-new-collection" +DEFAULT_THEME_SCHEME = ( + "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords" +) +OGC_API_RECORD_SPEC = "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 26b49f3..27eff98 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -4,13 +4,16 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. -import fsspec import logging + +import fsspec import yaml -from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME +from deep_code.constants import OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from deep_code.utils.github_automation import GitHubAutomation +from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator +from utils.ogc_api_record import OgcRecord logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -99,3 +102,72 @@ def publish_dataset(self, dataset_config_path: str): finally: self.github_automation.clean_up() + + +class WorkflowPublisher: + """Publishes workflow to the OSC GitHub repository. + + Credentials must be provided via a hidden file named `.gitaccess`, located in + the root of the repository. This file is expected to contain YAML of the form: + + github-username: "YOUR_GITHUB_USERNAME" + github-token: "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN" + """ + + def __init__(self): + with fsspec.open(".gitaccess", "r") as file: + git_config = yaml.safe_load(file) or {} + + self.github_username = git_config.get("github-username") + self.github_token = git_config.get("github-token") + + if not self.github_username or not self.github_token: + raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") + + self.github_automation = GitHubAutomation( + self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME + ) + + def publish_workflow(self, workflow_config_path: str): + + with fsspec.open(workflow_config_path, "r") as file: + workflow_config = yaml.safe_load(file) + + try: + logger.info("Generating OGC API Record for the workflow...") + workflow_id = workflow_config.get("workflow_id") + properties_list = workflow_config.get("properties", []) + + + contacts = workflow_config.get("contact", []) + rg = OSCWorkflowOGCApiRecordGenerator() + wf_record_properties = rg.build_record_properties(properties_list, contacts) + + links = workflow_config.get("links") + ogc_record = OgcRecord( + id=workflow_id, + type="Feature", + time={}, + properties=wf_record_properties, + links=links, + ) + + file_path = f"workflow/{workflow_id}/collection.json" + logger.info("Automating GitHub tasks...") + self.github_automation.fork_repository() + self.github_automation.clone_repository() + OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + workflow_id + self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) + self.github_automation.add_file(file_path, ogc_record.to_dict()) + self.github_automation.commit_and_push( + OSC_NEW_BRANCH_NAME, f"Add new collection:{workflow_id}" + ) + pr_url = self.github_automation.create_pull_request( + OSC_NEW_BRANCH_NAME, + f"Add new collection", + "This PR adds a new workflow to the OSC repository.", + ) + logger.info(f"Pull request created: {pr_url}") + + finally: + self.github_automation.clean_up() diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py new file mode 100644 index 0000000..da783e0 --- /dev/null +++ b/deep_code/utils/ogc_api_record.py @@ -0,0 +1,93 @@ +from typing import Any, Optional + +from xrlint.util.codec import JsonSerializable, MappingConstructible + +from deep_code.constants import OGC_API_RECORD_SPEC + + +class Contact(MappingConstructible["Contact"], JsonSerializable): + def __init__( + self, + name: str, + organization: str, + position: str | None = "", + links: list[dict[str, Any]] | None = None, + contactInstructions: str | None = "", + roles: list[str] = None, + ): + self.name = name + self.organization = organization + self.position = position + self.links = links or [] + self.contactInstructions = contactInstructions + self.roles = roles or ["principal investigator"] + + +class ThemeConcept(MappingConstructible["ThemeConcept"], JsonSerializable): + def __init__(self, id: str): + self.id = id + + +class Theme(MappingConstructible["Theme"], JsonSerializable): + def __init__(self, concepts: list[ThemeConcept], scheme: str): + self.concepts = concepts + self.scheme = scheme + + +class JupyterKernelInfo(MappingConstructible["RecordProperties"], JsonSerializable): + def __init__(self, name: str, python_version: float, env_file: str): + self.name = name + self.python_version = python_version + self.env_file = env_file + + +class RecordProperties(MappingConstructible["RecordProperties"], JsonSerializable): + def __init__( + self, + created: str, + type: str, + title: str, + description: str, + jupyter_kernel_info: JupyterKernelInfo, + updated: str = None, + contacts: list[Contact] = None, + themes: list[Theme] = None, + keywords: list[str] | None = None, + formats: list[dict] | None = None, + license: str = None, + ): + self.created = created + self.updated = updated + self.type = type + self.title = title + self.description = description + self.jupyter_kernel_info = jupyter_kernel_info + self.keywords = keywords or [] + self.contacts = contacts + self.themes = themes + self.formats = formats or [] + self.license = license + + +class OgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): + def __init__( + self, + id: str, + type: str, + time: dict, + properties: RecordProperties, + links: list[dict], + linkTemplates: list = [], + conformsTo: list[str] = None, + geometry: Optional[Any] = None, + ): + if conformsTo is None: + conformsTo = [OGC_API_RECORD_SPEC] + self.id = id + self.type = type + self.conformsTo = conformsTo + self.time = time + self.geometry = geometry + self.properties = properties + self.linkTemplates = linkTemplates + self.links = links diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py new file mode 100644 index 0000000..d323a93 --- /dev/null +++ b/deep_code/utils/ogc_record_generator.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + +from datetime import datetime, timezone + +from constants import DEFAULT_THEME_SCHEME +from deep_code.utils.ogc_api_record import ( + Contact, + RecordProperties, + Theme, + ThemeConcept, +) + + +class OSCWorkflowOGCApiRecordGenerator: + """Generates OGC API record for a workflow + """ + @staticmethod + def build_contact_objects(contacts_list: list[dict]) -> list[Contact]: + """Build a list of Contact objects from a list of contact dictionaries. + Uses the inherited MappingConstructible logic to parse each dict. + + Args: + contacts_list: A list of dictionaries, each containing contact information. + + Returns: + A list of Contact instances. + """ + return [Contact.from_value(cdict) for cdict in contacts_list] + + @staticmethod + def build_theme(osc_themes: list[str]) -> Theme: + """Convert each string into a ThemeConcept + """ + concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] + return Theme(concepts=concepts, scheme=DEFAULT_THEME_SCHEME) + + def build_record_properties(self, properties, contacts) -> RecordProperties: + """Build a RecordProperties object from a list of single-key property dicts + """ + now_iso = datetime.now(timezone.utc).isoformat() + properties.update({"created": now_iso}) + properties.update({"updated": now_iso}) + themes_list = properties.get("themes", []) + properties.update({"contacts": self.build_contact_objects(contacts)}) + if themes_list: + theme_obj = self.build_theme(themes_list) + properties.update({"themes": [theme_obj]}) + properties.setdefault("type", "workflow") + return RecordProperties.from_value(properties) diff --git a/pyproject.toml b/pyproject.toml index 057f7b8..efac413 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ dependencies = [ "pandas", "pystac", "pyyaml", - "xcube-core" + "xcube-core", + "xrlint", ] [tool.setuptools.dynamic] From de3c504341a0eec28f24766ef83987a79cd0f441 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 13:47:51 +0100 Subject: [PATCH 02/21] code formatting --- deep_code/cli/publish.py | 5 +---- deep_code/tests/tools/test_publish.py | 3 ++- deep_code/tests/utils/test_dataset_stac_generator.py | 4 ++-- deep_code/tests/utils/test_github_automation.py | 5 +++-- deep_code/tests/utils/test_osc_extension.py | 2 ++ deep_code/utils/dataset_stac_generator.py | 2 +- deep_code/utils/github_automation.py | 3 ++- deep_code/utils/osc_extension.py | 6 +++--- 8 files changed, 16 insertions(+), 14 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 48b1e63..a9da0f3 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -10,10 +10,7 @@ @click.command(name="publish-dataset") -@click.argument( - "dataset_config", - type=click.Path(exists=True) -) +@click.argument("dataset_config", type=click.Path(exists=True)) def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. """ diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 47c9961..ba306b8 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -1,5 +1,6 @@ +from unittest.mock import MagicMock, mock_open, patch + import pytest -from unittest.mock import patch, MagicMock, mock_open from deep_code.tools.publish import DatasetPublisher diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 12321b2..f2ef71d 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -1,10 +1,10 @@ import os +import unittest from datetime import datetime +from unittest.mock import MagicMock, patch import numpy as np from pystac import Collection -import unittest -from unittest.mock import patch, MagicMock from xarray import Dataset from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index 58acc09..6a66868 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -1,7 +1,8 @@ +import json import unittest -from unittest.mock import patch, MagicMock from pathlib import Path -import json +from unittest.mock import MagicMock, patch + from deep_code.utils.github_automation import GitHubAutomation diff --git a/deep_code/tests/utils/test_osc_extension.py b/deep_code/tests/utils/test_osc_extension.py index 66300cc..c11e3f8 100644 --- a/deep_code/tests/utils/test_osc_extension.py +++ b/deep_code/tests/utils/test_osc_extension.py @@ -1,5 +1,7 @@ import unittest + from pystac import Collection, Extent, SpatialExtent, TemporalExtent + from deep_code.utils.osc_extension import OscExtension diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 21f4cf8..1bd0064 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -4,8 +4,8 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. -import os import logging +import os from datetime import datetime, timezone import pandas as pd diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index d934d2a..4ad6e38 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -7,10 +7,11 @@ import json import logging import os -import requests import subprocess from pathlib import Path +import requests + class GitHubAutomation: """Automates GitHub operations needed to create a Pull Request. diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 6aa7519..8a777de 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -7,10 +7,10 @@ from typing import Literal import pystac -from pystac import SpatialExtent, TemporalExtent, Extent -from pystac.extensions.base import PropertiesExtension, ExtensionManagementMixin +from pystac import Extent, SpatialExtent, TemporalExtent +from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension -from deep_code.constants import OSC_SCHEMA_URI, CF_SCHEMA_URI +from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI class OscExtension( From e1c5f472db328d701c5967dcd33cbf145ab78969 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 14:12:39 +0100 Subject: [PATCH 03/21] new constant for wf branch --- deep_code/constants.py | 1 + deep_code/tools/publish.py | 17 ++++++++++++----- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index d40d728..8a17bbc 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -13,3 +13,4 @@ "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords" ) OGC_API_RECORD_SPEC = "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core" +WF_BRANCH_NAME = "add-new-workflow" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 27eff98..84ea29d 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -9,7 +9,8 @@ import fsspec import yaml -from deep_code.constants import OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER +from deep_code.constants import OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER, \ + WF_BRANCH_NAME from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator @@ -156,14 +157,15 @@ def publish_workflow(self, workflow_config_path: str): logger.info("Automating GitHub tasks...") self.github_automation.fork_repository() self.github_automation.clone_repository() - OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + workflow_id - self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) + # WF_NEW_BRANCH_NAME = WF_BRANCH_NAME + "-" + workflow_id + WF_NEW_BRANCH_NAME = WF_BRANCH_NAME + self.github_automation.create_branch(WF_NEW_BRANCH_NAME) self.github_automation.add_file(file_path, ogc_record.to_dict()) self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new collection:{workflow_id}" + WF_NEW_BRANCH_NAME, f"Add new workflow:{workflow_id}" ) pr_url = self.github_automation.create_pull_request( - OSC_NEW_BRANCH_NAME, + WF_NEW_BRANCH_NAME, f"Add new collection", "This PR adds a new workflow to the OSC repository.", ) @@ -171,3 +173,8 @@ def publish_workflow(self, workflow_config_path: str): finally: self.github_automation.clean_up() + +if __name__ == '__main__': + wp = WorkflowPublisher() + wp.publish_workflow("/home/tejas/bc/projects/deepesdl/deep-code/workflow-config" + ".yaml") From cb7c20616d5df1d10a9620136debe8354e933071 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 15:32:34 +0100 Subject: [PATCH 04/21] ordered imports --- deep_code/tools/publish.py | 13 +++++-------- deep_code/utils/ogc_record_generator.py | 9 +++------ deep_code/utils/osc_extension.py | 3 ++- 3 files changed, 10 insertions(+), 15 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 84ea29d..a5f9c87 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -8,13 +8,14 @@ import fsspec import yaml +from utils.ogc_api_record import OgcRecord -from deep_code.constants import OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER, \ - WF_BRANCH_NAME +from deep_code.constants import (OSC_BRANCH_NAME, OSC_REPO_NAME, + OSC_REPO_OWNER, WF_BRANCH_NAME) from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator -from utils.ogc_api_record import OgcRecord +from deep_code.utils.ogc_record_generator import \ + OSCWorkflowOGCApiRecordGenerator logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -174,7 +175,3 @@ def publish_workflow(self, workflow_config_path: str): finally: self.github_automation.clean_up() -if __name__ == '__main__': - wp = WorkflowPublisher() - wp.publish_workflow("/home/tejas/bc/projects/deepesdl/deep-code/workflow-config" - ".yaml") diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index d323a93..25e8b7f 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -7,12 +7,9 @@ from datetime import datetime, timezone from constants import DEFAULT_THEME_SCHEME -from deep_code.utils.ogc_api_record import ( - Contact, - RecordProperties, - Theme, - ThemeConcept, -) + +from deep_code.utils.ogc_api_record import (Contact, RecordProperties, Theme, + ThemeConcept) class OSCWorkflowOGCApiRecordGenerator: diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 8a777de..04bbb7f 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -8,7 +8,8 @@ import pystac from pystac import Extent, SpatialExtent, TemporalExtent -from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension +from pystac.extensions.base import (ExtensionManagementMixin, + PropertiesExtension) from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI From 2b7288d58511a6f06843a784cfae55eaa18d1335 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 15:32:52 +0100 Subject: [PATCH 05/21] unit tests --- deep_code/tests/utils/test_ogc_api_record.py | 102 ++++++++++++++++++ .../tests/utils/test_ogc_record_generator.py | 65 +++++++++++ 2 files changed, 167 insertions(+) create mode 100644 deep_code/tests/utils/test_ogc_api_record.py create mode 100644 deep_code/tests/utils/test_ogc_record_generator.py diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py new file mode 100644 index 0000000..a48aea4 --- /dev/null +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -0,0 +1,102 @@ +import unittest + +from deep_code.constants import OGC_API_RECORD_SPEC +from deep_code.utils.ogc_api_record import (Contact, JupyterKernelInfo, + OgcRecord, RecordProperties, Theme, + ThemeConcept) + + +class TestClasses(unittest.TestCase): + + def test_contact_initialization(self): + contact = Contact( + name="Person-X", + organization="Organization X", + position="Researcher", + links=[{"url": "http://example.com", "type": "website"}], + contactInstructions="Contact via email", + roles=["developer", "reviewer"] + ) + + self.assertEqual(contact.name, "Person-X") + self.assertEqual(contact.organization, "Organization X") + self.assertEqual(contact.position, "Researcher") + self.assertEqual(len(contact.links), 1) + self.assertEqual(contact.contactInstructions, "Contact via email") + self.assertIn("developer", contact.roles) + + def test_theme_concept_initialization(self): + theme_concept = ThemeConcept(id="concept1") + self.assertEqual(theme_concept.id, "concept1") + + def test_theme_initialization(self): + theme_concepts = [ThemeConcept(id="concept1"), ThemeConcept(id="concept2")] + theme = Theme(concepts=theme_concepts, scheme="http://example.com/scheme") + + self.assertEqual(len(theme.concepts), 2) + self.assertEqual(theme.scheme, "http://example.com/scheme") + + def test_jupyter_kernel_info_initialization(self): + kernel_info = JupyterKernelInfo(name="Python", python_version=3.9, env_file="env.yml") + + self.assertEqual(kernel_info.name, "Python") + self.assertEqual(kernel_info.python_version, 3.9) + self.assertEqual(kernel_info.env_file, "env.yml") + + def test_record_properties_initialization(self): + kernel_info = JupyterKernelInfo(name="Python", python_version=3.9, env_file="env.yml") + contacts = [Contact(name="Jane Doe", organization="Org Y")] + themes = [Theme(concepts=[ThemeConcept(id="concept1")], scheme="scheme1")] + + record_properties = RecordProperties( + created="2025-01-01", + type="dataset", + title="Sample Dataset", + description="A sample dataset", + jupyter_kernel_info=kernel_info, + updated="2025-01-02", + contacts=contacts, + themes=themes, + keywords=["sample", "test"], + formats=[{"format": "JSON"}], + license="CC-BY" + ) + + self.assertEqual(record_properties.created, "2025-01-01") + self.assertEqual(record_properties.updated, "2025-01-02") + self.assertEqual(record_properties.type, "dataset") + self.assertEqual(record_properties.title, "Sample Dataset") + self.assertEqual(record_properties.description, "A sample dataset") + self.assertEqual(record_properties.jupyter_kernel_info.name, "Python") + self.assertEqual(len(record_properties.contacts), 1) + self.assertEqual(len(record_properties.themes), 1) + self.assertIn("sample", record_properties.keywords) + self.assertEqual(record_properties.license, "CC-BY") + + def test_ogc_record_initialization(self): + kernel_info = JupyterKernelInfo(name="Python", python_version=3.9, env_file="env.yml") + properties = RecordProperties( + created="2025-01-01", + type="dataset", + title="Sample Dataset", + description="A sample dataset", + jupyter_kernel_info=kernel_info + ) + + ogc_record = OgcRecord( + id="record1", + type="Feature", + time={"start": "2025-01-01T00:00:00Z", "end": "2025-01-02T00:00:00Z"}, + properties=properties, + links=[{"href": "http://example.com", "rel": "self"}], + linkTemplates=[{"template": "http://example.com/{id}"}] + ) + + self.assertEqual(ogc_record.id, "record1") + self.assertEqual(ogc_record.type, "Feature") + self.assertEqual(ogc_record.time["start"], "2025-01-01T00:00:00Z") + self.assertEqual(ogc_record.properties.title, "Sample Dataset") + self.assertEqual(len(ogc_record.links), 1) + self.assertEqual(ogc_record.linkTemplates[0]["template"], "http://example.com/{id}") + self.assertEqual(ogc_record.conformsTo[0], OGC_API_RECORD_SPEC) + diff --git a/deep_code/tests/utils/test_ogc_record_generator.py b/deep_code/tests/utils/test_ogc_record_generator.py new file mode 100644 index 0000000..dc8c816 --- /dev/null +++ b/deep_code/tests/utils/test_ogc_record_generator.py @@ -0,0 +1,65 @@ +import unittest +from datetime import datetime, timezone + +from deep_code.constants import DEFAULT_THEME_SCHEME +from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator + + +class TestOSCWorkflowOGCApiRecordGenerator(unittest.TestCase): + + def test_build_contact_objects(self): + contacts_list = [ + {"name": "Alice", "organization": "Org A", "position": "Researcher"}, + {"name": "Bob", "organization": "Org B", "position": "Developer"} + ] + + result = OSCWorkflowOGCApiRecordGenerator.build_contact_objects(contacts_list) + + self.assertEqual(len(result), 2) + self.assertEqual(result[0].name, "Alice") + self.assertEqual(result[0].organization, "Org A") + self.assertEqual(result[0].position, "Researcher") + self.assertEqual(result[1].name, "Bob") + self.assertEqual(result[1].organization, "Org B") + self.assertEqual(result[1].position, "Developer") + + def test_build_theme(self): + osc_themes = ["theme1", "theme2"] + + theme = OSCWorkflowOGCApiRecordGenerator.build_theme(osc_themes) + + self.assertEqual(len(theme.concepts), 2) + self.assertEqual(theme.concepts[0].id, "theme1") + self.assertEqual(theme.concepts[1].id, "theme2") + self.assertEqual(theme.scheme, DEFAULT_THEME_SCHEME) + + def test_build_record_properties(self): + generator = OSCWorkflowOGCApiRecordGenerator() + properties = { + "title": "Test Workflow", + "description": "A test description", + "themes": ["theme1"], + "jupyter_kernel_info": { + "name": "deepesdl-xcube-1.7.1", + "python_version": 3.11, + "env_file": "https://git/env.yml" + } + } + contacts = [ + {"name": "Alice", "organization": "Org A", "position": "Researcher"} + ] + + record_properties = generator.build_record_properties(properties, contacts) + + now_iso = datetime.now(timezone.utc).isoformat() + + self.assertEqual(record_properties.title, "Test Workflow") + self.assertEqual(record_properties.description, "A test description") + self.assertEqual(len(record_properties.contacts), 1) + self.assertEqual(record_properties.contacts[0].name, "Alice") + self.assertEqual(len(record_properties.themes), 1) + self.assertEqual(record_properties.themes[0].concepts[0].id, "theme1") + self.assertEqual(record_properties.type, "workflow") + self.assertTrue("created" in record_properties.__dict__) + self.assertTrue("updated" in record_properties.__dict__) + From 2f3cd3fb719a63d1567a6d90b612eb72672e31d8 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 15:42:47 +0100 Subject: [PATCH 06/21] refactor --- deep_code/tests/utils/test_ogc_api_record.py | 37 ++++++++++++------- .../tests/utils/test_ogc_record_generator.py | 8 ++-- deep_code/tools/publish.py | 15 ++++---- deep_code/utils/ogc_record_generator.py | 12 ++++-- deep_code/utils/osc_extension.py | 3 +- 5 files changed, 44 insertions(+), 31 deletions(-) diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index a48aea4..52640fe 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -1,13 +1,17 @@ import unittest from deep_code.constants import OGC_API_RECORD_SPEC -from deep_code.utils.ogc_api_record import (Contact, JupyterKernelInfo, - OgcRecord, RecordProperties, Theme, - ThemeConcept) +from deep_code.utils.ogc_api_record import ( + Contact, + JupyterKernelInfo, + OgcRecord, + RecordProperties, + Theme, + ThemeConcept, +) class TestClasses(unittest.TestCase): - def test_contact_initialization(self): contact = Contact( name="Person-X", @@ -15,7 +19,7 @@ def test_contact_initialization(self): position="Researcher", links=[{"url": "http://example.com", "type": "website"}], contactInstructions="Contact via email", - roles=["developer", "reviewer"] + roles=["developer", "reviewer"], ) self.assertEqual(contact.name, "Person-X") @@ -37,14 +41,18 @@ def test_theme_initialization(self): self.assertEqual(theme.scheme, "http://example.com/scheme") def test_jupyter_kernel_info_initialization(self): - kernel_info = JupyterKernelInfo(name="Python", python_version=3.9, env_file="env.yml") + kernel_info = JupyterKernelInfo( + name="Python", python_version=3.9, env_file="env.yml" + ) self.assertEqual(kernel_info.name, "Python") self.assertEqual(kernel_info.python_version, 3.9) self.assertEqual(kernel_info.env_file, "env.yml") def test_record_properties_initialization(self): - kernel_info = JupyterKernelInfo(name="Python", python_version=3.9, env_file="env.yml") + kernel_info = JupyterKernelInfo( + name="Python", python_version=3.9, env_file="env.yml" + ) contacts = [Contact(name="Jane Doe", organization="Org Y")] themes = [Theme(concepts=[ThemeConcept(id="concept1")], scheme="scheme1")] @@ -59,7 +67,7 @@ def test_record_properties_initialization(self): themes=themes, keywords=["sample", "test"], formats=[{"format": "JSON"}], - license="CC-BY" + license="CC-BY", ) self.assertEqual(record_properties.created, "2025-01-01") @@ -74,13 +82,15 @@ def test_record_properties_initialization(self): self.assertEqual(record_properties.license, "CC-BY") def test_ogc_record_initialization(self): - kernel_info = JupyterKernelInfo(name="Python", python_version=3.9, env_file="env.yml") + kernel_info = JupyterKernelInfo( + name="Python", python_version=3.9, env_file="env.yml" + ) properties = RecordProperties( created="2025-01-01", type="dataset", title="Sample Dataset", description="A sample dataset", - jupyter_kernel_info=kernel_info + jupyter_kernel_info=kernel_info, ) ogc_record = OgcRecord( @@ -89,7 +99,7 @@ def test_ogc_record_initialization(self): time={"start": "2025-01-01T00:00:00Z", "end": "2025-01-02T00:00:00Z"}, properties=properties, links=[{"href": "http://example.com", "rel": "self"}], - linkTemplates=[{"template": "http://example.com/{id}"}] + linkTemplates=[{"template": "http://example.com/{id}"}], ) self.assertEqual(ogc_record.id, "record1") @@ -97,6 +107,7 @@ def test_ogc_record_initialization(self): self.assertEqual(ogc_record.time["start"], "2025-01-01T00:00:00Z") self.assertEqual(ogc_record.properties.title, "Sample Dataset") self.assertEqual(len(ogc_record.links), 1) - self.assertEqual(ogc_record.linkTemplates[0]["template"], "http://example.com/{id}") + self.assertEqual( + ogc_record.linkTemplates[0]["template"], "http://example.com/{id}" + ) self.assertEqual(ogc_record.conformsTo[0], OGC_API_RECORD_SPEC) - diff --git a/deep_code/tests/utils/test_ogc_record_generator.py b/deep_code/tests/utils/test_ogc_record_generator.py index dc8c816..f4fe372 100644 --- a/deep_code/tests/utils/test_ogc_record_generator.py +++ b/deep_code/tests/utils/test_ogc_record_generator.py @@ -6,11 +6,10 @@ class TestOSCWorkflowOGCApiRecordGenerator(unittest.TestCase): - def test_build_contact_objects(self): contacts_list = [ {"name": "Alice", "organization": "Org A", "position": "Researcher"}, - {"name": "Bob", "organization": "Org B", "position": "Developer"} + {"name": "Bob", "organization": "Org B", "position": "Developer"}, ] result = OSCWorkflowOGCApiRecordGenerator.build_contact_objects(contacts_list) @@ -42,8 +41,8 @@ def test_build_record_properties(self): "jupyter_kernel_info": { "name": "deepesdl-xcube-1.7.1", "python_version": 3.11, - "env_file": "https://git/env.yml" - } + "env_file": "https://git/env.yml", + }, } contacts = [ {"name": "Alice", "organization": "Org A", "position": "Researcher"} @@ -62,4 +61,3 @@ def test_build_record_properties(self): self.assertEqual(record_properties.type, "workflow") self.assertTrue("created" in record_properties.__dict__) self.assertTrue("updated" in record_properties.__dict__) - diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index a5f9c87..4c83bb9 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -8,14 +8,17 @@ import fsspec import yaml -from utils.ogc_api_record import OgcRecord -from deep_code.constants import (OSC_BRANCH_NAME, OSC_REPO_NAME, - OSC_REPO_OWNER, WF_BRANCH_NAME) +from deep_code.constants import ( + OSC_BRANCH_NAME, + OSC_REPO_NAME, + OSC_REPO_OWNER, + WF_BRANCH_NAME, +) from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.ogc_record_generator import \ - OSCWorkflowOGCApiRecordGenerator +from deep_code.utils.ogc_api_record import OgcRecord +from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -140,7 +143,6 @@ def publish_workflow(self, workflow_config_path: str): workflow_id = workflow_config.get("workflow_id") properties_list = workflow_config.get("properties", []) - contacts = workflow_config.get("contact", []) rg = OSCWorkflowOGCApiRecordGenerator() wf_record_properties = rg.build_record_properties(properties_list, contacts) @@ -174,4 +176,3 @@ def publish_workflow(self, workflow_config_path: str): finally: self.github_automation.clean_up() - diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index 25e8b7f..481663f 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -6,15 +6,19 @@ from datetime import datetime, timezone -from constants import DEFAULT_THEME_SCHEME - -from deep_code.utils.ogc_api_record import (Contact, RecordProperties, Theme, - ThemeConcept) +from deep_code.constants import DEFAULT_THEME_SCHEME +from deep_code.utils.ogc_api_record import ( + Contact, + RecordProperties, + Theme, + ThemeConcept, +) class OSCWorkflowOGCApiRecordGenerator: """Generates OGC API record for a workflow """ + @staticmethod def build_contact_objects(contacts_list: list[dict]) -> list[Contact]: """Build a list of Contact objects from a list of contact dictionaries. diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 04bbb7f..8a777de 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -8,8 +8,7 @@ import pystac from pystac import Extent, SpatialExtent, TemporalExtent -from pystac.extensions.base import (ExtensionManagementMixin, - PropertiesExtension) +from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI From 81256e09791723d3b4f47f0cb6f0d8b203232003 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 16:21:21 +0100 Subject: [PATCH 07/21] adjusted xrlint imports --- deep_code/utils/ogc_api_record.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index da783e0..437c2c8 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,6 +1,7 @@ from typing import Any, Optional -from xrlint.util.codec import JsonSerializable, MappingConstructible +from xrlint.util.constructible import MappingConstructible +from xrlint.util.serializable import JsonSerializable from deep_code.constants import OGC_API_RECORD_SPEC From c32a09fdf324c80269efa7f3d1ef559dec22990a Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 20 Jan 2025 16:57:10 +0100 Subject: [PATCH 08/21] added cli command --- deep_code/cli/main.py | 4 +++- deep_code/cli/publish.py | 10 +++++++++- deep_code/tools/publish.py | 9 ++++++--- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index be88985..af140a4 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -6,7 +6,7 @@ import click -from deep_code.cli.publish import publish_dataset +from deep_code.cli.publish import publish_dataset, publish_workflow @click.group() @@ -16,5 +16,7 @@ def main(): main.add_command(publish_dataset) +main.add_command(publish_workflow) + if __name__ == "__main__": main() diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index a9da0f3..a3d81d0 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -6,7 +6,7 @@ import click -from deep_code.tools.publish import DatasetPublisher +from deep_code.tools.publish import DatasetPublisher, WorkflowPublisher @click.command(name="publish-dataset") @@ -16,3 +16,11 @@ def publish_dataset(dataset_config): """ publisher = DatasetPublisher() publisher.publish_dataset(dataset_config_path=dataset_config) + + +@click.command(name="publish-workflow") +@click.argument("workflow_metadata", type=click.Path(exists=True)) +def publish_workflow(workflow_metadata): + + workflow_publisher = WorkflowPublisher() + workflow_publisher.publish_workflow(workflow_config_path=workflow_metadata) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 4c83bb9..89a041e 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -133,6 +133,10 @@ def __init__(self): self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) + @staticmethod + def _normalize_name(name: str | None) -> str | None: + return name.replace(" ", "-").lower() if name else None + def publish_workflow(self, workflow_config_path: str): with fsspec.open(workflow_config_path, "r") as file: @@ -140,7 +144,7 @@ def publish_workflow(self, workflow_config_path: str): try: logger.info("Generating OGC API Record for the workflow...") - workflow_id = workflow_config.get("workflow_id") + workflow_id = self._normalize_name(workflow_config.get("workflow_id")) properties_list = workflow_config.get("properties", []) contacts = workflow_config.get("contact", []) @@ -160,8 +164,7 @@ def publish_workflow(self, workflow_config_path: str): logger.info("Automating GitHub tasks...") self.github_automation.fork_repository() self.github_automation.clone_repository() - # WF_NEW_BRANCH_NAME = WF_BRANCH_NAME + "-" + workflow_id - WF_NEW_BRANCH_NAME = WF_BRANCH_NAME + WF_NEW_BRANCH_NAME = WF_BRANCH_NAME + "-" + workflow_id self.github_automation.create_branch(WF_NEW_BRANCH_NAME) self.github_automation.add_file(file_path, ogc_record.to_dict()) self.github_automation.commit_and_push( From bd72aa83e5222fc07ac6690fa982573052c94fcf Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 10:12:12 +0100 Subject: [PATCH 09/21] git merge tejas-xxx-build-varibale-reference-osc --- deep_code/tests/tools/test_publish.py | 31 ++- .../utils/test_dataset_stac_generator.py | 40 ++-- deep_code/tools/publish.py | 130 +++++++++-- deep_code/utils/dataset_stac_generator.py | 208 ++++++++++++++---- 4 files changed, 325 insertions(+), 84 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index ba306b8..4a13581 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -55,22 +55,21 @@ def test_publish_dataset_success( mock_subprocess_run, mock_chdir, ): - # Mock the YAML reads git_yaml_content = """ - github-username: test-user - github-token: test-token - """ + github-username: test-user + github-token: test-token + """ dataset_yaml_content = """ - dataset-id: test-dataset - collection-id: test-collection - documentation-link: http://example.com/doc - access-link: http://example.com/access - dataset-status: ongoing - dataset-region: Global - dataset-theme: ["climate"] - cf-parameter: [] - """ + dataset_id: test-dataset + collection_id: test-collection + documentation_link: http://example.com/doc + access_link: http://example.com/access + dataset_status: ongoing + dataset_region: Global + osc_theme: ["climate"] + cf_parameter: [] + """ mock_fsspec_open.side_effect = [ mock_open(read_data=git_yaml_content)(), mock_open(read_data=dataset_yaml_content)(), @@ -103,8 +102,8 @@ def test_publish_dataset_success( "links": [], "stac_version": "1.0.0", } - with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator: - mock_generator.return_value.build_stac_collection.return_value = ( + with patch("deep_code.tools.publish.OSCDatasetSTACGenerator") as mock_generator: + mock_generator.return_value.build_dataset_stac_collection.return_value = ( mock_collection ) @@ -112,7 +111,7 @@ def test_publish_dataset_success( publisher = DatasetPublisher() publisher.publish_dataset("/fake/path/to/dataset-config.yaml") - # 6Assert that we called git clone with /tmp/temp_repo + # Assert that we called git clone with /tmp/temp_repo # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" mock_subprocess_run.assert_any_call( diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index f2ef71d..d4444c1 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -7,7 +7,7 @@ from pystac import Collection from xarray import Dataset -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator class TestOSCProductSTACGenerator(unittest.TestCase): @@ -28,15 +28,31 @@ def setUp(self, mock_data_store): }, attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"}, data_vars={ - "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), - "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), + "var1": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var1", + "gcmd_keyword_url": "https://dummy", + }, + ), + "var2": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var2", + "gcmd_keyword_url": "https://dummy", + }, + ), }, ) mock_store = MagicMock() mock_store.open_data.return_value = self.mock_dataset mock_data_store.return_value = mock_store - self.generator = OSCProductSTACGenerator( + self.generator = OSCDatasetSTACGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", access_link="s3://mock-bucket/mock-dataset", @@ -66,7 +82,7 @@ def test_get_temporal_extent(self): def test_get_variables(self): """Test variable extraction.""" - variables = self.generator._get_variables() + variables = self.generator.get_variable_ids() self.assertEqual(variables, ["var1", "var2"]) def test_get_general_metadata(self): @@ -78,7 +94,7 @@ def test_get_general_metadata(self): @patch("pystac.Collection.set_self_href") def test_build_stac_collection(self, mock_set_self_href, mock_add_link): """Test STAC collection creation.""" - collection = self.generator.build_stac_collection() + collection = self.generator.build_dataset_stac_collection() self.assertIsInstance(collection, Collection) self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") @@ -104,8 +120,6 @@ def test_invalid_temporal_extent(self): with self.assertRaises(ValueError): self.generator._get_temporal_extent() - -class TestOpenDataset(unittest.TestCase): @patch("deep_code.utils.dataset_stac_generator.new_data_store") @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): @@ -113,10 +127,10 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor # Create a mock store and mock its `open_data` method mock_store = MagicMock() mock_new_data_store.return_value = mock_store - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset # Instantiate the generator (this will implicitly call _open_dataset) - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset is assigned correctly self.assertEqual(generator.dataset, "mock_dataset") @@ -151,13 +165,13 @@ def test_open_dataset_success_authenticated_store( mock_store, # Second call (authenticated store) returns a mock store ] - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" os.environ["S3_USER_STORAGE_KEY"] = "mock-key" os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset was successfully opened with the authenticated store self.assertEqual(generator.dataset, "mock_dataset") @@ -195,7 +209,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" with self.assertRaises(ValueError) as context: - OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") self.assertIn( "Failed to open Zarr dataset with ID mock-dataset-id", diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 89a041e..cc13353 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -5,6 +5,7 @@ # https://opensource.org/licenses/MIT. import logging +from pathlib import Path import fsspec import yaml @@ -15,7 +16,7 @@ OSC_REPO_OWNER, WF_BRANCH_NAME, ) -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.ogc_api_record import OgcRecord from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator @@ -24,6 +25,76 @@ logging.basicConfig(level=logging.INFO) +class BasePublisher: + """ + Base class providing common GitHub automation steps: + - Reading credentials from `.gitaccess` + - Setting up GitHubAutomation + - Forking, cloning, creating a branch + - Adding files, committing, pushing + - Creating a pull request + """ + + def __init__(self): + with fsspec.open(".gitaccess", "r") as file: + git_config = yaml.safe_load(file) or {} + + self.github_username = git_config.get("github-username") + self.github_token = git_config.get("github-token") + if not self.github_username or not self.github_token: + raise ValueError("GitHub credentials are missing in `.gitaccess` file.") + + self.github_automation = GitHubAutomation( + self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME + ) + + def publish_file( + self, + branch_name: str, + file_path: str, + file_content: dict, + commit_message: str, + pr_title: str, + pr_body: str, + ) -> str: + """ + Publish a single file to GitHub in a new branch and open a PR. + + Args: + branch_name: Name of the branch to create (e.g. 'osc-branch-collectionid'). + file_path: File path in the repo (e.g. 'products/.../collection.json'). + file_content: The JSON/dict content to commit. + commit_message: Commit message. + pr_title: Title for the pull request. + pr_body: Body of the pull request. + + Returns: + str: The URL of the created pull request. + """ + try: + logger.info("Starting GitHub automation...") + self.github_automation.fork_repository() + self.github_automation.clone_repository() + self.github_automation.create_branch(branch_name) + + # Add the file + self.github_automation.add_file(file_path, file_content) + + # Commit and push + self.github_automation.commit_and_push(branch_name, commit_message) + + # Create pull request + pr_url = self.github_automation.create_pull_request( + branch_name, pr_title, pr_body + ) + logger.info(f"Pull request created at: {pr_url}") + return pr_url + + finally: + # Always clean up local clone + self.github_automation.clean_up() + + class DatasetPublisher: """ Publishes products to the OSC GitHub repository. @@ -58,14 +129,14 @@ def publish_dataset(self, dataset_config_path: str): with fsspec.open(dataset_config_path, "r") as file: dataset_config = yaml.safe_load(file) - dataset_id = dataset_config.get("dataset-id") - collection_id = dataset_config.get("collection-id") - documentation_link = dataset_config.get("documentation-link") - access_link = dataset_config.get("access-link") - dataset_status = dataset_config.get("dataset-status") - osc_region = dataset_config.get("dataset-region") - dataset_theme = dataset_config.get("dataset-theme") - cf_params = dataset_config.get("cf-parameter") + dataset_id = dataset_config.get("dataset_id") + collection_id = dataset_config.get("collection_id") + documentation_link = dataset_config.get("documentation_link") + access_link = dataset_config.get("access_link") + dataset_status = dataset_config.get("dataset_status") + osc_region = dataset_config.get("osc_region") + osc_themes = dataset_config.get("osc_themes") + cf_params = dataset_config.get("cf_parameter") if not dataset_id or not collection_id: raise ValueError( @@ -75,17 +146,20 @@ def publish_dataset(self, dataset_config_path: str): try: logger.info("Generating STAC collection...") - generator = OSCProductSTACGenerator( + generator = OSCDatasetSTACGenerator( dataset_id=dataset_id, collection_id=collection_id, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, osc_region=osc_region, - osc_themes=dataset_theme, + osc_themes=osc_themes, cf_params=cf_params, ) - collection = generator.build_stac_collection() + # get variables from the datasets + variable_ids = generator.get_variable_ids() + # build STAC collection for the dataset + ds_collection = generator.build_dataset_stac_collection() file_path = f"products/{collection_id}/collection.json" logger.info("Automating GitHub tasks...") @@ -93,13 +167,41 @@ def publish_dataset(self, dataset_config_path: str): self.github_automation.clone_repository() OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - self.github_automation.add_file(file_path, collection.to_dict()) + + for var_id in variable_ids: + var_file_path = f"variables/{var_id}/catalog.json" + if not self.github_automation.file_exists(var_file_path): + logger.info( + f"Variable catalog for {var_id} does not exist. Creating..." + ) + var_metadata = generator.variables_metadata.get(var_id) + var_catalog = generator.build_variable_catalog(var_metadata) + self.github_automation.add_file( + var_file_path, var_catalog.to_dict() + ) + else: + logger.info( + f"Variable catalog already exists for {var_id}. so add the " + f"product as child link..." + ) + full_path = ( + Path(self.github_automation.local_clone_dir) / var_file_path + ) + self.github_automation.add_file( + var_file_path, + generator.update_existing_variable_catalog( + full_path, var_id + ).to_dict(), + ) + + self.github_automation.add_file(file_path, ds_collection.to_dict()) + self.github_automation.commit_and_push( OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" ) pr_url = self.github_automation.create_pull_request( OSC_NEW_BRANCH_NAME, - f"Add new collection", + f"Add new dataset collection", "This PR adds a new collection to the repository.", ) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 1bd0064..5b04c86 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -9,13 +9,13 @@ from datetime import datetime, timezone import pandas as pd -from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent +from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension -class OSCProductSTACGenerator: +class OSCDatasetSTACGenerator: """Generates OSC STAC Collections for a product from Zarr datasets. Args: @@ -53,6 +53,7 @@ def __init__( self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() + self.variables_metadata = self.get_variables_metadata() def _open_dataset(self): """Open the dataset using a S3 store as a xarray Dataset.""" @@ -170,29 +171,6 @@ def _get_temporal_extent(self) -> TemporalExtent: def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - def _get_variables(self) -> list[str]: - """Extracts variable names or descriptions from the dataset. - - Variables are prioritized based on their `long_name` or `standard_name` - attributes. If neither is available, the variable's key from - `dataset.data_vars.keys()` is used. - - Returns: - A list of variable names or descriptions. - """ - variables = [] - for var_name, variable in self.dataset.data_vars.items(): - long_name = self._normalize_name(variable.attrs.get("long_name")) - standard_name = self._normalize_name(variable.attrs.get("standard_name")) - if not long_name and not standard_name: - self.logger.error( - f"Metadata missing for variable '{var_name}': 'long_name' and " - f"'standard_name' attributes are not available." - ) - # Prioritize 'long_name', fallback to 'standard_name', then use variable key - variables.append(long_name or standard_name or var_name) - return variables - def _get_general_metadata(self) -> dict: return { "description": self.dataset.attrs.get( @@ -200,36 +178,173 @@ def _get_general_metadata(self) -> dict: ) } - def _get_variable_metadata(self, var_name, var_data) -> dict: - """Extract metadata from a single variable's attributes. + def extract_metadata_for_variable(self, variable_data) -> dict: + """Extract metadata for a single variable.""" + long_name = variable_data.attrs.get("long_name") + standard_name = variable_data.attrs.get("standard_name") + title = long_name or standard_name or variable_data.name + description = variable_data.attrs.get("description", "No variable description") + gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url") + return { + "variable_id": self._normalize_name(title), + "description": description, + "gcmd_keyword_url": gcmd_keyword_url, + } + + def get_variable_ids(self) -> list[str]: + """Get variable IDs for all variables in the dataset.""" + return list(self.variables_metadata.keys()) + + def get_variables_metadata(self) -> dict[str, dict]: + """Extract metadata for all variables in the dataset.""" + variables_metadata = {} + for var_name, variable in self.dataset.data_vars.items(): + var_metadata = self.extract_metadata_for_variable(variable) + variables_metadata[var_metadata.get("variable_id")] = var_metadata + return variables_metadata + + def _add_gcmd_link_to_var_catalog( + self, var_catalog: Catalog, var_metadata: dict + ) -> None: + """ + Checks for a GCMD keyword URL in var_metadata, adds a 'via' link to the catalog + pointing to the GCMD Keyword Viewer. Args: - var_name: The raw variable name in the dataset. - var_data: An xarray DataArray containing variable data and attrs. + var_catalog: The PySTAC Catalog to which we want to add the link. + var_metadata: Dictionary containing metadata about the variable, + including 'gcmd_keyword_url'. + """ + gcmd_keyword_url = var_metadata.get("gcmd_keyword_url") + if not gcmd_keyword_url: + self.logger.debug( + f"No gcmd_keyword_url in var_metadata. Skipping adding GCMD link in " + f'the {var_metadata.get("variable_id")} catalog' + ) + return + var_catalog.add_link( + Link( + rel="via", + target=gcmd_keyword_url, + title="Description", + media_type="text/html", + ) + ) + self.logger.info( + f'Added GCMD link for {var_metadata.get("variable_id")} ' + f"catalog {gcmd_keyword_url}." + ) + + def build_variable_catalog(self, var_metadata) -> Catalog: + """Build an OSC STAC Catalog for the variables in the dataset. Returns: - A dict with 'id', 'title', and 'description'. + A pystac.Catalog object. """ - long_name = var_data.attrs.get("long_name") - standard_name = var_data.attrs.get("standard_name") - title = long_name or standard_name or var_name + var_id = var_metadata.get("variable_id") + concepts = [{"id": theme} for theme in self.osc_themes] - normalized_title = self._normalize_name(title) + themes = [ + { + "scheme": "https://github.com/stac-extensions/osc#theme", + "concepts": concepts, + } + ] - description = var_data.attrs.get("description", "No variable description") + now_iso = datetime.now(timezone.utc).isoformat() - return {"id": var_name, "title": normalized_title, "description": description} + # Create a PySTAC Catalog object + var_catalog = Catalog( + id=var_id, + description=var_metadata.get("description"), + title=var_id, + stac_extensions=[ + "https://stac-extensions.github.io/themes/v1.0.0/schema.json" + ], + ) - def build_stac_collection(self) -> Collection: - """ - Build an OSC STAC Collection for the dataset. + var_catalog.stac_version = "1.0.0" + var_catalog.extra_fields["updated"] = now_iso + var_catalog.keywords = [] + + # Add the 'themes' block (from your example JSON) + var_catalog.extra_fields["themes"] = themes - :return: A pystac.Collection object. + var_catalog.remove_links("root") + # Add relevant links + var_catalog.add_link( + Link( + rel="root", + target="../../catalog.json", + media_type="application/json", + title="Open Science Catalog", + ) + ) + + # 'child' link: points to the product (or one of its collections) using this variable + var_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + + # 'parent' link: back up to the variables overview + var_catalog.add_link( + Link( + rel="parent", + target="../catalog.json", + media_type="application/json", + title="Variables", + ) + ) + # Add gcmd link for the variable definition + self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata) + + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + var_catalog.set_self_href(self_href) + + return var_catalog + + def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: + existing_catalog = Catalog.from_file(var_file_path) + now_iso = datetime.now(timezone.utc).isoformat() + existing_catalog.extra_fields["updated"] = now_iso + + # add 'child' link as the product + existing_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + existing_catalog.set_self_href(self_href) + + return existing_catalog + + def build_dataset_stac_collection(self) -> Collection: + """Build an OSC STAC Collection for the dataset. + + Returns: + A pystac.Collection object. """ try: spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() - variables = self._get_variables() + variables = self.get_variable_ids() general_metadata = self._get_general_metadata() except ValueError as e: raise ValueError(f"Metadata extraction failed: {e}") @@ -260,6 +375,7 @@ def build_stac_collection(self) -> Collection: now_iso = datetime.now(timezone.utc).isoformat() collection.extra_fields["created"] = now_iso collection.extra_fields["updated"] = now_iso + collection.title = self.collection_id # Remove any existing root link and re-add it properly collection.remove_links("root") @@ -284,6 +400,16 @@ def build_stac_collection(self) -> Collection: title="Products", ) ) + # Add variables ref + for var in variables: + collection.add_link( + Link( + rel="related", + target=f"../../varibales/{var}/catalog.json", + media_type="application/json", + title="Variable: " + var, + ) + ) self_href = ( "https://esa-earthcode.github.io/" From c96cd1d58a6f50b650001313c25204098918d9cd Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 10:14:52 +0100 Subject: [PATCH 10/21] update README.md --- README.md | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 1d4b381..74ffd87 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,16 @@ and Python API providing utilities that aid integration of DeepESDL datasets, experiments with EarthCODE. +The first release will focus on implementing the publish feature of DeepESDL +experiments/workflow as OGC API record and Datasets as an OSC stac collection. + ## Setup ## Install `deep-code` will be available in PyPI and conda-forge. Till the stable release, developers/contributors can follow the below steps to install deep-code. -## Installing from the repository for Developer +## Installing from the repository for Developers/Contributors To install deep-code directly from the git repository, clone the repository, and execute the steps below: @@ -72,16 +75,18 @@ github-token: personal access token #### dataset-config.yaml example ``` -dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr -collection-id: hydrology - -#non-mandatory -documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/ -access-link: s3://test -dataset-status: completed -dataset-region: global -dataset-theme: ["ocean", "environment"] -cf-parameter: [{"Name" : "hydrology"}] +dataset_id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr +collection_id: hydrology +osc_themes: + - Land + - Oceans +# non-mandatory +documentation_link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0.009deg-100x60x60-3.0.2.zarr/ +access_link: s3://test +dataset_status: completed +osc_region: global +cf_parameter: + - name: hydrology ``` dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. \ No newline at end of file From 2491433ed1e0cfd2b114270fe1fe4d746e8cc47f Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 10:17:28 +0100 Subject: [PATCH 11/21] updated github utility class --- deep_code/utils/github_automation.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 4ad6e38..4f3425e 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -114,3 +114,9 @@ def clean_up(self): subprocess.run(["rm", "-rf", self.local_clone_dir]) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to clean-up local repository: {e}") + + def file_exists(self, file_path) -> bool: + full_path = Path(self.local_clone_dir) / file_path + exists = os.path.isfile(full_path) + logging.debug(f"Checking existence of {full_path}: {exists}") + return exists \ No newline at end of file From ba7d25b047badda2341cdb5cc829223556dbd0fd Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 10:45:04 +0100 Subject: [PATCH 12/21] introduced BasePublisher class to keep code DRY --- deep_code/tests/tools/test_publish.py | 6 +- deep_code/tools/publish.py | 302 +++++++++++--------------- 2 files changed, 123 insertions(+), 185 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 4a13581..02c6bde 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -13,7 +13,7 @@ def test_init_missing_credentials(self, mock_fsspec_open): )() with pytest.raises( - ValueError, match="GitHub credentials are missing in the `.gitaccess` file." + ValueError, match="GitHub credentials are missing in `.gitaccess` file." ): DatasetPublisher() @@ -34,9 +34,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open): publisher = DatasetPublisher() with pytest.raises( - ValueError, - match="Dataset ID or Collection ID is missing in the " - "dataset-config.yaml file.", + ValueError, match="Dataset ID or Collection ID missing in config." ): publisher.publish_dataset("/path/to/dataset-config.yaml") diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index cc13353..e8bec94 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -27,18 +27,14 @@ class BasePublisher: """ - Base class providing common GitHub automation steps: - - Reading credentials from `.gitaccess` - - Setting up GitHubAutomation - - Forking, cloning, creating a branch - - Adding files, committing, pushing - - Creating a pull request + Base class providing: + - Reading .gitaccess for credentials + - Common GitHub automation steps (fork, clone, branch, file commit, pull request) """ def __init__(self): with fsspec.open(".gitaccess", "r") as file: git_config = yaml.safe_load(file) or {} - self.github_username = git_config.get("github-username") self.github_token = git_config.get("github-token") if not self.github_username or not self.github_token: @@ -48,37 +44,36 @@ def __init__(self): self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) - def publish_file( + def publish_files( self, branch_name: str, - file_path: str, - file_content: dict, + file_dict: dict[str, dict], commit_message: str, pr_title: str, pr_body: str, ) -> str: - """ - Publish a single file to GitHub in a new branch and open a PR. + """Publish multiple files to a new branch and open a PR. Args: - branch_name: Name of the branch to create (e.g. 'osc-branch-collectionid'). - file_path: File path in the repo (e.g. 'products/.../collection.json'). - file_content: The JSON/dict content to commit. - commit_message: Commit message. - pr_title: Title for the pull request. - pr_body: Body of the pull request. + branch_name: Branch name to create (e.g. "osc-branch-collectionid"). + file_dict: { file_path: file_content_dict } for each file to commit. + commit_message: Commit message for all changes. + pr_title: Title of the pull request. + pr_body: Description/body of the pull request. Returns: - str: The URL of the created pull request. + URL of the created pull request. """ try: - logger.info("Starting GitHub automation...") + logger.info("Forking and cloning repository...") self.github_automation.fork_repository() self.github_automation.clone_repository() self.github_automation.create_branch(branch_name) - # Add the file - self.github_automation.add_file(file_path, file_content) + # Add each file to the branch + for file_path, content in file_dict.items(): + logger.info(f"Adding {file_path} to {branch_name}") + self.github_automation.add_file(file_path, content) # Commit and push self.github_automation.commit_and_push(branch_name, commit_message) @@ -87,47 +82,23 @@ def publish_file( pr_url = self.github_automation.create_pull_request( branch_name, pr_title, pr_body ) - logger.info(f"Pull request created at: {pr_url}") + logger.info(f"Pull request created: {pr_url}") return pr_url finally: - # Always clean up local clone + # Cleanup local clone self.github_automation.clean_up() -class DatasetPublisher: +class DatasetPublisher(BasePublisher): + """Publishes products (datasets) to the OSC GitHub repository. + Inherits from BasePublisher for GitHub publishing logic. """ - Publishes products to the OSC GitHub repository. - - Credentials must be provided via a hidden file named `.gitaccess`, located in - the root of the repository. This file is expected to contain YAML of the form: - - github-username: "YOUR_GITHUB_USERNAME" - github-token: "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN" - """ - - def __init__(self): - with fsspec.open(".gitaccess", "r") as file: - git_config = yaml.safe_load(file) or {} - - self.github_username = git_config.get("github-username") - self.github_token = git_config.get("github-token") - - if not self.github_username or not self.github_token: - raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") - - self.github_automation = GitHubAutomation( - self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME - ) def publish_dataset(self, dataset_config_path: str): - """Publish a product collection to the specified GitHub repository. - - Args: - dataset_config_path: Path to the YAML file containing dataset config - """ + """Publish a product collection to the specified GitHub repository.""" with fsspec.open(dataset_config_path, "r") as file: - dataset_config = yaml.safe_load(file) + dataset_config = yaml.safe_load(file) or {} dataset_id = dataset_config.get("dataset_id") collection_id = dataset_config.get("collection_id") @@ -139,145 +110,114 @@ def publish_dataset(self, dataset_config_path: str): cf_params = dataset_config.get("cf_parameter") if not dataset_id or not collection_id: - raise ValueError( - "Dataset ID or Collection ID is missing in the dataset-config.yaml " - "file." - ) - - try: - logger.info("Generating STAC collection...") - generator = OSCDatasetSTACGenerator( - dataset_id=dataset_id, - collection_id=collection_id, - documentation_link=documentation_link, - access_link=access_link, - osc_status=dataset_status, - osc_region=osc_region, - osc_themes=osc_themes, - cf_params=cf_params, - ) - # get variables from the datasets - variable_ids = generator.get_variable_ids() - # build STAC collection for the dataset - ds_collection = generator.build_dataset_stac_collection() - - file_path = f"products/{collection_id}/collection.json" - logger.info("Automating GitHub tasks...") - self.github_automation.fork_repository() - self.github_automation.clone_repository() - OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id - self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - - for var_id in variable_ids: - var_file_path = f"variables/{var_id}/catalog.json" - if not self.github_automation.file_exists(var_file_path): - logger.info( - f"Variable catalog for {var_id} does not exist. Creating..." - ) - var_metadata = generator.variables_metadata.get(var_id) - var_catalog = generator.build_variable_catalog(var_metadata) - self.github_automation.add_file( - var_file_path, var_catalog.to_dict() - ) - else: - logger.info( - f"Variable catalog already exists for {var_id}. so add the " - f"product as child link..." - ) - full_path = ( - Path(self.github_automation.local_clone_dir) / var_file_path - ) - self.github_automation.add_file( - var_file_path, - generator.update_existing_variable_catalog( - full_path, var_id - ).to_dict(), - ) - - self.github_automation.add_file(file_path, ds_collection.to_dict()) - - self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" - ) - pr_url = self.github_automation.create_pull_request( - OSC_NEW_BRANCH_NAME, - f"Add new dataset collection", - "This PR adds a new collection to the repository.", - ) - - logger.info(f"Pull request created: {pr_url}") - - finally: - self.github_automation.clean_up() - - -class WorkflowPublisher: - """Publishes workflow to the OSC GitHub repository. - - Credentials must be provided via a hidden file named `.gitaccess`, located in - the root of the repository. This file is expected to contain YAML of the form: - - github-username: "YOUR_GITHUB_USERNAME" - github-token: "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN" - """ + raise ValueError("Dataset ID or Collection ID missing in config.") + + logger.info("Generating STAC collection...") + + generator = OSCDatasetSTACGenerator( + dataset_id=dataset_id, + collection_id=collection_id, + documentation_link=documentation_link, + access_link=access_link, + osc_status=dataset_status, + osc_region=osc_region, + osc_themes=osc_themes, + cf_params=cf_params, + ) - def __init__(self): - with fsspec.open(".gitaccess", "r") as file: - git_config = yaml.safe_load(file) or {} + variable_ids = generator.get_variable_ids() + ds_collection = generator.build_dataset_stac_collection() + + # Prepare a dictionary of file paths and content + file_dict = {} + product_path = f"products/{collection_id}/collection.json" + file_dict[product_path] = ds_collection.to_dict() + + # Add or update variable files + for var_id in variable_ids: + var_file_path = f"variables/{var_id}/catalog.json" + if not self.github_automation.file_exists(var_file_path): + logger.info( + f"Variable catalog for {var_id} does not exist. Creating..." + ) + var_metadata = generator.variables_metadata.get(var_id) + var_catalog = generator.build_variable_catalog(var_metadata) + file_dict[var_file_path] = var_catalog.to_dict() + else: + logger.info( + f"Variable catalog already exists for {var_id}, adding product link." + ) + full_path = Path(self.github_automation.local_clone_dir) / var_file_path + updated_catalog = generator.update_existing_variable_catalog( + full_path, var_id + ) + file_dict[var_file_path] = updated_catalog.to_dict() + + # Create branch name, commit message, PR info + branch_name = f"{OSC_BRANCH_NAME}-{collection_id}" + commit_message = f"Add new dataset collection: {collection_id}" + pr_title = "Add new dataset collection" + pr_body = "This PR adds a new dataset collection to the repository." + + # Publish all files in one go + pr_url = self.publish_files( + branch_name=branch_name, + file_dict=file_dict, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) - self.github_username = git_config.get("github-username") - self.github_token = git_config.get("github-token") + logger.info(f"Pull request created: {pr_url}") - if not self.github_username or not self.github_token: - raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") - self.github_automation = GitHubAutomation( - self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME - ) +class WorkflowPublisher(BasePublisher): + """Publishes workflows to the OSC GitHub repository.""" @staticmethod def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None def publish_workflow(self, workflow_config_path: str): - with fsspec.open(workflow_config_path, "r") as file: - workflow_config = yaml.safe_load(file) + workflow_config = yaml.safe_load(file) or {} + + workflow_id = self._normalize_name(workflow_config.get("workflow_id")) + if not workflow_id: + raise ValueError("workflow_id is missing in workflow config.") + + properties_list = workflow_config.get("properties", []) + contacts = workflow_config.get("contact", []) + links = workflow_config.get("links", []) + + logger.info("Generating OGC API Record for the workflow...") + rg = OSCWorkflowOGCApiRecordGenerator() + wf_record_properties = rg.build_record_properties(properties_list, contacts) + + ogc_record = OgcRecord( + id=workflow_id, + type="Feature", + time={}, + properties=wf_record_properties, + links=links, + ) - try: - logger.info("Generating OGC API Record for the workflow...") - workflow_id = self._normalize_name(workflow_config.get("workflow_id")) - properties_list = workflow_config.get("properties", []) - - contacts = workflow_config.get("contact", []) - rg = OSCWorkflowOGCApiRecordGenerator() - wf_record_properties = rg.build_record_properties(properties_list, contacts) - - links = workflow_config.get("links") - ogc_record = OgcRecord( - id=workflow_id, - type="Feature", - time={}, - properties=wf_record_properties, - links=links, - ) + file_path = f"workflow/{workflow_id}/collection.json" - file_path = f"workflow/{workflow_id}/collection.json" - logger.info("Automating GitHub tasks...") - self.github_automation.fork_repository() - self.github_automation.clone_repository() - WF_NEW_BRANCH_NAME = WF_BRANCH_NAME + "-" + workflow_id - self.github_automation.create_branch(WF_NEW_BRANCH_NAME) - self.github_automation.add_file(file_path, ogc_record.to_dict()) - self.github_automation.commit_and_push( - WF_NEW_BRANCH_NAME, f"Add new workflow:{workflow_id}" - ) - pr_url = self.github_automation.create_pull_request( - WF_NEW_BRANCH_NAME, - f"Add new collection", - "This PR adds a new workflow to the OSC repository.", - ) - logger.info(f"Pull request created: {pr_url}") + # Prepare the single file dict + file_dict = {file_path: ogc_record.to_dict()} - finally: - self.github_automation.clean_up() + branch_name = f"{WF_BRANCH_NAME}-{workflow_id}" + commit_message = f"Add new workflow: {workflow_id}" + pr_title = "Add new workflow" + pr_body = "This PR adds a new workflow to the OSC repository." + + pr_url = self.publish_files( + branch_name=branch_name, + file_dict=file_dict, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) + + logger.info(f"Pull request created: {pr_url}") From b2e14c83cb8ef05567377035e68244c2949f1589 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 10:45:14 +0100 Subject: [PATCH 13/21] refactor --- deep_code/utils/github_automation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 4f3425e..dbecfe4 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -119,4 +119,4 @@ def file_exists(self, file_path) -> bool: full_path = Path(self.local_clone_dir) / file_path exists = os.path.isfile(full_path) logging.debug(f"Checking existence of {full_path}: {exists}") - return exists \ No newline at end of file + return exists From 30e320a62df2de4a1e07fa1ef0cb1ae58fece6df Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 10:51:57 +0100 Subject: [PATCH 14/21] updated README.md --- README.md | 45 ++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 44 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 74ffd87..364b703 100644 --- a/README.md +++ b/README.md @@ -89,4 +89,47 @@ cf_parameter: - name: hydrology ``` -dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. \ No newline at end of file +dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. + +### deep-code publish-workflow + +Publish a workflow/experiment to the EarthCODE open-science catalog. + +```commandline +deep-code publish-workflow /path/to/workflow-config.yaml + ``` +#### workflow-config.yaml example + +``` +workflow_id: "4D Med hydrology cube generation" +properties: + title: "Hydrology cube generation recipe" + description: "4D Med cube generation" + keywords: + - Earth Science + themes: + - Atmosphere + - Ocean + - Evaporation + license: proprietary + jupyter_kernel_info: + name: deepesdl-xcube-1.7.1 + python_version: 3.11 + env_file: https://git/env.yml +links: + - rel: "documentation" + type: "application/json" + title: "4DMed Hydrology Cube Generation Recipe" + href: "https://github.com/deepesdl/cube-gen/tree/main/hydrology/README.md" + - rel: "jupyter-notebook" + type: "application/json" + title: "Workflow Jupyter Notebook" + href: "https://github.com/deepesdl/cube-gen/blob/main/hydrology/notebooks/reading_hydrology.ipynb" +contact: + - name: Tejas Morbagal Harish + organization: Brockmann Consult GmbH + links: + - rel: "about" + type: "text/html" + href: "https://www.brockmann-consult.de/" +``` From 1c148d38f806ee06b4b2929f9eee182f21f17f2d Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 21 Jan 2025 11:02:14 +0100 Subject: [PATCH 15/21] refactor --- deep_code/tests/tools/test_publish.py | 4 ++-- deep_code/tools/publish.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 02c6bde..00ac637 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -13,7 +13,7 @@ def test_init_missing_credentials(self, mock_fsspec_open): )() with pytest.raises( - ValueError, match="GitHub credentials are missing in `.gitaccess` file." + ValueError, match="GitHub credentials are missing in the `.gitaccess` file." ): DatasetPublisher() @@ -34,7 +34,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open): publisher = DatasetPublisher() with pytest.raises( - ValueError, match="Dataset ID or Collection ID missing in config." + ValueError, match="Dataset ID or Collection ID missing in the config." ): publisher.publish_dataset("/path/to/dataset-config.yaml") diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index e8bec94..e91dd90 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -38,7 +38,7 @@ def __init__(self): self.github_username = git_config.get("github-username") self.github_token = git_config.get("github-token") if not self.github_username or not self.github_token: - raise ValueError("GitHub credentials are missing in `.gitaccess` file.") + raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") self.github_automation = GitHubAutomation( self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME @@ -110,7 +110,7 @@ def publish_dataset(self, dataset_config_path: str): cf_params = dataset_config.get("cf_parameter") if not dataset_id or not collection_id: - raise ValueError("Dataset ID or Collection ID missing in config.") + raise ValueError("Dataset ID or Collection ID missing in the config.") logger.info("Generating STAC collection...") From e21c64f5e4e89afceaaa18313aaa2fea97acee43 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Mon, 27 Jan 2025 11:51:06 +0100 Subject: [PATCH 16/21] Update deep_code/utils/dataset_stac_generator.py Co-authored-by: Norman Fomferra --- deep_code/utils/dataset_stac_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 5b04c86..e3c4eb4 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -15,7 +15,7 @@ from deep_code.utils.osc_extension import OscExtension -class OSCDatasetSTACGenerator: +class OscDatasetStacGenerator: """Generates OSC STAC Collections for a product from Zarr datasets. Args: From 560bcde22cda4ceb9c006d5e560b2881f8cdb592 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Mon, 27 Jan 2025 11:51:19 +0100 Subject: [PATCH 17/21] Update deep_code/tests/utils/test_dataset_stac_generator.py Co-authored-by: Norman Fomferra --- deep_code/tests/utils/test_dataset_stac_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index d4444c1..55b0d7e 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -7,7 +7,7 @@ from pystac import Collection from xarray import Dataset -from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator class TestOSCProductSTACGenerator(unittest.TestCase): From 06dcf3bc351ef6cf68e68c15b38b9f4c7c4fef03 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 27 Jan 2025 11:57:40 +0100 Subject: [PATCH 18/21] refactor classname using camelcase --- deep_code/tests/tools/test_publish.py | 2 +- deep_code/tests/utils/test_dataset_stac_generator.py | 8 ++++---- deep_code/tools/publish.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 00ac637..3e0f5e8 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -100,7 +100,7 @@ def test_publish_dataset_success( "links": [], "stac_version": "1.0.0", } - with patch("deep_code.tools.publish.OSCDatasetSTACGenerator") as mock_generator: + with patch("deep_code.tools.publish.OscDatasetStacGenerator") as mock_generator: mock_generator.return_value.build_dataset_stac_collection.return_value = ( mock_collection ) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 55b0d7e..64285a7 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -52,7 +52,7 @@ def setUp(self, mock_data_store): mock_store.open_data.return_value = self.mock_dataset mock_data_store.return_value = mock_store - self.generator = OSCDatasetSTACGenerator( + self.generator = OscDatasetStacGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", access_link="s3://mock-bucket/mock-dataset", @@ -130,7 +130,7 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor mock_store.open_data.return_value = self.mock_dataset # Instantiate the generator (this will implicitly call _open_dataset) - generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset is assigned correctly self.assertEqual(generator.dataset, "mock_dataset") @@ -171,7 +171,7 @@ def test_open_dataset_success_authenticated_store( os.environ["S3_USER_STORAGE_KEY"] = "mock-key" os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset was successfully opened with the authenticated store self.assertEqual(generator.dataset, "mock_dataset") @@ -209,7 +209,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" with self.assertRaises(ValueError) as context: - OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id") + OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") self.assertIn( "Failed to open Zarr dataset with ID mock-dataset-id", diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index e91dd90..8e5f8c9 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -16,7 +16,7 @@ OSC_REPO_OWNER, WF_BRANCH_NAME, ) -from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.ogc_api_record import OgcRecord from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator @@ -114,7 +114,7 @@ def publish_dataset(self, dataset_config_path: str): logger.info("Generating STAC collection...") - generator = OSCDatasetSTACGenerator( + generator = OscDatasetStacGenerator( dataset_id=dataset_id, collection_id=collection_id, documentation_link=documentation_link, From 949a669844b5820537fb5de8e95d2afb0c7bfb7e Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 27 Jan 2025 12:03:07 +0100 Subject: [PATCH 19/21] refactor naming of variable attrs --- deep_code/utils/dataset_stac_generator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index e3c4eb4..3d4da00 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -182,11 +182,11 @@ def extract_metadata_for_variable(self, variable_data) -> dict: """Extract metadata for a single variable.""" long_name = variable_data.attrs.get("long_name") standard_name = variable_data.attrs.get("standard_name") - title = long_name or standard_name or variable_data.name - description = variable_data.attrs.get("description", "No variable description") + variable_id = standard_name or variable_data.name + description = variable_data.attrs.get("description", long_name) gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url") return { - "variable_id": self._normalize_name(title), + "variable_id": self._normalize_name(variable_id), "description": description, "gcmd_keyword_url": gcmd_keyword_url, } From 901d7f81af491a9413d28be4701e480faab57683 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 27 Jan 2025 12:29:15 +0100 Subject: [PATCH 20/21] use composition over inheritance --- deep_code/tools/publish.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 8e5f8c9..ba762e1 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -25,7 +25,7 @@ logging.basicConfig(level=logging.INFO) -class BasePublisher: +class GitHubPublisher: """ Base class providing: - Reading .gitaccess for credentials @@ -90,11 +90,15 @@ def publish_files( self.github_automation.clean_up() -class DatasetPublisher(BasePublisher): +class DatasetPublisher: """Publishes products (datasets) to the OSC GitHub repository. Inherits from BasePublisher for GitHub publishing logic. """ + def __init__(self): + # Composition + self.gh_publisher = GitHubPublisher() + def publish_dataset(self, dataset_config_path: str): """Publish a product collection to the specified GitHub repository.""" with fsspec.open(dataset_config_path, "r") as file: @@ -136,7 +140,7 @@ def publish_dataset(self, dataset_config_path: str): # Add or update variable files for var_id in variable_ids: var_file_path = f"variables/{var_id}/catalog.json" - if not self.github_automation.file_exists(var_file_path): + if not self.gh_publisher.github_automation.file_exists(var_file_path): logger.info( f"Variable catalog for {var_id} does not exist. Creating..." ) @@ -147,7 +151,10 @@ def publish_dataset(self, dataset_config_path: str): logger.info( f"Variable catalog already exists for {var_id}, adding product link." ) - full_path = Path(self.github_automation.local_clone_dir) / var_file_path + full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) + / var_file_path + ) updated_catalog = generator.update_existing_variable_catalog( full_path, var_id ) @@ -160,7 +167,7 @@ def publish_dataset(self, dataset_config_path: str): pr_body = "This PR adds a new dataset collection to the repository." # Publish all files in one go - pr_url = self.publish_files( + pr_url = self.gh_publisher.publish_files( branch_name=branch_name, file_dict=file_dict, commit_message=commit_message, @@ -171,9 +178,12 @@ def publish_dataset(self, dataset_config_path: str): logger.info(f"Pull request created: {pr_url}") -class WorkflowPublisher(BasePublisher): +class WorkflowPublisher: """Publishes workflows to the OSC GitHub repository.""" + def __init__(self): + self.gh_publisher = GitHubPublisher() + @staticmethod def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None @@ -212,7 +222,7 @@ def publish_workflow(self, workflow_config_path: str): pr_title = "Add new workflow" pr_body = "This PR adds a new workflow to the OSC repository." - pr_url = self.publish_files( + pr_url = self.gh_publisher.publish_files( branch_name=branch_name, file_dict=file_dict, commit_message=commit_message, From 5503cd85c7cf13cd3e6ff6833576b95a73bbe569 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 27 Jan 2025 12:37:59 +0100 Subject: [PATCH 21/21] regactor WF_BRANCH_NAME to add-new-workflow-from-deepesdl --- deep_code/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 8a17bbc..992ddf4 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -13,4 +13,4 @@ "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords" ) OGC_API_RECORD_SPEC = "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core" -WF_BRANCH_NAME = "add-new-workflow" +WF_BRANCH_NAME = "add-new-workflow-from-deepesdl"