diff --git a/README.md b/README.md index 1d4b381..364b703 100644 --- a/README.md +++ b/README.md @@ -9,13 +9,16 @@ and Python API providing utilities that aid integration of DeepESDL datasets, experiments with EarthCODE. +The first release will focus on implementing the publish feature of DeepESDL +experiments/workflow as OGC API record and Datasets as an OSC stac collection. + ## Setup ## Install `deep-code` will be available in PyPI and conda-forge. Till the stable release, developers/contributors can follow the below steps to install deep-code. -## Installing from the repository for Developer +## Installing from the repository for Developers/Contributors To install deep-code directly from the git repository, clone the repository, and execute the steps below: @@ -72,16 +75,61 @@ github-token: personal access token #### dataset-config.yaml example ``` -dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr -collection-id: hydrology - -#non-mandatory -documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/ -access-link: s3://test -dataset-status: completed -dataset-region: global -dataset-theme: ["ocean", "environment"] -cf-parameter: [{"Name" : "hydrology"}] +dataset_id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr +collection_id: hydrology +osc_themes: + - Land + - Oceans +# non-mandatory +documentation_link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0.009deg-100x60x60-3.0.2.zarr/ +access_link: s3://test +dataset_status: completed +osc_region: global +cf_parameter: + - name: hydrology ``` -dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. \ No newline at end of file +dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. + +### deep-code publish-workflow + +Publish a workflow/experiment to the EarthCODE open-science catalog. + +```commandline +deep-code publish-workflow /path/to/workflow-config.yaml + ``` +#### workflow-config.yaml example + +``` +workflow_id: "4D Med hydrology cube generation" +properties: + title: "Hydrology cube generation recipe" + description: "4D Med cube generation" + keywords: + - Earth Science + themes: + - Atmosphere + - Ocean + - Evaporation + license: proprietary + jupyter_kernel_info: + name: deepesdl-xcube-1.7.1 + python_version: 3.11 + env_file: https://git/env.yml +links: + - rel: "documentation" + type: "application/json" + title: "4DMed Hydrology Cube Generation Recipe" + href: "https://github.com/deepesdl/cube-gen/tree/main/hydrology/README.md" + - rel: "jupyter-notebook" + type: "application/json" + title: "Workflow Jupyter Notebook" + href: "https://github.com/deepesdl/cube-gen/blob/main/hydrology/notebooks/reading_hydrology.ipynb" +contact: + - name: Tejas Morbagal Harish + organization: Brockmann Consult GmbH + links: + - rel: "about" + type: "text/html" + href: "https://www.brockmann-consult.de/" +``` diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index be88985..af140a4 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -6,7 +6,7 @@ import click -from deep_code.cli.publish import publish_dataset +from deep_code.cli.publish import publish_dataset, publish_workflow @click.group() @@ -16,5 +16,7 @@ def main(): main.add_command(publish_dataset) +main.add_command(publish_workflow) + if __name__ == "__main__": main() diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 48b1e63..a3d81d0 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -6,16 +6,21 @@ import click -from deep_code.tools.publish import DatasetPublisher +from deep_code.tools.publish import DatasetPublisher, WorkflowPublisher @click.command(name="publish-dataset") -@click.argument( - "dataset_config", - type=click.Path(exists=True) -) +@click.argument("dataset_config", type=click.Path(exists=True)) def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. """ publisher = DatasetPublisher() publisher.publish_dataset(dataset_config_path=dataset_config) + + +@click.command(name="publish-workflow") +@click.argument("workflow_metadata", type=click.Path(exists=True)) +def publish_workflow(workflow_metadata): + + workflow_publisher = WorkflowPublisher() + workflow_publisher.publish_workflow(workflow_config_path=workflow_metadata) diff --git a/deep_code/constants.py b/deep_code/constants.py index 68982bc..992ddf4 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -9,3 +9,8 @@ OSC_REPO_OWNER = "ESA-EarthCODE" OSC_REPO_NAME = "open-science-catalog-metadata-testing" OSC_BRANCH_NAME = "add-new-collection" +DEFAULT_THEME_SCHEME = ( + "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords" +) +OGC_API_RECORD_SPEC = "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core" +WF_BRANCH_NAME = "add-new-workflow-from-deepesdl" diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 47c9961..3e0f5e8 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -1,5 +1,6 @@ +from unittest.mock import MagicMock, mock_open, patch + import pytest -from unittest.mock import patch, MagicMock, mock_open from deep_code.tools.publish import DatasetPublisher @@ -33,9 +34,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open): publisher = DatasetPublisher() with pytest.raises( - ValueError, - match="Dataset ID or Collection ID is missing in the " - "dataset-config.yaml file.", + ValueError, match="Dataset ID or Collection ID missing in the config." ): publisher.publish_dataset("/path/to/dataset-config.yaml") @@ -54,22 +53,21 @@ def test_publish_dataset_success( mock_subprocess_run, mock_chdir, ): - # Mock the YAML reads git_yaml_content = """ - github-username: test-user - github-token: test-token - """ + github-username: test-user + github-token: test-token + """ dataset_yaml_content = """ - dataset-id: test-dataset - collection-id: test-collection - documentation-link: http://example.com/doc - access-link: http://example.com/access - dataset-status: ongoing - dataset-region: Global - dataset-theme: ["climate"] - cf-parameter: [] - """ + dataset_id: test-dataset + collection_id: test-collection + documentation_link: http://example.com/doc + access_link: http://example.com/access + dataset_status: ongoing + dataset_region: Global + osc_theme: ["climate"] + cf_parameter: [] + """ mock_fsspec_open.side_effect = [ mock_open(read_data=git_yaml_content)(), mock_open(read_data=dataset_yaml_content)(), @@ -102,8 +100,8 @@ def test_publish_dataset_success( "links": [], "stac_version": "1.0.0", } - with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator: - mock_generator.return_value.build_stac_collection.return_value = ( + with patch("deep_code.tools.publish.OscDatasetStacGenerator") as mock_generator: + mock_generator.return_value.build_dataset_stac_collection.return_value = ( mock_collection ) @@ -111,7 +109,7 @@ def test_publish_dataset_success( publisher = DatasetPublisher() publisher.publish_dataset("/fake/path/to/dataset-config.yaml") - # 6Assert that we called git clone with /tmp/temp_repo + # Assert that we called git clone with /tmp/temp_repo # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" mock_subprocess_run.assert_any_call( diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 12321b2..64285a7 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -1,13 +1,13 @@ import os +import unittest from datetime import datetime +from unittest.mock import MagicMock, patch import numpy as np from pystac import Collection -import unittest -from unittest.mock import patch, MagicMock from xarray import Dataset -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator class TestOSCProductSTACGenerator(unittest.TestCase): @@ -28,15 +28,31 @@ def setUp(self, mock_data_store): }, attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"}, data_vars={ - "var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), - "var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)), + "var1": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var1", + "gcmd_keyword_url": "https://dummy", + }, + ), + "var2": ( + ("time", "lat", "lon"), + np.random.rand(2, 5, 10), + { + "description": "dummy", + "standard_name": "var2", + "gcmd_keyword_url": "https://dummy", + }, + ), }, ) mock_store = MagicMock() mock_store.open_data.return_value = self.mock_dataset mock_data_store.return_value = mock_store - self.generator = OSCProductSTACGenerator( + self.generator = OscDatasetStacGenerator( dataset_id="mock-dataset-id", collection_id="mock-collection-id", access_link="s3://mock-bucket/mock-dataset", @@ -66,7 +82,7 @@ def test_get_temporal_extent(self): def test_get_variables(self): """Test variable extraction.""" - variables = self.generator._get_variables() + variables = self.generator.get_variable_ids() self.assertEqual(variables, ["var1", "var2"]) def test_get_general_metadata(self): @@ -78,7 +94,7 @@ def test_get_general_metadata(self): @patch("pystac.Collection.set_self_href") def test_build_stac_collection(self, mock_set_self_href, mock_add_link): """Test STAC collection creation.""" - collection = self.generator.build_stac_collection() + collection = self.generator.build_dataset_stac_collection() self.assertIsInstance(collection, Collection) self.assertEqual(collection.id, "mock-collection-id") self.assertEqual(collection.description, "Mock dataset for testing.") @@ -104,8 +120,6 @@ def test_invalid_temporal_extent(self): with self.assertRaises(ValueError): self.generator._get_temporal_extent() - -class TestOpenDataset(unittest.TestCase): @patch("deep_code.utils.dataset_stac_generator.new_data_store") @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): @@ -113,10 +127,10 @@ def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_stor # Create a mock store and mock its `open_data` method mock_store = MagicMock() mock_new_data_store.return_value = mock_store - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset # Instantiate the generator (this will implicitly call _open_dataset) - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset is assigned correctly self.assertEqual(generator.dataset, "mock_dataset") @@ -151,13 +165,13 @@ def test_open_dataset_success_authenticated_store( mock_store, # Second call (authenticated store) returns a mock store ] - mock_store.open_data.return_value = "mock_dataset" + mock_store.open_data.return_value = self.mock_dataset os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" os.environ["S3_USER_STORAGE_KEY"] = "mock-key" os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") # Validate that the dataset was successfully opened with the authenticated store self.assertEqual(generator.dataset, "mock_dataset") @@ -195,7 +209,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" with self.assertRaises(ValueError) as context: - OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id") + OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") self.assertIn( "Failed to open Zarr dataset with ID mock-dataset-id", diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index 58acc09..6a66868 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -1,7 +1,8 @@ +import json import unittest -from unittest.mock import patch, MagicMock from pathlib import Path -import json +from unittest.mock import MagicMock, patch + from deep_code.utils.github_automation import GitHubAutomation diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py new file mode 100644 index 0000000..52640fe --- /dev/null +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -0,0 +1,113 @@ +import unittest + +from deep_code.constants import OGC_API_RECORD_SPEC +from deep_code.utils.ogc_api_record import ( + Contact, + JupyterKernelInfo, + OgcRecord, + RecordProperties, + Theme, + ThemeConcept, +) + + +class TestClasses(unittest.TestCase): + def test_contact_initialization(self): + contact = Contact( + name="Person-X", + organization="Organization X", + position="Researcher", + links=[{"url": "http://example.com", "type": "website"}], + contactInstructions="Contact via email", + roles=["developer", "reviewer"], + ) + + self.assertEqual(contact.name, "Person-X") + self.assertEqual(contact.organization, "Organization X") + self.assertEqual(contact.position, "Researcher") + self.assertEqual(len(contact.links), 1) + self.assertEqual(contact.contactInstructions, "Contact via email") + self.assertIn("developer", contact.roles) + + def test_theme_concept_initialization(self): + theme_concept = ThemeConcept(id="concept1") + self.assertEqual(theme_concept.id, "concept1") + + def test_theme_initialization(self): + theme_concepts = [ThemeConcept(id="concept1"), ThemeConcept(id="concept2")] + theme = Theme(concepts=theme_concepts, scheme="http://example.com/scheme") + + self.assertEqual(len(theme.concepts), 2) + self.assertEqual(theme.scheme, "http://example.com/scheme") + + def test_jupyter_kernel_info_initialization(self): + kernel_info = JupyterKernelInfo( + name="Python", python_version=3.9, env_file="env.yml" + ) + + self.assertEqual(kernel_info.name, "Python") + self.assertEqual(kernel_info.python_version, 3.9) + self.assertEqual(kernel_info.env_file, "env.yml") + + def test_record_properties_initialization(self): + kernel_info = JupyterKernelInfo( + name="Python", python_version=3.9, env_file="env.yml" + ) + contacts = [Contact(name="Jane Doe", organization="Org Y")] + themes = [Theme(concepts=[ThemeConcept(id="concept1")], scheme="scheme1")] + + record_properties = RecordProperties( + created="2025-01-01", + type="dataset", + title="Sample Dataset", + description="A sample dataset", + jupyter_kernel_info=kernel_info, + updated="2025-01-02", + contacts=contacts, + themes=themes, + keywords=["sample", "test"], + formats=[{"format": "JSON"}], + license="CC-BY", + ) + + self.assertEqual(record_properties.created, "2025-01-01") + self.assertEqual(record_properties.updated, "2025-01-02") + self.assertEqual(record_properties.type, "dataset") + self.assertEqual(record_properties.title, "Sample Dataset") + self.assertEqual(record_properties.description, "A sample dataset") + self.assertEqual(record_properties.jupyter_kernel_info.name, "Python") + self.assertEqual(len(record_properties.contacts), 1) + self.assertEqual(len(record_properties.themes), 1) + self.assertIn("sample", record_properties.keywords) + self.assertEqual(record_properties.license, "CC-BY") + + def test_ogc_record_initialization(self): + kernel_info = JupyterKernelInfo( + name="Python", python_version=3.9, env_file="env.yml" + ) + properties = RecordProperties( + created="2025-01-01", + type="dataset", + title="Sample Dataset", + description="A sample dataset", + jupyter_kernel_info=kernel_info, + ) + + ogc_record = OgcRecord( + id="record1", + type="Feature", + time={"start": "2025-01-01T00:00:00Z", "end": "2025-01-02T00:00:00Z"}, + properties=properties, + links=[{"href": "http://example.com", "rel": "self"}], + linkTemplates=[{"template": "http://example.com/{id}"}], + ) + + self.assertEqual(ogc_record.id, "record1") + self.assertEqual(ogc_record.type, "Feature") + self.assertEqual(ogc_record.time["start"], "2025-01-01T00:00:00Z") + self.assertEqual(ogc_record.properties.title, "Sample Dataset") + self.assertEqual(len(ogc_record.links), 1) + self.assertEqual( + ogc_record.linkTemplates[0]["template"], "http://example.com/{id}" + ) + self.assertEqual(ogc_record.conformsTo[0], OGC_API_RECORD_SPEC) diff --git a/deep_code/tests/utils/test_ogc_record_generator.py b/deep_code/tests/utils/test_ogc_record_generator.py new file mode 100644 index 0000000..f4fe372 --- /dev/null +++ b/deep_code/tests/utils/test_ogc_record_generator.py @@ -0,0 +1,63 @@ +import unittest +from datetime import datetime, timezone + +from deep_code.constants import DEFAULT_THEME_SCHEME +from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator + + +class TestOSCWorkflowOGCApiRecordGenerator(unittest.TestCase): + def test_build_contact_objects(self): + contacts_list = [ + {"name": "Alice", "organization": "Org A", "position": "Researcher"}, + {"name": "Bob", "organization": "Org B", "position": "Developer"}, + ] + + result = OSCWorkflowOGCApiRecordGenerator.build_contact_objects(contacts_list) + + self.assertEqual(len(result), 2) + self.assertEqual(result[0].name, "Alice") + self.assertEqual(result[0].organization, "Org A") + self.assertEqual(result[0].position, "Researcher") + self.assertEqual(result[1].name, "Bob") + self.assertEqual(result[1].organization, "Org B") + self.assertEqual(result[1].position, "Developer") + + def test_build_theme(self): + osc_themes = ["theme1", "theme2"] + + theme = OSCWorkflowOGCApiRecordGenerator.build_theme(osc_themes) + + self.assertEqual(len(theme.concepts), 2) + self.assertEqual(theme.concepts[0].id, "theme1") + self.assertEqual(theme.concepts[1].id, "theme2") + self.assertEqual(theme.scheme, DEFAULT_THEME_SCHEME) + + def test_build_record_properties(self): + generator = OSCWorkflowOGCApiRecordGenerator() + properties = { + "title": "Test Workflow", + "description": "A test description", + "themes": ["theme1"], + "jupyter_kernel_info": { + "name": "deepesdl-xcube-1.7.1", + "python_version": 3.11, + "env_file": "https://git/env.yml", + }, + } + contacts = [ + {"name": "Alice", "organization": "Org A", "position": "Researcher"} + ] + + record_properties = generator.build_record_properties(properties, contacts) + + now_iso = datetime.now(timezone.utc).isoformat() + + self.assertEqual(record_properties.title, "Test Workflow") + self.assertEqual(record_properties.description, "A test description") + self.assertEqual(len(record_properties.contacts), 1) + self.assertEqual(record_properties.contacts[0].name, "Alice") + self.assertEqual(len(record_properties.themes), 1) + self.assertEqual(record_properties.themes[0].concepts[0].id, "theme1") + self.assertEqual(record_properties.type, "workflow") + self.assertTrue("created" in record_properties.__dict__) + self.assertTrue("updated" in record_properties.__dict__) diff --git a/deep_code/tests/utils/test_osc_extension.py b/deep_code/tests/utils/test_osc_extension.py index 66300cc..c11e3f8 100644 --- a/deep_code/tests/utils/test_osc_extension.py +++ b/deep_code/tests/utils/test_osc_extension.py @@ -1,5 +1,7 @@ import unittest + from pystac import Collection, Extent, SpatialExtent, TemporalExtent + from deep_code.utils.osc_extension import OscExtension diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 26b49f3..ba762e1 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -4,36 +4,39 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. -import fsspec import logging +from pathlib import Path + +import fsspec import yaml -from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME -from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator +from deep_code.constants import ( + OSC_BRANCH_NAME, + OSC_REPO_NAME, + OSC_REPO_OWNER, + WF_BRANCH_NAME, +) +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation +from deep_code.utils.ogc_api_record import OgcRecord +from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) -class DatasetPublisher: +class GitHubPublisher: """ - Publishes products to the OSC GitHub repository. - - Credentials must be provided via a hidden file named `.gitaccess`, located in - the root of the repository. This file is expected to contain YAML of the form: - - github-username: "YOUR_GITHUB_USERNAME" - github-token: "YOUR_GITHUB_PERSONAL_ACCESS_TOKEN" + Base class providing: + - Reading .gitaccess for credentials + - Common GitHub automation steps (fork, clone, branch, file commit, pull request) """ def __init__(self): with fsspec.open(".gitaccess", "r") as file: git_config = yaml.safe_load(file) or {} - self.github_username = git_config.get("github-username") self.github_token = git_config.get("github-token") - if not self.github_username or not self.github_token: raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") @@ -41,61 +44,190 @@ def __init__(self): self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) - def publish_dataset(self, dataset_config_path: str): - """Publish a product collection to the specified GitHub repository. + def publish_files( + self, + branch_name: str, + file_dict: dict[str, dict], + commit_message: str, + pr_title: str, + pr_body: str, + ) -> str: + """Publish multiple files to a new branch and open a PR. Args: - dataset_config_path: Path to the YAML file containing dataset config + branch_name: Branch name to create (e.g. "osc-branch-collectionid"). + file_dict: { file_path: file_content_dict } for each file to commit. + commit_message: Commit message for all changes. + pr_title: Title of the pull request. + pr_body: Description/body of the pull request. + + Returns: + URL of the created pull request. """ - with fsspec.open(dataset_config_path, "r") as file: - dataset_config = yaml.safe_load(file) - - dataset_id = dataset_config.get("dataset-id") - collection_id = dataset_config.get("collection-id") - documentation_link = dataset_config.get("documentation-link") - access_link = dataset_config.get("access-link") - dataset_status = dataset_config.get("dataset-status") - osc_region = dataset_config.get("dataset-region") - dataset_theme = dataset_config.get("dataset-theme") - cf_params = dataset_config.get("cf-parameter") - - if not dataset_id or not collection_id: - raise ValueError( - "Dataset ID or Collection ID is missing in the dataset-config.yaml " - "file." - ) - try: - logger.info("Generating STAC collection...") - generator = OSCProductSTACGenerator( - dataset_id=dataset_id, - collection_id=collection_id, - documentation_link=documentation_link, - access_link=access_link, - osc_status=dataset_status, - osc_region=osc_region, - osc_themes=dataset_theme, - cf_params=cf_params, - ) - collection = generator.build_stac_collection() - - file_path = f"products/{collection_id}/collection.json" - logger.info("Automating GitHub tasks...") + logger.info("Forking and cloning repository...") self.github_automation.fork_repository() self.github_automation.clone_repository() - OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id - self.github_automation.create_branch(OSC_NEW_BRANCH_NAME) - self.github_automation.add_file(file_path, collection.to_dict()) - self.github_automation.commit_and_push( - OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}" - ) + self.github_automation.create_branch(branch_name) + + # Add each file to the branch + for file_path, content in file_dict.items(): + logger.info(f"Adding {file_path} to {branch_name}") + self.github_automation.add_file(file_path, content) + + # Commit and push + self.github_automation.commit_and_push(branch_name, commit_message) + + # Create pull request pr_url = self.github_automation.create_pull_request( - OSC_NEW_BRANCH_NAME, - f"Add new collection", - "This PR adds a new collection to the repository.", + branch_name, pr_title, pr_body ) - logger.info(f"Pull request created: {pr_url}") + return pr_url finally: + # Cleanup local clone self.github_automation.clean_up() + + +class DatasetPublisher: + """Publishes products (datasets) to the OSC GitHub repository. + Inherits from BasePublisher for GitHub publishing logic. + """ + + def __init__(self): + # Composition + self.gh_publisher = GitHubPublisher() + + def publish_dataset(self, dataset_config_path: str): + """Publish a product collection to the specified GitHub repository.""" + with fsspec.open(dataset_config_path, "r") as file: + dataset_config = yaml.safe_load(file) or {} + + dataset_id = dataset_config.get("dataset_id") + collection_id = dataset_config.get("collection_id") + documentation_link = dataset_config.get("documentation_link") + access_link = dataset_config.get("access_link") + dataset_status = dataset_config.get("dataset_status") + osc_region = dataset_config.get("osc_region") + osc_themes = dataset_config.get("osc_themes") + cf_params = dataset_config.get("cf_parameter") + + if not dataset_id or not collection_id: + raise ValueError("Dataset ID or Collection ID missing in the config.") + + logger.info("Generating STAC collection...") + + generator = OscDatasetStacGenerator( + dataset_id=dataset_id, + collection_id=collection_id, + documentation_link=documentation_link, + access_link=access_link, + osc_status=dataset_status, + osc_region=osc_region, + osc_themes=osc_themes, + cf_params=cf_params, + ) + + variable_ids = generator.get_variable_ids() + ds_collection = generator.build_dataset_stac_collection() + + # Prepare a dictionary of file paths and content + file_dict = {} + product_path = f"products/{collection_id}/collection.json" + file_dict[product_path] = ds_collection.to_dict() + + # Add or update variable files + for var_id in variable_ids: + var_file_path = f"variables/{var_id}/catalog.json" + if not self.gh_publisher.github_automation.file_exists(var_file_path): + logger.info( + f"Variable catalog for {var_id} does not exist. Creating..." + ) + var_metadata = generator.variables_metadata.get(var_id) + var_catalog = generator.build_variable_catalog(var_metadata) + file_dict[var_file_path] = var_catalog.to_dict() + else: + logger.info( + f"Variable catalog already exists for {var_id}, adding product link." + ) + full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) + / var_file_path + ) + updated_catalog = generator.update_existing_variable_catalog( + full_path, var_id + ) + file_dict[var_file_path] = updated_catalog.to_dict() + + # Create branch name, commit message, PR info + branch_name = f"{OSC_BRANCH_NAME}-{collection_id}" + commit_message = f"Add new dataset collection: {collection_id}" + pr_title = "Add new dataset collection" + pr_body = "This PR adds a new dataset collection to the repository." + + # Publish all files in one go + pr_url = self.gh_publisher.publish_files( + branch_name=branch_name, + file_dict=file_dict, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) + + logger.info(f"Pull request created: {pr_url}") + + +class WorkflowPublisher: + """Publishes workflows to the OSC GitHub repository.""" + + def __init__(self): + self.gh_publisher = GitHubPublisher() + + @staticmethod + def _normalize_name(name: str | None) -> str | None: + return name.replace(" ", "-").lower() if name else None + + def publish_workflow(self, workflow_config_path: str): + with fsspec.open(workflow_config_path, "r") as file: + workflow_config = yaml.safe_load(file) or {} + + workflow_id = self._normalize_name(workflow_config.get("workflow_id")) + if not workflow_id: + raise ValueError("workflow_id is missing in workflow config.") + + properties_list = workflow_config.get("properties", []) + contacts = workflow_config.get("contact", []) + links = workflow_config.get("links", []) + + logger.info("Generating OGC API Record for the workflow...") + rg = OSCWorkflowOGCApiRecordGenerator() + wf_record_properties = rg.build_record_properties(properties_list, contacts) + + ogc_record = OgcRecord( + id=workflow_id, + type="Feature", + time={}, + properties=wf_record_properties, + links=links, + ) + + file_path = f"workflow/{workflow_id}/collection.json" + + # Prepare the single file dict + file_dict = {file_path: ogc_record.to_dict()} + + branch_name = f"{WF_BRANCH_NAME}-{workflow_id}" + commit_message = f"Add new workflow: {workflow_id}" + pr_title = "Add new workflow" + pr_body = "This PR adds a new workflow to the OSC repository." + + pr_url = self.gh_publisher.publish_files( + branch_name=branch_name, + file_dict=file_dict, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) + + logger.info(f"Pull request created: {pr_url}") diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 21f4cf8..3d4da00 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -4,18 +4,18 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. -import os import logging +import os from datetime import datetime, timezone import pandas as pd -from pystac import Collection, Extent, Link, SpatialExtent, TemporalExtent +from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent from xcube.core.store import new_data_store from deep_code.utils.osc_extension import OscExtension -class OSCProductSTACGenerator: +class OscDatasetStacGenerator: """Generates OSC STAC Collections for a product from Zarr datasets. Args: @@ -53,6 +53,7 @@ def __init__( self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) self.dataset = self._open_dataset() + self.variables_metadata = self.get_variables_metadata() def _open_dataset(self): """Open the dataset using a S3 store as a xarray Dataset.""" @@ -170,29 +171,6 @@ def _get_temporal_extent(self) -> TemporalExtent: def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - def _get_variables(self) -> list[str]: - """Extracts variable names or descriptions from the dataset. - - Variables are prioritized based on their `long_name` or `standard_name` - attributes. If neither is available, the variable's key from - `dataset.data_vars.keys()` is used. - - Returns: - A list of variable names or descriptions. - """ - variables = [] - for var_name, variable in self.dataset.data_vars.items(): - long_name = self._normalize_name(variable.attrs.get("long_name")) - standard_name = self._normalize_name(variable.attrs.get("standard_name")) - if not long_name and not standard_name: - self.logger.error( - f"Metadata missing for variable '{var_name}': 'long_name' and " - f"'standard_name' attributes are not available." - ) - # Prioritize 'long_name', fallback to 'standard_name', then use variable key - variables.append(long_name or standard_name or var_name) - return variables - def _get_general_metadata(self) -> dict: return { "description": self.dataset.attrs.get( @@ -200,36 +178,173 @@ def _get_general_metadata(self) -> dict: ) } - def _get_variable_metadata(self, var_name, var_data) -> dict: - """Extract metadata from a single variable's attributes. + def extract_metadata_for_variable(self, variable_data) -> dict: + """Extract metadata for a single variable.""" + long_name = variable_data.attrs.get("long_name") + standard_name = variable_data.attrs.get("standard_name") + variable_id = standard_name or variable_data.name + description = variable_data.attrs.get("description", long_name) + gcmd_keyword_url = variable_data.attrs.get("gcmd_keyword_url") + return { + "variable_id": self._normalize_name(variable_id), + "description": description, + "gcmd_keyword_url": gcmd_keyword_url, + } + + def get_variable_ids(self) -> list[str]: + """Get variable IDs for all variables in the dataset.""" + return list(self.variables_metadata.keys()) + + def get_variables_metadata(self) -> dict[str, dict]: + """Extract metadata for all variables in the dataset.""" + variables_metadata = {} + for var_name, variable in self.dataset.data_vars.items(): + var_metadata = self.extract_metadata_for_variable(variable) + variables_metadata[var_metadata.get("variable_id")] = var_metadata + return variables_metadata + + def _add_gcmd_link_to_var_catalog( + self, var_catalog: Catalog, var_metadata: dict + ) -> None: + """ + Checks for a GCMD keyword URL in var_metadata, adds a 'via' link to the catalog + pointing to the GCMD Keyword Viewer. Args: - var_name: The raw variable name in the dataset. - var_data: An xarray DataArray containing variable data and attrs. + var_catalog: The PySTAC Catalog to which we want to add the link. + var_metadata: Dictionary containing metadata about the variable, + including 'gcmd_keyword_url'. + """ + gcmd_keyword_url = var_metadata.get("gcmd_keyword_url") + if not gcmd_keyword_url: + self.logger.debug( + f"No gcmd_keyword_url in var_metadata. Skipping adding GCMD link in " + f'the {var_metadata.get("variable_id")} catalog' + ) + return + var_catalog.add_link( + Link( + rel="via", + target=gcmd_keyword_url, + title="Description", + media_type="text/html", + ) + ) + self.logger.info( + f'Added GCMD link for {var_metadata.get("variable_id")} ' + f"catalog {gcmd_keyword_url}." + ) + + def build_variable_catalog(self, var_metadata) -> Catalog: + """Build an OSC STAC Catalog for the variables in the dataset. Returns: - A dict with 'id', 'title', and 'description'. + A pystac.Catalog object. """ - long_name = var_data.attrs.get("long_name") - standard_name = var_data.attrs.get("standard_name") - title = long_name or standard_name or var_name + var_id = var_metadata.get("variable_id") + concepts = [{"id": theme} for theme in self.osc_themes] - normalized_title = self._normalize_name(title) + themes = [ + { + "scheme": "https://github.com/stac-extensions/osc#theme", + "concepts": concepts, + } + ] - description = var_data.attrs.get("description", "No variable description") + now_iso = datetime.now(timezone.utc).isoformat() - return {"id": var_name, "title": normalized_title, "description": description} + # Create a PySTAC Catalog object + var_catalog = Catalog( + id=var_id, + description=var_metadata.get("description"), + title=var_id, + stac_extensions=[ + "https://stac-extensions.github.io/themes/v1.0.0/schema.json" + ], + ) - def build_stac_collection(self) -> Collection: - """ - Build an OSC STAC Collection for the dataset. + var_catalog.stac_version = "1.0.0" + var_catalog.extra_fields["updated"] = now_iso + var_catalog.keywords = [] - :return: A pystac.Collection object. + # Add the 'themes' block (from your example JSON) + var_catalog.extra_fields["themes"] = themes + + var_catalog.remove_links("root") + # Add relevant links + var_catalog.add_link( + Link( + rel="root", + target="../../catalog.json", + media_type="application/json", + title="Open Science Catalog", + ) + ) + + # 'child' link: points to the product (or one of its collections) using this variable + var_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + + # 'parent' link: back up to the variables overview + var_catalog.add_link( + Link( + rel="parent", + target="../catalog.json", + media_type="application/json", + title="Variables", + ) + ) + # Add gcmd link for the variable definition + self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata) + + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + var_catalog.set_self_href(self_href) + + return var_catalog + + def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: + existing_catalog = Catalog.from_file(var_file_path) + now_iso = datetime.now(timezone.utc).isoformat() + existing_catalog.extra_fields["updated"] = now_iso + + # add 'child' link as the product + existing_catalog.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + self_href = ( + f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" + f"/{var_id}/catalog.json" + ) + # 'self' link: the direct URL where this JSON is hosted + existing_catalog.set_self_href(self_href) + + return existing_catalog + + def build_dataset_stac_collection(self) -> Collection: + """Build an OSC STAC Collection for the dataset. + + Returns: + A pystac.Collection object. """ try: spatial_extent = self._get_spatial_extent() temporal_extent = self._get_temporal_extent() - variables = self._get_variables() + variables = self.get_variable_ids() general_metadata = self._get_general_metadata() except ValueError as e: raise ValueError(f"Metadata extraction failed: {e}") @@ -260,6 +375,7 @@ def build_stac_collection(self) -> Collection: now_iso = datetime.now(timezone.utc).isoformat() collection.extra_fields["created"] = now_iso collection.extra_fields["updated"] = now_iso + collection.title = self.collection_id # Remove any existing root link and re-add it properly collection.remove_links("root") @@ -284,6 +400,16 @@ def build_stac_collection(self) -> Collection: title="Products", ) ) + # Add variables ref + for var in variables: + collection.add_link( + Link( + rel="related", + target=f"../../varibales/{var}/catalog.json", + media_type="application/json", + title="Variable: " + var, + ) + ) self_href = ( "https://esa-earthcode.github.io/" diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index d934d2a..dbecfe4 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -7,10 +7,11 @@ import json import logging import os -import requests import subprocess from pathlib import Path +import requests + class GitHubAutomation: """Automates GitHub operations needed to create a Pull Request. @@ -113,3 +114,9 @@ def clean_up(self): subprocess.run(["rm", "-rf", self.local_clone_dir]) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to clean-up local repository: {e}") + + def file_exists(self, file_path) -> bool: + full_path = Path(self.local_clone_dir) / file_path + exists = os.path.isfile(full_path) + logging.debug(f"Checking existence of {full_path}: {exists}") + return exists diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py new file mode 100644 index 0000000..437c2c8 --- /dev/null +++ b/deep_code/utils/ogc_api_record.py @@ -0,0 +1,94 @@ +from typing import Any, Optional + +from xrlint.util.constructible import MappingConstructible +from xrlint.util.serializable import JsonSerializable + +from deep_code.constants import OGC_API_RECORD_SPEC + + +class Contact(MappingConstructible["Contact"], JsonSerializable): + def __init__( + self, + name: str, + organization: str, + position: str | None = "", + links: list[dict[str, Any]] | None = None, + contactInstructions: str | None = "", + roles: list[str] = None, + ): + self.name = name + self.organization = organization + self.position = position + self.links = links or [] + self.contactInstructions = contactInstructions + self.roles = roles or ["principal investigator"] + + +class ThemeConcept(MappingConstructible["ThemeConcept"], JsonSerializable): + def __init__(self, id: str): + self.id = id + + +class Theme(MappingConstructible["Theme"], JsonSerializable): + def __init__(self, concepts: list[ThemeConcept], scheme: str): + self.concepts = concepts + self.scheme = scheme + + +class JupyterKernelInfo(MappingConstructible["RecordProperties"], JsonSerializable): + def __init__(self, name: str, python_version: float, env_file: str): + self.name = name + self.python_version = python_version + self.env_file = env_file + + +class RecordProperties(MappingConstructible["RecordProperties"], JsonSerializable): + def __init__( + self, + created: str, + type: str, + title: str, + description: str, + jupyter_kernel_info: JupyterKernelInfo, + updated: str = None, + contacts: list[Contact] = None, + themes: list[Theme] = None, + keywords: list[str] | None = None, + formats: list[dict] | None = None, + license: str = None, + ): + self.created = created + self.updated = updated + self.type = type + self.title = title + self.description = description + self.jupyter_kernel_info = jupyter_kernel_info + self.keywords = keywords or [] + self.contacts = contacts + self.themes = themes + self.formats = formats or [] + self.license = license + + +class OgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): + def __init__( + self, + id: str, + type: str, + time: dict, + properties: RecordProperties, + links: list[dict], + linkTemplates: list = [], + conformsTo: list[str] = None, + geometry: Optional[Any] = None, + ): + if conformsTo is None: + conformsTo = [OGC_API_RECORD_SPEC] + self.id = id + self.type = type + self.conformsTo = conformsTo + self.time = time + self.geometry = geometry + self.properties = properties + self.linkTemplates = linkTemplates + self.links = links diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py new file mode 100644 index 0000000..481663f --- /dev/null +++ b/deep_code/utils/ogc_record_generator.py @@ -0,0 +1,54 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + +from datetime import datetime, timezone + +from deep_code.constants import DEFAULT_THEME_SCHEME +from deep_code.utils.ogc_api_record import ( + Contact, + RecordProperties, + Theme, + ThemeConcept, +) + + +class OSCWorkflowOGCApiRecordGenerator: + """Generates OGC API record for a workflow + """ + + @staticmethod + def build_contact_objects(contacts_list: list[dict]) -> list[Contact]: + """Build a list of Contact objects from a list of contact dictionaries. + Uses the inherited MappingConstructible logic to parse each dict. + + Args: + contacts_list: A list of dictionaries, each containing contact information. + + Returns: + A list of Contact instances. + """ + return [Contact.from_value(cdict) for cdict in contacts_list] + + @staticmethod + def build_theme(osc_themes: list[str]) -> Theme: + """Convert each string into a ThemeConcept + """ + concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] + return Theme(concepts=concepts, scheme=DEFAULT_THEME_SCHEME) + + def build_record_properties(self, properties, contacts) -> RecordProperties: + """Build a RecordProperties object from a list of single-key property dicts + """ + now_iso = datetime.now(timezone.utc).isoformat() + properties.update({"created": now_iso}) + properties.update({"updated": now_iso}) + themes_list = properties.get("themes", []) + properties.update({"contacts": self.build_contact_objects(contacts)}) + if themes_list: + theme_obj = self.build_theme(themes_list) + properties.update({"themes": [theme_obj]}) + properties.setdefault("type", "workflow") + return RecordProperties.from_value(properties) diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 6aa7519..8a777de 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -7,10 +7,10 @@ from typing import Literal import pystac -from pystac import SpatialExtent, TemporalExtent, Extent -from pystac.extensions.base import PropertiesExtension, ExtensionManagementMixin +from pystac import Extent, SpatialExtent, TemporalExtent +from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension -from deep_code.constants import OSC_SCHEMA_URI, CF_SCHEMA_URI +from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI class OscExtension( diff --git a/pyproject.toml b/pyproject.toml index 057f7b8..efac413 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,7 +26,8 @@ dependencies = [ "pandas", "pystac", "pyyaml", - "xcube-core" + "xcube-core", + "xrlint", ] [tool.setuptools.dynamic]