Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 15 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
and Python API providing utilities that aid integration of DeepESDL datasets,
experiments with EarthCODE.

The first release will focus on implementing the publish feature of DeepESDL
experiments/workflow as OGC API record and Datasets as an OSC stac collection.

## Setup

## Install
Expand Down Expand Up @@ -72,16 +75,18 @@ github-token: personal access token
#### dataset-config.yaml example

```
dataset-id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr
collection-id: hydrology

#non-mandatory
documentation-link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0-009deg-100x60x60-3-0-2-zarr/
access-link: s3://test
dataset-status: completed
dataset-region: global
dataset-theme: ["ocean", "environment"]
cf-parameter: [{"Name" : "hydrology"}]
dataset_id: hydrology-1D-0.009deg-100x60x60-3.0.2.zarr
collection_id: hydrology
osc_themes:
- Land
- Oceans
# non-mandatory
documentation_link: https://deepesdl.readthedocs.io/en/latest/datasets/hydrology-1D-0.009deg-100x60x60-3.0.2.zarr/
access_link: s3://test
dataset_status: completed
osc_region: global
cf_parameter:
- name: hydrology
```

dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket.
5 changes: 1 addition & 4 deletions deep_code/cli/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,7 @@


@click.command(name="publish-dataset")
@click.argument(
"dataset_config",
type=click.Path(exists=True)
)
@click.argument("dataset_config", type=click.Path(exists=True))

Check warning on line 13 in deep_code/cli/publish.py

View check run for this annotation

Codecov / codecov/patch

deep_code/cli/publish.py#L13

Added line #L13 was not covered by tests
def publish_dataset(dataset_config):
"""Request publishing a dataset to the open science catalogue.
"""
Expand Down
22 changes: 11 additions & 11 deletions deep_code/tests/tools/test_publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ def test_publish_dataset_success(
github-token: test-token
"""
dataset_yaml_content = """
dataset-id: test-dataset
collection-id: test-collection
documentation-link: http://example.com/doc
access-link: http://example.com/access
dataset-status: ongoing
dataset-region: Global
dataset-theme: ["climate"]
cf-parameter: []
dataset_id: test-dataset
collection_id: test-collection
documentation_link: http://example.com/doc
access_link: http://example.com/access
dataset_status: ongoing
dataset_region: Global
osc_theme: ["climate"]
cf_parameter: []
"""
mock_fsspec_open.side_effect = [
mock_open(read_data=git_yaml_content)(),
Expand Down Expand Up @@ -102,16 +102,16 @@ def test_publish_dataset_success(
"links": [],
"stac_version": "1.0.0",
}
with patch("deep_code.tools.publish.OSCProductSTACGenerator") as mock_generator:
mock_generator.return_value.build_stac_collection.return_value = (
with patch("deep_code.tools.publish.OSCDatasetSTACGenerator") as mock_generator:
mock_generator.return_value.build_dataset_stac_collection.return_value = (
mock_collection
)

# Instantiate & publish
publisher = DatasetPublisher()
publisher.publish_dataset("/fake/path/to/dataset-config.yaml")

# 6Assert that we called git clone with /tmp/temp_repo
# Assert that we called git clone with /tmp/temp_repo
# Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo
auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git"
mock_subprocess_run.assert_any_call(
Expand Down
40 changes: 27 additions & 13 deletions deep_code/tests/utils/test_dataset_stac_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from unittest.mock import patch, MagicMock
from xarray import Dataset

from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator
from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator


class TestOSCProductSTACGenerator(unittest.TestCase):
Expand All @@ -28,15 +28,31 @@ def setUp(self, mock_data_store):
},
attrs={"description": "Mock dataset for testing.", "title": "Mock Dataset"},
data_vars={
"var1": (("time", "lat", "lon"), np.random.rand(2, 5, 10)),
"var2": (("time", "lat", "lon"), np.random.rand(2, 5, 10)),
"var1": (
("time", "lat", "lon"),
np.random.rand(2, 5, 10),
{
"description": "dummy",
"standard_name": "var1",
"gcmd_keyword_url": "https://dummy",
},
),
"var2": (
("time", "lat", "lon"),
np.random.rand(2, 5, 10),
{
"description": "dummy",
"standard_name": "var2",
"gcmd_keyword_url": "https://dummy",
},
),
},
)
mock_store = MagicMock()
mock_store.open_data.return_value = self.mock_dataset
mock_data_store.return_value = mock_store

self.generator = OSCProductSTACGenerator(
self.generator = OSCDatasetSTACGenerator(
dataset_id="mock-dataset-id",
collection_id="mock-collection-id",
access_link="s3://mock-bucket/mock-dataset",
Expand Down Expand Up @@ -66,7 +82,7 @@ def test_get_temporal_extent(self):

def test_get_variables(self):
"""Test variable extraction."""
variables = self.generator._get_variables()
variables = self.generator.get_variable_ids()
self.assertEqual(variables, ["var1", "var2"])

def test_get_general_metadata(self):
Expand All @@ -78,7 +94,7 @@ def test_get_general_metadata(self):
@patch("pystac.Collection.set_self_href")
def test_build_stac_collection(self, mock_set_self_href, mock_add_link):
"""Test STAC collection creation."""
collection = self.generator.build_stac_collection()
collection = self.generator.build_dataset_stac_collection()
self.assertIsInstance(collection, Collection)
self.assertEqual(collection.id, "mock-collection-id")
self.assertEqual(collection.description, "Mock dataset for testing.")
Expand All @@ -104,19 +120,17 @@ def test_invalid_temporal_extent(self):
with self.assertRaises(ValueError):
self.generator._get_temporal_extent()


class TestOpenDataset(unittest.TestCase):
@patch("deep_code.utils.dataset_stac_generator.new_data_store")
@patch("deep_code.utils.dataset_stac_generator.logging.getLogger")
def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store):
"""Test dataset opening with the public store configuration."""
# Create a mock store and mock its `open_data` method
mock_store = MagicMock()
mock_new_data_store.return_value = mock_store
mock_store.open_data.return_value = "mock_dataset"
mock_store.open_data.return_value = self.mock_dataset

# Instantiate the generator (this will implicitly call _open_dataset)
generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id")
generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")

# Validate that the dataset is assigned correctly
self.assertEqual(generator.dataset, "mock_dataset")
Expand Down Expand Up @@ -151,13 +165,13 @@ def test_open_dataset_success_authenticated_store(
mock_store,
# Second call (authenticated store) returns a mock store
]
mock_store.open_data.return_value = "mock_dataset"
mock_store.open_data.return_value = self.mock_dataset

os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket"
os.environ["S3_USER_STORAGE_KEY"] = "mock-key"
os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret"

generator = OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id")
generator = OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")

# Validate that the dataset was successfully opened with the authenticated store
self.assertEqual(generator.dataset, "mock_dataset")
Expand Down Expand Up @@ -195,7 +209,7 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store):
os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret"

with self.assertRaises(ValueError) as context:
OSCProductSTACGenerator("mock-dataset-id", "mock-collection-id")
OSCDatasetSTACGenerator("mock-dataset-id", "mock-collection-id")

self.assertIn(
"Failed to open Zarr dataset with ID mock-dataset-id",
Expand Down
60 changes: 46 additions & 14 deletions deep_code/tools/publish.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
import fsspec
import logging
import yaml
from pathlib import Path

from deep_code.constants import OSC_REPO_OWNER, OSC_REPO_NAME, OSC_BRANCH_NAME
from deep_code.utils.dataset_stac_generator import OSCProductSTACGenerator
from deep_code.utils.dataset_stac_generator import OSCDatasetSTACGenerator
from deep_code.utils.github_automation import GitHubAutomation

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -50,14 +51,14 @@
with fsspec.open(dataset_config_path, "r") as file:
dataset_config = yaml.safe_load(file)

dataset_id = dataset_config.get("dataset-id")
collection_id = dataset_config.get("collection-id")
documentation_link = dataset_config.get("documentation-link")
access_link = dataset_config.get("access-link")
dataset_status = dataset_config.get("dataset-status")
osc_region = dataset_config.get("dataset-region")
dataset_theme = dataset_config.get("dataset-theme")
cf_params = dataset_config.get("cf-parameter")
dataset_id = dataset_config.get("dataset_id")
collection_id = dataset_config.get("collection_id")
documentation_link = dataset_config.get("documentation_link")
access_link = dataset_config.get("access_link")
dataset_status = dataset_config.get("dataset_status")
osc_region = dataset_config.get("osc_region")
osc_themes = dataset_config.get("osc_themes")
cf_params = dataset_config.get("cf_parameter")

if not dataset_id or not collection_id:
raise ValueError(
Expand All @@ -67,31 +68,62 @@

try:
logger.info("Generating STAC collection...")
generator = OSCProductSTACGenerator(
generator = OSCDatasetSTACGenerator(
dataset_id=dataset_id,
collection_id=collection_id,
documentation_link=documentation_link,
access_link=access_link,
osc_status=dataset_status,
osc_region=osc_region,
osc_themes=dataset_theme,
osc_themes=osc_themes,
cf_params=cf_params,
)
collection = generator.build_stac_collection()
# get variables from the datasets
variable_ids = generator.get_variable_ids()
# build STAC collection for the dataset
ds_collection = generator.build_dataset_stac_collection()

file_path = f"products/{collection_id}/collection.json"
logger.info("Automating GitHub tasks...")
self.github_automation.fork_repository()
self.github_automation.clone_repository()
OSC_NEW_BRANCH_NAME = OSC_BRANCH_NAME + "-" + collection_id
self.github_automation.create_branch(OSC_NEW_BRANCH_NAME)
self.github_automation.add_file(file_path, collection.to_dict())

for var_id in variable_ids:
var_file_path = f"variables/{var_id}/catalog.json"
if not self.github_automation.file_exists(var_file_path):
logger.info(

Check warning on line 96 in deep_code/tools/publish.py

View check run for this annotation

Codecov / codecov/patch

deep_code/tools/publish.py#L94-L96

Added lines #L94 - L96 were not covered by tests
f"Variable catalog for {var_id} does not exist. Creating..."
)
var_metadata = generator.variables_metadata.get(var_id)
var_catalog = generator.build_variable_catalog(var_metadata)
self.github_automation.add_file(

Check warning on line 101 in deep_code/tools/publish.py

View check run for this annotation

Codecov / codecov/patch

deep_code/tools/publish.py#L99-L101

Added lines #L99 - L101 were not covered by tests
var_file_path, var_catalog.to_dict()
)
else:
logger.info(

Check warning on line 105 in deep_code/tools/publish.py

View check run for this annotation

Codecov / codecov/patch

deep_code/tools/publish.py#L105

Added line #L105 was not covered by tests
f"Variable catalog already exists for {var_id}. so add the "
f"product as child link..."
)
full_path = (

Check warning on line 109 in deep_code/tools/publish.py

View check run for this annotation

Codecov / codecov/patch

deep_code/tools/publish.py#L109

Added line #L109 was not covered by tests
Path(self.github_automation.local_clone_dir) / var_file_path
)
self.github_automation.add_file(

Check warning on line 112 in deep_code/tools/publish.py

View check run for this annotation

Codecov / codecov/patch

deep_code/tools/publish.py#L112

Added line #L112 was not covered by tests
var_file_path,
generator.update_existing_variable_catalog(
full_path, var_id
).to_dict(),
)

self.github_automation.add_file(file_path, ds_collection.to_dict())

self.github_automation.commit_and_push(
OSC_NEW_BRANCH_NAME, f"Add new collection:{collection_id}"
)
pr_url = self.github_automation.create_pull_request(
OSC_NEW_BRANCH_NAME,
f"Add new collection",
f"Add new dataset collection",
"This PR adds a new collection to the repository.",
)

Expand Down
Loading
Loading