diff --git a/CHANGES.md b/CHANGES.md index 1e0b957..e638dad 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,4 +13,14 @@ - Support publishing to testing,staging and production repositories of open-science-metadata. - Implemented new cli command `generate-config` to generate starter templates for - config files. \ No newline at end of file + config files. + +## Changes in 0.1.3 + +- _Version bump only_; no code or functionality changes. This release was + republished to update the package on PyPI. + +## Changes in 0.1.4 + +- Implemented custom rules using xrlint to validate metadata in dataset, which is necessary to + generate a STAC collection valid for ESA Open Science Catalog. \ No newline at end of file diff --git a/deep_code/tests/utils/test_custom_xrlint_rules.py b/deep_code/tests/utils/test_custom_xrlint_rules.py new file mode 100644 index 0000000..6a36338 --- /dev/null +++ b/deep_code/tests/utils/test_custom_xrlint_rules.py @@ -0,0 +1,73 @@ +# Copyright © 2025 Brockmann Consult GmbH. +# This software is distributed under the terms and conditions of the +# MIT license (https://mit-license.org/). + +import unittest + +import xarray as xr +from xrlint.testing import RuleTest, RuleTester + +from deep_code.utils.custom_xrlint_rules import ( + DatasetDescriptionRule, + VariableGcmdKeywordUrlRule, +) + + +class TestDeepCodePlugin(unittest.TestCase): + def setUp(self): + """Set up test datasets.""" + # Valid dataset with all required metadata + self.valid_dataset = xr.Dataset( + data_vars={ + "temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]), + "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]), + }, + coords={"time": [1], "lat": [0, 1], "lon": [0, 1]}, + attrs={ + "description": "Test climate dataset", + "title": "Climate Dataset 2025", + }, + ) + self.valid_dataset["temperature"].attrs[ + "gcmd_keyword_url" + ] = "https://gcmd.nasa.gov/KeywordViewer/temperature" + self.valid_dataset["temperature"].attrs["units"] = "K" + self.valid_dataset["precipitation"].attrs[ + "gcmd_keyword_url" + ] = "https://gcmd.nasa.gov/KeywordViewer/precipitation" + self.valid_dataset["precipitation"].attrs["units"] = "mm" + + # Invalid dataset missing required metadata + self.invalid_dataset = xr.Dataset( + data_vars={ + "temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]), + "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]), + }, + coords={"time": [1], "lat": [0, 1], "lon": [0, 1]}, + attrs={}, + ) + self.invalid_dataset["temperature"].attrs[ + "gcmd_keyword_url" + ] = "https://gcmd.nasa.gov/KeywordViewer/temperature" + self.invalid_dataset["temperature"].attrs["units"] = "K" + # Intentionally omit gcmd_keyword_url and units for precipitation + + self.tester = RuleTester() + + def test_dataset_description(self): + """Test DatasetDescriptionRule with valid and invalid dataset.""" + self.tester.run( + "dataset-description", + DatasetDescriptionRule, + valid=[RuleTest(dataset=self.valid_dataset)], + invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)], + ) + + def test_variable_gcmd_keyword_url(self): + """Test VariableGcmdKeywordUrlRule with valid dataset.""" + self.tester.run( + "variable-gcmd-keyword-url", + VariableGcmdKeywordUrlRule, + valid=[RuleTest(dataset=self.valid_dataset)], + invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)], + ) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index e8864b1..464c538 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -1,17 +1,27 @@ -import os +#!/usr/bin/env python3 +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import unittest from datetime import datetime from unittest.mock import MagicMock, patch import numpy as np -from pystac import Collection -from xarray import Dataset +from pystac import Catalog, Collection +from xarray import DataArray, Dataset -from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator +from deep_code.constants import ( + DEEPESDL_COLLECTION_SELF_HREF, + OSC_THEME_SCHEME, + PRODUCT_BASE_CATALOG_SELF_HREF, + VARIABLE_BASE_CATALOG_SELF_HREF, +) +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator, Theme class TestOSCProductSTACGenerator(unittest.TestCase): - @patch("deep_code.utils.dataset_stac_generator.new_data_store") + @patch("deep_code.utils.dataset_stac_generator.open_dataset") def setUp(self, mock_data_store): """Set up a mock dataset and generator.""" self.mock_dataset = Dataset( @@ -50,7 +60,7 @@ def setUp(self, mock_data_store): ) mock_store = MagicMock() mock_store.open_data.return_value = self.mock_dataset - mock_data_store.return_value = mock_store + mock_data_store.return_value = self.mock_dataset self.generator = OscDatasetStacGenerator( dataset_id="mock-dataset-id", @@ -65,9 +75,8 @@ def setUp(self, mock_data_store): def test_open_dataset(self): """Test if the dataset is opened correctly.""" self.assertIsInstance(self.generator.dataset, Dataset) - self.assertIn("lon", self.generator.dataset.coords) - self.assertIn("lat", self.generator.dataset.coords) - self.assertIn("time", self.generator.dataset.coords) + for coord in ("lon", "lat", "time"): + self.assertIn(coord, self.generator.dataset.coords) def test_get_spatial_extent(self): """Test spatial extent extraction.""" @@ -77,146 +86,93 @@ def test_get_spatial_extent(self): def test_get_temporal_extent(self): """Test temporal extent extraction.""" extent = self.generator._get_temporal_extent() - expected_intervals = [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)] - self.assertEqual(extent.intervals[0], expected_intervals) + # TemporalExtent.intervals is a list of [start, end] + interval = extent.intervals[0] + self.assertEqual(interval[0], datetime(2023, 1, 1, 0, 0)) + self.assertEqual(interval[1], datetime(2023, 1, 2, 0, 0)) def test_get_variables(self): - """Test variable extraction.""" - variables = self.generator.get_variable_ids() - self.assertEqual(variables, ["var1", "var2"]) + """Test variable ID extraction.""" + vars_ = self.generator.get_variable_ids() + self.assertCountEqual(vars_, ["var1", "var2"]) def test_get_general_metadata(self): """Test general metadata extraction.""" - metadata = self.generator._get_general_metadata() - self.assertEqual(metadata["description"], "Mock dataset for testing.") - - @patch("pystac.Collection.add_link") - @patch("pystac.Collection.set_self_href") - def test_build_stac_collection(self, mock_set_self_href, mock_add_link): - """Test STAC collection creation.""" - collection = self.generator.build_dataset_stac_collection() - self.assertIsInstance(collection, Collection) - self.assertEqual(collection.id, "mock-collection-id") - self.assertEqual(collection.description, "Mock dataset for testing.") - self.assertEqual( - collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0] - ) - self.assertEqual( - collection.extent.temporal.intervals[0], - [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)], - ) - mock_set_self_href.assert_called_once() - mock_add_link.assert_called() - - def test_invalid_spatial_extent(self): - """Test spatial extent extraction with missing coordinates.""" - self.generator.dataset = Dataset(coords={"x": [], "y": []}) - with self.assertRaises(ValueError): - self.generator._get_spatial_extent() - - def test_invalid_temporal_extent(self): - """Test temporal extent extraction with missing time.""" - self.generator.dataset = Dataset(coords={}) - with self.assertRaises(ValueError): - self.generator._get_temporal_extent() - - @patch("deep_code.utils.dataset_stac_generator.new_data_store") - @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") - def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): - """Test dataset opening with the public store configuration.""" - # Create a mock store and mock its `open_data` method - mock_store = MagicMock() - mock_new_data_store.return_value = mock_store - mock_store.open_data.return_value = self.mock_dataset - - # Instantiate the generator (this will implicitly call _open_dataset) - generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") - - # Validate that the dataset is assigned correctly - self.assertEqual(generator.dataset, "mock_dataset") - - # Validate that `new_data_store` was called once with the correct parameters - mock_new_data_store.assert_called_once_with( - "s3", root="deep-esdl-public", storage_options={"anon": True} - ) - - # Ensure `open_data` was called once on the returned store - mock_store.open_data.assert_called_once_with("mock-dataset-id") - - # Validate logging behavior - mock_logger().info.assert_any_call( - "Attempting to open dataset with configuration: Public store" - ) - mock_logger().info.assert_any_call( - "Successfully opened dataset with configuration: Public store" - ) - - @patch("deep_code.utils.dataset_stac_generator.new_data_store") - @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") - def test_open_dataset_success_authenticated_store( - self, mock_logger, mock_new_data_store - ): - """Test dataset opening with the authenticated store configuration.""" - # Simulate public store failure - mock_store = MagicMock() - mock_new_data_store.side_effect = [ - Exception("Public store failure"), - # First call (public store) raises an exception - mock_store, - # Second call (authenticated store) returns a mock store - ] - mock_store.open_data.return_value = self.mock_dataset - - os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" - os.environ["S3_USER_STORAGE_KEY"] = "mock-key" - os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - - generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") - - # Validate that the dataset was successfully opened with the authenticated store - self.assertEqual(generator.dataset, "mock_dataset") - self.assertEqual(mock_new_data_store.call_count, 2) - - # Validate calls to `new_data_store` - mock_new_data_store.assert_any_call( - "s3", root="deep-esdl-public", storage_options={"anon": True} - ) - mock_new_data_store.assert_any_call( - "s3", - root="mock-bucket", - storage_options={"anon": False, "key": "mock-key", "secret": "mock-secret"}, - ) - - # Validate logging calls - mock_logger().info.assert_any_call( - "Attempting to open dataset with configuration: Public store" - ) - mock_logger().info.assert_any_call( - "Attempting to open dataset with configuration: Authenticated store" - ) - mock_logger().info.assert_any_call( - "Successfully opened dataset with configuration: Authenticated store" - ) - - @patch("deep_code.utils.dataset_stac_generator.new_data_store") - @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") - def test_open_dataset_failure(self, mock_logger, mock_new_data_store): - """Test dataset opening failure with all configurations.""" - # Simulate all store failures - mock_new_data_store.side_effect = Exception("Store failure") - os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" - os.environ["S3_USER_STORAGE_KEY"] = "mock-key" - os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - - with self.assertRaises(ValueError) as context: - OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") - - self.assertIn( - "Failed to open Zarr dataset with ID mock-dataset-id", - str(context.exception), - ) - self.assertIn("Public store, Authenticated store", str(context.exception)) - self.assertEqual(mock_new_data_store.call_count, 2) + meta = self.generator._get_general_metadata() + self.assertEqual(meta.get("description"), "Mock dataset for testing.") + + def test_extract_metadata_for_variable(self): + """Test single variable metadata extraction.""" + da: DataArray = self.mock_dataset.data_vars["var1"] + var_meta = self.generator.extract_metadata_for_variable(da) + self.assertEqual(var_meta["variable_id"], "var1") + self.assertEqual(var_meta["description"], "dummy") + self.assertEqual(var_meta["gcmd_keyword_url"], "https://dummy") + + def test_get_variables_metadata(self): + """Test metadata dict for all variables.""" + meta_dict = self.generator.get_variables_metadata() + self.assertIn("var1", meta_dict) + self.assertIn("var2", meta_dict) + self.assertIsInstance(meta_dict["var1"], dict) + + def test_build_theme(self): + """Test Theme builder static method.""" + themes = ["a", "b"] + theme_obj: Theme = OscDatasetStacGenerator.build_theme(themes) + self.assertEqual(theme_obj.scheme, OSC_THEME_SCHEME) + ids = [tc.id for tc in theme_obj.concepts] + self.assertListEqual(ids, ["a", "b"]) + + @patch.object(OscDatasetStacGenerator, "_add_gcmd_link_to_var_catalog") + @patch.object(OscDatasetStacGenerator, "add_themes_as_related_links_var_catalog") + def test_build_variable_catalog(self, mock_add_themes, mock_add_gcmd): + """Test building of variable-level STAC catalog.""" + var_meta = self.generator.variables_metadata["var1"] + catalog = self.generator.build_variable_catalog(var_meta) + self.assertIsInstance(catalog, Catalog) + self.assertEqual(catalog.id, "var1") + # Title should be capitalized + self.assertEqual(catalog.title, "Var1") + # Self href ends with var1/catalog.json + self.assertTrue(catalog.self_href.endswith("/var1/catalog.json")) + + @patch("pystac.Catalog.from_file") + def test_update_product_base_catalog(self, mock_from_file): + """Test linking product catalog.""" + mock_cat = MagicMock(spec=Catalog) + mock_from_file.return_value = mock_cat + + result = self.generator.update_product_base_catalog("path.json") + self.assertIs(result, mock_cat) + mock_cat.add_link.assert_called_once() + mock_cat.set_self_href.assert_called_once_with(PRODUCT_BASE_CATALOG_SELF_HREF) + + @patch("pystac.Catalog.from_file") + def test_update_variable_base_catalog(self, mock_from_file): + """Test linking variable base catalog.""" + mock_cat = MagicMock(spec=Catalog) + mock_from_file.return_value = mock_cat + + vars_ = ["v1", "v2"] + result = self.generator.update_variable_base_catalog("vars.json", vars_) + self.assertIs(result, mock_cat) + # Expect one add_link per variable + self.assertEqual(mock_cat.add_link.call_count, len(vars_)) + mock_cat.set_self_href.assert_called_once_with(VARIABLE_BASE_CATALOG_SELF_HREF) + + @patch("pystac.Collection.from_file") + def test_update_deepesdl_collection(self, mock_from_file): + """Test updating DeepESDL collection.""" + mock_coll = MagicMock(spec=Collection) + mock_from_file.return_value = mock_coll + + result = self.generator.update_deepesdl_collection("deep.json") + self.assertIs(result, mock_coll) + # Expect child and theme related links for each theme + calls = mock_coll.add_link.call_count + self.assertGreaterEqual(calls, 1 + len(self.generator.osc_themes)) + mock_coll.set_self_href.assert_called_once_with(DEEPESDL_COLLECTION_SELF_HREF) class TestFormatString(unittest.TestCase): diff --git a/deep_code/tests/utils/test_helper.py b/deep_code/tests/utils/test_helper.py new file mode 100644 index 0000000..a7c8c3f --- /dev/null +++ b/deep_code/tests/utils/test_helper.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + +import os +import unittest +from unittest.mock import MagicMock, call, patch + +import xarray +import xarray as xr + +from deep_code.utils.helper import open_dataset + + +def make_dummy_dataset(): + """Create a simple xarray.Dataset for testing.""" + return xr.Dataset( + coords={"time": [0, 1, 2]}, data_vars={"x": (("time",), [10, 20, 30])} + ) + + +class TestOpenDataset(unittest.TestCase): + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") + def test_success_public_store(self, mock_new_store, mock_get_logger): + """Should open dataset with the public store on first try.""" + dummy = make_dummy_dataset() + mock_store = MagicMock() + mock_store.open_data.return_value = dummy + mock_new_store.return_value = mock_store + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + result = open_dataset("test-id") + + self.assertIs(result, dummy) + mock_new_store.assert_called_once_with( + "s3", root="deep-esdl-public", storage_options={"anon": True} + ) + mock_logger.info.assert_any_call( + "Attempting to open dataset 'test-id' with configuration: Public store" + ) + mock_logger.info.assert_any_call( + "Successfully opened dataset 'test-id' with configuration: Public store" + ) + + @patch("deep_code.utils.helper.new_data_store") + @patch("deep_code.utils.helper.logging.getLogger") + def test_open_dataset_success_authenticated_store( + self, mock_get_logger, mock_new_store + ): + """Test fallback to authenticated store when public store fails.""" + mock_store = MagicMock() + mock_new_store.side_effect = [Exception("Public store failure"), mock_store] + mock_store.open_data.return_value = make_dummy_dataset() + + os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" + os.environ["S3_USER_STORAGE_KEY"] = "mock-key" + os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" + + ds = open_dataset("my-id", logger=mock_get_logger()) + + self.assertIsInstance(ds, xarray.Dataset) + + # And new_data_store should have been called twice with exactly these params + expected_calls = [ + call("s3", root="deep-esdl-public", storage_options={"anon": True}), + call( + "s3", + root="mock-bucket", + storage_options={ + "anon": False, + "key": "mock-key", + "secret": "mock-secret", + }, + ), + ] + mock_new_store.assert_has_calls(expected_calls, any_order=False) + + # And the logger should have info about both attempts + logger = mock_get_logger() + logger.info.assert_any_call( + "Attempting to open dataset 'my-id' with configuration: Public store" + ) + logger.info.assert_any_call( + "Attempting to open dataset 'my-id' with configuration: Authenticated store" + ) + logger.info.assert_any_call( + "Successfully opened dataset 'my-id' with configuration: Authenticated store" + ) + + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") + def test_all_stores_fail_raises(self, mock_new_store, mock_get_logger): + """Should raise ValueError if all stores fail.""" + mock_new_store.side_effect = Exception("fail") + os.environ["S3_USER_STORAGE_BUCKET"] = "user-bucket" + os.environ["S3_USER_STORAGE_KEY"] = "key" + os.environ["S3_USER_STORAGE_SECRET"] = "secret" + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + with self.assertRaises(ValueError) as ctx: + open_dataset("test-id") + msg = str(ctx.exception) + self.assertIn("Tried configurations: Public store, Authenticated store", msg) + self.assertIn("Last error: fail", msg) + + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") + def test_with_custom_configs(self, mock_new_store, mock_get_logger): + """Should use provided storage_configs instead of defaults.""" + dummy = make_dummy_dataset() + mock_store = MagicMock() + mock_store.open_data.return_value = dummy + mock_new_store.return_value = mock_store + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + custom_cfgs = [ + { + "description": "Local store", + "params": {"storage_type": "file", "root": ".", "storage_options": {}}, + } + ] + + result = open_dataset("test-id", storage_configs=custom_cfgs) + + self.assertIs(result, dummy) + mock_new_store.assert_called_once_with("file", root=".", storage_options={}) + mock_logger.info.assert_any_call( + "Attempting to open dataset 'test-id' with configuration: Local store" + ) + mock_logger.info.assert_any_call( + "Successfully opened dataset 'test-id' with configuration: Local store" + ) + + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") + def test_uses_provided_logger(self, mock_new_store, mock_get_logger): + """Should use the logger provided by the caller.""" + dummy = make_dummy_dataset() + mock_store = MagicMock() + mock_store.open_data.return_value = dummy + mock_new_store.return_value = mock_store + custom_logger = MagicMock() + mock_get_logger.side_effect = AssertionError("getLogger should not be used") + + result = open_dataset("test-id", logger=custom_logger) + + self.assertIs(result, dummy) + custom_logger.info.assert_any_call( + "Attempting to open dataset 'test-id' with configuration: Public store" + ) + custom_logger.info.assert_any_call( + "Successfully opened dataset 'test-id' with configuration: Public store" + ) diff --git a/deep_code/tools/check.py b/deep_code/tools/check.py deleted file mode 100644 index 3b54c65..0000000 --- a/deep_code/tools/check.py +++ /dev/null @@ -1,4 +0,0 @@ -""" -Verify the readiness of a dataset or an existing workflow repository for experiment -publication by identifying any issues or missing components -""" diff --git a/deep_code/tools/lint.py b/deep_code/tools/lint.py new file mode 100644 index 0000000..ec5d4ea --- /dev/null +++ b/deep_code/tools/lint.py @@ -0,0 +1,37 @@ +import xarray as xr +from xrlint.linter import new_linter +from xrlint.result import Result + +from deep_code.utils.custom_xrlint_rules import export_config +from deep_code.utils.helper import open_dataset + + +class LintDataset: + """Lints xarray dataset using xrlint library. + + Args: + dataset_id (str | None): ID of a Zarr dataset in the DeepESDL public or team bucket. + dataset (xr.Dataset | None): In-memory xarray.Dataset instance. + + Note: + One of `dataset_id` or `dataset` must be provided. + """ + + def __init__( + self, dataset_id: str | None = None, dataset: xr.Dataset | None = None + ): + if not dataset_id and not dataset: + raise ValueError("You must provide either `dataset_id` or `dataset`.") + self.dataset_id = dataset_id + self.dataset = dataset + + def lint_dataset(self) -> Result: + if self.dataset is not None: + ds = self.dataset + elif self.dataset_id is not None: + ds = open_dataset(self.dataset_id) + else: + raise RuntimeError("No dataset to lint.") + + linter = new_linter(*export_config()) + return linter.validate(ds) diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py new file mode 100644 index 0000000..d003840 --- /dev/null +++ b/deep_code/utils/custom_xrlint_rules.py @@ -0,0 +1,78 @@ +# Copyright © 2025 Brockmann Consult GmbH. +# This software is distributed under the terms and conditions of the +# MIT license (https://mit-license.org/). + +""" +This module defines the deepcode plugin for XRLint, which validates +metadata required for dataset publication to a catalog. It checks for: +- A 'description' attribute in dataset.attrs +- A 'gcmd_keyword_url' attribute in each variable's attrs +""" + +from xrlint.node import DatasetNode, VariableNode +from xrlint.plugin import new_plugin +from xrlint.rule import RuleContext, RuleOp + +plugin = new_plugin(name="deepcode", version="1.0.0") + + +@plugin.define_rule("dataset-description") +class DatasetDescriptionRule(RuleOp): + """Ensures the dataset has a 'description' attribute.""" + + def validate_dataset(self, ctx: RuleContext, node: DatasetNode): + if "description" not in node.dataset.attrs: + ctx.report( + "Dataset missing required 'description' attribute.", + suggestions=["Add a 'description' attribute to dataset.attrs."], + ) + + +@plugin.define_rule("variable-gcmd-keyword-url") +class VariableGcmdKeywordUrlRule(RuleOp): + """Ensures all variables have a 'gcmd_keyword_url' attribute.""" + + def validate_variable(self, ctx: RuleContext, node: VariableNode): + if node.name not in ctx.dataset.data_vars: + return + + if "gcmd_keyword_url" not in node.array.attrs: + ctx.report(f"Variable '{node.name}' missing 'gcmd_keyword_url' attribute.") + + +# Define the recommended ruleset for this plugin +plugin.define_config( + "recommended", + [ + { + "rules": { + "deepcode/variable-gcmd-keyword-url": "error", + "deepcode/dataset-description": "error", + } + } + ], +) + + +def export_config() -> list: + """ + Export the plugin configuration to be consumed by the XRLint Linter. + + Returns + ------- + list + A list of plugin config dictionaries and rule presets. + """ + return [ + {"plugins": {"deepcode": plugin}}, + "recommended", + { + "rules": { + "content-desc": "off", + "no-empty-attrs": "off", + "conventions": "off", + "time-coordinate": "off" + } + }, + "deepcode/recommended", + ] diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index f444f70..3e96a6e 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -4,12 +4,10 @@ # https://opensource.org/licenses/MIT. import logging -import os from datetime import datetime, timezone import pandas as pd from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent -from xcube.core.store import new_data_store from deep_code.constants import ( DEEPESDL_COLLECTION_SELF_HREF, @@ -17,6 +15,7 @@ PRODUCT_BASE_CATALOG_SELF_HREF, VARIABLE_BASE_CATALOG_SELF_HREF, ) +from deep_code.utils.helper import open_dataset from deep_code.utils.ogc_api_record import Theme, ThemeConcept from deep_code.utils.osc_extension import OscExtension @@ -58,69 +57,9 @@ def __init__( self.osc_missions = osc_missions or [] self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) - self.dataset = self._open_dataset() + self.dataset = open_dataset(dataset_id=dataset_id, logger=self.logger) self.variables_metadata = self.get_variables_metadata() - def _open_dataset(self): - """Open the dataset using a S3 store as a xarray Dataset.""" - - store_configs = [ - { - "description": "Public store", - "params": { - "storage_type": "s3", - "root": "deep-esdl-public", - "storage_options": {"anon": True}, - }, - }, - { - "description": "Authenticated store", - "params": { - "storage_type": "s3", - "root": os.environ.get("S3_USER_STORAGE_BUCKET"), - "storage_options": { - "anon": False, - "key": os.environ.get("S3_USER_STORAGE_KEY"), - "secret": os.environ.get("S3_USER_STORAGE_SECRET"), - }, - }, - }, - ] - - # Iterate through configurations and attempt to open the dataset - last_exception = None - tried_configurations = [] - for config in store_configs: - tried_configurations.append(config["description"]) - try: - self.logger.info( - f"Attempting to open dataset with configuration: " - f"{config['description']}" - ) - store = new_data_store( - config["params"]["storage_type"], - root=config["params"]["root"], - storage_options=config["params"]["storage_options"], - ) - dataset = store.open_data(self.dataset_id) - self.logger.info( - f"Successfully opened dataset with configuration: " - f"{config['description']}" - ) - return dataset - except Exception as e: - self.logger.error( - f"Failed to open dataset with configuration: " - f"{config['description']}. Error: {e}" - ) - last_exception = e - - raise ValueError( - f"Failed to open Zarr dataset with ID {self.dataset_id}. " - f"Tried configurations: {', '.join(tried_configurations)}. " - f"Last error: {last_exception}" - ) - def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" if {"lon", "lat"}.issubset(self.dataset.coords): @@ -176,8 +115,7 @@ def _get_temporal_extent(self) -> TemporalExtent: @staticmethod def _normalize_name(name: str | None) -> str | None: if name: - return (name.replace(" ", "-"). - replace("_", "-").lower()) + return name.replace(" ", "-").replace("_", "-").lower() return None def _get_general_metadata(self) -> dict: @@ -205,8 +143,9 @@ def get_variable_ids(self) -> list[str]: variable_ids = list(self.variables_metadata.keys()) # Remove 'crs' and 'spatial_ref' from the list if they exist, note that # spatial_ref will be normalized to spatial-ref in variable_ids and skipped. - return [var_id for var_id in variable_ids if var_id not in ["crs", - "spatial-ref"]] + return [ + var_id for var_id in variable_ids if var_id not in ["crs", "spatial-ref"] + ] def get_variables_metadata(self) -> dict[str, dict]: """Extract metadata for all variables in the dataset.""" @@ -232,7 +171,8 @@ def _add_gcmd_link_to_var_catalog( if not gcmd_keyword_url: gcmd_keyword_url = input( f"Enter GCMD keyword URL or a similar url for" - f" {var_metadata.get('variable_id')}: ").strip() + f" {var_metadata.get('variable_id')}: " + ).strip() var_catalog.add_link( Link( rel="via", @@ -326,23 +266,23 @@ def build_variable_catalog(self, var_metadata) -> Catalog: return var_catalog def update_product_base_catalog(self, product_catalog_path) -> Catalog: - """Link product to base product catalog""" - product_base_catalog = Catalog.from_file(product_catalog_path) - product_base_catalog.add_link( - Link( - rel="child", - target=f"./{self.collection_id}/collection.json", - media_type="application/json", - title=self.collection_id, - ) + """Link product to base product catalog""" + product_base_catalog = Catalog.from_file(product_catalog_path) + product_base_catalog.add_link( + Link( + rel="child", + target=f"./{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, ) - # 'self' link: the direct URL where this JSON is hosted - product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) - return product_base_catalog + ) + # 'self' link: the direct URL where this JSON is hosted + product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) + return product_base_catalog - def update_variable_base_catalog(self, variable_base_catalog_path, variable_ids) \ - -> ( - Catalog): + def update_variable_base_catalog( + self, variable_base_catalog_path, variable_ids + ) -> (Catalog): """Link product to base product catalog""" variable_base_catalog = Catalog.from_file(variable_base_catalog_path) for var_id in variable_ids: @@ -387,7 +327,7 @@ def update_deepesdl_collection(self, deepesdl_collection_full_path): rel="related", target=f"../../themes/{theme}/catalog.json", media_type="application/json", - title=f"Theme: {self.format_string(theme)}" + title=f"Theme: {self.format_string(theme)}", ) ) deepesdl_collection.set_self_href(DEEPESDL_COLLECTION_SELF_HREF) @@ -534,7 +474,7 @@ def build_dataset_stac_collection(self) -> Collection: rel="related", target="../../projects/deep-earth-system-data-lab/collection.json", media_type="application/json", - title="Project: DeepESDL" + title="Project: DeepESDL", ) ) diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index cca6b75..0e33585 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -1,3 +1,11 @@ +import logging +import os +from typing import Optional + +import xarray as xr +from xcube.core.store import new_data_store + + def serialize(obj): """Convert non-serializable objects to JSON-compatible formats. Args: @@ -12,3 +20,89 @@ def serialize(obj): if hasattr(obj, "__dict__"): return obj.__dict__ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + + +def open_dataset( + dataset_id: str, + root: str = "deep-esdl-public", + storage_configs: Optional[list[dict]] = None, + logger: Optional[logging.Logger] = None, +) -> xr.Dataset: + """Open an xarray dataset from a specified store. + + Args: + dataset_id: ID of the dataset (e.g., path to Zarr or NetCDF file). + storage_type: Type of storage (e.g., 's3', 'file'). Defaults to 's3'. + root: Root path or bucket for the store. Defaults to 'deep-esdl-public'. + storage_configs: List of storage configurations. If None, uses default S3 configs. + logger: Optional logger for logging messages. If None, uses default logger. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If the dataset cannot be opened with any configuration. + """ + if logger is None: + logger = logging.getLogger(__name__) + + # Default S3 configurations + default_configs = [ + { + "description": "Public store", + "params": { + "storage_type": "s3", + "root": root, + "storage_options": {"anon": True}, + }, + }, + { + "description": "Authenticated store", + "params": { + "storage_type": "s3", + "root": os.environ.get("S3_USER_STORAGE_BUCKET", root), + "storage_options": { + "anon": False, + "key": os.environ.get("S3_USER_STORAGE_KEY"), + "secret": os.environ.get("S3_USER_STORAGE_SECRET"), + }, + }, + }, + ] + + # Use provided configs or default + configs = storage_configs or default_configs + + # Iterate through configurations and attempt to open the dataset + last_exception = None + tried_configurations = [] + for config in configs: + tried_configurations.append(config["description"]) + try: + logger.info( + f"Attempting to open dataset '{dataset_id}' with configuration: " + f"{config['description']}" + ) + store = new_data_store( + config["params"]["storage_type"], + root=config["params"]["root"], + storage_options=config["params"]["storage_options"], + ) + dataset = store.open_data(dataset_id) + logger.info( + f"Successfully opened dataset '{dataset_id}' with configuration: " + f"{config['description']}" + ) + return dataset + except Exception as e: + logger.error( + f"Failed to open dataset '{dataset_id}' with configuration: " + f"{config['description']}. Error: {e}" + ) + last_exception = e + + raise ValueError( + f"Failed to open dataset with ID '{dataset_id}'. " + f"Tried configurations: {', '.join(tried_configurations)}. " + f"Last error: {last_exception}" + ) diff --git a/deep_code/version.py b/deep_code/version.py index 6ecc219..2617f2d 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.3" +version = "0.1.4.dev" diff --git a/environment.yml b/environment.yml index 9570a8d..c0f4b28 100644 --- a/environment.yml +++ b/environment.yml @@ -12,6 +12,7 @@ dependencies: - pystac - pyyaml - xcube + - xrlint - zarr >=2.11,<3 # test dependencies - numpy