From 116b696e61516bde4a06f9d98754aa9fc1c99903 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 12 Jun 2025 14:21:20 +0200 Subject: [PATCH 1/7] custom xrlint rules for deepcode --- .../tests/utils/test_custom_xrlint_rules.py | 71 +++++ .../utils/test_dataset_stac_generator.py | 247 ++++++++---------- deep_code/tests/utils/test_helper.py | 161 ++++++++++++ deep_code/tools/check.py | 4 - deep_code/tools/lint.py | 35 +++ deep_code/utils/custom_xrlint_rules.py | 83 ++++++ deep_code/utils/dataset_stac_generator.py | 67 +---- deep_code/utils/helper.py | 92 +++++++ deep_code/version.py | 2 +- environment.yml | 1 + 10 files changed, 551 insertions(+), 212 deletions(-) create mode 100644 deep_code/tests/utils/test_custom_xrlint_rules.py create mode 100644 deep_code/tests/utils/test_helper.py delete mode 100644 deep_code/tools/check.py create mode 100644 deep_code/tools/lint.py create mode 100644 deep_code/utils/custom_xrlint_rules.py diff --git a/deep_code/tests/utils/test_custom_xrlint_rules.py b/deep_code/tests/utils/test_custom_xrlint_rules.py new file mode 100644 index 0000000..4636b5e --- /dev/null +++ b/deep_code/tests/utils/test_custom_xrlint_rules.py @@ -0,0 +1,71 @@ +# Copyright © 2025 Brockmann Consult GmbH. +# This software is distributed under the terms and conditions of the +# MIT license (https://mit-license.org/). + +import unittest +import xarray as xr + +from deep_code.utils.custom_xrlint_rules import ( + DatasetDescriptionRule, + VariableGcmdKeywordUrlRule +) +from xrlint.testing import RuleTest, RuleTester + +class TestDeepCodePlugin(unittest.TestCase): + def setUp(self): + """Set up test datasets.""" + # Valid dataset with all required metadata + self.valid_dataset = xr.Dataset( + data_vars={ + "temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]), + "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]) + }, + coords={"time": [1], "lat": [0, 1], "lon": [0, 1]}, + attrs={ + "description": "Test climate dataset", + "title": "Climate Dataset 2025" + } + ) + self.valid_dataset["temperature"].attrs["gcmd_keyword_url"] = ( + "https://gcmd.nasa.gov/KeywordViewer/temperature" + ) + self.valid_dataset["temperature"].attrs["units"] = "K" + self.valid_dataset["precipitation"].attrs["gcmd_keyword_url"] = ( + "https://gcmd.nasa.gov/KeywordViewer/precipitation" + ) + self.valid_dataset["precipitation"].attrs["units"] = "mm" + + # Invalid dataset missing required metadata + self.invalid_dataset = xr.Dataset( + data_vars={ + "temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]), + "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]) + }, + coords={"time": [1], "lat": [0, 1], "lon": [0, 1]}, + attrs={} + ) + self.invalid_dataset["temperature"].attrs["gcmd_keyword_url"] = ( + "https://gcmd.nasa.gov/KeywordViewer/temperature" + ) + self.invalid_dataset["temperature"].attrs["units"] = "K" + # Intentionally omit gcmd_keyword_url and units for precipitation + + self.tester = RuleTester() + + def test_dataset_description(self): + """Test DatasetDescriptionRule with valid and invalid dataset.""" + self.tester.run( + "dataset-description", + DatasetDescriptionRule, + valid=[RuleTest(dataset=self.valid_dataset)], + invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)] + ) + + def test_variable_gcmd_keyword_url(self): + """Test VariableGcmdKeywordUrlRule with valid dataset.""" + self.tester.run( + "variable-gcmd-keyword-url", + VariableGcmdKeywordUrlRule, + valid=[RuleTest(dataset=self.valid_dataset)], + invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)] + ) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index e8864b1..8a4e471 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -1,17 +1,28 @@ -import os +#!/usr/bin/env python3 +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + import unittest -from datetime import datetime +from datetime import datetime, timezone from unittest.mock import MagicMock, patch import numpy as np -from pystac import Collection -from xarray import Dataset - +from pystac import Catalog, Collection, Link +from xarray import Dataset, DataArray + +from deep_code.constants import ( + DEEPESDL_COLLECTION_SELF_HREF, + OSC_THEME_SCHEME, + PRODUCT_BASE_CATALOG_SELF_HREF, + VARIABLE_BASE_CATALOG_SELF_HREF, +) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator +from deep_code.utils.dataset_stac_generator import Theme, ThemeConcept class TestOSCProductSTACGenerator(unittest.TestCase): - @patch("deep_code.utils.dataset_stac_generator.new_data_store") + @patch("deep_code.utils.dataset_stac_generator.open_dataset") def setUp(self, mock_data_store): """Set up a mock dataset and generator.""" self.mock_dataset = Dataset( @@ -50,7 +61,7 @@ def setUp(self, mock_data_store): ) mock_store = MagicMock() mock_store.open_data.return_value = self.mock_dataset - mock_data_store.return_value = mock_store + mock_data_store.return_value = self.mock_dataset self.generator = OscDatasetStacGenerator( dataset_id="mock-dataset-id", @@ -65,9 +76,8 @@ def setUp(self, mock_data_store): def test_open_dataset(self): """Test if the dataset is opened correctly.""" self.assertIsInstance(self.generator.dataset, Dataset) - self.assertIn("lon", self.generator.dataset.coords) - self.assertIn("lat", self.generator.dataset.coords) - self.assertIn("time", self.generator.dataset.coords) + for coord in ("lon", "lat", "time"): + self.assertIn(coord, self.generator.dataset.coords) def test_get_spatial_extent(self): """Test spatial extent extraction.""" @@ -77,146 +87,93 @@ def test_get_spatial_extent(self): def test_get_temporal_extent(self): """Test temporal extent extraction.""" extent = self.generator._get_temporal_extent() - expected_intervals = [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)] - self.assertEqual(extent.intervals[0], expected_intervals) + # TemporalExtent.intervals is a list of [start, end] + interval = extent.intervals[0] + self.assertEqual(interval[0], datetime(2023, 1, 1, 0, 0)) + self.assertEqual(interval[1], datetime(2023, 1, 2, 0, 0)) def test_get_variables(self): - """Test variable extraction.""" - variables = self.generator.get_variable_ids() - self.assertEqual(variables, ["var1", "var2"]) + """Test variable ID extraction.""" + vars_ = self.generator.get_variable_ids() + self.assertCountEqual(vars_, ["var1", "var2"]) def test_get_general_metadata(self): """Test general metadata extraction.""" - metadata = self.generator._get_general_metadata() - self.assertEqual(metadata["description"], "Mock dataset for testing.") - - @patch("pystac.Collection.add_link") - @patch("pystac.Collection.set_self_href") - def test_build_stac_collection(self, mock_set_self_href, mock_add_link): - """Test STAC collection creation.""" - collection = self.generator.build_dataset_stac_collection() - self.assertIsInstance(collection, Collection) - self.assertEqual(collection.id, "mock-collection-id") - self.assertEqual(collection.description, "Mock dataset for testing.") - self.assertEqual( - collection.extent.spatial.bboxes[0], [-180.0, -90.0, 180.0, 90.0] - ) - self.assertEqual( - collection.extent.temporal.intervals[0], - [datetime(2023, 1, 1, 0, 0), datetime(2023, 1, 2, 0, 0)], - ) - mock_set_self_href.assert_called_once() - mock_add_link.assert_called() - - def test_invalid_spatial_extent(self): - """Test spatial extent extraction with missing coordinates.""" - self.generator.dataset = Dataset(coords={"x": [], "y": []}) - with self.assertRaises(ValueError): - self.generator._get_spatial_extent() - - def test_invalid_temporal_extent(self): - """Test temporal extent extraction with missing time.""" - self.generator.dataset = Dataset(coords={}) - with self.assertRaises(ValueError): - self.generator._get_temporal_extent() - - @patch("deep_code.utils.dataset_stac_generator.new_data_store") - @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") - def test_open_dataset_success_public_store(self, mock_logger, mock_new_data_store): - """Test dataset opening with the public store configuration.""" - # Create a mock store and mock its `open_data` method - mock_store = MagicMock() - mock_new_data_store.return_value = mock_store - mock_store.open_data.return_value = self.mock_dataset - - # Instantiate the generator (this will implicitly call _open_dataset) - generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") - - # Validate that the dataset is assigned correctly - self.assertEqual(generator.dataset, "mock_dataset") - - # Validate that `new_data_store` was called once with the correct parameters - mock_new_data_store.assert_called_once_with( - "s3", root="deep-esdl-public", storage_options={"anon": True} - ) - - # Ensure `open_data` was called once on the returned store - mock_store.open_data.assert_called_once_with("mock-dataset-id") - - # Validate logging behavior - mock_logger().info.assert_any_call( - "Attempting to open dataset with configuration: Public store" - ) - mock_logger().info.assert_any_call( - "Successfully opened dataset with configuration: Public store" - ) - - @patch("deep_code.utils.dataset_stac_generator.new_data_store") - @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") - def test_open_dataset_success_authenticated_store( - self, mock_logger, mock_new_data_store - ): - """Test dataset opening with the authenticated store configuration.""" - # Simulate public store failure - mock_store = MagicMock() - mock_new_data_store.side_effect = [ - Exception("Public store failure"), - # First call (public store) raises an exception - mock_store, - # Second call (authenticated store) returns a mock store - ] - mock_store.open_data.return_value = self.mock_dataset - - os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" - os.environ["S3_USER_STORAGE_KEY"] = "mock-key" - os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - - generator = OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") - - # Validate that the dataset was successfully opened with the authenticated store - self.assertEqual(generator.dataset, "mock_dataset") - self.assertEqual(mock_new_data_store.call_count, 2) - - # Validate calls to `new_data_store` - mock_new_data_store.assert_any_call( - "s3", root="deep-esdl-public", storage_options={"anon": True} - ) - mock_new_data_store.assert_any_call( - "s3", - root="mock-bucket", - storage_options={"anon": False, "key": "mock-key", "secret": "mock-secret"}, - ) - - # Validate logging calls - mock_logger().info.assert_any_call( - "Attempting to open dataset with configuration: Public store" - ) - mock_logger().info.assert_any_call( - "Attempting to open dataset with configuration: Authenticated store" - ) - mock_logger().info.assert_any_call( - "Successfully opened dataset with configuration: Authenticated store" - ) - - @patch("deep_code.utils.dataset_stac_generator.new_data_store") - @patch("deep_code.utils.dataset_stac_generator.logging.getLogger") - def test_open_dataset_failure(self, mock_logger, mock_new_data_store): - """Test dataset opening failure with all configurations.""" - # Simulate all store failures - mock_new_data_store.side_effect = Exception("Store failure") - os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" - os.environ["S3_USER_STORAGE_KEY"] = "mock-key" - os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" - - with self.assertRaises(ValueError) as context: - OscDatasetStacGenerator("mock-dataset-id", "mock-collection-id") - - self.assertIn( - "Failed to open Zarr dataset with ID mock-dataset-id", - str(context.exception), - ) - self.assertIn("Public store, Authenticated store", str(context.exception)) - self.assertEqual(mock_new_data_store.call_count, 2) + meta = self.generator._get_general_metadata() + self.assertEqual(meta.get("description"), "Mock dataset for testing.") + + def test_extract_metadata_for_variable(self): + """Test single variable metadata extraction.""" + da: DataArray = self.mock_dataset.data_vars['var1'] + var_meta = self.generator.extract_metadata_for_variable(da) + self.assertEqual(var_meta['variable_id'], 'var1') + self.assertEqual(var_meta['description'], 'dummy') + self.assertEqual(var_meta['gcmd_keyword_url'], 'https://dummy') + + def test_get_variables_metadata(self): + """Test metadata dict for all variables.""" + meta_dict = self.generator.get_variables_metadata() + self.assertIn('var1', meta_dict) + self.assertIn('var2', meta_dict) + self.assertIsInstance(meta_dict['var1'], dict) + + def test_build_theme(self): + """Test Theme builder static method.""" + themes = ["a", "b"] + theme_obj: Theme = OscDatasetStacGenerator.build_theme(themes) + self.assertEqual(theme_obj.scheme, OSC_THEME_SCHEME) + ids = [tc.id for tc in theme_obj.concepts] + self.assertListEqual(ids, ['a', 'b']) + + @patch.object(OscDatasetStacGenerator, '_add_gcmd_link_to_var_catalog') + @patch.object(OscDatasetStacGenerator, 'add_themes_as_related_links_var_catalog') + def test_build_variable_catalog(self, mock_add_themes, mock_add_gcmd): + """Test building of variable-level STAC catalog.""" + var_meta = self.generator.variables_metadata['var1'] + catalog = self.generator.build_variable_catalog(var_meta) + self.assertIsInstance(catalog, Catalog) + self.assertEqual(catalog.id, 'var1') + # Title should be capitalized + self.assertEqual(catalog.title, 'Var1') + # Self href ends with var1/catalog.json + self.assertTrue(catalog.self_href.endswith('/var1/catalog.json')) + + @patch('pystac.Catalog.from_file') + def test_update_product_base_catalog(self, mock_from_file): + """Test linking product catalog.""" + mock_cat = MagicMock(spec=Catalog) + mock_from_file.return_value = mock_cat + + result = self.generator.update_product_base_catalog('path.json') + self.assertIs(result, mock_cat) + mock_cat.add_link.assert_called_once() + mock_cat.set_self_href.assert_called_once_with(PRODUCT_BASE_CATALOG_SELF_HREF) + + @patch('pystac.Catalog.from_file') + def test_update_variable_base_catalog(self, mock_from_file): + """Test linking variable base catalog.""" + mock_cat = MagicMock(spec=Catalog) + mock_from_file.return_value = mock_cat + + vars_ = ['v1', 'v2'] + result = self.generator.update_variable_base_catalog('vars.json', vars_) + self.assertIs(result, mock_cat) + # Expect one add_link per variable + self.assertEqual(mock_cat.add_link.call_count, len(vars_)) + mock_cat.set_self_href.assert_called_once_with(VARIABLE_BASE_CATALOG_SELF_HREF) + + @patch('pystac.Collection.from_file') + def test_update_deepesdl_collection(self, mock_from_file): + """Test updating DeepESDL collection.""" + mock_coll = MagicMock(spec=Collection) + mock_from_file.return_value = mock_coll + + result = self.generator.update_deepesdl_collection('deep.json') + self.assertIs(result, mock_coll) + # Expect child and theme related links for each theme + calls = mock_coll.add_link.call_count + self.assertGreaterEqual(calls, 1 + len(self.generator.osc_themes)) + mock_coll.set_self_href.assert_called_once_with(DEEPESDL_COLLECTION_SELF_HREF) class TestFormatString(unittest.TestCase): diff --git a/deep_code/tests/utils/test_helper.py b/deep_code/tests/utils/test_helper.py new file mode 100644 index 0000000..2267c08 --- /dev/null +++ b/deep_code/tests/utils/test_helper.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python3 +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + +import os +import unittest +from unittest.mock import MagicMock, patch, call + +import xarray +import xarray as xr + +from deep_code.utils.helper import open_dataset + + +def make_dummy_dataset(): + """Create a simple xarray.Dataset for testing.""" + return xr.Dataset( + coords={"time": [0, 1, 2]}, + data_vars={"x": (("time",), [10, 20, 30])} + ) + + +class TestOpenDataset(unittest.TestCase): + @patch('deep_code.utils.helper.logging.getLogger') + @patch('deep_code.utils.helper.new_data_store') + def test_success_public_store(self, mock_new_store, mock_get_logger): + """Should open dataset with the public store on first try.""" + dummy = make_dummy_dataset() + mock_store = MagicMock() + mock_store.open_data.return_value = dummy + mock_new_store.return_value = mock_store + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + result = open_dataset('test-id') + + self.assertIs(result, dummy) + mock_new_store.assert_called_once_with( + 's3', + root='deep-esdl-public', + storage_options={'anon': True} + ) + mock_logger.info.assert_any_call( + "Attempting to open dataset 'test-id' with configuration: Public store" + ) + mock_logger.info.assert_any_call( + "Successfully opened dataset 'test-id' with configuration: Public store" + ) + + @patch("deep_code.utils.helper.new_data_store") + @patch("deep_code.utils.helper.logging.getLogger") + def test_open_dataset_success_authenticated_store( + self, mock_get_logger, mock_new_store + ): + """Test fallback to authenticated store when public store fails.""" + mock_store = MagicMock() + mock_new_store.side_effect = [Exception("Public store failure"), mock_store] + mock_store.open_data.return_value = make_dummy_dataset() + + os.environ["S3_USER_STORAGE_BUCKET"] = "mock-bucket" + os.environ["S3_USER_STORAGE_KEY"] = "mock-key" + os.environ["S3_USER_STORAGE_SECRET"] = "mock-secret" + + ds = open_dataset("my-id", logger=mock_get_logger()) + + self.assertIsInstance(ds, xarray.Dataset) + + # And new_data_store should have been called twice with exactly these params + expected_calls = [ + call("s3", root="deep-esdl-public", storage_options={"anon": True}), + call( + "s3", + root="mock-bucket", + storage_options={ + "anon": False, + "key": "mock-key", + "secret": "mock-secret", + }, + ), + ] + mock_new_store.assert_has_calls(expected_calls, any_order=False) + + # And the logger should have info about both attempts + logger = mock_get_logger() + logger.info.assert_any_call( + "Attempting to open dataset 'my-id' with configuration: Public store" + ) + logger.info.assert_any_call( + "Attempting to open dataset 'my-id' with configuration: Authenticated store" + ) + logger.info.assert_any_call( + "Successfully opened dataset 'my-id' with configuration: Authenticated store" + ) + + @patch('deep_code.utils.helper.logging.getLogger') + @patch('deep_code.utils.helper.new_data_store') + def test_all_stores_fail_raises(self, mock_new_store, mock_get_logger): + """Should raise ValueError if all stores fail.""" + mock_new_store.side_effect = Exception('fail') + os.environ['S3_USER_STORAGE_BUCKET'] = 'user-bucket' + os.environ['S3_USER_STORAGE_KEY'] = 'key' + os.environ['S3_USER_STORAGE_SECRET'] = 'secret' + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + with self.assertRaises(ValueError) as ctx: + open_dataset('test-id') + msg = str(ctx.exception) + self.assertIn("Tried configurations: Public store, Authenticated store", msg) + self.assertIn("Last error: fail", msg) + + @patch('deep_code.utils.helper.logging.getLogger') + @patch('deep_code.utils.helper.new_data_store') + def test_with_custom_configs(self, mock_new_store, mock_get_logger): + """Should use provided storage_configs instead of defaults.""" + dummy = make_dummy_dataset() + mock_store = MagicMock() + mock_store.open_data.return_value = dummy + mock_new_store.return_value = mock_store + mock_logger = MagicMock() + mock_get_logger.return_value = mock_logger + + custom_cfgs = [ + {"description": "Local store", "params": {"storage_type": "file", "root": ".", "storage_options": {}}} + ] + + result = open_dataset('test-id', storage_configs=custom_cfgs) + + self.assertIs(result, dummy) + mock_new_store.assert_called_once_with( + 'file', root='.', storage_options={} + ) + mock_logger.info.assert_any_call( + "Attempting to open dataset 'test-id' with configuration: Local store" + ) + mock_logger.info.assert_any_call( + "Successfully opened dataset 'test-id' with configuration: Local store" + ) + + @patch('deep_code.utils.helper.logging.getLogger') + @patch('deep_code.utils.helper.new_data_store') + def test_uses_provided_logger(self, mock_new_store, mock_get_logger): + """Should use the logger provided by the caller.""" + dummy = make_dummy_dataset() + mock_store = MagicMock() + mock_store.open_data.return_value = dummy + mock_new_store.return_value = mock_store + custom_logger = MagicMock() + mock_get_logger.side_effect = AssertionError("getLogger should not be used") + + result = open_dataset('test-id', logger=custom_logger) + + self.assertIs(result, dummy) + custom_logger.info.assert_any_call( + "Attempting to open dataset 'test-id' with configuration: Public store" + ) + custom_logger.info.assert_any_call( + "Successfully opened dataset 'test-id' with configuration: Public store" + ) + diff --git a/deep_code/tools/check.py b/deep_code/tools/check.py deleted file mode 100644 index 3b54c65..0000000 --- a/deep_code/tools/check.py +++ /dev/null @@ -1,4 +0,0 @@ -""" -Verify the readiness of a dataset or an existing workflow repository for experiment -publication by identifying any issues or missing components -""" diff --git a/deep_code/tools/lint.py b/deep_code/tools/lint.py new file mode 100644 index 0000000..e904a11 --- /dev/null +++ b/deep_code/tools/lint.py @@ -0,0 +1,35 @@ +import xarray as xr +from xrlint.linter import new_linter +from deep_code.utils.custom_xrlint_rules import export_config +from deep_code.utils.helper import open_dataset + + +class LintDataset: + """Lints xarray dataset using xrlint library. + + Args: + dataset_id (str | None): ID of a Zarr dataset in the DeepESDL public or team bucket. + dataset (xr.Dataset | None): In-memory xarray.Dataset instance. + + Note: + One of `dataset_id` or `dataset` must be provided. + """ + def __init__( + self, + dataset_id: str | None = None, + dataset: xr.Dataset | None = None): + if not dataset_id and not dataset: + raise ValueError("You must provide either `dataset_id` or `dataset`.") + self.dataset_id = dataset_id + self.dataset = dataset + + def lint_dataset(self): + if self.dataset is not None: + ds = self.dataset + elif self.dataset_id is not None: + ds = open_dataset(self.dataset_id) + else: + raise RuntimeError("No dataset to lint.") + + linter = new_linter(*export_config()) + return linter.validate(ds) diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py new file mode 100644 index 0000000..775d178 --- /dev/null +++ b/deep_code/utils/custom_xrlint_rules.py @@ -0,0 +1,83 @@ +# Copyright © 2025 Brockmann Consult GmbH. +# This software is distributed under the terms and conditions of the +# MIT license (https://mit-license.org/). + +""" +This module defines the catalog_metadata_plugin for XRLint, which validates +metadata required for dataset publication to a catalog. It checks for: +- A 'description' attribute in dataset.attrs +- A 'gcmd_keyword_url' attribute in each variable's attrs +""" + +from xrlint.node import DatasetNode, VariableNode +from xrlint.plugin import new_plugin +from xrlint.rule import RuleContext, RuleOp + +plugin = new_plugin(name="deepcode", version="1.0.0") + + +@plugin.define_rule("dataset-description") +class DatasetDescriptionRule(RuleOp): + """Ensures the dataset has a 'description' attribute.""" + + def validate_dataset(self, ctx: RuleContext, node: DatasetNode): + if "description" not in node.dataset.attrs: + ctx.report( + "Dataset missing required 'description' attribute.", + suggestions=["Add a 'description' attribute to dataset.attrs."] + ) + + +@plugin.define_rule("variable-gcmd-keyword-url") +class VariableGcmdKeywordUrlRule(RuleOp): + """Ensures all variables have a 'gcmd_keyword_url' attribute.""" + + def validate_variable(self, ctx: RuleContext, node: VariableNode): + if node.name not in ctx.dataset.data_vars: + return + + if "gcmd_keyword_url" not in node.array.attrs: + ctx.report( + f"Variable '{node.name}' missing 'gcmd_keyword_url' attribute." + ) + + +# Define the recommended ruleset for this plugin +plugin.define_config( + "recommended", + [ + { + "rules": { + "deepcode/variable-gcmd-keyword-url": "error", + "deepcode/dataset-description": "error", + } + } + ] +) + + +def export_config() -> list: + """ + Export the plugin configuration to be consumed by the XRLint Linter. + + Returns + ------- + list + A list of plugin config dictionaries and rule presets. + """ + return [ + { + "plugins": { + "deepcode": plugin, + }, + }, + "recommended", + { + "rules": { + "content-desc": "off", + "no-empty-attrs": "off", + "conventions": "off", + } + }, + "deepcode/recommended", + ] diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index f444f70..9873022 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -4,12 +4,10 @@ # https://opensource.org/licenses/MIT. import logging -import os from datetime import datetime, timezone import pandas as pd from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent -from xcube.core.store import new_data_store from deep_code.constants import ( DEEPESDL_COLLECTION_SELF_HREF, @@ -19,6 +17,7 @@ ) from deep_code.utils.ogc_api_record import Theme, ThemeConcept from deep_code.utils.osc_extension import OscExtension +from deep_code.utils.helper import open_dataset class OscDatasetStacGenerator: @@ -58,68 +57,12 @@ def __init__( self.osc_missions = osc_missions or [] self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) - self.dataset = self._open_dataset() + self.dataset = open_dataset( + dataset_id=dataset_id, + logger=self.logger + ) self.variables_metadata = self.get_variables_metadata() - def _open_dataset(self): - """Open the dataset using a S3 store as a xarray Dataset.""" - - store_configs = [ - { - "description": "Public store", - "params": { - "storage_type": "s3", - "root": "deep-esdl-public", - "storage_options": {"anon": True}, - }, - }, - { - "description": "Authenticated store", - "params": { - "storage_type": "s3", - "root": os.environ.get("S3_USER_STORAGE_BUCKET"), - "storage_options": { - "anon": False, - "key": os.environ.get("S3_USER_STORAGE_KEY"), - "secret": os.environ.get("S3_USER_STORAGE_SECRET"), - }, - }, - }, - ] - - # Iterate through configurations and attempt to open the dataset - last_exception = None - tried_configurations = [] - for config in store_configs: - tried_configurations.append(config["description"]) - try: - self.logger.info( - f"Attempting to open dataset with configuration: " - f"{config['description']}" - ) - store = new_data_store( - config["params"]["storage_type"], - root=config["params"]["root"], - storage_options=config["params"]["storage_options"], - ) - dataset = store.open_data(self.dataset_id) - self.logger.info( - f"Successfully opened dataset with configuration: " - f"{config['description']}" - ) - return dataset - except Exception as e: - self.logger.error( - f"Failed to open dataset with configuration: " - f"{config['description']}. Error: {e}" - ) - last_exception = e - - raise ValueError( - f"Failed to open Zarr dataset with ID {self.dataset_id}. " - f"Tried configurations: {', '.join(tried_configurations)}. " - f"Last error: {last_exception}" - ) def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index cca6b75..10ad44c 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -1,3 +1,10 @@ +import logging +import os +from typing import Optional + +import xarray as xr +from xcube.core.store import new_data_store + def serialize(obj): """Convert non-serializable objects to JSON-compatible formats. Args: @@ -12,3 +19,88 @@ def serialize(obj): if hasattr(obj, "__dict__"): return obj.__dict__ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + +def open_dataset( + dataset_id: str, + root: str = "deep-esdl-public", + storage_configs: Optional[list[dict]] = None, + logger: Optional[logging.Logger] = None +) -> xr.Dataset: + """Open an xarray dataset from a specified store. + + Args: + dataset_id: ID of the dataset (e.g., path to Zarr or NetCDF file). + storage_type: Type of storage (e.g., 's3', 'file'). Defaults to 's3'. + root: Root path or bucket for the store. Defaults to 'deep-esdl-public'. + storage_configs: List of storage configurations. If None, uses default S3 configs. + logger: Optional logger for logging messages. If None, uses default logger. + + Returns: + xarray.Dataset: The opened dataset. + + Raises: + ValueError: If the dataset cannot be opened with any configuration. + """ + if logger is None: + logger = logging.getLogger(__name__) + + # Default S3 configurations + default_configs = [ + { + "description": "Public store", + "params": { + "storage_type": "s3", + "root": root, + "storage_options": {"anon": True}, + }, + }, + { + "description": "Authenticated store", + "params": { + "storage_type": "s3", + "root": os.environ.get("S3_USER_STORAGE_BUCKET", root), + "storage_options": { + "anon": False, + "key": os.environ.get("S3_USER_STORAGE_KEY"), + "secret": os.environ.get("S3_USER_STORAGE_SECRET"), + }, + }, + }, + ] + + # Use provided configs or default + configs = storage_configs or default_configs + + # Iterate through configurations and attempt to open the dataset + last_exception = None + tried_configurations = [] + for config in configs: + tried_configurations.append(config["description"]) + try: + logger.info( + f"Attempting to open dataset '{dataset_id}' with configuration: " + f"{config['description']}" + ) + store = new_data_store( + config["params"]["storage_type"], + root=config["params"]["root"], + storage_options=config["params"]["storage_options"], + ) + dataset = store.open_data(dataset_id) + logger.info( + f"Successfully opened dataset '{dataset_id}' with configuration: " + f"{config['description']}" + ) + return dataset + except Exception as e: + logger.error( + f"Failed to open dataset '{dataset_id}' with configuration: " + f"{config['description']}. Error: {e}" + ) + last_exception = e + + raise ValueError( + f"Failed to open dataset with ID '{dataset_id}'. " + f"Tried configurations: {', '.join(tried_configurations)}. " + f"Last error: {last_exception}" + ) \ No newline at end of file diff --git a/deep_code/version.py b/deep_code/version.py index 6ecc219..2617f2d 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.1.3" +version = "0.1.4.dev" diff --git a/environment.yml b/environment.yml index 9570a8d..c0f4b28 100644 --- a/environment.yml +++ b/environment.yml @@ -12,6 +12,7 @@ dependencies: - pystac - pyyaml - xcube + - xrlint - zarr >=2.11,<3 # test dependencies - numpy From e6683dba8b6f7f6a021c0a2737f8eb0ef3f84cf2 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 12 Jun 2025 14:26:53 +0200 Subject: [PATCH 2/7] reformatting with ruff --- .../tests/utils/test_custom_xrlint_rules.py | 40 +++++++------- .../utils/test_dataset_stac_generator.py | 51 +++++++++--------- deep_code/tests/utils/test_helper.py | 51 +++++++++--------- deep_code/tools/lint.py | 7 +-- deep_code/utils/custom_xrlint_rules.py | 14 ++--- deep_code/utils/dataset_stac_generator.py | 53 +++++++++---------- deep_code/utils/helper.py | 6 ++- 7 files changed, 107 insertions(+), 115 deletions(-) diff --git a/deep_code/tests/utils/test_custom_xrlint_rules.py b/deep_code/tests/utils/test_custom_xrlint_rules.py index 4636b5e..6a36338 100644 --- a/deep_code/tests/utils/test_custom_xrlint_rules.py +++ b/deep_code/tests/utils/test_custom_xrlint_rules.py @@ -3,13 +3,15 @@ # MIT license (https://mit-license.org/). import unittest + import xarray as xr +from xrlint.testing import RuleTest, RuleTester from deep_code.utils.custom_xrlint_rules import ( DatasetDescriptionRule, - VariableGcmdKeywordUrlRule + VariableGcmdKeywordUrlRule, ) -from xrlint.testing import RuleTest, RuleTester + class TestDeepCodePlugin(unittest.TestCase): def setUp(self): @@ -18,35 +20,35 @@ def setUp(self): self.valid_dataset = xr.Dataset( data_vars={ "temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]), - "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]) + "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]), }, coords={"time": [1], "lat": [0, 1], "lon": [0, 1]}, attrs={ "description": "Test climate dataset", - "title": "Climate Dataset 2025" - } - ) - self.valid_dataset["temperature"].attrs["gcmd_keyword_url"] = ( - "https://gcmd.nasa.gov/KeywordViewer/temperature" + "title": "Climate Dataset 2025", + }, ) + self.valid_dataset["temperature"].attrs[ + "gcmd_keyword_url" + ] = "https://gcmd.nasa.gov/KeywordViewer/temperature" self.valid_dataset["temperature"].attrs["units"] = "K" - self.valid_dataset["precipitation"].attrs["gcmd_keyword_url"] = ( - "https://gcmd.nasa.gov/KeywordViewer/precipitation" - ) + self.valid_dataset["precipitation"].attrs[ + "gcmd_keyword_url" + ] = "https://gcmd.nasa.gov/KeywordViewer/precipitation" self.valid_dataset["precipitation"].attrs["units"] = "mm" # Invalid dataset missing required metadata self.invalid_dataset = xr.Dataset( data_vars={ "temperature": (("time", "lat", "lon"), [[[300, 301], [302, 303]]]), - "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]) + "precipitation": (("time", "lat", "lon"), [[[10, 20], [30, 40]]]), }, coords={"time": [1], "lat": [0, 1], "lon": [0, 1]}, - attrs={} - ) - self.invalid_dataset["temperature"].attrs["gcmd_keyword_url"] = ( - "https://gcmd.nasa.gov/KeywordViewer/temperature" + attrs={}, ) + self.invalid_dataset["temperature"].attrs[ + "gcmd_keyword_url" + ] = "https://gcmd.nasa.gov/KeywordViewer/temperature" self.invalid_dataset["temperature"].attrs["units"] = "K" # Intentionally omit gcmd_keyword_url and units for precipitation @@ -58,8 +60,8 @@ def test_dataset_description(self): "dataset-description", DatasetDescriptionRule, valid=[RuleTest(dataset=self.valid_dataset)], - invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)] - ) + invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)], + ) def test_variable_gcmd_keyword_url(self): """Test VariableGcmdKeywordUrlRule with valid dataset.""" @@ -67,5 +69,5 @@ def test_variable_gcmd_keyword_url(self): "variable-gcmd-keyword-url", VariableGcmdKeywordUrlRule, valid=[RuleTest(dataset=self.valid_dataset)], - invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)] + invalid=[RuleTest(dataset=self.invalid_dataset, expected=1)], ) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 8a4e471..464c538 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -4,12 +4,12 @@ # https://opensource.org/licenses/MIT. import unittest -from datetime import datetime, timezone +from datetime import datetime from unittest.mock import MagicMock, patch import numpy as np -from pystac import Catalog, Collection, Link -from xarray import Dataset, DataArray +from pystac import Catalog, Collection +from xarray import DataArray, Dataset from deep_code.constants import ( DEEPESDL_COLLECTION_SELF_HREF, @@ -17,8 +17,7 @@ PRODUCT_BASE_CATALOG_SELF_HREF, VARIABLE_BASE_CATALOG_SELF_HREF, ) -from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator -from deep_code.utils.dataset_stac_generator import Theme, ThemeConcept +from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator, Theme class TestOSCProductSTACGenerator(unittest.TestCase): @@ -104,18 +103,18 @@ def test_get_general_metadata(self): def test_extract_metadata_for_variable(self): """Test single variable metadata extraction.""" - da: DataArray = self.mock_dataset.data_vars['var1'] + da: DataArray = self.mock_dataset.data_vars["var1"] var_meta = self.generator.extract_metadata_for_variable(da) - self.assertEqual(var_meta['variable_id'], 'var1') - self.assertEqual(var_meta['description'], 'dummy') - self.assertEqual(var_meta['gcmd_keyword_url'], 'https://dummy') + self.assertEqual(var_meta["variable_id"], "var1") + self.assertEqual(var_meta["description"], "dummy") + self.assertEqual(var_meta["gcmd_keyword_url"], "https://dummy") def test_get_variables_metadata(self): """Test metadata dict for all variables.""" meta_dict = self.generator.get_variables_metadata() - self.assertIn('var1', meta_dict) - self.assertIn('var2', meta_dict) - self.assertIsInstance(meta_dict['var1'], dict) + self.assertIn("var1", meta_dict) + self.assertIn("var2", meta_dict) + self.assertIsInstance(meta_dict["var1"], dict) def test_build_theme(self): """Test Theme builder static method.""" @@ -123,52 +122,52 @@ def test_build_theme(self): theme_obj: Theme = OscDatasetStacGenerator.build_theme(themes) self.assertEqual(theme_obj.scheme, OSC_THEME_SCHEME) ids = [tc.id for tc in theme_obj.concepts] - self.assertListEqual(ids, ['a', 'b']) + self.assertListEqual(ids, ["a", "b"]) - @patch.object(OscDatasetStacGenerator, '_add_gcmd_link_to_var_catalog') - @patch.object(OscDatasetStacGenerator, 'add_themes_as_related_links_var_catalog') + @patch.object(OscDatasetStacGenerator, "_add_gcmd_link_to_var_catalog") + @patch.object(OscDatasetStacGenerator, "add_themes_as_related_links_var_catalog") def test_build_variable_catalog(self, mock_add_themes, mock_add_gcmd): """Test building of variable-level STAC catalog.""" - var_meta = self.generator.variables_metadata['var1'] + var_meta = self.generator.variables_metadata["var1"] catalog = self.generator.build_variable_catalog(var_meta) self.assertIsInstance(catalog, Catalog) - self.assertEqual(catalog.id, 'var1') + self.assertEqual(catalog.id, "var1") # Title should be capitalized - self.assertEqual(catalog.title, 'Var1') + self.assertEqual(catalog.title, "Var1") # Self href ends with var1/catalog.json - self.assertTrue(catalog.self_href.endswith('/var1/catalog.json')) + self.assertTrue(catalog.self_href.endswith("/var1/catalog.json")) - @patch('pystac.Catalog.from_file') + @patch("pystac.Catalog.from_file") def test_update_product_base_catalog(self, mock_from_file): """Test linking product catalog.""" mock_cat = MagicMock(spec=Catalog) mock_from_file.return_value = mock_cat - result = self.generator.update_product_base_catalog('path.json') + result = self.generator.update_product_base_catalog("path.json") self.assertIs(result, mock_cat) mock_cat.add_link.assert_called_once() mock_cat.set_self_href.assert_called_once_with(PRODUCT_BASE_CATALOG_SELF_HREF) - @patch('pystac.Catalog.from_file') + @patch("pystac.Catalog.from_file") def test_update_variable_base_catalog(self, mock_from_file): """Test linking variable base catalog.""" mock_cat = MagicMock(spec=Catalog) mock_from_file.return_value = mock_cat - vars_ = ['v1', 'v2'] - result = self.generator.update_variable_base_catalog('vars.json', vars_) + vars_ = ["v1", "v2"] + result = self.generator.update_variable_base_catalog("vars.json", vars_) self.assertIs(result, mock_cat) # Expect one add_link per variable self.assertEqual(mock_cat.add_link.call_count, len(vars_)) mock_cat.set_self_href.assert_called_once_with(VARIABLE_BASE_CATALOG_SELF_HREF) - @patch('pystac.Collection.from_file') + @patch("pystac.Collection.from_file") def test_update_deepesdl_collection(self, mock_from_file): """Test updating DeepESDL collection.""" mock_coll = MagicMock(spec=Collection) mock_from_file.return_value = mock_coll - result = self.generator.update_deepesdl_collection('deep.json') + result = self.generator.update_deepesdl_collection("deep.json") self.assertIs(result, mock_coll) # Expect child and theme related links for each theme calls = mock_coll.add_link.call_count diff --git a/deep_code/tests/utils/test_helper.py b/deep_code/tests/utils/test_helper.py index 2267c08..a7c8c3f 100644 --- a/deep_code/tests/utils/test_helper.py +++ b/deep_code/tests/utils/test_helper.py @@ -5,7 +5,7 @@ import os import unittest -from unittest.mock import MagicMock, patch, call +from unittest.mock import MagicMock, call, patch import xarray import xarray as xr @@ -16,14 +16,13 @@ def make_dummy_dataset(): """Create a simple xarray.Dataset for testing.""" return xr.Dataset( - coords={"time": [0, 1, 2]}, - data_vars={"x": (("time",), [10, 20, 30])} + coords={"time": [0, 1, 2]}, data_vars={"x": (("time",), [10, 20, 30])} ) class TestOpenDataset(unittest.TestCase): - @patch('deep_code.utils.helper.logging.getLogger') - @patch('deep_code.utils.helper.new_data_store') + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") def test_success_public_store(self, mock_new_store, mock_get_logger): """Should open dataset with the public store on first try.""" dummy = make_dummy_dataset() @@ -33,13 +32,11 @@ def test_success_public_store(self, mock_new_store, mock_get_logger): mock_logger = MagicMock() mock_get_logger.return_value = mock_logger - result = open_dataset('test-id') + result = open_dataset("test-id") self.assertIs(result, dummy) mock_new_store.assert_called_once_with( - 's3', - root='deep-esdl-public', - storage_options={'anon': True} + "s3", root="deep-esdl-public", storage_options={"anon": True} ) mock_logger.info.assert_any_call( "Attempting to open dataset 'test-id' with configuration: Public store" @@ -93,25 +90,25 @@ def test_open_dataset_success_authenticated_store( "Successfully opened dataset 'my-id' with configuration: Authenticated store" ) - @patch('deep_code.utils.helper.logging.getLogger') - @patch('deep_code.utils.helper.new_data_store') + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") def test_all_stores_fail_raises(self, mock_new_store, mock_get_logger): """Should raise ValueError if all stores fail.""" - mock_new_store.side_effect = Exception('fail') - os.environ['S3_USER_STORAGE_BUCKET'] = 'user-bucket' - os.environ['S3_USER_STORAGE_KEY'] = 'key' - os.environ['S3_USER_STORAGE_SECRET'] = 'secret' + mock_new_store.side_effect = Exception("fail") + os.environ["S3_USER_STORAGE_BUCKET"] = "user-bucket" + os.environ["S3_USER_STORAGE_KEY"] = "key" + os.environ["S3_USER_STORAGE_SECRET"] = "secret" mock_logger = MagicMock() mock_get_logger.return_value = mock_logger with self.assertRaises(ValueError) as ctx: - open_dataset('test-id') + open_dataset("test-id") msg = str(ctx.exception) self.assertIn("Tried configurations: Public store, Authenticated store", msg) self.assertIn("Last error: fail", msg) - @patch('deep_code.utils.helper.logging.getLogger') - @patch('deep_code.utils.helper.new_data_store') + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") def test_with_custom_configs(self, mock_new_store, mock_get_logger): """Should use provided storage_configs instead of defaults.""" dummy = make_dummy_dataset() @@ -122,15 +119,16 @@ def test_with_custom_configs(self, mock_new_store, mock_get_logger): mock_get_logger.return_value = mock_logger custom_cfgs = [ - {"description": "Local store", "params": {"storage_type": "file", "root": ".", "storage_options": {}}} + { + "description": "Local store", + "params": {"storage_type": "file", "root": ".", "storage_options": {}}, + } ] - result = open_dataset('test-id', storage_configs=custom_cfgs) + result = open_dataset("test-id", storage_configs=custom_cfgs) self.assertIs(result, dummy) - mock_new_store.assert_called_once_with( - 'file', root='.', storage_options={} - ) + mock_new_store.assert_called_once_with("file", root=".", storage_options={}) mock_logger.info.assert_any_call( "Attempting to open dataset 'test-id' with configuration: Local store" ) @@ -138,8 +136,8 @@ def test_with_custom_configs(self, mock_new_store, mock_get_logger): "Successfully opened dataset 'test-id' with configuration: Local store" ) - @patch('deep_code.utils.helper.logging.getLogger') - @patch('deep_code.utils.helper.new_data_store') + @patch("deep_code.utils.helper.logging.getLogger") + @patch("deep_code.utils.helper.new_data_store") def test_uses_provided_logger(self, mock_new_store, mock_get_logger): """Should use the logger provided by the caller.""" dummy = make_dummy_dataset() @@ -149,7 +147,7 @@ def test_uses_provided_logger(self, mock_new_store, mock_get_logger): custom_logger = MagicMock() mock_get_logger.side_effect = AssertionError("getLogger should not be used") - result = open_dataset('test-id', logger=custom_logger) + result = open_dataset("test-id", logger=custom_logger) self.assertIs(result, dummy) custom_logger.info.assert_any_call( @@ -158,4 +156,3 @@ def test_uses_provided_logger(self, mock_new_store, mock_get_logger): custom_logger.info.assert_any_call( "Successfully opened dataset 'test-id' with configuration: Public store" ) - diff --git a/deep_code/tools/lint.py b/deep_code/tools/lint.py index e904a11..c9eab51 100644 --- a/deep_code/tools/lint.py +++ b/deep_code/tools/lint.py @@ -1,5 +1,6 @@ import xarray as xr from xrlint.linter import new_linter + from deep_code.utils.custom_xrlint_rules import export_config from deep_code.utils.helper import open_dataset @@ -14,10 +15,10 @@ class LintDataset: Note: One of `dataset_id` or `dataset` must be provided. """ + def __init__( - self, - dataset_id: str | None = None, - dataset: xr.Dataset | None = None): + self, dataset_id: str | None = None, dataset: xr.Dataset | None = None + ): if not dataset_id and not dataset: raise ValueError("You must provide either `dataset_id` or `dataset`.") self.dataset_id = dataset_id diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py index 775d178..5da179e 100644 --- a/deep_code/utils/custom_xrlint_rules.py +++ b/deep_code/utils/custom_xrlint_rules.py @@ -24,7 +24,7 @@ def validate_dataset(self, ctx: RuleContext, node: DatasetNode): if "description" not in node.dataset.attrs: ctx.report( "Dataset missing required 'description' attribute.", - suggestions=["Add a 'description' attribute to dataset.attrs."] + suggestions=["Add a 'description' attribute to dataset.attrs."], ) @@ -37,9 +37,7 @@ def validate_variable(self, ctx: RuleContext, node: VariableNode): return if "gcmd_keyword_url" not in node.array.attrs: - ctx.report( - f"Variable '{node.name}' missing 'gcmd_keyword_url' attribute." - ) + ctx.report(f"Variable '{node.name}' missing 'gcmd_keyword_url' attribute.") # Define the recommended ruleset for this plugin @@ -52,7 +50,7 @@ def validate_variable(self, ctx: RuleContext, node: VariableNode): "deepcode/dataset-description": "error", } } - ] + ], ) @@ -66,11 +64,7 @@ def export_config() -> list: A list of plugin config dictionaries and rule presets. """ return [ - { - "plugins": { - "deepcode": plugin, - }, - }, + {"plugins": {"deepcode": plugin}}, "recommended", { "rules": { diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 9873022..3e96a6e 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -15,9 +15,9 @@ PRODUCT_BASE_CATALOG_SELF_HREF, VARIABLE_BASE_CATALOG_SELF_HREF, ) +from deep_code.utils.helper import open_dataset from deep_code.utils.ogc_api_record import Theme, ThemeConcept from deep_code.utils.osc_extension import OscExtension -from deep_code.utils.helper import open_dataset class OscDatasetStacGenerator: @@ -57,13 +57,9 @@ def __init__( self.osc_missions = osc_missions or [] self.cf_params = cf_params or {} self.logger = logging.getLogger(__name__) - self.dataset = open_dataset( - dataset_id=dataset_id, - logger=self.logger - ) + self.dataset = open_dataset(dataset_id=dataset_id, logger=self.logger) self.variables_metadata = self.get_variables_metadata() - def _get_spatial_extent(self) -> SpatialExtent: """Extract spatial extent from the dataset.""" if {"lon", "lat"}.issubset(self.dataset.coords): @@ -119,8 +115,7 @@ def _get_temporal_extent(self) -> TemporalExtent: @staticmethod def _normalize_name(name: str | None) -> str | None: if name: - return (name.replace(" ", "-"). - replace("_", "-").lower()) + return name.replace(" ", "-").replace("_", "-").lower() return None def _get_general_metadata(self) -> dict: @@ -148,8 +143,9 @@ def get_variable_ids(self) -> list[str]: variable_ids = list(self.variables_metadata.keys()) # Remove 'crs' and 'spatial_ref' from the list if they exist, note that # spatial_ref will be normalized to spatial-ref in variable_ids and skipped. - return [var_id for var_id in variable_ids if var_id not in ["crs", - "spatial-ref"]] + return [ + var_id for var_id in variable_ids if var_id not in ["crs", "spatial-ref"] + ] def get_variables_metadata(self) -> dict[str, dict]: """Extract metadata for all variables in the dataset.""" @@ -175,7 +171,8 @@ def _add_gcmd_link_to_var_catalog( if not gcmd_keyword_url: gcmd_keyword_url = input( f"Enter GCMD keyword URL or a similar url for" - f" {var_metadata.get('variable_id')}: ").strip() + f" {var_metadata.get('variable_id')}: " + ).strip() var_catalog.add_link( Link( rel="via", @@ -269,23 +266,23 @@ def build_variable_catalog(self, var_metadata) -> Catalog: return var_catalog def update_product_base_catalog(self, product_catalog_path) -> Catalog: - """Link product to base product catalog""" - product_base_catalog = Catalog.from_file(product_catalog_path) - product_base_catalog.add_link( - Link( - rel="child", - target=f"./{self.collection_id}/collection.json", - media_type="application/json", - title=self.collection_id, - ) + """Link product to base product catalog""" + product_base_catalog = Catalog.from_file(product_catalog_path) + product_base_catalog.add_link( + Link( + rel="child", + target=f"./{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, ) - # 'self' link: the direct URL where this JSON is hosted - product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) - return product_base_catalog + ) + # 'self' link: the direct URL where this JSON is hosted + product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) + return product_base_catalog - def update_variable_base_catalog(self, variable_base_catalog_path, variable_ids) \ - -> ( - Catalog): + def update_variable_base_catalog( + self, variable_base_catalog_path, variable_ids + ) -> (Catalog): """Link product to base product catalog""" variable_base_catalog = Catalog.from_file(variable_base_catalog_path) for var_id in variable_ids: @@ -330,7 +327,7 @@ def update_deepesdl_collection(self, deepesdl_collection_full_path): rel="related", target=f"../../themes/{theme}/catalog.json", media_type="application/json", - title=f"Theme: {self.format_string(theme)}" + title=f"Theme: {self.format_string(theme)}", ) ) deepesdl_collection.set_self_href(DEEPESDL_COLLECTION_SELF_HREF) @@ -477,7 +474,7 @@ def build_dataset_stac_collection(self) -> Collection: rel="related", target="../../projects/deep-earth-system-data-lab/collection.json", media_type="application/json", - title="Project: DeepESDL" + title="Project: DeepESDL", ) ) diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index 10ad44c..0e33585 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -5,6 +5,7 @@ import xarray as xr from xcube.core.store import new_data_store + def serialize(obj): """Convert non-serializable objects to JSON-compatible formats. Args: @@ -20,11 +21,12 @@ def serialize(obj): return obj.__dict__ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + def open_dataset( dataset_id: str, root: str = "deep-esdl-public", storage_configs: Optional[list[dict]] = None, - logger: Optional[logging.Logger] = None + logger: Optional[logging.Logger] = None, ) -> xr.Dataset: """Open an xarray dataset from a specified store. @@ -103,4 +105,4 @@ def open_dataset( f"Failed to open dataset with ID '{dataset_id}'. " f"Tried configurations: {', '.join(tried_configurations)}. " f"Last error: {last_exception}" - ) \ No newline at end of file + ) From f144635f20616130e25405855a6cbdd762d80664 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 12 Jun 2025 14:28:20 +0200 Subject: [PATCH 3/7] refactor --- deep_code/utils/custom_xrlint_rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py index 5da179e..c8ca0f5 100644 --- a/deep_code/utils/custom_xrlint_rules.py +++ b/deep_code/utils/custom_xrlint_rules.py @@ -3,7 +3,7 @@ # MIT license (https://mit-license.org/). """ -This module defines the catalog_metadata_plugin for XRLint, which validates +This module defines the deepcode plugin for XRLint, which validates metadata required for dataset publication to a catalog. It checks for: - A 'description' attribute in dataset.attrs - A 'gcmd_keyword_url' attribute in each variable's attrs From 92b7cde857d8fd096cf348bcc61d8b6b5944a781 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 12 Jun 2025 16:31:11 +0200 Subject: [PATCH 4/7] updated change log --- CHANGES.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 1e0b957..01f2403 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -13,4 +13,14 @@ - Support publishing to testing,staging and production repositories of open-science-metadata. - Implemented new cli command `generate-config` to generate starter templates for - config files. \ No newline at end of file + config files. + +## Changes in 0.1.3 + +- _Version bump only_; no code or functionality changes. This release was + republished to update the package on PyPI. + +## Changes in 0.1.4 + +- Implemented custom rules using xrlint to check required metadata in dataset to + generate valid open science catalog STAC collection. \ No newline at end of file From 58e6346228c3e62bcf3ccb8873037c678f36d8b0 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Fri, 13 Jun 2025 13:29:23 +0200 Subject: [PATCH 5/7] Update CHANGES.md Co-authored-by: Thomas Storm --- CHANGES.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 01f2403..e638dad 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -22,5 +22,5 @@ ## Changes in 0.1.4 -- Implemented custom rules using xrlint to check required metadata in dataset to - generate valid open science catalog STAC collection. \ No newline at end of file +- Implemented custom rules using xrlint to validate metadata in dataset, which is necessary to + generate a STAC collection valid for ESA Open Science Catalog. \ No newline at end of file From 3f0352e2021f3765164f49bfb65665ef07306b5d Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 13 Jun 2025 13:32:21 +0200 Subject: [PATCH 6/7] added return type --- deep_code/tools/lint.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deep_code/tools/lint.py b/deep_code/tools/lint.py index c9eab51..ec5d4ea 100644 --- a/deep_code/tools/lint.py +++ b/deep_code/tools/lint.py @@ -1,5 +1,6 @@ import xarray as xr from xrlint.linter import new_linter +from xrlint.result import Result from deep_code.utils.custom_xrlint_rules import export_config from deep_code.utils.helper import open_dataset @@ -24,7 +25,7 @@ def __init__( self.dataset_id = dataset_id self.dataset = dataset - def lint_dataset(self): + def lint_dataset(self) -> Result: if self.dataset is not None: ds = self.dataset elif self.dataset_id is not None: From 5a4b3fd37d80196138d9ab36cc90711bc146a2c2 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 13 Jun 2025 16:19:53 +0200 Subject: [PATCH 7/7] turn off time-coordinate rule --- deep_code/utils/custom_xrlint_rules.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deep_code/utils/custom_xrlint_rules.py b/deep_code/utils/custom_xrlint_rules.py index c8ca0f5..d003840 100644 --- a/deep_code/utils/custom_xrlint_rules.py +++ b/deep_code/utils/custom_xrlint_rules.py @@ -71,6 +71,7 @@ def export_config() -> list: "content-desc": "off", "no-empty-attrs": "off", "conventions": "off", + "time-coordinate": "off" } }, "deepcode/recommended",