From 8a8382352d27066c427dada5e43ee0869a26a790 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 19 Mar 2025 12:01:53 +0100 Subject: [PATCH 01/43] initial commit --- deep_code/tools/new.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index 3d1ed1e..f54d0b2 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -1,5 +1,13 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2025 by Brockmann Consult GmbH +# Permissions are hereby granted under the terms of the MIT License: +# https://opensource.org/licenses/MIT. + """Logic for initializing repositories Initialize a GitHub repository with the proposed configurations files, an initial workflow notebook template (e.g. workflow.ipynb), a template Python package (code and pyproject.toml), and a template setup for documentation (e.g., using mkdocs), -setup of thebuild pipeline""" +setup of the build pipeline""" + + From 3091bc77e75218d577918131dde3d55ba326fa49 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 7 Mar 2025 13:13:33 +0100 Subject: [PATCH 02/43] current state --- deep_code/constants.py | 4 +- deep_code/tools/publish.py | 52 ++++++++++-- deep_code/utils/dataset_stac_generator.py | 99 +++++++++++++++++++++-- deep_code/utils/github_automation.py | 41 +++++++--- deep_code/utils/osc_extension.py | 27 ++++--- deep_code/version.py | 2 +- 6 files changed, 189 insertions(+), 36 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 992ddf4..6649a8c 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -4,8 +4,10 @@ # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. -OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0-rc.3/schema.json" +OSC_SCHEMA_URI = "https://stac-extensions.github.io/osc/v1.0.0/schema.json" CF_SCHEMA_URI = "https://stac-extensions.github.io/cf/v0.2.0/schema.json" +THEMES_SCHEMA_URI = "https://stac-extensions.github.io/themes/v1.0.0/schema.json" +OSC_THEME_SCHEME = "https://github.com/stac-extensions/osc#theme" OSC_REPO_OWNER = "ESA-EarthCODE" OSC_REPO_NAME = "open-science-catalog-metadata-testing" OSC_BRANCH_NAME = "add-new-collection" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index ba762e1..b29aeeb 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -6,6 +6,7 @@ import logging from pathlib import Path +from datetime import datetime import fsspec import yaml @@ -43,6 +44,8 @@ def __init__(self): self.github_automation = GitHubAutomation( self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) + self.github_automation.fork_repository() + self.github_automation.clone_repository() def publish_files( self, @@ -65,9 +68,6 @@ def publish_files( URL of the created pull request. """ try: - logger.info("Forking and cloning repository...") - self.github_automation.fork_repository() - self.github_automation.clone_repository() self.github_automation.create_branch(branch_name) # Add each file to the branch @@ -137,8 +137,16 @@ def publish_dataset(self, dataset_config_path: str): product_path = f"products/{collection_id}/collection.json" file_dict[product_path] = ds_collection.to_dict() + variable_base_catalog_path = f"variables/catalog.json" + variable_catalog_full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) + / variable_base_catalog_path + ) # Add or update variable files for var_id in variable_ids: + if var_id in ["crs", "spatial_ref"]: + logger.info(f"Skipping CRS variable: {var_id}") + continue var_file_path = f"variables/{var_id}/catalog.json" if not self.gh_publisher.github_automation.file_exists(var_file_path): logger.info( @@ -147,6 +155,13 @@ def publish_dataset(self, dataset_config_path: str): var_metadata = generator.variables_metadata.get(var_id) var_catalog = generator.build_variable_catalog(var_metadata) file_dict[var_file_path] = var_catalog.to_dict() + logger.info( + f"Add {var_id} child link to variable base catalog" + ) + updated_var_base_catalog = generator.update_variable_base_catalog( + variable_catalog_full_path, var_id + ) + file_dict[variable_base_catalog_path] = updated_var_base_catalog.to_dict() else: logger.info( f"Variable catalog already exists for {var_id}, adding product link." @@ -160,11 +175,31 @@ def publish_dataset(self, dataset_config_path: str): ) file_dict[var_file_path] = updated_catalog.to_dict() + """Link product to base product catalog""" + product_catalog_path = f"products/catalog.json" + full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) + / product_catalog_path + ) + updated_product_base_catalog = generator.update_product_base_catalog(full_path) + file_dict[product_catalog_path] = updated_product_base_catalog.to_dict() + + #Link product to project catalog + deepesdl_collection_path = \ + f"projects/deep-earth-system-data-lab/collection.json" + deepesdl_collection_full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) + / deepesdl_collection_path + ) + updated_deepesdl_collection = generator.update_deepesdl_collection(deepesdl_collection_full_path) + file_dict[deepesdl_collection_path] = updated_deepesdl_collection.to_dict() + # Create branch name, commit message, PR info - branch_name = f"{OSC_BRANCH_NAME}-{collection_id}" + branch_name = f"{OSC_BRANCH_NAME}-{collection_id}-{datetime.now().strftime('%Y%m%d%H%M%S')}" commit_message = f"Add new dataset collection: {collection_id}" - pr_title = "Add new dataset collection" - pr_body = "This PR adds a new dataset collection to the repository." + pr_title = f"Add new dataset collection: {collection_id}" + pr_body = (f"This PR adds a new dataset collection: {collection_id} and it's " + f"corresponding variable catalogs to the repository.") # Publish all files in one go pr_url = self.gh_publisher.publish_files( @@ -231,3 +266,8 @@ def publish_workflow(self, workflow_config_path: str): ) logger.info(f"Pull request created: {pr_url}") + +if __name__ == '__main__': + ds_p = DatasetPublisher() + ds_p.publish_dataset("/home/tejas/bc/projects/deepesdl/deep-code/dataset-config" + ".yaml") diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 3d4da00..6db175e 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 - +import json # Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. @@ -12,7 +12,9 @@ from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent from xcube.core.store import new_data_store +from deep_code.constants import OSC_THEME_SCHEME from deep_code.utils.osc_extension import OscExtension +from deep_code.utils.ogc_api_record import Theme, ThemeConcept class OscDatasetStacGenerator: @@ -218,10 +220,12 @@ def _add_gcmd_link_to_var_catalog( gcmd_keyword_url = var_metadata.get("gcmd_keyword_url") if not gcmd_keyword_url: self.logger.debug( - f"No gcmd_keyword_url in var_metadata. Skipping adding GCMD link in " - f'the {var_metadata.get("variable_id")} catalog' + f"No gcmd_keyword_url in var_metadata. Please input GCMD link " + f"for the {var_metadata.get("variable_id")} catalog" ) - return + gcmd_keyword_url = input( + f"Enter GCMD keyword URL or a similar url for" + f" {var_metadata.get("variable_id")}: ").strip() var_catalog.add_link( Link( rel="via", @@ -312,6 +316,45 @@ def build_variable_catalog(self, var_metadata) -> Catalog: return var_catalog + def update_product_base_catalog(self, product_catalog_path) -> Catalog: + """Link product to base product catalog""" + product_base_catalog = Catalog.from_file(product_catalog_path) + product_base_catalog.add_link( + Link( + rel="child", + target=f"./{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + return product_base_catalog + + def update_variable_base_catalog(self, variable_base_catalog_path, var_id) -> ( + Catalog): + """Link product to base product catalog""" + variable_base_catalog = Catalog.from_file(variable_base_catalog_path) + variable_base_catalog.add_link( + Link( + rel="child", + target=f"./{var_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + return variable_base_catalog + + def update_deepesdl_collection(self, deepesdl_collection_full_path): + deepesdl_collection = Collection.from_file(deepesdl_collection_full_path) + deepesdl_collection.add_link( + Link( + rel="child", + target=f"../../products/{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) + ) + return deepesdl_collection + def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: existing_catalog = Catalog.from_file(var_file_path) now_iso = datetime.now(timezone.utc).isoformat() @@ -335,6 +378,25 @@ def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: return existing_catalog + @staticmethod + def format_string(s): + return s.capitalize() + + @staticmethod + def build_theme(osc_themes: list[str]) -> Theme: + """Convert each string into a ThemeConcept + """ + concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] + return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) + + def build_theme_links(self) -> Link: + return Link( + rel="related", + target="../../themes/catalog.json", + media_type="application/json", + title="Theme: Land" + ) + def build_dataset_stac_collection(self) -> Collection: """Build an OSC STAC Collection for the dataset. @@ -363,7 +425,6 @@ def build_dataset_stac_collection(self) -> Collection: osc_extension.osc_type = "product" osc_extension.osc_status = self.osc_status osc_extension.osc_region = self.osc_region - osc_extension.osc_themes = self.osc_themes osc_extension.osc_variables = variables osc_extension.osc_missions = self.osc_missions if self.cf_params: @@ -400,12 +461,13 @@ def build_dataset_stac_collection(self) -> Collection: title="Products", ) ) + # Add variables ref for var in variables: collection.add_link( Link( rel="related", - target=f"../../varibales/{var}/catalog.json", + target=f"../../variables/{var}/catalog.json", media_type="application/json", title="Variable: " + var, ) @@ -417,6 +479,31 @@ def build_dataset_stac_collection(self) -> Collection: ) collection.set_self_href(self_href) + # align with themes instead of osc:themes + if self.osc_themes: + theme_obj = self.build_theme(self.osc_themes) + collection.extra_fields["themes"] = [theme_obj] + + for theme in self.osc_themes: + formated_theme = self.format_string(theme) + collection.add_link( + Link( + rel="related", + target=f"../../themes/{theme}/catalog.json", + media_type="application/json", + title=f"Theme: {formated_theme}", + ) + ) + + collection.add_link( + Link( + rel="related", + target=f"../../projects/deep-earth-system-data-lab/collection.json", + media_type="application/json", + title=f"Project: DeepESDL" + ) + ) + # Validate OSC extension fields try: osc_extension.validate_extension() diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index dbecfe4..0eb91f5 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -50,15 +50,15 @@ def clone_repository(self): subprocess.run( ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True ) - os.chdir(self.local_clone_dir) + # os.chdir(self.local_clone_dir) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to clone repository: {e}") - @staticmethod - def create_branch(branch_name: str): + def create_branch(self, branch_name: str): """Create a new branch in the local repository.""" logging.info(f"Creating new branch: {branch_name}...") try: + os.chdir(self.local_clone_dir) subprocess.run(["git", "checkout", "-b", branch_name], check=True) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed Creating branch: '{branch_name}': {e}") @@ -66,22 +66,35 @@ def create_branch(branch_name: str): def add_file(self, file_path: str, content): """Add a new file to the local repository.""" logging.info(f"Adding new file: {file_path}...") + os.chdir(self.local_clone_dir) # Ensure we are in the Git repository full_path = Path(self.local_clone_dir) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) + + # Ensure content is serializable + if hasattr(content, "to_dict"): + content = content.to_dict() + if not isinstance(content, (dict, list, str, int, float, bool, type(None))): + raise TypeError(f"Cannot serialize content of type {type(content)}") + + # Serialize to JSON + try: + json_content = json.dumps(content, indent=2, default=self.serialize) + except TypeError as e: + raise RuntimeError(f"JSON serialization failed: {e}") + with open(full_path, "w") as f: - # Convert content to dictionary if it's a PySTAC object - if hasattr(content, "to_dict"): - content = content.to_dict() - f.write(json.dumps(content, indent=2)) + f.write(json_content) + + # Git add the file try: subprocess.run(["git", "add", str(full_path)], check=True) except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to add file '{file_path}': {e}") - @staticmethod - def commit_and_push(branch_name: str, commit_message: str): + def commit_and_push(self, branch_name: str, commit_message: str): """Commit changes and push to the forked repository.""" logging.info("Committing and pushing changes...") + os.chdir(self.local_clone_dir) try: subprocess.run(["git", "commit", "-m", commit_message], check=True) subprocess.run(["git", "push", "-u", "origin", branch_name], check=True) @@ -93,6 +106,7 @@ def create_pull_request( ): """Create a pull request from the forked repository to the base repository.""" logging.info("Creating a pull request...") + os.chdir(self.local_clone_dir) url = f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls" headers = {"Authorization": f"token {self.token}"} data = { @@ -120,3 +134,12 @@ def file_exists(self, file_path) -> bool: exists = os.path.isfile(full_path) logging.debug(f"Checking existence of {full_path}: {exists}") return exists + + # Check and convert any non-serializable objects + def serialize(self, obj): + if isinstance(obj, set): + return list(obj) # Convert sets to lists + if hasattr(obj, "__dict__"): + return obj.__dict__ # Convert objects with attributes to dicts + raise TypeError( + f"Object of type {type(obj).__name__} is not JSON serializable") \ No newline at end of file diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 8a777de..4f269b9 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -10,7 +10,8 @@ from pystac import Extent, SpatialExtent, TemporalExtent from pystac.extensions.base import ExtensionManagementMixin, PropertiesExtension -from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI +from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI, THEMES_SCHEMA_URI + class OscExtension( @@ -63,17 +64,17 @@ def osc_project(self) -> str | None: def osc_project(self, v: str) -> None: self._set_property("osc:project", v, pop_if_none=False) - @property - def osc_themes(self) -> list[str] | None: - return self._get_property("osc:themes", list) - - @osc_themes.setter - def osc_themes(self, value: list[str]) -> None: - if not isinstance(value, list) or not all( - isinstance(item, str) for item in value - ): - raise ValueError("osc:themes must be a list of strings") - self._set_property("osc:themes", value, pop_if_none=False) + # @property + # def osc_themes(self) -> list[str] | None: + # return self._get_property("osc:themes", list) + # + # @osc_themes.setter + # def osc_themes(self, value: list[str]) -> None: + # if not isinstance(value, list) or not all( + # isinstance(item, str) for item in value + # ): + # raise ValueError("osc:themes must be a list of strings") + # self._set_property("osc:themes", value, pop_if_none=False) @property def osc_region(self) -> str | None: @@ -150,7 +151,7 @@ def updated(self, value: str) -> None: @classmethod def get_schema_uri(cls) -> list[str]: - return [OSC_SCHEMA_URI, CF_SCHEMA_URI] + return [OSC_SCHEMA_URI, CF_SCHEMA_URI, THEMES_SCHEMA_URI] @classmethod def ext( diff --git a/deep_code/version.py b/deep_code/version.py index e2b6e0b..30dc845 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.0.1.dev0" +version = "0.0.1.dev1" From 6a795fe2cdbef4311758d88406586288f46fcf66 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 7 Mar 2025 14:45:03 +0100 Subject: [PATCH 03/43] introduced new constants and re-adding self-links --- deep_code/constants.py | 2 ++ deep_code/utils/dataset_stac_generator.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 6649a8c..31491ab 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -16,3 +16,5 @@ ) OGC_API_RECORD_SPEC = "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core" WF_BRANCH_NAME = "add-new-workflow-from-deepesdl" +VARIABLE_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/variables/catalog.json" +PRODUCT_BASE_CATALOG_SELF_HREF = ("https://esa-earthcode.github.io/open-science-catalog-metadata/products/catalog.json") \ No newline at end of file diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 6db175e..cec42e3 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -12,7 +12,8 @@ from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent from xcube.core.store import new_data_store -from deep_code.constants import OSC_THEME_SCHEME +from deep_code.constants import OSC_THEME_SCHEME, VARIABLE_BASE_CATALOG_SELF_HREF, \ + PRODUCT_BASE_CATALOG_SELF_HREF from deep_code.utils.osc_extension import OscExtension from deep_code.utils.ogc_api_record import Theme, ThemeConcept @@ -195,7 +196,9 @@ def extract_metadata_for_variable(self, variable_data) -> dict: def get_variable_ids(self) -> list[str]: """Get variable IDs for all variables in the dataset.""" - return list(self.variables_metadata.keys()) + variable_ids = list(self.variables_metadata.keys()) + # Remove 'crs' and 'spatial_ref' from the list if they exist + return [var_id for var_id in variable_ids if var_id not in ["crs", "spatial_ref"]] def get_variables_metadata(self) -> dict[str, dict]: """Extract metadata for all variables in the dataset.""" @@ -327,6 +330,8 @@ def update_product_base_catalog(self, product_catalog_path) -> Catalog: title=self.collection_id, ) ) + # 'self' link: the direct URL where this JSON is hosted + product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) return product_base_catalog def update_variable_base_catalog(self, variable_base_catalog_path, var_id) -> ( @@ -336,11 +341,13 @@ def update_variable_base_catalog(self, variable_base_catalog_path, var_id) -> ( variable_base_catalog.add_link( Link( rel="child", - target=f"./{var_id}/collection.json", + target=f"./{var_id}/catalog.json", media_type="application/json", title=self.collection_id, ) ) + # 'self' link: the direct URL where this JSON is hosted + variable_base_catalog.set_self_href(VARIABLE_BASE_CATALOG_SELF_HREF) return variable_base_catalog def update_deepesdl_collection(self, deepesdl_collection_full_path): From ba6e895efe6e43dac2b97b74643124917a467e50 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 7 Mar 2025 15:32:19 +0100 Subject: [PATCH 04/43] fix updating var base catalog --- deep_code/constants.py | 4 ++- deep_code/tools/publish.py | 27 ++++++++-------- deep_code/utils/dataset_stac_generator.py | 38 +++++++++++++++++------ 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 31491ab..10afbca 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -17,4 +17,6 @@ OGC_API_RECORD_SPEC = "http://www.opengis.net/spec/ogcapi-records-1/1.0/req/record-core" WF_BRANCH_NAME = "add-new-workflow-from-deepesdl" VARIABLE_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/variables/catalog.json" -PRODUCT_BASE_CATALOG_SELF_HREF = ("https://esa-earthcode.github.io/open-science-catalog-metadata/products/catalog.json") \ No newline at end of file +PRODUCT_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/products/catalog.json" +DEEPESDL_COLLECTION_SELF_HREF = ("https://esa-earthcode.github.io/open-science-catalog" + "-metadatprojects/deepesdl/collection.json") \ No newline at end of file diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index b29aeeb..da45972 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -144,9 +144,9 @@ def publish_dataset(self, dataset_config_path: str): ) # Add or update variable files for var_id in variable_ids: - if var_id in ["crs", "spatial_ref"]: - logger.info(f"Skipping CRS variable: {var_id}") - continue + # if var_id in ["crs", "spatial_ref"]: + # logger.info(f"Skipping CRS variable: {var_id}") + # continue var_file_path = f"variables/{var_id}/catalog.json" if not self.gh_publisher.github_automation.file_exists(var_file_path): logger.info( @@ -155,13 +155,6 @@ def publish_dataset(self, dataset_config_path: str): var_metadata = generator.variables_metadata.get(var_id) var_catalog = generator.build_variable_catalog(var_metadata) file_dict[var_file_path] = var_catalog.to_dict() - logger.info( - f"Add {var_id} child link to variable base catalog" - ) - updated_var_base_catalog = generator.update_variable_base_catalog( - variable_catalog_full_path, var_id - ) - file_dict[variable_base_catalog_path] = updated_var_base_catalog.to_dict() else: logger.info( f"Variable catalog already exists for {var_id}, adding product link." @@ -174,7 +167,17 @@ def publish_dataset(self, dataset_config_path: str): full_path, var_id ) file_dict[var_file_path] = updated_catalog.to_dict() + # logger.info( + # f"Add {var_id} child link to variable base catalog" + # ) + # file_dict[ + # variable_base_catalog_path] = generator.update_variable_base_catalog( + # variable_catalog_full_path, var_id).to_dict() + + file_dict[variable_base_catalog_path] = generator.update_variable_base_catalog( + variable_catalog_full_path, variable_ids + ).to_dict() """Link product to base product catalog""" product_catalog_path = f"products/catalog.json" full_path = ( @@ -267,7 +270,3 @@ def publish_workflow(self, workflow_config_path: str): logger.info(f"Pull request created: {pr_url}") -if __name__ == '__main__': - ds_p = DatasetPublisher() - ds_p.publish_dataset("/home/tejas/bc/projects/deepesdl/deep-code/dataset-config" - ".yaml") diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index cec42e3..a98ff58 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -13,7 +13,7 @@ from xcube.core.store import new_data_store from deep_code.constants import OSC_THEME_SCHEME, VARIABLE_BASE_CATALOG_SELF_HREF, \ - PRODUCT_BASE_CATALOG_SELF_HREF + PRODUCT_BASE_CATALOG_SELF_HREF, DEEPESDL_COLLECTION_SELF_HREF from deep_code.utils.osc_extension import OscExtension from deep_code.utils.ogc_api_record import Theme, ThemeConcept @@ -310,6 +310,8 @@ def build_variable_catalog(self, var_metadata) -> Catalog: # Add gcmd link for the variable definition self._add_gcmd_link_to_var_catalog(var_catalog, var_metadata) + self.add_themes_as_related_links_var_catalog(var_catalog) + self_href = ( f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" f"/{var_id}/catalog.json" @@ -334,22 +336,36 @@ def update_product_base_catalog(self, product_catalog_path) -> Catalog: product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) return product_base_catalog - def update_variable_base_catalog(self, variable_base_catalog_path, var_id) -> ( + def update_variable_base_catalog(self, variable_base_catalog_path, variable_ids) \ + -> ( Catalog): """Link product to base product catalog""" variable_base_catalog = Catalog.from_file(variable_base_catalog_path) - variable_base_catalog.add_link( - Link( - rel="child", - target=f"./{var_id}/catalog.json", - media_type="application/json", - title=self.collection_id, + for var_id in variable_ids: + variable_base_catalog.add_link( + Link( + rel="child", + target=f"./{var_id}/catalog.json", + media_type="application/json", + title=var_id, + ) ) - ) # 'self' link: the direct URL where this JSON is hosted variable_base_catalog.set_self_href(VARIABLE_BASE_CATALOG_SELF_HREF) return variable_base_catalog + def add_themes_as_related_links_var_catalog(self, var_catalog): + """Add themes as related links to variable catalog""" + for theme in self.osc_themes: + var_catalog.add_link( + Link( + rel="related", + target="../../themes/oceans/catalog.json", + media_type="application/json", + title=f"Theme: {self.format_string(theme)}", + ) + ) + def update_deepesdl_collection(self, deepesdl_collection_full_path): deepesdl_collection = Collection.from_file(deepesdl_collection_full_path) deepesdl_collection.add_link( @@ -360,6 +376,7 @@ def update_deepesdl_collection(self, deepesdl_collection_full_path): title=self.collection_id, ) ) + deepesdl_collection.set_self_href(DEEPESDL_COLLECTION_SELF_HREF) return deepesdl_collection def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: @@ -376,6 +393,7 @@ def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: title=self.collection_id, ) ) + self.add_themes_as_related_links_var_catalog(existing_catalog) self_href = ( f"https://esa-earthcode.github.io/open-science-catalog-metadata/variables" f"/{var_id}/catalog.json" @@ -482,7 +500,7 @@ def build_dataset_stac_collection(self) -> Collection: self_href = ( "https://esa-earthcode.github.io/" - "open-science-catalog-metadata/products/deepesdl/collection.json" + f"open-science-catalog-metadata/products/{self.collection_id}/collection.json" ) collection.set_self_href(self_href) From 8d0a2b47a76a202a2b6ee20df6eab7f981d4b1c6 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 7 Mar 2025 16:03:19 +0100 Subject: [PATCH 05/43] refactored add themes as links --- deep_code/constants.py | 5 +++-- deep_code/utils/dataset_stac_generator.py | 12 +++++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 10afbca..e346c0c 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -18,5 +18,6 @@ WF_BRANCH_NAME = "add-new-workflow-from-deepesdl" VARIABLE_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/variables/catalog.json" PRODUCT_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/products/catalog.json" -DEEPESDL_COLLECTION_SELF_HREF = ("https://esa-earthcode.github.io/open-science-catalog" - "-metadatprojects/deepesdl/collection.json") \ No newline at end of file +DEEPESDL_COLLECTION_SELF_HREF = ( + "https://esa-earthcode.github.io/open-science-catalog-metadata/projects/deepesdl" + "/collection.json") \ No newline at end of file diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index a98ff58..4b2f10a 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -360,7 +360,7 @@ def add_themes_as_related_links_var_catalog(self, var_catalog): var_catalog.add_link( Link( rel="related", - target="../../themes/oceans/catalog.json", + target=f"../../themes/{theme}/catalog.json", media_type="application/json", title=f"Theme: {self.format_string(theme)}", ) @@ -376,6 +376,16 @@ def update_deepesdl_collection(self, deepesdl_collection_full_path): title=self.collection_id, ) ) + # add themes to deepesdl + for theme in self.osc_themes: + deepesdl_collection.add_link( + Link( + rel="related", + target=f"../../themes/cryosphere/catalog.json", + media_type="application/json", + title=f"Theme: {self.format_string(theme)}" + ) + ) deepesdl_collection.set_self_href(DEEPESDL_COLLECTION_SELF_HREF) return deepesdl_collection From eae7f533226a1037cdf3d86d7cec835832b26553 Mon Sep 17 00:00:00 2001 From: tejas Date: Sun, 9 Mar 2025 10:34:49 +0100 Subject: [PATCH 06/43] refactor --- deep_code/tools/publish.py | 33 +++++++++++++++++++++++ deep_code/utils/dataset_stac_generator.py | 27 ++++++++++--------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index da45972..d9f13da 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -10,6 +10,7 @@ import fsspec import yaml +from pystac import Catalog from deep_code.constants import ( OSC_BRANCH_NAME, @@ -99,6 +100,36 @@ def __init__(self): # Composition self.gh_publisher = GitHubPublisher() + @staticmethod + def clean_title(title: str) -> str: + """Clean up titles by replacing Unicode escape sequences with standard characters.""" + title = title.replace('\u00a0', + ' ') # Replace non-breaking space with normal space + title = title.replace('\u00b0', + '°') # Replace unicode degree symbol with actual degree symbol + return title + + def clean_catalog_titles(self, catalog: Catalog): + """Recursively clean all titles in the catalog.""" + # Clean title for the catalog itself + if isinstance(catalog.title, str): + catalog.title = self.clean_title(catalog.title) + + # Clean titles in all links of the catalog + for link in catalog.links: + if isinstance(link.title, str): + link.title = self.clean_title(link.title) + + for link in catalog.links: + if link.rel == 'child': + try: + # If the link points to another catalog or collection, clean it recursively + child_catalog = Catalog.from_file(link.href) + self.clean_catalog_titles(child_catalog) + except Exception as e: + # If the link doesn't point to a valid catalog file, skip it + pass + def publish_dataset(self, dataset_config_path: str): """Publish a product collection to the specified GitHub repository.""" with fsspec.open(dataset_config_path, "r") as file: @@ -185,6 +216,8 @@ def publish_dataset(self, dataset_config_path: str): / product_catalog_path ) updated_product_base_catalog = generator.update_product_base_catalog(full_path) + # clean special characters + self.clean_catalog_titles(updated_product_base_catalog) file_dict[product_catalog_path] = updated_product_base_catalog.to_dict() #Link product to project catalog diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 4b2f10a..39e586c 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -322,21 +322,22 @@ def build_variable_catalog(self, var_metadata) -> Catalog: return var_catalog def update_product_base_catalog(self, product_catalog_path) -> Catalog: - """Link product to base product catalog""" - product_base_catalog = Catalog.from_file(product_catalog_path) - product_base_catalog.add_link( - Link( - rel="child", - target=f"./{self.collection_id}/collection.json", - media_type="application/json", - title=self.collection_id, + """Link product to base product catalog""" + product_base_catalog = Catalog.from_file(product_catalog_path) + product_base_catalog.add_link( + Link( + rel="child", + target=f"./{self.collection_id}/collection.json", + media_type="application/json", + title=self.collection_id, + ) ) - ) - # 'self' link: the direct URL where this JSON is hosted - product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) - return product_base_catalog + # 'self' link: the direct URL where this JSON is hosted + product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) + return product_base_catalog - def update_variable_base_catalog(self, variable_base_catalog_path, variable_ids) \ + @staticmethod + def update_variable_base_catalog(variable_base_catalog_path, variable_ids) \ -> ( Catalog): """Link product to base product catalog""" From ffed6702f6263c11d62ba6209837af5cc8a0c870 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 10 Mar 2025 19:30:35 +0100 Subject: [PATCH 07/43] implemented ogc api record generation for workflow and experiment --- deep_code/cli/publish.py | 3 +- deep_code/constants.py | 3 +- deep_code/tools/publish.py | 98 ++++++++++++++---- deep_code/utils/dataset_stac_generator.py | 14 +-- deep_code/utils/ogc_api_record.py | 121 ++++++++++++++++++++-- deep_code/utils/ogc_record_generator.py | 8 +- 6 files changed, 203 insertions(+), 44 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index a3d81d0..bdeaee9 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -23,4 +23,5 @@ def publish_dataset(dataset_config): def publish_workflow(workflow_metadata): workflow_publisher = WorkflowPublisher() - workflow_publisher.publish_workflow(workflow_config_path=workflow_metadata) + workflow_publisher.publish_workflow_experiment( + workflow_config_path=workflow_metadata) diff --git a/deep_code/constants.py b/deep_code/constants.py index e346c0c..59a993c 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -20,4 +20,5 @@ PRODUCT_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/products/catalog.json" DEEPESDL_COLLECTION_SELF_HREF = ( "https://esa-earthcode.github.io/open-science-catalog-metadata/projects/deepesdl" - "/collection.json") \ No newline at end of file + "/collection.json") +BASE_URL_OSC = "https://esa-earthcode.github.io/open-science-catalog-metadata" \ No newline at end of file diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index d9f13da..72e7fb7 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -1,5 +1,6 @@ #!/usr/bin/env python3 - +import copy +import json # Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. @@ -20,7 +21,8 @@ ) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.ogc_api_record import OgcRecord +from deep_code.utils.ogc_api_record import WorkflowAsOgcRecord, \ + ExperimentAsOgcRecord from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator logger = logging.getLogger(__name__) @@ -259,7 +261,23 @@ def __init__(self): def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - def publish_workflow(self, workflow_config_path: str): + @staticmethod + def _write_to_file(file_path: str, data: dict): + """Write a dictionary to a JSON file. + + Args: + file_path (str): The path to the file. + data (dict): The data to write. + """ + # Create the directory if it doesn't exist + Path(file_path).parent.mkdir(parents=True, exist_ok=True) + + # Write the data to the file + with open(file_path, "w") as file: + json.dump(data, file, indent=4) + logger.info(f"File written to {file_path}") + + def publish_workflow_experiment(self, workflow_config_path: str, write_to_file: bool = False): with fsspec.open(workflow_config_path, "r") as file: workflow_config = yaml.safe_load(file) or {} @@ -270,36 +288,72 @@ def publish_workflow(self, workflow_config_path: str): properties_list = workflow_config.get("properties", []) contacts = workflow_config.get("contact", []) links = workflow_config.get("links", []) + jupyter_notebook_url = workflow_config.get("jupyter_notebook_url") logger.info("Generating OGC API Record for the workflow...") rg = OSCWorkflowOGCApiRecordGenerator() - wf_record_properties = rg.build_record_properties(properties_list, contacts) - - ogc_record = OgcRecord( + wf_record_properties = rg.build_record_properties(properties_list, contacts, + caller="WorkflowAsOgcRecord") + workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", - time={}, properties=wf_record_properties, links=links, + jupyter_notebook_url=jupyter_notebook_url ) + # Convert to dictionary and remove jupyter_notebook_url + workflow_dict = workflow_record.to_dict() + if "jupyter_notebook_url" in workflow_dict: + del workflow_dict["jupyter_notebook_url"] - file_path = f"workflow/{workflow_id}/collection.json" - - # Prepare the single file dict - file_dict = {file_path: ogc_record.to_dict()} + wf_file_path = f"workflow/{workflow_id}/record.json" + file_dict = {wf_file_path: workflow_dict} - branch_name = f"{WF_BRANCH_NAME}-{workflow_id}" - commit_message = f"Add new workflow: {workflow_id}" - pr_title = "Add new workflow" - pr_body = "This PR adds a new workflow to the OSC repository." + # Build properties for the experiment record + exp_record_properties = copy.deepcopy(wf_record_properties) + exp_record_properties.type = "experiment" - pr_url = self.gh_publisher.publish_files( - branch_name=branch_name, - file_dict=file_dict, - commit_message=commit_message, - pr_title=pr_title, - pr_body=pr_body, + experiment_record = ExperimentAsOgcRecord( + id=workflow_id, + type="Feature", + properties=exp_record_properties, + links=links, + jupyter_notebook_url=jupyter_notebook_url ) + # Convert to dictionary and remove jupyter_notebook_url + experiment_dict = experiment_record.to_dict() + if "jupyter_notebook_url" in experiment_dict: + del experiment_dict["jupyter_notebook_url"] + exp_file_path = f"experiments/{workflow_id}/record.json" + file_dict[exp_file_path] = experiment_dict + + # Write to files if testing + if write_to_file: + self._write_to_file(wf_file_path, workflow_record.to_dict()) + self._write_to_file(exp_file_path, experiment_record.to_dict()) + + # Publish to GitHub if not testing + if not write_to_file: + branch_name = f"{WF_BRANCH_NAME}-{workflow_id}" + commit_message = f"Adding workflow from DeepESDL: {workflow_id}" + pr_title = f"Add workflow and Experiment from DeepESDL: {workflow_id}" + pr_body = "This PR adds a new workflow/experiment to the OSC repository." + + pr_url = self.gh_publisher.publish_files( + branch_name=branch_name, + file_dict=file_dict, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) - logger.info(f"Pull request created: {pr_url}") + logger.info(f"Pull request created: {pr_url}") +if __name__ == '__main__': + # Example usage for testing + publisher = WorkflowPublisher() + publisher.publish_workflow_experiment( + workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow" + "-config.yaml", + write_to_file=True + ) \ No newline at end of file diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 39e586c..d708003 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -425,13 +425,13 @@ def build_theme(osc_themes: list[str]) -> Theme: concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - def build_theme_links(self) -> Link: - return Link( - rel="related", - target="../../themes/catalog.json", - media_type="application/json", - title="Theme: Land" - ) + # def build_theme_links(self) -> Link: + # return Link( + # rel="related", + # target="../../themes/catalog.json", + # media_type="application/json", + # title="Theme: Land" + # ) def build_dataset_stac_collection(self) -> Collection: """Build an OSC STAC Collection for the dataset. diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 437c2c8..bf2fee3 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -1,9 +1,9 @@ from typing import Any, Optional from xrlint.util.constructible import MappingConstructible -from xrlint.util.serializable import JsonSerializable +from xrlint.util.serializable import JsonSerializable, JsonValue -from deep_code.constants import OGC_API_RECORD_SPEC +from deep_code.constants import OGC_API_RECORD_SPEC, BASE_URL_OSC class Contact(MappingConstructible["Contact"], JsonSerializable): @@ -30,11 +30,10 @@ def __init__(self, id: str): class Theme(MappingConstructible["Theme"], JsonSerializable): - def __init__(self, concepts: list[ThemeConcept], scheme: str): + def __init__(self, concepts: list, scheme: str): self.concepts = concepts self.scheme = scheme - class JupyterKernelInfo(MappingConstructible["RecordProperties"], JsonSerializable): def __init__(self, name: str, python_version: float, env_file: str): self.name = name @@ -70,25 +69,129 @@ def __init__( self.license = license -class OgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): +class ExperimentAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): + def __init__( + self, + id: str, + type: str, + jupyter_notebook_url: str, + properties: RecordProperties, + links: list[dict], + linkTemplates: list = [], + conformsTo: list[str] = None, + geometry: Optional[Any] = None + ): + if conformsTo is None: + conformsTo = [OGC_API_RECORD_SPEC] + self.id = id + self.type = type + self.conformsTo = conformsTo + self.jupyter_notebook_url = jupyter_notebook_url + self.geometry = geometry + self.properties = properties + self.linkTemplates = linkTemplates + self.links = self._generate_static_links() + links + + def _generate_static_links(self): + """Generates static links (root and parent) for the record.""" + return [ + { + "rel": "root", + "href": "../../catalog.json", + "type": "application/json", + "title": "Open Science Catalog" + }, + { + "rel": "parent", + "href": "../catalog.json", + "type": "application/json", + "title": "Experiments" + }, + { + "rel": "related", + "href": f"../../workflows/{self.id}/record.json", + "type": "application/json", + "title": "Workflow: POLARIS" + }, + { + "rel": "related", + "href": "../../projects/deepesdl/collection.json", + "type": "application/json", + "title": "Project: DeepESDL" + }, + { + "rel": "input", + "href": "./input.yaml", + "type": "application/yaml", + "title": "Input parameters" + }, + { + "rel": "environment", + "href": "./environment.yaml", + "type": "application/yaml", + "title": "Execution environment" + }, + { + "rel": "self", + "href": f"{BASE_URL_OSC}/experiments/{self.id}/record.json", + "type": "application/json" + } + ] + +class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( self, id: str, type: str, - time: dict, + jupyter_notebook_url: str, properties: RecordProperties, links: list[dict], linkTemplates: list = [], conformsTo: list[str] = None, - geometry: Optional[Any] = None, + geometry: Optional[Any] = None ): if conformsTo is None: conformsTo = [OGC_API_RECORD_SPEC] self.id = id self.type = type self.conformsTo = conformsTo - self.time = time + self.jupyter_notebook_url = jupyter_notebook_url self.geometry = geometry self.properties = properties self.linkTemplates = linkTemplates - self.links = links + self.links = self._generate_static_links() + links + + def _generate_static_links(self): + """Generates static links (root and parent) for the record.""" + return [ + { + "rel": "root", + "href": "../../catalog.json", + "type": "application/json", + "title": "Open Science Catalog" + }, + { + "rel": "parent", + "href": "../catalog.json", + "type": "application/json", + "title": "Workflows" + }, + { + "rel": "child", + "href": f"../../experiments/{self.id}/record.json", + "type": "application/json", + "title": f"{self.id}" + }, + { + "rel": "jupyter-notebook", + "type": "application/json", + "title": "Jupyter Notebook", + "href": f"{self.jupyter_notebook_url}" + }, + + { + "rel": "self", + "href": f"{BASE_URL_OSC}/workflows/{self.id}/record.json", + "type": "application/json" + } + ] \ No newline at end of file diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index 481663f..84f8c81 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -6,12 +6,12 @@ from datetime import datetime, timezone -from deep_code.constants import DEFAULT_THEME_SCHEME +from deep_code.constants import DEFAULT_THEME_SCHEME, OSC_THEME_SCHEME from deep_code.utils.ogc_api_record import ( Contact, RecordProperties, Theme, - ThemeConcept, + ThemeConcept ) @@ -37,9 +37,9 @@ def build_theme(osc_themes: list[str]) -> Theme: """Convert each string into a ThemeConcept """ concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] - return Theme(concepts=concepts, scheme=DEFAULT_THEME_SCHEME) + return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - def build_record_properties(self, properties, contacts) -> RecordProperties: + def build_record_properties(self, properties, contacts, caller: type) -> RecordProperties: """Build a RecordProperties object from a list of single-key property dicts """ now_iso = datetime.now(timezone.utc).isoformat() From 846e47410977a1c0ecad682c7ce970b3b836e823 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 10 Mar 2025 23:15:16 +0100 Subject: [PATCH 08/43] implementation works to generate valid ogc api records for experiments and workflows --- deep_code/cli/publish.py | 4 +- deep_code/tests/tools/test_publish.py | 8 +- deep_code/tools/publish.py | 190 +++++++++++++----------- deep_code/utils/ogc_api_record.py | 134 ++++++++++++----- deep_code/utils/ogc_record_generator.py | 27 +++- 5 files changed, 230 insertions(+), 133 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index bdeaee9..771eade 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -6,7 +6,7 @@ import click -from deep_code.tools.publish import DatasetPublisher, WorkflowPublisher +from deep_code.tools.publish import Publisher, WorkflowPublisher @click.command(name="publish-dataset") @@ -14,7 +14,7 @@ def publish_dataset(dataset_config): """Request publishing a dataset to the open science catalogue. """ - publisher = DatasetPublisher() + publisher = Publisher() publisher.publish_dataset(dataset_config_path=dataset_config) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 3e0f5e8..ff5d1ef 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -2,7 +2,7 @@ import pytest -from deep_code.tools.publish import DatasetPublisher +from deep_code.tools.publish import Publisher class TestDatasetPublisher: @@ -15,7 +15,7 @@ def test_init_missing_credentials(self, mock_fsspec_open): with pytest.raises( ValueError, match="GitHub credentials are missing in the `.gitaccess` file." ): - DatasetPublisher() + Publisher() @patch("deep_code.tools.publish.fsspec.open") def test_publish_dataset_missing_ids(self, mock_fsspec_open): @@ -31,7 +31,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open): mock_open(read_data=dataset_yaml_content)(), ] - publisher = DatasetPublisher() + publisher = Publisher() with pytest.raises( ValueError, match="Dataset ID or Collection ID missing in the config." @@ -106,7 +106,7 @@ def test_publish_dataset_success( ) # Instantiate & publish - publisher = DatasetPublisher() + publisher = Publisher() publisher.publish_dataset("/fake/path/to/dataset-config.yaml") # Assert that we called git clone with /tmp/temp_repo diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 72e7fb7..537f1f0 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -22,7 +22,7 @@ from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.ogc_api_record import WorkflowAsOgcRecord, \ - ExperimentAsOgcRecord + ExperimentAsOgcRecord, LinksBuilder from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator logger = logging.getLogger(__name__) @@ -93,67 +93,84 @@ def publish_files( self.github_automation.clean_up() -class DatasetPublisher: +class Publisher: """Publishes products (datasets) to the OSC GitHub repository. Inherits from BasePublisher for GitHub publishing logic. """ - def __init__(self): + def __init__(self, dataset_config_path: str, workflow_config_path: str): # Composition self.gh_publisher = GitHubPublisher() - - @staticmethod - def clean_title(title: str) -> str: - """Clean up titles by replacing Unicode escape sequences with standard characters.""" - title = title.replace('\u00a0', - ' ') # Replace non-breaking space with normal space - title = title.replace('\u00b0', - '°') # Replace unicode degree symbol with actual degree symbol - return title - - def clean_catalog_titles(self, catalog: Catalog): - """Recursively clean all titles in the catalog.""" - # Clean title for the catalog itself - if isinstance(catalog.title, str): - catalog.title = self.clean_title(catalog.title) - - # Clean titles in all links of the catalog - for link in catalog.links: - if isinstance(link.title, str): - link.title = self.clean_title(link.title) - - for link in catalog.links: - if link.rel == 'child': - try: - # If the link points to another catalog or collection, clean it recursively - child_catalog = Catalog.from_file(link.href) - self.clean_catalog_titles(child_catalog) - except Exception as e: - # If the link doesn't point to a valid catalog file, skip it - pass - - def publish_dataset(self, dataset_config_path: str): + self.collection_id = "" + + # Paths to configuration files + self.dataset_config_path = dataset_config_path + self.workflow_config_path = workflow_config_path + + # Load configuration files + self._read_config_files() + self.collection_id = self.dataset_config.get("collection_id") + + # Ensure collection_id is set + if not self.collection_id: + raise ValueError("collection_id is missing in dataset config.") + + # @staticmethod + # def clean_title(title: str) -> str: + # """Clean up titles by replacing Unicode escape sequences with standard characters.""" + # title = title.replace('\u00a0', + # ' ') # Replace non-breaking space with normal space + # title = title.replace('\u00b0', + # '°') # Replace unicode degree symbol with actual degree symbol + # return title + + # def clean_catalog_titles(self, catalog: Catalog): + # """Recursively clean all titles in the catalog.""" + # # Clean title for the catalog itself + # if isinstance(catalog.title, str): + # catalog.title = self.clean_title(catalog.title) + # + # # Clean titles in all links of the catalog + # for link in catalog.links: + # if isinstance(link.title, str): + # link.title = self.clean_title(link.title) + # + # for link in catalog.links: + # if link.rel == 'child': + # try: + # # If the link points to another catalog or collection, clean it recursively + # child_catalog = Catalog.from_file(link.href) + # self.clean_catalog_titles(child_catalog) + # except Exception as e: + # # If the link doesn't point to a valid catalog file, skip it + # pass + + def _read_config_files(self) -> None: + with fsspec.open(self.dataset_config_path, "r") as file: + self.dataset_config = yaml.safe_load(file) or {} + with fsspec.open(self.workflow_config_path, "r") as file: + self.workflow_config = yaml.safe_load(file) or {} + + def publish_dataset(self): """Publish a product collection to the specified GitHub repository.""" - with fsspec.open(dataset_config_path, "r") as file: - dataset_config = yaml.safe_load(file) or {} - - dataset_id = dataset_config.get("dataset_id") - collection_id = dataset_config.get("collection_id") - documentation_link = dataset_config.get("documentation_link") - access_link = dataset_config.get("access_link") - dataset_status = dataset_config.get("dataset_status") - osc_region = dataset_config.get("osc_region") - osc_themes = dataset_config.get("osc_themes") - cf_params = dataset_config.get("cf_parameter") - - if not dataset_id or not collection_id: + + dataset_id = self.dataset_config.get("dataset_id") + self.collection_id = self.dataset_config.get("collection_id") + documentation_link = self.dataset_config.get("documentation_link") + access_link = self.dataset_config.get("access_link") + dataset_status = self.dataset_config.get("dataset_status") + osc_region = self.dataset_config.get("osc_region") + osc_themes = self.dataset_config.get("osc_themes") + cf_params = self.dataset_config.get("cf_parameter") + + if not dataset_id or not self.collection_id: raise ValueError("Dataset ID or Collection ID missing in the config.") logger.info("Generating STAC collection...") generator = OscDatasetStacGenerator( dataset_id=dataset_id, - collection_id=collection_id, + collection_id=self.collection_id, documentation_link=documentation_link, access_link=access_link, osc_status=dataset_status, @@ -167,7 +184,7 @@ def publish_dataset(self, dataset_config_path: str): # Prepare a dictionary of file paths and content file_dict = {} - product_path = f"products/{collection_id}/collection.json" + product_path = f"products/{self.collection_id}/collection.json" file_dict[product_path] = ds_collection.to_dict() variable_base_catalog_path = f"variables/catalog.json" @@ -177,9 +194,6 @@ def publish_dataset(self, dataset_config_path: str): ) # Add or update variable files for var_id in variable_ids: - # if var_id in ["crs", "spatial_ref"]: - # logger.info(f"Skipping CRS variable: {var_id}") - # continue var_file_path = f"variables/{var_id}/catalog.json" if not self.gh_publisher.github_automation.file_exists(var_file_path): logger.info( @@ -219,7 +233,7 @@ def publish_dataset(self, dataset_config_path: str): ) updated_product_base_catalog = generator.update_product_base_catalog(full_path) # clean special characters - self.clean_catalog_titles(updated_product_base_catalog) + # self.clean_catalog_titles(updated_product_base_catalog) file_dict[product_catalog_path] = updated_product_base_catalog.to_dict() #Link product to project catalog @@ -233,10 +247,12 @@ def publish_dataset(self, dataset_config_path: str): file_dict[deepesdl_collection_path] = updated_deepesdl_collection.to_dict() # Create branch name, commit message, PR info - branch_name = f"{OSC_BRANCH_NAME}-{collection_id}-{datetime.now().strftime('%Y%m%d%H%M%S')}" - commit_message = f"Add new dataset collection: {collection_id}" - pr_title = f"Add new dataset collection: {collection_id}" - pr_body = (f"This PR adds a new dataset collection: {collection_id} and it's " + branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}" + f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") + commit_message = f"Add new dataset collection: {self.collection_id}" + pr_title = f"Add new dataset collection: {self.collection_id}" + pr_body = (f"This PR adds a new dataset collection: {self.collection_id} and " + f"it's " f"corresponding variable catalogs to the repository.") # Publish all files in one go @@ -251,12 +267,6 @@ def publish_dataset(self, dataset_config_path: str): logger.info(f"Pull request created: {pr_url}") -class WorkflowPublisher: - """Publishes workflows to the OSC GitHub repository.""" - - def __init__(self): - self.gh_publisher = GitHubPublisher() - @staticmethod def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None @@ -277,60 +287,65 @@ def _write_to_file(file_path: str, data: dict): json.dump(data, file, indent=4) logger.info(f"File written to {file_path}") - def publish_workflow_experiment(self, workflow_config_path: str, write_to_file: bool = False): - with fsspec.open(workflow_config_path, "r") as file: - workflow_config = yaml.safe_load(file) or {} - - workflow_id = self._normalize_name(workflow_config.get("workflow_id")) + def publish_workflow_experiment(self, write_to_file: bool = False): + workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) if not workflow_id: raise ValueError("workflow_id is missing in workflow config.") - properties_list = workflow_config.get("properties", []) - contacts = workflow_config.get("contact", []) - links = workflow_config.get("links", []) - jupyter_notebook_url = workflow_config.get("jupyter_notebook_url") + properties_list = self.workflow_config.get("properties", {}) + osc_themes = properties_list.get("themes") + contacts = self.workflow_config.get("contact", []) + links = self.workflow_config.get("links", []) + jupyter_notebook_url = self.workflow_config.get("jupyter_notebook_url") logger.info("Generating OGC API Record for the workflow...") rg = OSCWorkflowOGCApiRecordGenerator() - wf_record_properties = rg.build_record_properties(properties_list, contacts, - caller="WorkflowAsOgcRecord") + wf_record_properties = rg.build_record_properties(properties_list, contacts) + + link_builder = LinksBuilder(osc_themes) + theme_links = link_builder.build_them_links_for_records() + workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", properties=wf_record_properties, - links=links, - jupyter_notebook_url=jupyter_notebook_url + links=links + theme_links, + jupyter_notebook_url=jupyter_notebook_url, + themes=osc_themes ) # Convert to dictionary and remove jupyter_notebook_url workflow_dict = workflow_record.to_dict() if "jupyter_notebook_url" in workflow_dict: del workflow_dict["jupyter_notebook_url"] - wf_file_path = f"workflow/{workflow_id}/record.json" file_dict = {wf_file_path: workflow_dict} # Build properties for the experiment record exp_record_properties = copy.deepcopy(wf_record_properties) exp_record_properties.type = "experiment" + exp_record_properties.osc_workflow = workflow_id experiment_record = ExperimentAsOgcRecord( id=workflow_id, type="Feature", + jupyter_notebook_url=jupyter_notebook_url, + collection_id=self.collection_id, properties=exp_record_properties, - links=links, - jupyter_notebook_url=jupyter_notebook_url + links=links + theme_links ) # Convert to dictionary and remove jupyter_notebook_url experiment_dict = experiment_record.to_dict() if "jupyter_notebook_url" in experiment_dict: del experiment_dict["jupyter_notebook_url"] + if "collection_id" in experiment_dict: + del experiment_dict["collection_id"] exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict # Write to files if testing if write_to_file: - self._write_to_file(wf_file_path, workflow_record.to_dict()) - self._write_to_file(exp_file_path, experiment_record.to_dict()) + self._write_to_file(wf_file_path, workflow_dict) + self._write_to_file(exp_file_path, experiment_dict) # Publish to GitHub if not testing if not write_to_file: @@ -351,9 +366,8 @@ def publish_workflow_experiment(self, workflow_config_path: str, write_to_file: if __name__ == '__main__': # Example usage for testing - publisher = WorkflowPublisher() - publisher.publish_workflow_experiment( - workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow" - "-config.yaml", - write_to_file=True - ) \ No newline at end of file + publisher = Publisher(dataset_config_path="/home/tejas/bc/projects/deepesdl/deep" + "-code/dataset-config.yaml", + workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow" + "-config.yaml") + publisher.publish_workflow_experiment(write_to_file=True) \ No newline at end of file diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index bf2fee3..47e039c 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -49,6 +49,7 @@ def __init__( title: str, description: str, jupyter_kernel_info: JupyterKernelInfo, + osc_workflow: str = None, updated: str = None, contacts: list[Contact] = None, themes: list[Theme] = None, @@ -62,14 +63,55 @@ def __init__( self.title = title self.description = description self.jupyter_kernel_info = jupyter_kernel_info + self.osc_workflow = osc_workflow self.keywords = keywords or [] self.contacts = contacts self.themes = themes self.formats = formats or [] self.license = license + def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: + """Convert this object into a JSON-serializable dictionary.""" + data = super().to_dict(value_name) + if self.osc_workflow is not None: + data["osc:workflow"] = self.osc_workflow + del data["osc_workflow"] # Remove the original key + return data -class ExperimentAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): +class LinksBuilder: + def __init__(self, + themes: list[str], + ): + self.themes = themes + self.theme_links = [] + + @staticmethod + def format_string(s): + return s.capitalize() + + def build_them_links_for_records(self): + for theme in self.themes: + formated_theme = self.format_string(theme) + link = { + "rel": "related", + "href": f"../../themes/{theme}/catalog.json", + "type": "application/json", + "title": f"Theme: {formated_theme}" + } + self.theme_links.append(link) + return self.theme_links + + @staticmethod + def build_link_to_dataset(collection_id): + return [{ + "rel": "child", + "href": f"../../products/{collection_id}/collection.json", + "type": "application/json", + "title": f"{collection_id}" + }] + + +class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( self, id: str, @@ -79,17 +121,19 @@ def __init__( links: list[dict], linkTemplates: list = [], conformsTo: list[str] = None, - geometry: Optional[Any] = None + geometry: Optional[Any] = None, + themes: Optional[Any] = None ): if conformsTo is None: conformsTo = [OGC_API_RECORD_SPEC] self.id = id self.type = type - self.conformsTo = conformsTo self.jupyter_notebook_url = jupyter_notebook_url self.geometry = geometry self.properties = properties self.linkTemplates = linkTemplates + self.conformsTo = conformsTo + self.themes = themes self.links = self._generate_static_links() + links def _generate_static_links(self): @@ -105,57 +149,56 @@ def _generate_static_links(self): "rel": "parent", "href": "../catalog.json", "type": "application/json", - "title": "Experiments" + "title": "Workflows" }, { - "rel": "related", - "href": f"../../workflows/{self.id}/record.json", + "rel": "child", + "href": f"../../experiments/{self.id}/record.json", "type": "application/json", - "title": "Workflow: POLARIS" + "title": f"{self.id}" }, { - "rel": "related", - "href": "../../projects/deepesdl/collection.json", + "rel": "jupyter-notebook", "type": "application/json", - "title": "Project: DeepESDL" - }, - { - "rel": "input", - "href": "./input.yaml", - "type": "application/yaml", - "title": "Input parameters" - }, - { - "rel": "environment", - "href": "./environment.yaml", - "type": "application/yaml", - "title": "Execution environment" + "title": "Jupyter Notebook", + "href": f"{self.jupyter_notebook_url}" }, + { "rel": "self", - "href": f"{BASE_URL_OSC}/experiments/{self.id}/record.json", + "href": f"{BASE_URL_OSC}/workflows/{self.id}/record.json", "type": "application/json" } ] -class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): + # def _assemble_all_links(self): + # static_links = self._generate_static_links() + # link_builder = LinksBuilder(self.themes) + # theme_links = link_builder.build_them_links_for_records() + # return static_links + theme_links + +class ExperimentAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( self, id: str, type: str, jupyter_notebook_url: str, + collection_id: str, properties: RecordProperties, links: list[dict], - linkTemplates: list = [], + linkTemplates=None, conformsTo: list[str] = None, geometry: Optional[Any] = None ): + if linkTemplates is None: + linkTemplates = [] if conformsTo is None: conformsTo = [OGC_API_RECORD_SPEC] self.id = id self.type = type self.conformsTo = conformsTo self.jupyter_notebook_url = jupyter_notebook_url + self.collection_id = collection_id self.geometry = geometry self.properties = properties self.linkTemplates = linkTemplates @@ -174,24 +217,47 @@ def _generate_static_links(self): "rel": "parent", "href": "../catalog.json", "type": "application/json", - "title": "Workflows" + "title": "Experiments" + }, + { + "rel": "related", + "href": f"../../workflows/{self.id}/record.json", + "type": "application/json", + "title": "Workflow: POLARIS" }, { "rel": "child", - "href": f"../../experiments/{self.id}/record.json", + "href": f"../../products/{self.collection_id}/collection.json", "type": "application/json", - "title": f"{self.id}" + "title": f"{self.collection_id}" }, { - "rel": "jupyter-notebook", + "rel": "related", + "href": "../../projects/deepesdl/collection.json", "type": "application/json", - "title": "Jupyter Notebook", - "href": f"{self.jupyter_notebook_url}" + "title": "Project: DeepESDL" + }, + { + "rel": "input", + "href": "./input.yaml", + "type": "application/yaml", + "title": "Input parameters" + }, + { + "rel": "environment", + "href": "./environment.yaml", + "type": "application/yaml", + "title": "Execution environment" }, - { "rel": "self", - "href": f"{BASE_URL_OSC}/workflows/{self.id}/record.json", + "href": f"{BASE_URL_OSC}/experiments/{self.id}/record.json", "type": "application/json" } - ] \ No newline at end of file + ] + + # def _assemble_all_links(self): + # static_links = self._generate_static_links() + # link_builder = LinksBuilder(self.themes) + # theme_links = link_builder.build_them_links_for_records() + # return static_links + theme_links \ No newline at end of file diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index 84f8c81..f561ef9 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -6,7 +6,7 @@ from datetime import datetime, timezone -from deep_code.constants import DEFAULT_THEME_SCHEME, OSC_THEME_SCHEME +from deep_code.constants import OSC_THEME_SCHEME from deep_code.utils.ogc_api_record import ( Contact, RecordProperties, @@ -39,16 +39,33 @@ def build_theme(osc_themes: list[str]) -> Theme: concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - def build_record_properties(self, properties, contacts, caller: type) -> RecordProperties: - """Build a RecordProperties object from a list of single-key property dicts + def build_record_properties(self, properties: dict, contacts: list) -> RecordProperties: + """Build a RecordProperties object from a properties dictionary. + + Args: + properties: A dictionary containing properties (e.g., title, description, themes). + contacts: A list of contact dictionaries. + caller: The caller type ("WorkflowAsOgcRecord" or "ExperimentAsOgcRecord"). + + Returns: + A RecordProperties object. """ now_iso = datetime.now(timezone.utc).isoformat() properties.update({"created": now_iso}) properties.update({"updated": now_iso}) + + # Extract themes from the properties dictionary themes_list = properties.get("themes", []) + + # Build contact objects properties.update({"contacts": self.build_contact_objects(contacts)}) + + # Build theme object if themes are present if themes_list: theme_obj = self.build_theme(themes_list) - properties.update({"themes": [theme_obj]}) + properties.update( + {"themes": [theme_obj]}) # Wrap the Theme object in a list + properties.setdefault("type", "workflow") - return RecordProperties.from_value(properties) + + return RecordProperties.from_value(properties) \ No newline at end of file From b0fed741472df0eb0fa767f5aa0c5d3b9b72283d Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 11 Mar 2025 19:08:24 +0100 Subject: [PATCH 09/43] update github_automation class to fork only if local clone dir doesn't exist, added write_to_file option for local testing during development --- deep_code/tools/publish.py | 92 +++++++++++++++------------- deep_code/utils/github_automation.py | 61 ++++++++++++------ deep_code/utils/helper.py | 18 ++++++ 3 files changed, 109 insertions(+), 62 deletions(-) create mode 100644 deep_code/utils/helper.py diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 537f1f0..a422acc 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -24,6 +24,7 @@ from deep_code.utils.ogc_api_record import WorkflowAsOgcRecord, \ ExperimentAsOgcRecord, LinksBuilder from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator +from deep_code.utils.helper import serialize logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -151,7 +152,25 @@ def _read_config_files(self) -> None: with fsspec.open(self.workflow_config_path, "r") as file: self.workflow_config = yaml.safe_load(file) or {} - def publish_dataset(self): + @staticmethod + def _write_to_file(file_path: str, data: dict): + """Write a dictionary to a JSON file. + + Args: + file_path (str): The path to the file. + data (dict): The data to write. + """ + # Create the directory if it doesn't exist + Path(file_path).parent.mkdir(parents=True, exist_ok=True) + try: + json_content = json.dumps(data, indent=2, default=serialize) + except TypeError as e: + raise RuntimeError(f"JSON serialization failed: {e}") + + with open(file_path, "w") as f: + f.write(json_content) + + def publish_dataset(self, write_to_file: bool = False): """Publish a product collection to the specified GitHub repository.""" dataset_id = self.dataset_config.get("dataset_id") @@ -214,18 +233,12 @@ def publish_dataset(self): full_path, var_id ) file_dict[var_file_path] = updated_catalog.to_dict() - # logger.info( - # f"Add {var_id} child link to variable base catalog" - # ) - # file_dict[ - # variable_base_catalog_path] = generator.update_variable_base_catalog( - # variable_catalog_full_path, var_id).to_dict() - file_dict[variable_base_catalog_path] = generator.update_variable_base_catalog( variable_catalog_full_path, variable_ids ).to_dict() - """Link product to base product catalog""" + + # Link product to base product catalog product_catalog_path = f"products/catalog.json" full_path = ( Path(self.gh_publisher.github_automation.local_clone_dir) @@ -246,46 +259,36 @@ def publish_dataset(self): updated_deepesdl_collection = generator.update_deepesdl_collection(deepesdl_collection_full_path) file_dict[deepesdl_collection_path] = updated_deepesdl_collection.to_dict() - # Create branch name, commit message, PR info - branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}" - f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") - commit_message = f"Add new dataset collection: {self.collection_id}" - pr_title = f"Add new dataset collection: {self.collection_id}" - pr_body = (f"This PR adds a new dataset collection: {self.collection_id} and " - f"it's " - f"corresponding variable catalogs to the repository.") - - # Publish all files in one go - pr_url = self.gh_publisher.publish_files( - branch_name=branch_name, - file_dict=file_dict, - commit_message=commit_message, - pr_title=pr_title, - pr_body=pr_body, - ) + # Write to files if testing + if write_to_file: + for file_path, data in file_dict.items(): + self._write_to_file(file_path, data) # Pass file_path and data + else: + # Create branch name, commit message, PR info + branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}" + f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") + commit_message = f"Add new dataset collection: {self.collection_id}" + pr_title = f"Add new dataset collection: {self.collection_id}" + pr_body = (f"This PR adds a new dataset collection: {self.collection_id} and " + f"it's " + f"corresponding variable catalogs to the repository.") + + # Publish all files in one go + pr_url = self.gh_publisher.publish_files( + branch_name=branch_name, + file_dict=file_dict, + commit_message=commit_message, + pr_title=pr_title, + pr_body=pr_body, + ) - logger.info(f"Pull request created: {pr_url}") + logger.info(f"Pull request created: {pr_url}") @staticmethod def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - @staticmethod - def _write_to_file(file_path: str, data: dict): - """Write a dictionary to a JSON file. - - Args: - file_path (str): The path to the file. - data (dict): The data to write. - """ - # Create the directory if it doesn't exist - Path(file_path).parent.mkdir(parents=True, exist_ok=True) - - # Write the data to the file - with open(file_path, "w") as file: - json.dump(data, file, indent=4) - logger.info(f"File written to {file_path}") def publish_workflow_experiment(self, write_to_file: bool = False): workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) @@ -344,8 +347,8 @@ def publish_workflow_experiment(self, write_to_file: bool = False): # Write to files if testing if write_to_file: - self._write_to_file(wf_file_path, workflow_dict) - self._write_to_file(exp_file_path, experiment_dict) + for file_path, data in file_dict.items(): + self._write_to_file(file_path, data) # Publish to GitHub if not testing if not write_to_file: @@ -370,4 +373,5 @@ def publish_workflow_experiment(self, write_to_file: bool = False): "-code/dataset-config.yaml", workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow" "-config.yaml") + publisher.publish_dataset(write_to_file=True) publisher.publish_workflow_experiment(write_to_file=True) \ No newline at end of file diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 0eb91f5..f0bca18 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -12,6 +12,7 @@ import requests +from deep_code.utils.helper import serialize class GitHubAutomation: """Automates GitHub operations needed to create a Pull Request. @@ -43,16 +44,40 @@ def fork_repository(self): response.raise_for_status() logging.info(f"Repository forked to {self.username}/{self.repo_name}") + # def clone_repository(self): + # """Clone the forked repository locally.""" + # logging.info("Cloning forked repository...") + # try: + # subprocess.run( + # ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True + # ) + # # os.chdir(self.local_clone_dir) + # except subprocess.CalledProcessError as e: + # raise RuntimeError(f"Failed to clone repository: {e}") + def clone_repository(self): - """Clone the forked repository locally.""" - logging.info("Cloning forked repository...") - try: - subprocess.run( - ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True - ) - # os.chdir(self.local_clone_dir) - except subprocess.CalledProcessError as e: - raise RuntimeError(f"Failed to clone repository: {e}") + """Clone the forked repository locally if it doesn't exist, or pull updates if it does.""" + logging.info("Checking local repository...") + if not os.path.exists(self.local_clone_dir): + # Directory doesn't exist, clone the repository + logging.info("Cloning forked repository...") + try: + subprocess.run( + ["git", "clone", self.fork_repo_url, self.local_clone_dir], + check=True + ) + logging.info(f"Repository cloned to {self.local_clone_dir}") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to clone repository: {e}") + else: + # Directory exists, pull the latest changes + logging.info("Local repository already exists. Pulling latest changes...") + try: + os.chdir(self.local_clone_dir) + subprocess.run(["git", "pull"], check=True) + logging.info("Repository updated with latest changes.") + except subprocess.CalledProcessError as e: + raise RuntimeError(f"Failed to pull latest changes: {e}") def create_branch(self, branch_name: str): """Create a new branch in the local repository.""" @@ -78,7 +103,7 @@ def add_file(self, file_path: str, content): # Serialize to JSON try: - json_content = json.dumps(content, indent=2, default=self.serialize) + json_content = json.dumps(content, indent=2, default=serialize) except TypeError as e: raise RuntimeError(f"JSON serialization failed: {e}") @@ -135,11 +160,11 @@ def file_exists(self, file_path) -> bool: logging.debug(f"Checking existence of {full_path}: {exists}") return exists - # Check and convert any non-serializable objects - def serialize(self, obj): - if isinstance(obj, set): - return list(obj) # Convert sets to lists - if hasattr(obj, "__dict__"): - return obj.__dict__ # Convert objects with attributes to dicts - raise TypeError( - f"Object of type {type(obj).__name__} is not JSON serializable") \ No newline at end of file + # # Check and convert any non-serializable objects + # def serialize(self, obj): + # if isinstance(obj, set): + # return list(obj) # Convert sets to lists + # if hasattr(obj, "__dict__"): + # return obj.__dict__ # Convert objects with attributes to dicts + # raise TypeError( + # f"Object of type {type(obj).__name__} is not JSON serializable") \ No newline at end of file diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py new file mode 100644 index 0000000..27e8d82 --- /dev/null +++ b/deep_code/utils/helper.py @@ -0,0 +1,18 @@ +import json +from pathlib import Path + + +def serialize(obj): + """Convert non-serializable objects to JSON-compatible formats. + Args: + obj: The object to serialize. + Returns: + A JSON-compatible representation of the object. + Raises: + TypeError: If the object cannot be serialized. + """ + if isinstance(obj, set): + return list(obj) # Convert sets to lists + if hasattr(obj, "__dict__"): + return obj.__dict__ # Convert objects with attributes to dicts + raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") From e143e37ea3557e7b4da52e3a5edba15743c8a0ae Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 11 Mar 2025 19:40:08 +0100 Subject: [PATCH 10/43] extract repeated logic into helper methods --- deep_code/tools/publish.py | 128 ++++++++++++++++++------------------- 1 file changed, 63 insertions(+), 65 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index a422acc..266bfef 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -11,13 +11,12 @@ import fsspec import yaml -from pystac import Catalog from deep_code.constants import ( OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER, - WF_BRANCH_NAME, + WF_BRANCH_NAME ) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation @@ -170,6 +169,46 @@ def _write_to_file(file_path: str, data: dict): with open(file_path, "w") as f: f.write(json_content) + def _update_and_add_to_file_dict(self, file_dict, catalog_path, + update_method, *args): + """Update a catalog using the specified method and add it to file_dict. + + Args: + file_dict: The dictionary to which the updated catalog will be added. + catalog_path: The path to the catalog file. + update_method: The method to call for updating the catalog. + *args: Additional arguments to pass to the update method. + """ + full_path = Path( + self.gh_publisher.github_automation.local_clone_dir) / catalog_path + updated_catalog = update_method(full_path, *args) + file_dict[catalog_path] = updated_catalog.to_dict() + + def _update_variable_catalogs(self, generator, file_dict, variable_ids): + """Update or create variable catalogs and add them to file_dict. + + Args: + generator: The generator object. + file_dict: The dictionary to which the updated catalogs will be added. + variable_ids: A list of variable IDs. + """ + for var_id in variable_ids: + var_file_path = f"variables/{var_id}/catalog.json" + if not self.gh_publisher.github_automation.file_exists(var_file_path): + logger.info( + f"Variable catalog for {var_id} does not exist. Creating...") + var_metadata = generator.variables_metadata.get(var_id) + var_catalog = generator.build_variable_catalog(var_metadata) + file_dict[var_file_path] = var_catalog.to_dict() + else: + logger.info( + f"Variable catalog already exists for {var_id}, adding product link.") + full_path = Path( + self.gh_publisher.github_automation.local_clone_dir) / var_file_path + updated_catalog = generator.update_existing_variable_catalog(full_path, + var_id) + file_dict[var_file_path] = updated_catalog.to_dict() + def publish_dataset(self, write_to_file: bool = False): """Publish a product collection to the specified GitHub repository.""" @@ -206,58 +245,26 @@ def publish_dataset(self, write_to_file: bool = False): product_path = f"products/{self.collection_id}/collection.json" file_dict[product_path] = ds_collection.to_dict() - variable_base_catalog_path = f"variables/catalog.json" - variable_catalog_full_path = ( - Path(self.gh_publisher.github_automation.local_clone_dir) - / variable_base_catalog_path - ) - # Add or update variable files - for var_id in variable_ids: - var_file_path = f"variables/{var_id}/catalog.json" - if not self.gh_publisher.github_automation.file_exists(var_file_path): - logger.info( - f"Variable catalog for {var_id} does not exist. Creating..." - ) - var_metadata = generator.variables_metadata.get(var_id) - var_catalog = generator.build_variable_catalog(var_metadata) - file_dict[var_file_path] = var_catalog.to_dict() - else: - logger.info( - f"Variable catalog already exists for {var_id}, adding product link." - ) - full_path = ( - Path(self.gh_publisher.github_automation.local_clone_dir) - / var_file_path - ) - updated_catalog = generator.update_existing_variable_catalog( - full_path, var_id - ) - file_dict[var_file_path] = updated_catalog.to_dict() + # Update or create variable catalogs for each osc:variable + self._update_variable_catalogs(generator, file_dict, variable_ids) - file_dict[variable_base_catalog_path] = generator.update_variable_base_catalog( - variable_catalog_full_path, variable_ids - ).to_dict() + # Update variable base catalog + variable_base_catalog_path = "variables/catalog.json" + self._update_and_add_to_file_dict(file_dict, variable_base_catalog_path, + generator.update_variable_base_catalog, variable_ids + ) - # Link product to base product catalog - product_catalog_path = f"products/catalog.json" - full_path = ( - Path(self.gh_publisher.github_automation.local_clone_dir) - / product_catalog_path + # Update product base catalog + product_catalog_path = "products/catalog.json" + self._update_and_add_to_file_dict(file_dict, product_catalog_path, + generator.update_product_base_catalog ) - updated_product_base_catalog = generator.update_product_base_catalog(full_path) - # clean special characters - # self.clean_catalog_titles(updated_product_base_catalog) - file_dict[product_catalog_path] = updated_product_base_catalog.to_dict() - - #Link product to project catalog - deepesdl_collection_path = \ - f"projects/deep-earth-system-data-lab/collection.json" - deepesdl_collection_full_path = ( - Path(self.gh_publisher.github_automation.local_clone_dir) - / deepesdl_collection_path + + # Update DeepESDL collection + deepesdl_collection_path = "projects/deep-earth-system-data-lab/collection.json" + self._update_and_add_to_file_dict(file_dict, deepesdl_collection_path, + generator.update_deepesdl_collection ) - updated_deepesdl_collection = generator.update_deepesdl_collection(deepesdl_collection_full_path) - file_dict[deepesdl_collection_path] = updated_deepesdl_collection.to_dict() # Write to files if testing if write_to_file: @@ -269,9 +276,10 @@ def publish_dataset(self, write_to_file: bool = False): f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") commit_message = f"Add new dataset collection: {self.collection_id}" pr_title = f"Add new dataset collection: {self.collection_id}" - pr_body = (f"This PR adds a new dataset collection: {self.collection_id} and " - f"it's " - f"corresponding variable catalogs to the repository.") + pr_body = ( + f"This PR adds a new dataset collection: {self.collection_id} and " + f"it's " + f"corresponding variable catalogs to the repository.") # Publish all files in one go pr_url = self.gh_publisher.publish_files( @@ -349,9 +357,8 @@ def publish_workflow_experiment(self, write_to_file: bool = False): if write_to_file: for file_path, data in file_dict.items(): self._write_to_file(file_path, data) - - # Publish to GitHub if not testing - if not write_to_file: + else: + # Publish to GitHub if not testing branch_name = f"{WF_BRANCH_NAME}-{workflow_id}" commit_message = f"Adding workflow from DeepESDL: {workflow_id}" pr_title = f"Add workflow and Experiment from DeepESDL: {workflow_id}" @@ -366,12 +373,3 @@ def publish_workflow_experiment(self, write_to_file: bool = False): ) logger.info(f"Pull request created: {pr_url}") - -if __name__ == '__main__': - # Example usage for testing - publisher = Publisher(dataset_config_path="/home/tejas/bc/projects/deepesdl/deep" - "-code/dataset-config.yaml", - workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow" - "-config.yaml") - publisher.publish_dataset(write_to_file=True) - publisher.publish_workflow_experiment(write_to_file=True) \ No newline at end of file From fca2f051926185133de901b6b4012b66bb128268 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 11 Mar 2025 19:40:40 +0100 Subject: [PATCH 11/43] refactor to remove comments --- deep_code/utils/github_automation.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index f0bca18..85ac8c9 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -44,17 +44,6 @@ def fork_repository(self): response.raise_for_status() logging.info(f"Repository forked to {self.username}/{self.repo_name}") - # def clone_repository(self): - # """Clone the forked repository locally.""" - # logging.info("Cloning forked repository...") - # try: - # subprocess.run( - # ["git", "clone", self.fork_repo_url, self.local_clone_dir], check=True - # ) - # # os.chdir(self.local_clone_dir) - # except subprocess.CalledProcessError as e: - # raise RuntimeError(f"Failed to clone repository: {e}") - def clone_repository(self): """Clone the forked repository locally if it doesn't exist, or pull updates if it does.""" logging.info("Checking local repository...") From 2d8638297a89c11c3006cfc0f305793df71a1ee6 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 11 Mar 2025 19:42:03 +0100 Subject: [PATCH 12/43] refactor to remove comments --- deep_code/utils/dataset_stac_generator.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index d708003..b092fac 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -425,14 +425,6 @@ def build_theme(osc_themes: list[str]) -> Theme: concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - # def build_theme_links(self) -> Link: - # return Link( - # rel="related", - # target="../../themes/catalog.json", - # media_type="application/json", - # title="Theme: Land" - # ) - def build_dataset_stac_collection(self) -> Collection: """Build an OSC STAC Collection for the dataset. From 9e5d54422165f89b29f2600640183da9558e9c55 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 12:17:34 +0100 Subject: [PATCH 13/43] combined publish methods to create a single GitHub PR --- deep_code/tools/publish.py | 102 ++++++++++++++----------------------- 1 file changed, 39 insertions(+), 63 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 266bfef..80ce321 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -1,10 +1,10 @@ #!/usr/bin/env python3 -import copy -import json # Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. +import copy +import json import logging from pathlib import Path from datetime import datetime @@ -111,40 +111,9 @@ def __init__(self, dataset_config_path: str, workflow_config_path: str): self._read_config_files() self.collection_id = self.dataset_config.get("collection_id") - # Ensure collection_id is set if not self.collection_id: raise ValueError("collection_id is missing in dataset config.") - # @staticmethod - # def clean_title(title: str) -> str: - # """Clean up titles by replacing Unicode escape sequences with standard characters.""" - # title = title.replace('\u00a0', - # ' ') # Replace non-breaking space with normal space - # title = title.replace('\u00b0', - # '°') # Replace unicode degree symbol with actual degree symbol - # return title - - # def clean_catalog_titles(self, catalog: Catalog): - # """Recursively clean all titles in the catalog.""" - # # Clean title for the catalog itself - # if isinstance(catalog.title, str): - # catalog.title = self.clean_title(catalog.title) - # - # # Clean titles in all links of the catalog - # for link in catalog.links: - # if isinstance(link.title, str): - # link.title = self.clean_title(link.title) - # - # for link in catalog.links: - # if link.rel == 'child': - # try: - # # If the link points to another catalog or collection, clean it recursively - # child_catalog = Catalog.from_file(link.href) - # self.clean_catalog_titles(child_catalog) - # except Exception as e: - # # If the link doesn't point to a valid catalog file, skip it - # pass - def _read_config_files(self) -> None: with fsspec.open(self.dataset_config_path, "r") as file: self.dataset_config = yaml.safe_load(file) or {} @@ -210,7 +179,8 @@ def _update_variable_catalogs(self, generator, file_dict, variable_ids): file_dict[var_file_path] = updated_catalog.to_dict() def publish_dataset(self, write_to_file: bool = False): - """Publish a product collection to the specified GitHub repository.""" + """Prepare dataset/product collection for publishing to the specified GitHub + repository.""" dataset_id = self.dataset_config.get("dataset_id") self.collection_id = self.dataset_config.get("collection_id") @@ -270,27 +240,8 @@ def publish_dataset(self, write_to_file: bool = False): if write_to_file: for file_path, data in file_dict.items(): self._write_to_file(file_path, data) # Pass file_path and data - else: - # Create branch name, commit message, PR info - branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}" - f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") - commit_message = f"Add new dataset collection: {self.collection_id}" - pr_title = f"Add new dataset collection: {self.collection_id}" - pr_body = ( - f"This PR adds a new dataset collection: {self.collection_id} and " - f"it's " - f"corresponding variable catalogs to the repository.") - - # Publish all files in one go - pr_url = self.gh_publisher.publish_files( - branch_name=branch_name, - file_dict=file_dict, - commit_message=commit_message, - pr_title=pr_title, - pr_body=pr_body, - ) - - logger.info(f"Pull request created: {pr_url}") + return {} + return file_dict @staticmethod @@ -299,6 +250,8 @@ def _normalize_name(name: str | None) -> str | None: def publish_workflow_experiment(self, write_to_file: bool = False): + """prepare workflow and experiment as ogc api record to publish it to the + specified GitHub repository.""" workflow_id = self._normalize_name(self.workflow_config.get("workflow_id")) if not workflow_id: raise ValueError("workflow_id is missing in workflow config.") @@ -357,19 +310,42 @@ def publish_workflow_experiment(self, write_to_file: bool = False): if write_to_file: for file_path, data in file_dict.items(): self._write_to_file(file_path, data) - else: - # Publish to GitHub if not testing - branch_name = f"{WF_BRANCH_NAME}-{workflow_id}" - commit_message = f"Adding workflow from DeepESDL: {workflow_id}" - pr_title = f"Add workflow and Experiment from DeepESDL: {workflow_id}" - pr_body = "This PR adds a new workflow/experiment to the OSC repository." + return {} + return file_dict + + def publish_all(self, write_to_file: bool = False): + """Publish both dataset and workflow/experiment in a single PR.""" + # Get file dictionaries from both methods + dataset_files = self.publish_dataset(write_to_file=write_to_file) + workflow_files = self.publish_workflow_experiment(write_to_file=write_to_file) + + # Combine the file dictionaries + combined_files = {**dataset_files, **workflow_files} + + if not write_to_file: + # Create branch name, commit message, PR info + branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}" + f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") + commit_message = ( + f"Add new dataset collection: {self.collection_id} and " + f"workflow/experiment: {self.workflow_config.get('workflow_id')}" + ) + pr_title = ( + f"Add new dataset collection: {self.collection_id} and " + f"workflow/experiment: {self.workflow_config.get('workflow_id')}" + ) + pr_body = ( + f"This PR adds a new dataset collection: {self.collection_id} and " + f"its corresponding workflow/experiment to the repository." + ) + # Publish all files in one go pr_url = self.gh_publisher.publish_files( branch_name=branch_name, - file_dict=file_dict, + file_dict=combined_files, commit_message=commit_message, pr_title=pr_title, pr_body=pr_body, ) - logger.info(f"Pull request created: {pr_url}") + logger.info(f"Pull request created: {pr_url}") \ No newline at end of file From 40de2cd31787562c13514a83be7a6c7455834664 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 12:33:54 +0100 Subject: [PATCH 14/43] added lint with ruff to CI --- .github/workflows/unittest-workflow.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index 5ed40a3..b0f7f36 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -23,6 +23,10 @@ jobs: cd /home/runner/work/deep-code/deep-code pip install .[dev] + - name: Lint with ruff + run: | + ruff check + - name: Run unit tests shell: bash -l {0} run: | From 574a21943558258c7bc0f359eddebde8b893104f Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 12:35:02 +0100 Subject: [PATCH 15/43] ruff lint checks --- deep_code/cli/publish.py | 3 +- deep_code/constants.py | 5 +- .../tests/utils/test_ogc_record_generator.py | 3 - deep_code/tools/publish.py | 74 ++++++++++--------- deep_code/utils/dataset_stac_generator.py | 21 +++--- deep_code/utils/github_automation.py | 7 +- deep_code/utils/helper.py | 4 - deep_code/utils/ogc_api_record.py | 62 ++++++++-------- deep_code/utils/ogc_record_generator.py | 11 ++- deep_code/utils/osc_extension.py | 1 - 10 files changed, 99 insertions(+), 92 deletions(-) diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index 771eade..fb24b7f 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -24,4 +24,5 @@ def publish_workflow(workflow_metadata): workflow_publisher = WorkflowPublisher() workflow_publisher.publish_workflow_experiment( - workflow_config_path=workflow_metadata) + workflow_config_path=workflow_metadata + ) diff --git a/deep_code/constants.py b/deep_code/constants.py index 59a993c..c7fb597 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -20,5 +20,6 @@ PRODUCT_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/products/catalog.json" DEEPESDL_COLLECTION_SELF_HREF = ( "https://esa-earthcode.github.io/open-science-catalog-metadata/projects/deepesdl" - "/collection.json") -BASE_URL_OSC = "https://esa-earthcode.github.io/open-science-catalog-metadata" \ No newline at end of file + "/collection.json" +) +BASE_URL_OSC = "https://esa-earthcode.github.io/open-science-catalog-metadata" diff --git a/deep_code/tests/utils/test_ogc_record_generator.py b/deep_code/tests/utils/test_ogc_record_generator.py index f4fe372..88fb835 100644 --- a/deep_code/tests/utils/test_ogc_record_generator.py +++ b/deep_code/tests/utils/test_ogc_record_generator.py @@ -1,5 +1,4 @@ import unittest -from datetime import datetime, timezone from deep_code.constants import DEFAULT_THEME_SCHEME from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator @@ -50,8 +49,6 @@ def test_build_record_properties(self): record_properties = generator.build_record_properties(properties, contacts) - now_iso = datetime.now(timezone.utc).isoformat() - self.assertEqual(record_properties.title, "Test Workflow") self.assertEqual(record_properties.description, "A test description") self.assertEqual(len(record_properties.contacts), 1) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 80ce321..9532329 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -6,24 +6,22 @@ import copy import json import logging -from pathlib import Path from datetime import datetime +from pathlib import Path import fsspec import yaml -from deep_code.constants import ( - OSC_BRANCH_NAME, - OSC_REPO_NAME, - OSC_REPO_OWNER, - WF_BRANCH_NAME -) +from deep_code.constants import OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.ogc_api_record import WorkflowAsOgcRecord, \ - ExperimentAsOgcRecord, LinksBuilder -from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator from deep_code.utils.helper import serialize +from deep_code.utils.ogc_api_record import ( + ExperimentAsOgcRecord, + LinksBuilder, + WorkflowAsOgcRecord, +) +from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -138,8 +136,9 @@ def _write_to_file(file_path: str, data: dict): with open(file_path, "w") as f: f.write(json_content) - def _update_and_add_to_file_dict(self, file_dict, catalog_path, - update_method, *args): + def _update_and_add_to_file_dict( + self, file_dict, catalog_path, update_method, *args + ): """Update a catalog using the specified method and add it to file_dict. Args: @@ -148,8 +147,9 @@ def _update_and_add_to_file_dict(self, file_dict, catalog_path, update_method: The method to call for updating the catalog. *args: Additional arguments to pass to the update method. """ - full_path = Path( - self.gh_publisher.github_automation.local_clone_dir) / catalog_path + full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path + ) updated_catalog = update_method(full_path, *args) file_dict[catalog_path] = updated_catalog.to_dict() @@ -165,17 +165,22 @@ def _update_variable_catalogs(self, generator, file_dict, variable_ids): var_file_path = f"variables/{var_id}/catalog.json" if not self.gh_publisher.github_automation.file_exists(var_file_path): logger.info( - f"Variable catalog for {var_id} does not exist. Creating...") + f"Variable catalog for {var_id} does not exist. Creating..." + ) var_metadata = generator.variables_metadata.get(var_id) var_catalog = generator.build_variable_catalog(var_metadata) file_dict[var_file_path] = var_catalog.to_dict() else: logger.info( - f"Variable catalog already exists for {var_id}, adding product link.") - full_path = Path( - self.gh_publisher.github_automation.local_clone_dir) / var_file_path - updated_catalog = generator.update_existing_variable_catalog(full_path, - var_id) + f"Variable catalog already exists for {var_id}, adding product link." + ) + full_path = ( + Path(self.gh_publisher.github_automation.local_clone_dir) + / var_file_path + ) + updated_catalog = generator.update_existing_variable_catalog( + full_path, var_id + ) file_dict[var_file_path] = updated_catalog.to_dict() def publish_dataset(self, write_to_file: bool = False): @@ -220,20 +225,23 @@ def publish_dataset(self, write_to_file: bool = False): # Update variable base catalog variable_base_catalog_path = "variables/catalog.json" - self._update_and_add_to_file_dict(file_dict, variable_base_catalog_path, - generator.update_variable_base_catalog, variable_ids + self._update_and_add_to_file_dict( + file_dict, + variable_base_catalog_path, + generator.update_variable_base_catalog, + variable_ids, ) # Update product base catalog product_catalog_path = "products/catalog.json" - self._update_and_add_to_file_dict(file_dict, product_catalog_path, - generator.update_product_base_catalog + self._update_and_add_to_file_dict( + file_dict, product_catalog_path, generator.update_product_base_catalog ) # Update DeepESDL collection deepesdl_collection_path = "projects/deep-earth-system-data-lab/collection.json" - self._update_and_add_to_file_dict(file_dict, deepesdl_collection_path, - generator.update_deepesdl_collection + self._update_and_add_to_file_dict( + file_dict, deepesdl_collection_path, generator.update_deepesdl_collection ) # Write to files if testing @@ -243,12 +251,10 @@ def publish_dataset(self, write_to_file: bool = False): return {} return file_dict - @staticmethod def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None - def publish_workflow_experiment(self, write_to_file: bool = False): """prepare workflow and experiment as ogc api record to publish it to the specified GitHub repository.""" @@ -275,7 +281,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): properties=wf_record_properties, links=links + theme_links, jupyter_notebook_url=jupyter_notebook_url, - themes=osc_themes + themes=osc_themes, ) # Convert to dictionary and remove jupyter_notebook_url workflow_dict = workflow_record.to_dict() @@ -295,7 +301,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): jupyter_notebook_url=jupyter_notebook_url, collection_id=self.collection_id, properties=exp_record_properties, - links=links + theme_links + links=links + theme_links, ) # Convert to dictionary and remove jupyter_notebook_url experiment_dict = experiment_record.to_dict() @@ -324,8 +330,10 @@ def publish_all(self, write_to_file: bool = False): if not write_to_file: # Create branch name, commit message, PR info - branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}" - f"-{datetime.now().strftime('%Y%m%d%H%M%S')}") + branch_name = ( + f"{OSC_BRANCH_NAME}-{self.collection_id}" + f"-{datetime.now().strftime('%Y%m%d%H%M%S')}" + ) commit_message = ( f"Add new dataset collection: {self.collection_id} and " f"workflow/experiment: {self.workflow_config.get('workflow_id')}" @@ -348,4 +356,4 @@ def publish_all(self, write_to_file: bool = False): pr_body=pr_body, ) - logger.info(f"Pull request created: {pr_url}") \ No newline at end of file + logger.info(f"Pull request created: {pr_url}") diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index b092fac..c30517d 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -import json # Copyright (c) 2025 by Brockmann Consult GmbH # Permissions are hereby granted under the terms of the MIT License: # https://opensource.org/licenses/MIT. @@ -12,10 +11,14 @@ from pystac import Catalog, Collection, Extent, Link, SpatialExtent, TemporalExtent from xcube.core.store import new_data_store -from deep_code.constants import OSC_THEME_SCHEME, VARIABLE_BASE_CATALOG_SELF_HREF, \ - PRODUCT_BASE_CATALOG_SELF_HREF, DEEPESDL_COLLECTION_SELF_HREF -from deep_code.utils.osc_extension import OscExtension +from deep_code.constants import ( + DEEPESDL_COLLECTION_SELF_HREF, + OSC_THEME_SCHEME, + PRODUCT_BASE_CATALOG_SELF_HREF, + VARIABLE_BASE_CATALOG_SELF_HREF, +) from deep_code.utils.ogc_api_record import Theme, ThemeConcept +from deep_code.utils.osc_extension import OscExtension class OscDatasetStacGenerator: @@ -222,10 +225,6 @@ def _add_gcmd_link_to_var_catalog( """ gcmd_keyword_url = var_metadata.get("gcmd_keyword_url") if not gcmd_keyword_url: - self.logger.debug( - f"No gcmd_keyword_url in var_metadata. Please input GCMD link " - f"for the {var_metadata.get("variable_id")} catalog" - ) gcmd_keyword_url = input( f"Enter GCMD keyword URL or a similar url for" f" {var_metadata.get("variable_id")}: ").strip() @@ -382,7 +381,7 @@ def update_deepesdl_collection(self, deepesdl_collection_full_path): deepesdl_collection.add_link( Link( rel="related", - target=f"../../themes/cryosphere/catalog.json", + target=f"../../themes/{theme}/catalog.json", media_type="application/json", title=f"Theme: {self.format_string(theme)}" ) @@ -526,9 +525,9 @@ def build_dataset_stac_collection(self) -> Collection: collection.add_link( Link( rel="related", - target=f"../../projects/deep-earth-system-data-lab/collection.json", + target="../../projects/deep-earth-system-data-lab/collection.json", media_type="application/json", - title=f"Project: DeepESDL" + title="Project: DeepESDL" ) ) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 85ac8c9..357f6d8 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -14,6 +14,7 @@ from deep_code.utils.helper import serialize + class GitHubAutomation: """Automates GitHub operations needed to create a Pull Request. @@ -53,7 +54,7 @@ def clone_repository(self): try: subprocess.run( ["git", "clone", self.fork_repo_url, self.local_clone_dir], - check=True + check=True, ) logging.info(f"Repository cloned to {self.local_clone_dir}") except subprocess.CalledProcessError as e: @@ -80,7 +81,7 @@ def create_branch(self, branch_name: str): def add_file(self, file_path: str, content): """Add a new file to the local repository.""" logging.info(f"Adding new file: {file_path}...") - os.chdir(self.local_clone_dir) # Ensure we are in the Git repository + os.chdir(self.local_clone_dir) # Ensure we are in the Git repository full_path = Path(self.local_clone_dir) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) @@ -156,4 +157,4 @@ def file_exists(self, file_path) -> bool: # if hasattr(obj, "__dict__"): # return obj.__dict__ # Convert objects with attributes to dicts # raise TypeError( - # f"Object of type {type(obj).__name__} is not JSON serializable") \ No newline at end of file + # f"Object of type {type(obj).__name__} is not JSON serializable") diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index 27e8d82..5ff268f 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -1,7 +1,3 @@ -import json -from pathlib import Path - - def serialize(obj): """Convert non-serializable objects to JSON-compatible formats. Args: diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 47e039c..023689a 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -3,7 +3,7 @@ from xrlint.util.constructible import MappingConstructible from xrlint.util.serializable import JsonSerializable, JsonValue -from deep_code.constants import OGC_API_RECORD_SPEC, BASE_URL_OSC +from deep_code.constants import BASE_URL_OSC, OGC_API_RECORD_SPEC class Contact(MappingConstructible["Contact"], JsonSerializable): @@ -34,6 +34,7 @@ def __init__(self, concepts: list, scheme: str): self.concepts = concepts self.scheme = scheme + class JupyterKernelInfo(MappingConstructible["RecordProperties"], JsonSerializable): def __init__(self, name: str, python_version: float, env_file: str): self.name = name @@ -78,10 +79,9 @@ def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: del data["osc_workflow"] # Remove the original key return data + class LinksBuilder: - def __init__(self, - themes: list[str], - ): + def __init__(self, themes: list[str]): self.themes = themes self.theme_links = [] @@ -96,19 +96,21 @@ def build_them_links_for_records(self): "rel": "related", "href": f"../../themes/{theme}/catalog.json", "type": "application/json", - "title": f"Theme: {formated_theme}" + "title": f"Theme: {formated_theme}", } self.theme_links.append(link) return self.theme_links @staticmethod def build_link_to_dataset(collection_id): - return [{ - "rel": "child", - "href": f"../../products/{collection_id}/collection.json", - "type": "application/json", - "title": f"{collection_id}" - }] + return [ + { + "rel": "child", + "href": f"../../products/{collection_id}/collection.json", + "type": "application/json", + "title": f"{collection_id}", + } + ] class WorkflowAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): @@ -122,7 +124,7 @@ def __init__( linkTemplates: list = [], conformsTo: list[str] = None, geometry: Optional[Any] = None, - themes: Optional[Any] = None + themes: Optional[Any] = None, ): if conformsTo is None: conformsTo = [OGC_API_RECORD_SPEC] @@ -143,32 +145,31 @@ def _generate_static_links(self): "rel": "root", "href": "../../catalog.json", "type": "application/json", - "title": "Open Science Catalog" + "title": "Open Science Catalog", }, { "rel": "parent", "href": "../catalog.json", "type": "application/json", - "title": "Workflows" + "title": "Workflows", }, { "rel": "child", "href": f"../../experiments/{self.id}/record.json", "type": "application/json", - "title": f"{self.id}" + "title": f"{self.id}", }, { "rel": "jupyter-notebook", "type": "application/json", "title": "Jupyter Notebook", - "href": f"{self.jupyter_notebook_url}" + "href": f"{self.jupyter_notebook_url}", }, - { "rel": "self", "href": f"{BASE_URL_OSC}/workflows/{self.id}/record.json", - "type": "application/json" - } + "type": "application/json", + }, ] # def _assemble_all_links(self): @@ -177,6 +178,7 @@ def _generate_static_links(self): # theme_links = link_builder.build_them_links_for_records() # return static_links + theme_links + class ExperimentAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( self, @@ -188,7 +190,7 @@ def __init__( links: list[dict], linkTemplates=None, conformsTo: list[str] = None, - geometry: Optional[Any] = None + geometry: Optional[Any] = None, ): if linkTemplates is None: linkTemplates = [] @@ -211,53 +213,53 @@ def _generate_static_links(self): "rel": "root", "href": "../../catalog.json", "type": "application/json", - "title": "Open Science Catalog" + "title": "Open Science Catalog", }, { "rel": "parent", "href": "../catalog.json", "type": "application/json", - "title": "Experiments" + "title": "Experiments", }, { "rel": "related", "href": f"../../workflows/{self.id}/record.json", "type": "application/json", - "title": "Workflow: POLARIS" + "title": "Workflow: POLARIS", }, { "rel": "child", "href": f"../../products/{self.collection_id}/collection.json", "type": "application/json", - "title": f"{self.collection_id}" + "title": f"{self.collection_id}", }, { "rel": "related", "href": "../../projects/deepesdl/collection.json", "type": "application/json", - "title": "Project: DeepESDL" + "title": "Project: DeepESDL", }, { "rel": "input", "href": "./input.yaml", "type": "application/yaml", - "title": "Input parameters" + "title": "Input parameters", }, { "rel": "environment", "href": "./environment.yaml", "type": "application/yaml", - "title": "Execution environment" + "title": "Execution environment", }, { "rel": "self", "href": f"{BASE_URL_OSC}/experiments/{self.id}/record.json", - "type": "application/json" - } + "type": "application/json", + }, ] # def _assemble_all_links(self): # static_links = self._generate_static_links() # link_builder = LinksBuilder(self.themes) # theme_links = link_builder.build_them_links_for_records() - # return static_links + theme_links \ No newline at end of file + # return static_links + theme_links diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index f561ef9..93a02e6 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -11,7 +11,7 @@ Contact, RecordProperties, Theme, - ThemeConcept + ThemeConcept, ) @@ -39,7 +39,9 @@ def build_theme(osc_themes: list[str]) -> Theme: concepts = [ThemeConcept(id=theme_str) for theme_str in osc_themes] return Theme(concepts=concepts, scheme=OSC_THEME_SCHEME) - def build_record_properties(self, properties: dict, contacts: list) -> RecordProperties: + def build_record_properties( + self, properties: dict, contacts: list + ) -> RecordProperties: """Build a RecordProperties object from a properties dictionary. Args: @@ -64,8 +66,9 @@ def build_record_properties(self, properties: dict, contacts: list) -> RecordPro if themes_list: theme_obj = self.build_theme(themes_list) properties.update( - {"themes": [theme_obj]}) # Wrap the Theme object in a list + {"themes": [theme_obj]} + ) # Wrap the Theme object in a list properties.setdefault("type", "workflow") - return RecordProperties.from_value(properties) \ No newline at end of file + return RecordProperties.from_value(properties) diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 4f269b9..716c0e9 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -13,7 +13,6 @@ from deep_code.constants import CF_SCHEMA_URI, OSC_SCHEMA_URI, THEMES_SCHEMA_URI - class OscExtension( PropertiesExtension, ExtensionManagementMixin[pystac.Item | pystac.Collection] ): From 57f1c8c1022d20cdb5d29f2aab757c4c1d48024f Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 12:42:38 +0100 Subject: [PATCH 16/43] update cli command to publish all files in one PR --- deep_code/cli/main.py | 5 ++--- deep_code/cli/publish.py | 21 +++++++-------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/deep_code/cli/main.py b/deep_code/cli/main.py index af140a4..e4f5380 100644 --- a/deep_code/cli/main.py +++ b/deep_code/cli/main.py @@ -6,7 +6,7 @@ import click -from deep_code.cli.publish import publish_dataset, publish_workflow +from deep_code.cli.publish import publish @click.group() @@ -15,8 +15,7 @@ def main(): pass -main.add_command(publish_dataset) -main.add_command(publish_workflow) +main.add_command(publish) if __name__ == "__main__": main() diff --git a/deep_code/cli/publish.py b/deep_code/cli/publish.py index fb24b7f..47e34fb 100644 --- a/deep_code/cli/publish.py +++ b/deep_code/cli/publish.py @@ -6,23 +6,16 @@ import click -from deep_code.tools.publish import Publisher, WorkflowPublisher +from deep_code.tools.publish import Publisher -@click.command(name="publish-dataset") +@click.command(name="publish") @click.argument("dataset_config", type=click.Path(exists=True)) -def publish_dataset(dataset_config): +@click.argument("workflow_config", type=click.Path(exists=True)) +def publish(dataset_config, workflow_config): """Request publishing a dataset to the open science catalogue. """ - publisher = Publisher() - publisher.publish_dataset(dataset_config_path=dataset_config) - - -@click.command(name="publish-workflow") -@click.argument("workflow_metadata", type=click.Path(exists=True)) -def publish_workflow(workflow_metadata): - - workflow_publisher = WorkflowPublisher() - workflow_publisher.publish_workflow_experiment( - workflow_config_path=workflow_metadata + publisher = Publisher( + dataset_config_path=dataset_config, workflow_config_path=workflow_config ) + publisher.publish_all() From 0956ad0d0dc437d3bf2b0a907bc386942a737ba6 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 12:47:28 +0100 Subject: [PATCH 17/43] updated ci to install ruff explicitly --- .github/workflows/unittest-workflow.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index b0f7f36..8c67ec2 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -25,6 +25,7 @@ jobs: - name: Lint with ruff run: | + pip install ruff ruff check - name: Run unit tests From ab4adb31ac33c45e46e72784976ea2ddfd67a01f Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 12:50:59 +0100 Subject: [PATCH 18/43] updated ci --- .github/workflows/unittest-workflow.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unittest-workflow.yaml b/.github/workflows/unittest-workflow.yaml index 8c67ec2..d83721a 100644 --- a/.github/workflows/unittest-workflow.yaml +++ b/.github/workflows/unittest-workflow.yaml @@ -21,7 +21,7 @@ jobs: shell: bash -l {0} run: | cd /home/runner/work/deep-code/deep-code - pip install .[dev] + pip install -e ".[dev]" - name: Lint with ruff run: | From 6844604e8f56c670f6bdab699c62ab2f7f6febdb Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 12 Mar 2025 18:09:13 +0100 Subject: [PATCH 19/43] remove commented code --- deep_code/utils/github_automation.py | 9 --------- deep_code/utils/ogc_api_record.py | 5 ----- deep_code/utils/osc_extension.py | 12 ------------ 3 files changed, 26 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 357f6d8..244bddd 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -149,12 +149,3 @@ def file_exists(self, file_path) -> bool: exists = os.path.isfile(full_path) logging.debug(f"Checking existence of {full_path}: {exists}") return exists - - # # Check and convert any non-serializable objects - # def serialize(self, obj): - # if isinstance(obj, set): - # return list(obj) # Convert sets to lists - # if hasattr(obj, "__dict__"): - # return obj.__dict__ # Convert objects with attributes to dicts - # raise TypeError( - # f"Object of type {type(obj).__name__} is not JSON serializable") diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 023689a..2d42365 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -258,8 +258,3 @@ def _generate_static_links(self): }, ] - # def _assemble_all_links(self): - # static_links = self._generate_static_links() - # link_builder = LinksBuilder(self.themes) - # theme_links = link_builder.build_them_links_for_records() - # return static_links + theme_links diff --git a/deep_code/utils/osc_extension.py b/deep_code/utils/osc_extension.py index 716c0e9..48b68e6 100644 --- a/deep_code/utils/osc_extension.py +++ b/deep_code/utils/osc_extension.py @@ -63,18 +63,6 @@ def osc_project(self) -> str | None: def osc_project(self, v: str) -> None: self._set_property("osc:project", v, pop_if_none=False) - # @property - # def osc_themes(self) -> list[str] | None: - # return self._get_property("osc:themes", list) - # - # @osc_themes.setter - # def osc_themes(self, value: list[str]) -> None: - # if not isinstance(value, list) or not all( - # isinstance(item, str) for item in value - # ): - # raise ValueError("osc:themes must be a list of strings") - # self._set_property("osc:themes", value, pop_if_none=False) - @property def osc_region(self) -> str | None: return self._get_property("osc:region", str) From c31184341b424a8141238d8492533648067f9034 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 13 Mar 2025 09:24:06 +0100 Subject: [PATCH 20/43] remove commented code --- deep_code/utils/ogc_api_record.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 2d42365..ba7a3f9 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -172,12 +172,6 @@ def _generate_static_links(self): }, ] - # def _assemble_all_links(self): - # static_links = self._generate_static_links() - # link_builder = LinksBuilder(self.themes) - # theme_links = link_builder.build_them_links_for_records() - # return static_links + theme_links - class ExperimentAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable): def __init__( From 09a43b74723a3e181d2b65b11a67b833f2518d05 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 13 Mar 2025 16:03:25 +0100 Subject: [PATCH 21/43] update base catalogs of experiments and workflows --- deep_code/constants.py | 5 ++++ deep_code/tools/publish.py | 55 +++++++++++++++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index c7fb597..80aa22f 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -23,3 +23,8 @@ "/collection.json" ) BASE_URL_OSC = "https://esa-earthcode.github.io/open-science-catalog-metadata" +EXPERIMENT_BASE_CATALOG_SELF_HREF = \ + "https://esa-earthcode.github.io/open-science-catalog-metadata/experiments/catalog.json" +WORKFLOW_BASE_CATALOG_SELF_HREF = \ + ("https://esa-earthcode.github.io/open-science-catalog-metadata/workflow/catalog" + ".json") diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 9532329..24b2314 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -11,8 +11,15 @@ import fsspec import yaml - -from deep_code.constants import OSC_BRANCH_NAME, OSC_REPO_NAME, OSC_REPO_OWNER +from pystac import Catalog, Link + +from deep_code.constants import ( + EXPERIMENT_BASE_CATALOG_SELF_HREF, + OSC_BRANCH_NAME, + OSC_REPO_NAME, + OSC_REPO_OWNER, + WORKFLOW_BASE_CATALOG_SELF_HREF, +) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation from deep_code.utils.helper import serialize @@ -255,6 +262,37 @@ def publish_dataset(self, write_to_file: bool = False): def _normalize_name(name: str | None) -> str | None: return name.replace(" ", "-").lower() if name else None + def _update_base_catalog( + self, catalog_path: str, item_id: str, self_href: str + ) -> Catalog: + """Update a base catalog by adding a link to a new item. + + Args: + catalog_path: Path to the base catalog JSON file. + item_id: ID of the new item (experiment or workflow). + self_href: Self-href for the base catalog. + + Returns: + Updated Catalog object. + """ + # Load the base catalog + base_catalog = Catalog.from_file(Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path) + + # Add a link to the new item + base_catalog.add_link( + Link( + rel="item", + target=f"./{item_id}/collection.json", + media_type="application/json", + title=item_id, + ) + ) + + # Set the self-href for the base catalog + base_catalog.set_self_href(self_href) + + return base_catalog + def publish_workflow_experiment(self, write_to_file: bool = False): """prepare workflow and experiment as ogc api record to publish it to the specified GitHub repository.""" @@ -287,7 +325,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): workflow_dict = workflow_record.to_dict() if "jupyter_notebook_url" in workflow_dict: del workflow_dict["jupyter_notebook_url"] - wf_file_path = f"workflow/{workflow_id}/record.json" + wf_file_path = f"workflows/{workflow_id}/record.json" file_dict = {wf_file_path: workflow_dict} # Build properties for the experiment record @@ -312,6 +350,17 @@ def publish_workflow_experiment(self, write_to_file: bool = False): exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict + self._update_base_catalog( + catalog_path="experiments/catalog.json", + item_id=workflow_id, + self_href=EXPERIMENT_BASE_CATALOG_SELF_HREF, + ) + + self._update_base_catalog( + catalog_path="workflow/catalog.json", + item_id=workflow_id, + self_href=WORKFLOW_BASE_CATALOG_SELF_HREF, + ) # Write to files if testing if write_to_file: for file_path, data in file_dict.items(): From 259a3d9b73566ac38b0f68f35e91e5866dc17d30 Mon Sep 17 00:00:00 2001 From: tejas Date: Thu, 13 Mar 2025 22:25:34 +0100 Subject: [PATCH 22/43] adapted to generated valid records --- deep_code/constants.py | 3 ++- deep_code/tools/publish.py | 23 ++++++++++++++++------- deep_code/utils/ogc_api_record.py | 24 ++++++++++++++++++++---- deep_code/utils/ogc_record_generator.py | 2 +- 4 files changed, 39 insertions(+), 13 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 80aa22f..a4281b1 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -26,5 +26,6 @@ EXPERIMENT_BASE_CATALOG_SELF_HREF = \ "https://esa-earthcode.github.io/open-science-catalog-metadata/experiments/catalog.json" WORKFLOW_BASE_CATALOG_SELF_HREF = \ - ("https://esa-earthcode.github.io/open-science-catalog-metadata/workflow/catalog" + ("https://esa-earthcode.github.io/open-science-catalog-metadata/workflows/catalog" ".json") +PROJECT_COLLECTION_NAME = "deep-earth-system-data-lab" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 24b2314..ca51aec 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -107,6 +107,7 @@ def __init__(self, dataset_config_path: str, workflow_config_path: str): # Composition self.gh_publisher = GitHubPublisher() self.collection_id = "" + self.workflow_title = "" # Paths to configuration files self.dataset_config_path = dataset_config_path @@ -115,6 +116,7 @@ def __init__(self, dataset_config_path: str, workflow_config_path: str): # Load configuration files self._read_config_files() self.collection_id = self.dataset_config.get("collection_id") + self.workflow_title = self.workflow_config.get("properties", {}).get("title") if not self.collection_id: raise ValueError("collection_id is missing in dataset config.") @@ -158,7 +160,7 @@ def _update_and_add_to_file_dict( Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path ) updated_catalog = update_method(full_path, *args) - file_dict[catalog_path] = updated_catalog.to_dict() + file_dict[full_path] = updated_catalog.to_dict() def _update_variable_catalogs(self, generator, file_dict, variable_ids): """Update or create variable catalogs and add them to file_dict. @@ -282,9 +284,9 @@ def _update_base_catalog( base_catalog.add_link( Link( rel="item", - target=f"./{item_id}/collection.json", + target=f"./{item_id}/record.json", media_type="application/json", - title=item_id, + title=f"{self.workflow_title}", ) ) @@ -309,6 +311,8 @@ def publish_workflow_experiment(self, write_to_file: bool = False): logger.info("Generating OGC API Record for the workflow...") rg = OSCWorkflowOGCApiRecordGenerator() wf_record_properties = rg.build_record_properties(properties_list, contacts) + # make a copy for experiment record + exp_record_properties = copy.deepcopy(wf_record_properties) link_builder = LinksBuilder(osc_themes) theme_links = link_builder.build_them_links_for_records() @@ -316,6 +320,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): workflow_record = WorkflowAsOgcRecord( id=workflow_id, type="Feature", + title=self.workflow_title, properties=wf_record_properties, links=links + theme_links, jupyter_notebook_url=jupyter_notebook_url, @@ -325,16 +330,18 @@ def publish_workflow_experiment(self, write_to_file: bool = False): workflow_dict = workflow_record.to_dict() if "jupyter_notebook_url" in workflow_dict: del workflow_dict["jupyter_notebook_url"] + if "osc:workflow" in workflow_dict["properties"]: + del workflow_dict["properties"]["osc:workflow"] wf_file_path = f"workflows/{workflow_id}/record.json" file_dict = {wf_file_path: workflow_dict} # Build properties for the experiment record - exp_record_properties = copy.deepcopy(wf_record_properties) exp_record_properties.type = "experiment" exp_record_properties.osc_workflow = workflow_id experiment_record = ExperimentAsOgcRecord( id=workflow_id, + title=self.workflow_title, type="Feature", jupyter_notebook_url=jupyter_notebook_url, collection_id=self.collection_id, @@ -347,17 +354,19 @@ def publish_workflow_experiment(self, write_to_file: bool = False): del experiment_dict["jupyter_notebook_url"] if "collection_id" in experiment_dict: del experiment_dict["collection_id"] + if "osc:project" in experiment_dict["properties"]: + del experiment_dict["properties"]["osc:project"] exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict - self._update_base_catalog( + file_dict["experiments/catalog.json"] = self._update_base_catalog( catalog_path="experiments/catalog.json", item_id=workflow_id, self_href=EXPERIMENT_BASE_CATALOG_SELF_HREF, ) - self._update_base_catalog( - catalog_path="workflow/catalog.json", + file_dict["workflows/catalog.json"] = self._update_base_catalog( + catalog_path="workflows/catalog.json", item_id=workflow_id, self_href=WORKFLOW_BASE_CATALOG_SELF_HREF, ) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index ba7a3f9..457320f 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -3,7 +3,8 @@ from xrlint.util.constructible import MappingConstructible from xrlint.util.serializable import JsonSerializable, JsonValue -from deep_code.constants import BASE_URL_OSC, OGC_API_RECORD_SPEC +from deep_code.constants import BASE_URL_OSC, OGC_API_RECORD_SPEC, \ + PROJECT_COLLECTION_NAME class Contact(MappingConstructible["Contact"], JsonSerializable): @@ -50,6 +51,7 @@ def __init__( title: str, description: str, jupyter_kernel_info: JupyterKernelInfo, + osc_project: str, osc_workflow: str = None, updated: str = None, contacts: list[Contact] = None, @@ -64,6 +66,7 @@ def __init__( self.title = title self.description = description self.jupyter_kernel_info = jupyter_kernel_info + self.osc_project = osc_project self.osc_workflow = osc_workflow self.keywords = keywords or [] self.contacts = contacts @@ -77,6 +80,9 @@ def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: if self.osc_workflow is not None: data["osc:workflow"] = self.osc_workflow del data["osc_workflow"] # Remove the original key + if self.osc_project is not None: + data["osc:project"] = self.osc_project + del data["osc_project"] return data @@ -118,6 +124,7 @@ def __init__( self, id: str, type: str, + title: str, jupyter_notebook_url: str, properties: RecordProperties, links: list[dict], @@ -130,6 +137,7 @@ def __init__( conformsTo = [OGC_API_RECORD_SPEC] self.id = id self.type = type + self.title = title self.jupyter_notebook_url = jupyter_notebook_url self.geometry = geometry self.properties = properties @@ -157,7 +165,7 @@ def _generate_static_links(self): "rel": "child", "href": f"../../experiments/{self.id}/record.json", "type": "application/json", - "title": f"{self.id}", + "title": f"{self.title}", }, { "rel": "jupyter-notebook", @@ -165,6 +173,12 @@ def _generate_static_links(self): "title": "Jupyter Notebook", "href": f"{self.jupyter_notebook_url}", }, + { + "rel": "related", + "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", + "type": "application/json", + "title": "Project: DeepESDL", + }, { "rel": "self", "href": f"{BASE_URL_OSC}/workflows/{self.id}/record.json", @@ -177,6 +191,7 @@ class ExperimentAsOgcRecord(MappingConstructible["OgcRecord"], JsonSerializable) def __init__( self, id: str, + title: str, type: str, jupyter_notebook_url: str, collection_id: str, @@ -191,6 +206,7 @@ def __init__( if conformsTo is None: conformsTo = [OGC_API_RECORD_SPEC] self.id = id + self.title = title self.type = type self.conformsTo = conformsTo self.jupyter_notebook_url = jupyter_notebook_url @@ -219,7 +235,7 @@ def _generate_static_links(self): "rel": "related", "href": f"../../workflows/{self.id}/record.json", "type": "application/json", - "title": "Workflow: POLARIS", + "title": f"Workflow: {self.title}", }, { "rel": "child", @@ -229,7 +245,7 @@ def _generate_static_links(self): }, { "rel": "related", - "href": "../../projects/deepesdl/collection.json", + "href": f"../../projects/{PROJECT_COLLECTION_NAME}/collection.json", "type": "application/json", "title": "Project: DeepESDL", }, diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index 93a02e6..4dc0bec 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -47,7 +47,6 @@ def build_record_properties( Args: properties: A dictionary containing properties (e.g., title, description, themes). contacts: A list of contact dictionaries. - caller: The caller type ("WorkflowAsOgcRecord" or "ExperimentAsOgcRecord"). Returns: A RecordProperties object. @@ -70,5 +69,6 @@ def build_record_properties( ) # Wrap the Theme object in a list properties.setdefault("type", "workflow") + properties.setdefault("osc_project", "deep-earth-system-data-lab") return RecordProperties.from_value(properties) From 4b4e4da95810204638ab62bb394e1b9c388be915 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 11:12:41 +0100 Subject: [PATCH 23/43] refactor --- deep_code/tools/publish.py | 9 +++++---- deep_code/utils/ogc_api_record.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index ca51aec..c17d56f 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -326,12 +326,12 @@ def publish_workflow_experiment(self, write_to_file: bool = False): jupyter_notebook_url=jupyter_notebook_url, themes=osc_themes, ) - # Convert to dictionary and remove jupyter_notebook_url + # Convert to dictionary and cleanup workflow_dict = workflow_record.to_dict() if "jupyter_notebook_url" in workflow_dict: del workflow_dict["jupyter_notebook_url"] - if "osc:workflow" in workflow_dict["properties"]: - del workflow_dict["properties"]["osc:workflow"] + if "osc_workflow" in workflow_dict["properties"]: + del workflow_dict["properties"]["osc_workflow"] wf_file_path = f"workflows/{workflow_id}/record.json" file_dict = {wf_file_path: workflow_dict} @@ -348,7 +348,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): properties=exp_record_properties, links=links + theme_links, ) - # Convert to dictionary and remove jupyter_notebook_url + # Convert to dictionary and cleanup experiment_dict = experiment_record.to_dict() if "jupyter_notebook_url" in experiment_dict: del experiment_dict["jupyter_notebook_url"] @@ -359,6 +359,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): exp_file_path = f"experiments/{workflow_id}/record.json" file_dict[exp_file_path] = experiment_dict + # Update base catalogs of experiments and workflows with links file_dict["experiments/catalog.json"] = self._update_base_catalog( catalog_path="experiments/catalog.json", item_id=workflow_id, diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 457320f..3958431 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -79,7 +79,7 @@ def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: data = super().to_dict(value_name) if self.osc_workflow is not None: data["osc:workflow"] = self.osc_workflow - del data["osc_workflow"] # Remove the original key + del data["osc_workflow"] # Remove the original key as it be renamed it if self.osc_project is not None: data["osc:project"] = self.osc_project del data["osc_project"] From db535f85372cd3b00cfdbde2e67aae91f0ac2393 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 13:20:54 +0100 Subject: [PATCH 24/43] updated unit tests --- deep_code/tests/tools/test_publish.py | 120 -------- .../tests/utils/test_github_automation.py | 173 +++++++----- deep_code/tests/utils/test_ogc_api_record.py | 258 +++++++++++++----- .../tests/utils/test_ogc_record_generator.py | 4 +- 4 files changed, 308 insertions(+), 247 deletions(-) delete mode 100644 deep_code/tests/tools/test_publish.py diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py deleted file mode 100644 index ff5d1ef..0000000 --- a/deep_code/tests/tools/test_publish.py +++ /dev/null @@ -1,120 +0,0 @@ -from unittest.mock import MagicMock, mock_open, patch - -import pytest - -from deep_code.tools.publish import Publisher - - -class TestDatasetPublisher: - @patch("deep_code.tools.publish.fsspec.open") - def test_init_missing_credentials(self, mock_fsspec_open): - mock_fsspec_open.return_value.__enter__.return_value = mock_open( - read_data="{}" - )() - - with pytest.raises( - ValueError, match="GitHub credentials are missing in the `.gitaccess` file." - ): - Publisher() - - @patch("deep_code.tools.publish.fsspec.open") - def test_publish_dataset_missing_ids(self, mock_fsspec_open): - git_yaml_content = """ - github-username: test-user - github-token: test-token - """ - dataset_yaml_content = """ - collection-id: test-collection - """ - mock_fsspec_open.side_effect = [ - mock_open(read_data=git_yaml_content)(), - mock_open(read_data=dataset_yaml_content)(), - ] - - publisher = Publisher() - - with pytest.raises( - ValueError, match="Dataset ID or Collection ID missing in the config." - ): - publisher.publish_dataset("/path/to/dataset-config.yaml") - - @patch("deep_code.utils.github_automation.os.chdir") - @patch("deep_code.utils.github_automation.subprocess.run") - @patch("deep_code.utils.github_automation.os.path.expanduser", return_value="/tmp") - @patch("requests.post") - @patch("deep_code.utils.github_automation.GitHubAutomation") - @patch("deep_code.tools.publish.fsspec.open") - def test_publish_dataset_success( - self, - mock_fsspec_open, - mock_github_automation, - mock_requests_post, - mock_expanduser, - mock_subprocess_run, - mock_chdir, - ): - # Mock the YAML reads - git_yaml_content = """ - github-username: test-user - github-token: test-token - """ - dataset_yaml_content = """ - dataset_id: test-dataset - collection_id: test-collection - documentation_link: http://example.com/doc - access_link: http://example.com/access - dataset_status: ongoing - dataset_region: Global - osc_theme: ["climate"] - cf_parameter: [] - """ - mock_fsspec_open.side_effect = [ - mock_open(read_data=git_yaml_content)(), - mock_open(read_data=dataset_yaml_content)(), - ] - - # Mock GitHubAutomation methods - mock_git = mock_github_automation.return_value - mock_git.fork_repository.return_value = None - mock_git.clone_repository.return_value = None - mock_git.create_branch.return_value = None - mock_git.add_file.return_value = None - mock_git.commit_and_push.return_value = None - mock_git.create_pull_request.return_value = "http://example.com/pr" - mock_git.clean_up.return_value = None - - # Mock subprocess.run & os.chdir - mock_subprocess_run.return_value = None - mock_chdir.return_value = None - - # Mock STAC generator - mock_collection = MagicMock() - mock_collection.to_dict.return_value = { - "type": "Collection", - "id": "test-collection", - "description": "A test STAC collection", - "extent": { - "spatial": {"bbox": [[-180.0, -90.0, 180.0, 90.0]]}, - "temporal": {"interval": [["2023-01-01T00:00:00Z", None]]}, - }, - "links": [], - "stac_version": "1.0.0", - } - with patch("deep_code.tools.publish.OscDatasetStacGenerator") as mock_generator: - mock_generator.return_value.build_dataset_stac_collection.return_value = ( - mock_collection - ) - - # Instantiate & publish - publisher = Publisher() - publisher.publish_dataset("/fake/path/to/dataset-config.yaml") - - # Assert that we called git clone with /tmp/temp_repo - # Because expanduser("~") is now patched to /tmp, the actual path is /tmp/temp_repo - auth_url = "https://test-user:test-token@github.com/test-user/open-science-catalog-metadata-testing.git" - mock_subprocess_run.assert_any_call( - ["git", "clone", auth_url, "/tmp/temp_repo"], check=True - ) - - # Also confirm we changed directories to /tmp/temp_repo - mock_chdir.assert_any_call("/tmp/temp_repo") diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index 6a66868..2456f0d 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -1,4 +1,4 @@ -import json +import logging import unittest from pathlib import Path from unittest.mock import MagicMock, patch @@ -8,113 +8,164 @@ class TestGitHubAutomation(unittest.TestCase): def setUp(self): - self.github = GitHubAutomation( - username="test-user", - token="test-token", - repo_owner="test-owner", - repo_name="test-repo", + # Set up test data + self.username = "testuser" + self.token = "testtoken" + self.repo_owner = "testowner" + self.repo_name = "testrepo" + self.github_automation = GitHubAutomation( + self.username, self.token, self.repo_owner, self.repo_name ) + logging.disable(logging.CRITICAL) # Disable logging during tests + + def tearDown(self): + logging.disable(logging.NOTSET) # Re-enable logging after tests @patch("requests.post") def test_fork_repository(self, mock_post): - """Test the fork_repository method.""" + # Mock the response from GitHub API mock_response = MagicMock() mock_response.raise_for_status.return_value = None mock_post.return_value = mock_response - self.github.fork_repository() + # Call the method + self.github_automation.fork_repository() + # Assertions mock_post.assert_called_once_with( - "https://api.github.com/repos/test-owner/test-repo/forks", - headers={"Authorization": "token test-token"}, + f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/forks", + headers={"Authorization": f"token {self.token}"}, ) @patch("subprocess.run") - @patch("os.chdir") - def test_clone_repository(self, mock_chdir, mock_run): - """Test the clone_repository method.""" - self.github.clone_repository() + def test_clone_repository_new(self, mock_run): + # Mock the subprocess.run method + mock_run.return_value = MagicMock() + + # Mock os.path.exists to return False (directory does not exist) + with patch("os.path.exists", return_value=False): + self.github_automation.clone_repository() + # Assertions mock_run.assert_called_once_with( - ["git", "clone", self.github.fork_repo_url, self.github.local_clone_dir], + [ + "git", + "clone", + f"https://{self.username}:{self.token}@github.com/{self.username}/{self.repo_name}.git", + self.github_automation.local_clone_dir, + ], check=True, ) - mock_chdir.assert_called_once_with(self.github.local_clone_dir) + + @patch("subprocess.run") + def test_clone_repository_existing(self, mock_run): + # Mock the subprocess.run method + mock_run.return_value = MagicMock() + + # Mock os.path.exists to return True (directory exists) + with patch("os.path.exists", return_value=True): + with patch("os.chdir"): + self.github_automation.clone_repository() + + # Assertions + mock_run.assert_called_once_with(["git", "pull"], check=True) @patch("subprocess.run") def test_create_branch(self, mock_run): - """Test the create_branch method.""" - branch_name = "test-branch" - self.github.create_branch(branch_name) + # Mock the subprocess.run method + mock_run.return_value = MagicMock() + + # Mock os.chdir + with patch("os.chdir"): + self.github_automation.create_branch("test-branch") + # Assertions mock_run.assert_called_once_with( - ["git", "checkout", "-b", branch_name], check=True + ["git", "checkout", "-b", "test-branch"], check=True ) @patch("subprocess.run") - @patch("builtins.open", new_callable=unittest.mock.mock_open) - @patch("pathlib.Path.mkdir") - def test_add_file(self, mock_mkdir, mock_open, mock_run): - """Test the add_file method.""" - file_path = "test-dir/test-file.json" - content = {"key": "value"} - - self.github.add_file(file_path, content) - - mock_mkdir.assert_called_once_with(parents=True, exist_ok=True) - mock_open.assert_called_once_with( - Path(self.github.local_clone_dir) / file_path, "w" - ) - mock_open().write.assert_called_once_with(json.dumps(content, indent=2)) + def test_add_file(self, mock_run): + # Mock the subprocess.run method + mock_run.return_value = MagicMock() + + # Mock os.chdir and Path + with patch("os.chdir"), patch("pathlib.Path.mkdir"), patch( + "builtins.open", unittest.mock.mock_open() + ): + self.github_automation.add_file("test/file.json", {"key": "value"}) + + # Assertions mock_run.assert_called_once_with( - ["git", "add", str(Path(self.github.local_clone_dir) / file_path)], + [ + "git", + "add", + str(Path(self.github_automation.local_clone_dir) / "test/file.json"), + ], check=True, ) @patch("subprocess.run") def test_commit_and_push(self, mock_run): - """Test the commit_and_push method.""" - branch_name = "test-branch" - commit_message = "Test commit message" + # Mock the subprocess.run method + mock_run.return_value = MagicMock() - self.github.commit_and_push(branch_name, commit_message) + # Mock os.chdir + with patch("os.chdir"): + self.github_automation.commit_and_push("test-branch", "Test commit message") - mock_run.assert_any_call(["git", "commit", "-m", commit_message], check=True) + # Assertions + mock_run.assert_any_call( + ["git", "commit", "-m", "Test commit message"], check=True + ) mock_run.assert_any_call( - ["git", "push", "-u", "origin", branch_name], check=True + ["git", "push", "-u", "origin", "test-branch"], check=True ) @patch("requests.post") def test_create_pull_request(self, mock_post): - """Test the create_pull_request method.""" - branch_name = "test-branch" - pr_title = "Test PR" - pr_body = "This is a test PR" - base_branch = "main" - + # Mock the response from GitHub API mock_response = MagicMock() - mock_response.json.return_value = {"html_url": "https://github.com/test-pr"} mock_response.raise_for_status.return_value = None + mock_response.json.return_value = {"html_url": "https://github.com/test/pull/1"} mock_post.return_value = mock_response - self.github.create_pull_request(branch_name, pr_title, pr_body, base_branch) + # Mock os.chdir + with patch("os.chdir"): + self.github_automation.create_pull_request( + "test-branch", "Test PR", "Test body" + ) + # Assertions mock_post.assert_called_once_with( - "https://api.github.com/repos/test-owner/test-repo/pulls", - headers={"Authorization": "token test-token"}, + f"https://api.github.com/repos/{self.repo_owner}/{self.repo_name}/pulls", + headers={"Authorization": f"token {self.token}"}, json={ - "title": pr_title, - "head": f"test-user:{branch_name}", - "base": base_branch, - "body": pr_body, + "title": "Test PR", + "head": f"{self.username}:test-branch", + "base": "main", + "body": "Test body", }, ) @patch("subprocess.run") - @patch("os.chdir") - def test_clean_up(self, mock_chdir, mock_run): - """Test the clean_up method.""" - self.github.clean_up() + def test_clean_up(self, mock_run): + # Mock the subprocess.run method + mock_run.return_value = MagicMock() + + # Mock os.chdir + with patch("os.chdir"): + self.github_automation.clean_up() + + # Assertions + mock_run.assert_called_once_with( + ["rm", "-rf", self.github_automation.local_clone_dir] + ) + + def test_file_exists(self): + # Mock os.path.isfile + with patch("os.path.isfile", return_value=True): + result = self.github_automation.file_exists("test/file.json") - mock_chdir.assert_called_once_with("..") - mock_run.assert_called_once_with(["rm", "-rf", self.github.local_clone_dir]) + # Assertions + self.assertTrue(result) diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index 52640fe..bf7ae69 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -3,111 +3,241 @@ from deep_code.constants import OGC_API_RECORD_SPEC from deep_code.utils.ogc_api_record import ( Contact, + ExperimentAsOgcRecord, JupyterKernelInfo, - OgcRecord, + LinksBuilder, RecordProperties, Theme, ThemeConcept, + WorkflowAsOgcRecord, ) -class TestClasses(unittest.TestCase): +class TestContact(unittest.TestCase): def test_contact_initialization(self): contact = Contact( - name="Person-X", - organization="Organization X", + name="John Doe", + organization="DeepESDL", position="Researcher", - links=[{"url": "http://example.com", "type": "website"}], + links=[{"href": "https://example.com"}], contactInstructions="Contact via email", - roles=["developer", "reviewer"], + roles=["principal investigator"], ) - self.assertEqual(contact.name, "Person-X") - self.assertEqual(contact.organization, "Organization X") + self.assertEqual(contact.name, "John Doe") + self.assertEqual(contact.organization, "DeepESDL") self.assertEqual(contact.position, "Researcher") - self.assertEqual(len(contact.links), 1) + self.assertEqual(contact.links, [{"href": "https://example.com"}]) self.assertEqual(contact.contactInstructions, "Contact via email") - self.assertIn("developer", contact.roles) + self.assertEqual(contact.roles, ["principal investigator"]) + def test_contact_default_values(self): + contact = Contact(name="Jane Doe", organization="DeepESDL") + + self.assertEqual(contact.position, "") + self.assertEqual(contact.links, []) + self.assertEqual(contact.contactInstructions, "") + self.assertEqual(contact.roles, ["principal investigator"]) + + +class TestThemeConcept(unittest.TestCase): def test_theme_concept_initialization(self): - theme_concept = ThemeConcept(id="concept1") - self.assertEqual(theme_concept.id, "concept1") + theme_concept = ThemeConcept(id="climate") + + self.assertEqual(theme_concept.id, "climate") + +class TestTheme(unittest.TestCase): def test_theme_initialization(self): - theme_concepts = [ThemeConcept(id="concept1"), ThemeConcept(id="concept2")] - theme = Theme(concepts=theme_concepts, scheme="http://example.com/scheme") + theme_concept = ThemeConcept(id="climate") + theme = Theme(concepts=[theme_concept], scheme="https://example.com") - self.assertEqual(len(theme.concepts), 2) - self.assertEqual(theme.scheme, "http://example.com/scheme") + self.assertEqual(theme.concepts, [theme_concept]) + self.assertEqual(theme.scheme, "https://example.com") + +class TestJupyterKernelInfo(unittest.TestCase): def test_jupyter_kernel_info_initialization(self): kernel_info = JupyterKernelInfo( - name="Python", python_version=3.9, env_file="env.yml" + name="python3", python_version=3.9, env_file="environment.yml" ) - self.assertEqual(kernel_info.name, "Python") + self.assertEqual(kernel_info.name, "python3") self.assertEqual(kernel_info.python_version, 3.9) - self.assertEqual(kernel_info.env_file, "env.yml") + self.assertEqual(kernel_info.env_file, "environment.yml") + +class TestRecordProperties(unittest.TestCase): def test_record_properties_initialization(self): kernel_info = JupyterKernelInfo( - name="Python", python_version=3.9, env_file="env.yml" + name="python3", python_version=3.9, env_file="environment.yml" + ) + contact = Contact(name="John Doe", organization="DeepESDL") + theme = Theme( + concepts=[ThemeConcept(id="climate")], scheme="https://example.com" + ) + + record_properties = RecordProperties( + created="2023-01-01", + type="workflow", + title="Test Workflow", + description="A test workflow", + jupyter_kernel_info=kernel_info, + osc_project="DeepESDL", + osc_workflow="test-workflow", + updated="2023-01-02", + contacts=[contact], + themes=[theme], + keywords=["test", "workflow"], + formats=[{"type": "application/json"}], + license="MIT", + ) + + self.assertEqual(record_properties.created, "2023-01-01") + self.assertEqual(record_properties.updated, "2023-01-02") + self.assertEqual(record_properties.type, "workflow") + self.assertEqual(record_properties.title, "Test Workflow") + self.assertEqual(record_properties.description, "A test workflow") + self.assertEqual(record_properties.jupyter_kernel_info, kernel_info) + self.assertEqual(record_properties.osc_project, "deep-esdl") + self.assertEqual(record_properties.osc_workflow, "test-workflow") + self.assertEqual(record_properties.keywords, ["test", "workflow"]) + self.assertEqual(record_properties.contacts, [contact]) + self.assertEqual(record_properties.themes, [theme]) + self.assertEqual(record_properties.formats, [{"type": "application/json"}]) + self.assertEqual(record_properties.license, "MIT") + + def test_record_properties_to_dict(self): + kernel_info = JupyterKernelInfo( + name="python3", python_version=3.9, env_file="environment.yml" + ) + record_properties = RecordProperties( + created="2023-01-01", + type="workflow", + title="Test Workflow", + description="A test workflow", + jupyter_kernel_info=kernel_info, + osc_project="DeepESDL", + osc_workflow="test-workflow", ) - contacts = [Contact(name="Jane Doe", organization="Org Y")] - themes = [Theme(concepts=[ThemeConcept(id="concept1")], scheme="scheme1")] + result = record_properties.to_dict() + + self.assertEqual(result["created"], "2023-01-01") + self.assertEqual(result["type"], "workflow") + self.assertEqual(result["title"], "Test Workflow") + self.assertEqual(result["description"], "A test workflow") + self.assertEqual(result["jupyter_kernel_info"], kernel_info.to_dict()) + self.assertEqual(result["osc:project"], "DeepESDL") + self.assertEqual(result["osc:workflow"], "test-workflow") + self.assertNotIn("osc_project", result) + self.assertNotIn("osc_workflow", result) + + +class TestLinksBuilder(unittest.TestCase): + def test_build_theme_links_for_records(self): + links_builder = LinksBuilder(themes=["climate", "ocean"]) + theme_links = links_builder.build_them_links_for_records() + + expected_links = [ + { + "rel": "related", + "href": "../../themes/climate/catalog.json", + "type": "application/json", + "title": "Theme: Climate", + }, + { + "rel": "related", + "href": "../../themes/ocean/catalog.json", + "type": "application/json", + "title": "Theme: Ocean", + }, + ] + + self.assertEqual(theme_links, expected_links) + + def test_build_link_to_dataset(self): + link = LinksBuilder.build_link_to_dataset("test-collection") + + expected_link = [ + { + "rel": "child", + "href": "../../products/test-collection/collection.json", + "type": "application/json", + "title": "test-collection", + } + ] + + self.assertEqual(link, expected_link) + + +class TestWorkflowAsOgcRecord(unittest.TestCase): + def test_workflow_as_ogc_record_initialization(self): + kernel_info = JupyterKernelInfo( + name="python3", python_version=3.9, env_file="environment.yml" + ) record_properties = RecordProperties( - created="2025-01-01", - type="dataset", - title="Sample Dataset", - description="A sample dataset", + created="2023-01-01", + type="workflow", + title="Test Workflow", + description="A test workflow", jupyter_kernel_info=kernel_info, - updated="2025-01-02", - contacts=contacts, - themes=themes, - keywords=["sample", "test"], - formats=[{"format": "JSON"}], - license="CC-BY", + osc_project="DeepESDL", ) - self.assertEqual(record_properties.created, "2025-01-01") - self.assertEqual(record_properties.updated, "2025-01-02") - self.assertEqual(record_properties.type, "dataset") - self.assertEqual(record_properties.title, "Sample Dataset") - self.assertEqual(record_properties.description, "A sample dataset") - self.assertEqual(record_properties.jupyter_kernel_info.name, "Python") - self.assertEqual(len(record_properties.contacts), 1) - self.assertEqual(len(record_properties.themes), 1) - self.assertIn("sample", record_properties.keywords) - self.assertEqual(record_properties.license, "CC-BY") - - def test_ogc_record_initialization(self): + workflow_record = WorkflowAsOgcRecord( + id="test-workflow", + type="workflow", + title="Test Workflow", + jupyter_notebook_url="https://example.com/notebook.ipynb", + properties=record_properties, + links=[{"rel": "self", "href": "https://example.com"}], + ) + + self.assertEqual(workflow_record.id, "test-workflow") + self.assertEqual(workflow_record.type, "workflow") + self.assertEqual(workflow_record.title, "Test Workflow") + self.assertEqual( + workflow_record.jupyter_notebook_url, "https://example.com/notebook.ipynb" + ) + self.assertEqual(workflow_record.properties, record_properties) + self.assertEqual(workflow_record.conformsTo, [OGC_API_RECORD_SPEC]) + self.assertEqual(workflow_record.links[0]["rel"], "root") + self.assertEqual(workflow_record.links[-1]["rel"], "self") + + +class TestExperimentAsOgcRecord(unittest.TestCase): + def test_experiment_as_ogc_record_initialization(self): kernel_info = JupyterKernelInfo( - name="Python", python_version=3.9, env_file="env.yml" + name="python3", python_version=3.12, env_file="environment.yml" ) - properties = RecordProperties( - created="2025-01-01", - type="dataset", - title="Sample Dataset", - description="A sample dataset", + record_properties = RecordProperties( + created="2023-01-01", + type="experiment", + title="Test Experiment", + description="A test experiment", jupyter_kernel_info=kernel_info, + osc_project="DeepESDL", ) - ogc_record = OgcRecord( - id="record1", - type="Feature", - time={"start": "2025-01-01T00:00:00Z", "end": "2025-01-02T00:00:00Z"}, - properties=properties, - links=[{"href": "http://example.com", "rel": "self"}], - linkTemplates=[{"template": "http://example.com/{id}"}], + experiment_record = ExperimentAsOgcRecord( + id="test-experiment", + title="Test Experiment", + type="experiment", + jupyter_notebook_url="https://example.com/notebook.ipynb", + collection_id="test-collection", + properties=record_properties, + links=[{"rel": "self", "href": "https://example.com"}], ) - self.assertEqual(ogc_record.id, "record1") - self.assertEqual(ogc_record.type, "Feature") - self.assertEqual(ogc_record.time["start"], "2025-01-01T00:00:00Z") - self.assertEqual(ogc_record.properties.title, "Sample Dataset") - self.assertEqual(len(ogc_record.links), 1) + self.assertEqual(experiment_record.id, "test-experiment") + self.assertEqual(experiment_record.title, "Test Experiment") + self.assertEqual(experiment_record.type, "experiment") self.assertEqual( - ogc_record.linkTemplates[0]["template"], "http://example.com/{id}" + experiment_record.jupyter_notebook_url, "https://example.com/notebook.ipynb" ) - self.assertEqual(ogc_record.conformsTo[0], OGC_API_RECORD_SPEC) + self.assertEqual(experiment_record.collection_id, "test-collection") + self.assertEqual(experiment_record.properties, record_properties) + self.assertEqual(experiment_record.conformsTo, [OGC_API_RECORD_SPEC]) + self.assertEqual(experiment_record.links[0]["rel"], "root") + self.assertEqual(experiment_record.links[-1]["rel"], "self") diff --git a/deep_code/tests/utils/test_ogc_record_generator.py b/deep_code/tests/utils/test_ogc_record_generator.py index 88fb835..d56cf7d 100644 --- a/deep_code/tests/utils/test_ogc_record_generator.py +++ b/deep_code/tests/utils/test_ogc_record_generator.py @@ -1,6 +1,6 @@ import unittest -from deep_code.constants import DEFAULT_THEME_SCHEME +from deep_code.constants import OSC_THEME_SCHEME from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator @@ -29,7 +29,7 @@ def test_build_theme(self): self.assertEqual(len(theme.concepts), 2) self.assertEqual(theme.concepts[0].id, "theme1") self.assertEqual(theme.concepts[1].id, "theme2") - self.assertEqual(theme.scheme, DEFAULT_THEME_SCHEME) + self.assertEqual(theme.scheme, OSC_THEME_SCHEME) def test_build_record_properties(self): generator = OSCWorkflowOGCApiRecordGenerator() From d522ce62b462f3ddd03b1b8ba02a2b6cdc0e496c Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 13:21:11 +0100 Subject: [PATCH 25/43] formatting --- deep_code/constants.py | 10 +++++----- deep_code/tools/publish.py | 4 +++- deep_code/utils/ogc_api_record.py | 8 +++++--- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index a4281b1..814a03f 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -23,9 +23,9 @@ "/collection.json" ) BASE_URL_OSC = "https://esa-earthcode.github.io/open-science-catalog-metadata" -EXPERIMENT_BASE_CATALOG_SELF_HREF = \ - "https://esa-earthcode.github.io/open-science-catalog-metadata/experiments/catalog.json" -WORKFLOW_BASE_CATALOG_SELF_HREF = \ - ("https://esa-earthcode.github.io/open-science-catalog-metadata/workflows/catalog" - ".json") +EXPERIMENT_BASE_CATALOG_SELF_HREF = "https://esa-earthcode.github.io/open-science-catalog-metadata/experiments/catalog.json" +WORKFLOW_BASE_CATALOG_SELF_HREF = ( + "https://esa-earthcode.github.io/open-science-catalog-metadata/workflows/catalog" + ".json" +) PROJECT_COLLECTION_NAME = "deep-earth-system-data-lab" diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index c17d56f..7311582 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -278,7 +278,9 @@ def _update_base_catalog( Updated Catalog object. """ # Load the base catalog - base_catalog = Catalog.from_file(Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path) + base_catalog = Catalog.from_file( + Path(self.gh_publisher.github_automation.local_clone_dir) / catalog_path + ) # Add a link to the new item base_catalog.add_link( diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 3958431..42e806e 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -3,8 +3,11 @@ from xrlint.util.constructible import MappingConstructible from xrlint.util.serializable import JsonSerializable, JsonValue -from deep_code.constants import BASE_URL_OSC, OGC_API_RECORD_SPEC, \ - PROJECT_COLLECTION_NAME +from deep_code.constants import ( + BASE_URL_OSC, + OGC_API_RECORD_SPEC, + PROJECT_COLLECTION_NAME, +) class Contact(MappingConstructible["Contact"], JsonSerializable): @@ -267,4 +270,3 @@ def _generate_static_links(self): "type": "application/json", }, ] - From ac74f99758217705fe84a561e1e3bfaaee8abbc9 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 14:33:41 +0100 Subject: [PATCH 26/43] fixed unit test --- deep_code/tests/utils/test_ogc_api_record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index bf7ae69..18b2f29 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -99,7 +99,7 @@ def test_record_properties_initialization(self): self.assertEqual(record_properties.title, "Test Workflow") self.assertEqual(record_properties.description, "A test workflow") self.assertEqual(record_properties.jupyter_kernel_info, kernel_info) - self.assertEqual(record_properties.osc_project, "deep-esdl") + self.assertEqual(record_properties.osc_project, "DeepESDL") self.assertEqual(record_properties.osc_workflow, "test-workflow") self.assertEqual(record_properties.keywords, ["test", "workflow"]) self.assertEqual(record_properties.contacts, [contact]) From e7982cbb9c8c6d5bef6dfa475a4d42c1c0198f99 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 15:18:05 +0100 Subject: [PATCH 27/43] few more uni tests --- deep_code/tests/tools/test_publish.py | 108 ++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) create mode 100644 deep_code/tests/tools/test_publish.py diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py new file mode 100644 index 0000000..f5a4a51 --- /dev/null +++ b/deep_code/tests/tools/test_publish.py @@ -0,0 +1,108 @@ +import unittest +from unittest.mock import patch, mock_open, MagicMock +import json +import yaml +from pathlib import Path +import tempfile +from pystac import Catalog + +from deep_code.tools.publish import Publisher + + +class TestPublisher(unittest.TestCase): + @patch("fsspec.open") + @patch("deep_code.tools.publish.GitHubPublisher") + def setUp(self, mock_github_publisher, mock_fsspec_open): + # Mock GitHubPublisher to avoid reading .gitaccess + self.mock_github_publisher_instance = MagicMock() + mock_github_publisher.return_value = self.mock_github_publisher_instance + + # Mock dataset and workflow config files + self.dataset_config = {"collection_id": "test-collection", "dataset_id": "test-dataset"} + self.workflow_config = { + "properties": {"title": "Test Workflow"}, + "workflow_id": "test-workflow", + } + + # Mock fsspec.open for config files + self.mock_fsspec_open = mock_fsspec_open + self.mock_fsspec_open.side_effect = [ + mock_open(read_data=yaml.dump(self.dataset_config)).return_value, + mock_open(read_data=yaml.dump(self.workflow_config)).return_value, + ] + + # Initialize Publisher + self.publisher = Publisher( + dataset_config_path="test-dataset-config.yaml", + workflow_config_path="test-workflow-config.yaml", + ) + + def test_normalize_name(self): + # Test normal input + self.assertEqual(Publisher._normalize_name("Test Name"), "test-name") + + # Test input with multiple spaces + self.assertEqual(Publisher._normalize_name("Test Name"), "test---name") + + # Test empty input + self.assertIsNone(Publisher._normalize_name("")) + + # Test None input + self.assertIsNone(Publisher._normalize_name(None)) + + def test_write_to_file(self): + # Create a temporary file + with tempfile.NamedTemporaryFile(delete=False) as temp_file: + file_path = temp_file.name + + # Test data + data = {"key": "value"} + + # Call the method + Publisher._write_to_file(file_path, data) + + # Read the file and verify its content + with open(file_path, "r") as f: + content = json.load(f) + self.assertEqual(content, data) + + # Clean up + Path(file_path).unlink() + + def test_update_base_catalog(self): + # Create a mock Catalog + catalog = Catalog(id="test-catalog", description="Test Catalog") + + # Mock file path and item ID + catalog_path = "test-catalog.json" + item_id = "test-item" + self_href = "https://example.com/catalog.json" + + self.publisher.workflow_title = "Test Workflow" + + # Mock the Catalog.from_file method + with patch("pystac.Catalog.from_file", return_value=catalog): + updated_catalog = self.publisher._update_base_catalog(catalog_path, + item_id, self_href) + + # Assertions + self.assertEqual(updated_catalog.get_self_href(), self_href) + self.assertIsInstance(updated_catalog, Catalog) + + def test_read_config_files(self): + # Mock dataset and workflow config files + dataset_config = {"collection_id": "test-collection", "dataset_id": "test-dataset"} + workflow_config = { + "properties": {"title": "Test Workflow"}, + "workflow_id": "test-workflow", + } + + # Mock fsspec.open for config files + self.mock_fsspec_open.side_effect = [ + mock_open(read_data=yaml.dump(dataset_config)).return_value, + mock_open(read_data=yaml.dump(workflow_config)).return_value, + ] + + # Assertions + self.assertEqual(self.publisher.dataset_config, dataset_config) + self.assertEqual(self.publisher.workflow_config, workflow_config) From f8a940c364bd46acdef5ddcde7cff7c9a11cbae1 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 15:21:33 +0100 Subject: [PATCH 28/43] code formated --- deep_code/tests/tools/test_publish.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index f5a4a51..3bd532f 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -18,7 +18,10 @@ def setUp(self, mock_github_publisher, mock_fsspec_open): mock_github_publisher.return_value = self.mock_github_publisher_instance # Mock dataset and workflow config files - self.dataset_config = {"collection_id": "test-collection", "dataset_id": "test-dataset"} + self.dataset_config = { + "collection_id": "test-collection", + "dataset_id": "test-dataset", + } self.workflow_config = { "properties": {"title": "Test Workflow"}, "workflow_id": "test-workflow", @@ -82,8 +85,9 @@ def test_update_base_catalog(self): # Mock the Catalog.from_file method with patch("pystac.Catalog.from_file", return_value=catalog): - updated_catalog = self.publisher._update_base_catalog(catalog_path, - item_id, self_href) + updated_catalog = self.publisher._update_base_catalog( + catalog_path, item_id, self_href + ) # Assertions self.assertEqual(updated_catalog.get_self_href(), self_href) @@ -91,7 +95,10 @@ def test_update_base_catalog(self): def test_read_config_files(self): # Mock dataset and workflow config files - dataset_config = {"collection_id": "test-collection", "dataset_id": "test-dataset"} + dataset_config = { + "collection_id": "test-collection", + "dataset_id": "test-dataset", + } workflow_config = { "properties": {"title": "Test Workflow"}, "workflow_id": "test-workflow", From ee3e9f81a57a25b2cc6425c2fe22453d96bc7f85 Mon Sep 17 00:00:00 2001 From: tejas Date: Fri, 14 Mar 2025 15:30:33 +0100 Subject: [PATCH 29/43] clean up --- deep_code/tests/tools/test_publish.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/deep_code/tests/tools/test_publish.py b/deep_code/tests/tools/test_publish.py index 3bd532f..f355cb5 100644 --- a/deep_code/tests/tools/test_publish.py +++ b/deep_code/tests/tools/test_publish.py @@ -41,16 +41,9 @@ def setUp(self, mock_github_publisher, mock_fsspec_open): ) def test_normalize_name(self): - # Test normal input self.assertEqual(Publisher._normalize_name("Test Name"), "test-name") - - # Test input with multiple spaces self.assertEqual(Publisher._normalize_name("Test Name"), "test---name") - - # Test empty input self.assertIsNone(Publisher._normalize_name("")) - - # Test None input self.assertIsNone(Publisher._normalize_name(None)) def test_write_to_file(self): From b46528d1c64f361a7325ed204660c463428472e9 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 17 Mar 2025 11:41:59 +0100 Subject: [PATCH 30/43] update README.md --- README.md | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 02f2eed..a8a840b 100644 --- a/README.md +++ b/README.md @@ -59,13 +59,13 @@ Use the --help option with these subcommands to get more details on usage. The CLI retrieves the Git username and personal access token from a hidden file named .gitaccess. Ensure this file is located in the same directory where you execute the CLI command. -### deep-code publish-product +### deep-code publish -Publish a dataset which is a result of an experiment to the EarthCODE -open-science catalog. +Publish the experiment, workflow and dataset which is a result of an experiment to +the EarthCODE open-science catalog. ```commandline - deep-code publish-dataset /path/to/dataset-config.yaml + deep-code publish /path/to/dataset-config.yaml /path/to/workflow-config.yaml ``` #### .gitaccess example @@ -94,12 +94,6 @@ cf_parameter: dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. -### deep-code publish-workflow - -Publish a workflow/experiment to the EarthCODE open-science catalog. - -```commandline -deep-code publish-workflow /path/to/workflow-config.yaml ``` #### workflow-config.yaml example From e8a18ecd068c70aa5cc62b3b1f692547962429d2 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 17 Mar 2025 14:42:29 +0100 Subject: [PATCH 31/43] updated format string method to generate titles in a standard format --- .../utils/test_dataset_stac_generator.py | 31 +++++++++++++++++++ deep_code/utils/dataset_stac_generator.py | 27 ++++++++++------ 2 files changed, 48 insertions(+), 10 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index 64285a7..cb1230b 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -217,3 +217,34 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): ) self.assertIn("Public store, Authenticated store", str(context.exception)) self.assertEqual(mock_new_data_store.call_count, 2) + +class TestFormatString(unittest.TestCase): + def test_single_word(self): + self.assertEqual(OscDatasetStacGenerator.format_string("temperature"), "Temperature") + self.assertEqual(OscDatasetStacGenerator.format_string("temp"), "Temp") + self.assertEqual(OscDatasetStacGenerator.format_string("hello"), "Hello") + + def test_multiple_words_with_spaces(self): + self.assertEqual(OscDatasetStacGenerator.format_string("surface temp"), "Surface Temp") + self.assertEqual(OscDatasetStacGenerator.format_string("this is a test"), "This Is A Test") + + def test_multiple_words_with_underscores(self): + self.assertEqual(OscDatasetStacGenerator.format_string("surface_temp"), "Surface Temp") + self.assertEqual(OscDatasetStacGenerator.format_string("this_is_a_test"), "This Is A Test") + + def test_mixed_spaces_and_underscores(self): + self.assertEqual(OscDatasetStacGenerator.format_string("surface_temp and_more"), "Surface Temp And More") + self.assertEqual(OscDatasetStacGenerator.format_string("mixed_case_with_underscores_and spaces"), "Mixed Case With Underscores And Spaces") + + def test_edge_cases(self): + # Empty string + self.assertEqual(OscDatasetStacGenerator.format_string(""), "") + # Single word with trailing underscore + self.assertEqual(OscDatasetStacGenerator.format_string("temperature_"), "Temperature") + # Single word with leading underscore + self.assertEqual(OscDatasetStacGenerator.format_string("_temp"), "Temp") + # Single word with leading/trailing spaces + self.assertEqual(OscDatasetStacGenerator.format_string(" hello "), "Hello") + # Multiple spaces or underscores + self.assertEqual(OscDatasetStacGenerator.format_string("too___many___underscores"), "Too Many Underscores") + self.assertEqual(OscDatasetStacGenerator.format_string("too many spaces"), "Too Many Spaces") \ No newline at end of file diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index c30517d..4a3142d 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -175,7 +175,10 @@ def _get_temporal_extent(self) -> TemporalExtent: @staticmethod def _normalize_name(name: str | None) -> str | None: - return name.replace(" ", "-").lower() if name else None + if name: + return (name.replace(" ", "-"). + replace("_", "-").lower()) + return None def _get_general_metadata(self) -> dict: return { @@ -200,8 +203,10 @@ def extract_metadata_for_variable(self, variable_data) -> dict: def get_variable_ids(self) -> list[str]: """Get variable IDs for all variables in the dataset.""" variable_ids = list(self.variables_metadata.keys()) - # Remove 'crs' and 'spatial_ref' from the list if they exist - return [var_id for var_id in variable_ids if var_id not in ["crs", "spatial_ref"]] + # Remove 'crs' and 'spatial_ref' from the list if they exist, note that + # spatial_ref will be normalized to spatial-ref in variable_ids and skipped. + return [var_id for var_id in variable_ids if var_id not in ["crs", + "spatial-ref"]] def get_variables_metadata(self) -> dict[str, dict]: """Extract metadata for all variables in the dataset.""" @@ -263,7 +268,7 @@ def build_variable_catalog(self, var_metadata) -> Catalog: var_catalog = Catalog( id=var_id, description=var_metadata.get("description"), - title=var_id, + title=self.format_string(var_id), stac_extensions=[ "https://stac-extensions.github.io/themes/v1.0.0/schema.json" ], @@ -335,8 +340,7 @@ def update_product_base_catalog(self, product_catalog_path) -> Catalog: product_base_catalog.set_self_href(PRODUCT_BASE_CATALOG_SELF_HREF) return product_base_catalog - @staticmethod - def update_variable_base_catalog(variable_base_catalog_path, variable_ids) \ + def update_variable_base_catalog(self, variable_base_catalog_path, variable_ids) \ -> ( Catalog): """Link product to base product catalog""" @@ -347,7 +351,7 @@ def update_variable_base_catalog(variable_base_catalog_path, variable_ids) \ rel="child", target=f"./{var_id}/catalog.json", media_type="application/json", - title=var_id, + title=self.format_string(var_id), ) ) # 'self' link: the direct URL where this JSON is hosted @@ -414,8 +418,11 @@ def update_existing_variable_catalog(self, var_file_path, var_id) -> Catalog: return existing_catalog @staticmethod - def format_string(s): - return s.capitalize() + def format_string(s: str) -> str: + # Strip leading/trailing spaces/underscores and replace underscores with spaces + words = s.strip(" _").replace("_", " ").replace("-", " ").split() + # Capitalize each word and join them with a space + return " ".join(word.capitalize() for word in words) @staticmethod def build_theme(osc_themes: list[str]) -> Theme: @@ -496,7 +503,7 @@ def build_dataset_stac_collection(self) -> Collection: rel="related", target=f"../../variables/{var}/catalog.json", media_type="application/json", - title="Variable: " + var, + title="Variable: " + self.format_string(var), ) ) From 9bfa3f3ea68be759bacc39a9aa49cb3e028c02d3 Mon Sep 17 00:00:00 2001 From: tejas Date: Mon, 17 Mar 2025 14:43:24 +0100 Subject: [PATCH 32/43] black reformatted --- .../utils/test_dataset_stac_generator.py | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/deep_code/tests/utils/test_dataset_stac_generator.py b/deep_code/tests/utils/test_dataset_stac_generator.py index cb1230b..e8864b1 100644 --- a/deep_code/tests/utils/test_dataset_stac_generator.py +++ b/deep_code/tests/utils/test_dataset_stac_generator.py @@ -218,33 +218,60 @@ def test_open_dataset_failure(self, mock_logger, mock_new_data_store): self.assertIn("Public store, Authenticated store", str(context.exception)) self.assertEqual(mock_new_data_store.call_count, 2) + class TestFormatString(unittest.TestCase): def test_single_word(self): - self.assertEqual(OscDatasetStacGenerator.format_string("temperature"), "Temperature") + self.assertEqual( + OscDatasetStacGenerator.format_string("temperature"), "Temperature" + ) self.assertEqual(OscDatasetStacGenerator.format_string("temp"), "Temp") self.assertEqual(OscDatasetStacGenerator.format_string("hello"), "Hello") def test_multiple_words_with_spaces(self): - self.assertEqual(OscDatasetStacGenerator.format_string("surface temp"), "Surface Temp") - self.assertEqual(OscDatasetStacGenerator.format_string("this is a test"), "This Is A Test") + self.assertEqual( + OscDatasetStacGenerator.format_string("surface temp"), "Surface Temp" + ) + self.assertEqual( + OscDatasetStacGenerator.format_string("this is a test"), "This Is A Test" + ) def test_multiple_words_with_underscores(self): - self.assertEqual(OscDatasetStacGenerator.format_string("surface_temp"), "Surface Temp") - self.assertEqual(OscDatasetStacGenerator.format_string("this_is_a_test"), "This Is A Test") + self.assertEqual( + OscDatasetStacGenerator.format_string("surface_temp"), "Surface Temp" + ) + self.assertEqual( + OscDatasetStacGenerator.format_string("this_is_a_test"), "This Is A Test" + ) def test_mixed_spaces_and_underscores(self): - self.assertEqual(OscDatasetStacGenerator.format_string("surface_temp and_more"), "Surface Temp And More") - self.assertEqual(OscDatasetStacGenerator.format_string("mixed_case_with_underscores_and spaces"), "Mixed Case With Underscores And Spaces") + self.assertEqual( + OscDatasetStacGenerator.format_string("surface_temp and_more"), + "Surface Temp And More", + ) + self.assertEqual( + OscDatasetStacGenerator.format_string( + "mixed_case_with_underscores_and spaces" + ), + "Mixed Case With Underscores And Spaces", + ) def test_edge_cases(self): # Empty string self.assertEqual(OscDatasetStacGenerator.format_string(""), "") # Single word with trailing underscore - self.assertEqual(OscDatasetStacGenerator.format_string("temperature_"), "Temperature") + self.assertEqual( + OscDatasetStacGenerator.format_string("temperature_"), "Temperature" + ) # Single word with leading underscore self.assertEqual(OscDatasetStacGenerator.format_string("_temp"), "Temp") # Single word with leading/trailing spaces self.assertEqual(OscDatasetStacGenerator.format_string(" hello "), "Hello") # Multiple spaces or underscores - self.assertEqual(OscDatasetStacGenerator.format_string("too___many___underscores"), "Too Many Underscores") - self.assertEqual(OscDatasetStacGenerator.format_string("too many spaces"), "Too Many Spaces") \ No newline at end of file + self.assertEqual( + OscDatasetStacGenerator.format_string("too___many___underscores"), + "Too Many Underscores", + ) + self.assertEqual( + OscDatasetStacGenerator.format_string("too many spaces"), + "Too Many Spaces", + ) From c8916e1def4437d6bc896b058af91b14233e0217 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 18 Mar 2025 16:12:53 +0100 Subject: [PATCH 33/43] update README.md --- README.md | 47 +++++++++++++++++++---------------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index a8a840b..4c7b856 100644 --- a/README.md +++ b/README.md @@ -56,25 +56,26 @@ pytest --cov-report html --cov=deep-code providing different utility functions. Use the --help option with these subcommands to get more details on usage. -The CLI retrieves the Git username and personal access token from a hidden file named .gitaccess. Ensure this file is located in the same directory where you execute the CLI +The CLI retrieves the Git username and personal access token from a hidden file named +.gitaccess. Ensure this file is located in the same directory where you execute the CLI command. +#### .gitaccess example + +``` +github-username: your-git-user +github-token: personal access token +``` + ### deep-code publish -Publish the experiment, workflow and dataset which is a result of an experiment to +Publish the experiment, workflow and dataset which is a result of an experiment to the EarthCODE open-science catalog. ```commandline deep-code publish /path/to/dataset-config.yaml /path/to/workflow-config.yaml ``` -#### .gitaccess example - -``` -github-username: your-git-user -github-token: personal access token -``` - #### dataset-config.yaml example ``` @@ -92,36 +93,26 @@ cf_parameter: - name: hydrology ``` -dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 or your team bucket. +dataset-id has to be a valid dataset-id from `deep-esdl-public` s3 bucket or your team +bucket. - ``` #### workflow-config.yaml example ``` -workflow_id: "4D Med hydrology cube generation" +workflow_id: "esa-cci-permafrost" properties: - title: "Hydrology cube generation recipe" - description: "4D Med cube generation" + title: "ESA CCI permafrost" + description: "cube generation workflow for esa-cci-permafrost" keywords: - Earth Science themes: - - Atmosphere - - Ocean - - Evaporation + - cryosphere license: proprietary jupyter_kernel_info: - name: deepesdl-xcube-1.7.1 + name: deepesdl-xcube-1.8.3 python_version: 3.11 - env_file: https://git/env.yml -links: - - rel: "documentation" - type: "application/json" - title: "4DMed Hydrology Cube Generation Recipe" - href: "https://github.com/deepesdl/cube-gen/tree/main/hydrology/README.md" - - rel: "jupyter-notebook" - type: "application/json" - title: "Workflow Jupyter Notebook" - href: "https://github.com/deepesdl/cube-gen/blob/main/hydrology/notebooks/reading_hydrology.ipynb" + env_file: "https://github.com/deepesdl/cube-gen/blob/main/Permafrost/environment.yml" +jupyter_notebook_url: "https://github.com/deepesdl/cube-gen/blob/main/Permafrost/Create-CCI-Permafrost-cube-EarthCODE.ipynb" contact: - name: Tejas Morbagal Harish organization: Brockmann Consult GmbH From 118e34e34b7179caec767ceffd64222b498edd80 Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Tue, 18 Mar 2025 16:14:21 +0100 Subject: [PATCH 34/43] Update deep_code/utils/ogc_api_record.py Co-authored-by: Thomas Storm --- deep_code/utils/ogc_api_record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 42e806e..4e2a9b2 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -100,7 +100,7 @@ def format_string(s): def build_them_links_for_records(self): for theme in self.themes: - formated_theme = self.format_string(theme) + formatted_theme = self.format_string(theme) link = { "rel": "related", "href": f"../../themes/{theme}/catalog.json", From 7bfe1897fecd881ae481a973c3bdbe0a5ba1577e Mon Sep 17 00:00:00 2001 From: Tejas Morbagal Harish Date: Tue, 18 Mar 2025 16:15:03 +0100 Subject: [PATCH 35/43] Update deep_code/utils/ogc_api_record.py Co-authored-by: Thomas Storm --- deep_code/utils/ogc_api_record.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index 4e2a9b2..e35f788 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -105,7 +105,7 @@ def build_them_links_for_records(self): "rel": "related", "href": f"../../themes/{theme}/catalog.json", "type": "application/json", - "title": f"Theme: {formated_theme}", + "title": f"Theme: {formatted_theme}", } self.theme_links.append(link) return self.theme_links From fdb350d44d623017b0cfb0104b56718142419bb9 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 18 Mar 2025 16:27:56 +0100 Subject: [PATCH 36/43] addressed suggestions to PR #6 --- deep_code/tests/utils/test_ogc_api_record.py | 2 +- deep_code/tools/publish.py | 3 +-- deep_code/utils/ogc_api_record.py | 10 +++------- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/deep_code/tests/utils/test_ogc_api_record.py b/deep_code/tests/utils/test_ogc_api_record.py index 18b2f29..236ed4c 100644 --- a/deep_code/tests/utils/test_ogc_api_record.py +++ b/deep_code/tests/utils/test_ogc_api_record.py @@ -137,7 +137,7 @@ def test_record_properties_to_dict(self): class TestLinksBuilder(unittest.TestCase): def test_build_theme_links_for_records(self): links_builder = LinksBuilder(themes=["climate", "ocean"]) - theme_links = links_builder.build_them_links_for_records() + theme_links = links_builder.build_theme_links_for_records() expected_links = [ { diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 7311582..35760ff 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -100,7 +100,6 @@ def publish_files( class Publisher: """Publishes products (datasets) to the OSC GitHub repository. - Inherits from BasePublisher for GitHub publishing logic. """ def __init__(self, dataset_config_path: str, workflow_config_path: str): @@ -317,7 +316,7 @@ def publish_workflow_experiment(self, write_to_file: bool = False): exp_record_properties = copy.deepcopy(wf_record_properties) link_builder = LinksBuilder(osc_themes) - theme_links = link_builder.build_them_links_for_records() + theme_links = link_builder.build_theme_links_for_records() workflow_record = WorkflowAsOgcRecord( id=workflow_id, diff --git a/deep_code/utils/ogc_api_record.py b/deep_code/utils/ogc_api_record.py index e35f788..fd417ca 100644 --- a/deep_code/utils/ogc_api_record.py +++ b/deep_code/utils/ogc_api_record.py @@ -82,7 +82,7 @@ def to_dict(self, value_name: str | None = None) -> dict[str, JsonValue]: data = super().to_dict(value_name) if self.osc_workflow is not None: data["osc:workflow"] = self.osc_workflow - del data["osc_workflow"] # Remove the original key as it be renamed it + del data["osc_workflow"] # Remove the original key as it has been renamed if self.osc_project is not None: data["osc:project"] = self.osc_project del data["osc_project"] @@ -94,13 +94,9 @@ def __init__(self, themes: list[str]): self.themes = themes self.theme_links = [] - @staticmethod - def format_string(s): - return s.capitalize() - - def build_them_links_for_records(self): + def build_theme_links_for_records(self): for theme in self.themes: - formatted_theme = self.format_string(theme) + formatted_theme = theme.capitalize() link = { "rel": "related", "href": f"../../themes/{theme}/catalog.json", From 556a8734bb1af138560dc7d2e54e5507488fc570 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 18 Mar 2025 16:29:42 +0100 Subject: [PATCH 37/43] renamed clone_repository method to clone_sync_repository to better match the functionality --- deep_code/tests/utils/test_github_automation.py | 4 ++-- deep_code/tools/publish.py | 2 +- deep_code/utils/github_automation.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/deep_code/tests/utils/test_github_automation.py b/deep_code/tests/utils/test_github_automation.py index 2456f0d..efa284c 100644 --- a/deep_code/tests/utils/test_github_automation.py +++ b/deep_code/tests/utils/test_github_automation.py @@ -44,7 +44,7 @@ def test_clone_repository_new(self, mock_run): # Mock os.path.exists to return False (directory does not exist) with patch("os.path.exists", return_value=False): - self.github_automation.clone_repository() + self.github_automation.clone_sync_repository() # Assertions mock_run.assert_called_once_with( @@ -65,7 +65,7 @@ def test_clone_repository_existing(self, mock_run): # Mock os.path.exists to return True (directory exists) with patch("os.path.exists", return_value=True): with patch("os.chdir"): - self.github_automation.clone_repository() + self.github_automation.clone_sync_repository() # Assertions mock_run.assert_called_once_with(["git", "pull"], check=True) diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index 35760ff..c17264f 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -53,7 +53,7 @@ def __init__(self): self.github_username, self.github_token, OSC_REPO_OWNER, OSC_REPO_NAME ) self.github_automation.fork_repository() - self.github_automation.clone_repository() + self.github_automation.clone_sync_repository() def publish_files( self, diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 244bddd..8b4a133 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -45,7 +45,7 @@ def fork_repository(self): response.raise_for_status() logging.info(f"Repository forked to {self.username}/{self.repo_name}") - def clone_repository(self): + def clone_sync_repository(self): """Clone the forked repository locally if it doesn't exist, or pull updates if it does.""" logging.info("Checking local repository...") if not os.path.exists(self.local_clone_dir): From 46489751144d74023363d754b872f0b7d441c0f3 Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 18 Mar 2025 16:31:23 +0100 Subject: [PATCH 38/43] removed extra non-mandatory comments --- deep_code/utils/ogc_record_generator.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index 4dc0bec..8b68c39 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -55,18 +55,15 @@ def build_record_properties( properties.update({"created": now_iso}) properties.update({"updated": now_iso}) - # Extract themes from the properties dictionary themes_list = properties.get("themes", []) - # Build contact objects properties.update({"contacts": self.build_contact_objects(contacts)}) - # Build theme object if themes are present if themes_list: theme_obj = self.build_theme(themes_list) properties.update( {"themes": [theme_obj]} - ) # Wrap the Theme object in a list + ) properties.setdefault("type", "workflow") properties.setdefault("osc_project", "deep-earth-system-data-lab") From 31a7763518f82cec2088fd8bbdd8c36d26a95eea Mon Sep 17 00:00:00 2001 From: tejas Date: Tue, 18 Mar 2025 16:43:44 +0100 Subject: [PATCH 39/43] clean up --- deep_code/utils/dataset_stac_generator.py | 4 ++-- deep_code/utils/github_automation.py | 10 +--------- deep_code/utils/helper.py | 4 ++-- deep_code/utils/ogc_record_generator.py | 4 +--- 4 files changed, 6 insertions(+), 16 deletions(-) diff --git a/deep_code/utils/dataset_stac_generator.py b/deep_code/utils/dataset_stac_generator.py index 4a3142d..9b6b703 100644 --- a/deep_code/utils/dataset_stac_generator.py +++ b/deep_code/utils/dataset_stac_generator.py @@ -519,13 +519,13 @@ def build_dataset_stac_collection(self) -> Collection: collection.extra_fields["themes"] = [theme_obj] for theme in self.osc_themes: - formated_theme = self.format_string(theme) + formatted_theme = self.format_string(theme) collection.add_link( Link( rel="related", target=f"../../themes/{theme}/catalog.json", media_type="application/json", - title=f"Theme: {formated_theme}", + title=f"Theme: {formatted_theme}", ) ) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index 8b4a133..bdbae38 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -49,7 +49,6 @@ def clone_sync_repository(self): """Clone the forked repository locally if it doesn't exist, or pull updates if it does.""" logging.info("Checking local repository...") if not os.path.exists(self.local_clone_dir): - # Directory doesn't exist, clone the repository logging.info("Cloning forked repository...") try: subprocess.run( @@ -60,7 +59,6 @@ def clone_sync_repository(self): except subprocess.CalledProcessError as e: raise RuntimeError(f"Failed to clone repository: {e}") else: - # Directory exists, pull the latest changes logging.info("Local repository already exists. Pulling latest changes...") try: os.chdir(self.local_clone_dir) @@ -81,26 +79,20 @@ def create_branch(self, branch_name: str): def add_file(self, file_path: str, content): """Add a new file to the local repository.""" logging.info(f"Adding new file: {file_path}...") - os.chdir(self.local_clone_dir) # Ensure we are in the Git repository + os.chdir(self.local_clone_dir) full_path = Path(self.local_clone_dir) / file_path full_path.parent.mkdir(parents=True, exist_ok=True) - # Ensure content is serializable if hasattr(content, "to_dict"): content = content.to_dict() if not isinstance(content, (dict, list, str, int, float, bool, type(None))): raise TypeError(f"Cannot serialize content of type {type(content)}") - - # Serialize to JSON try: json_content = json.dumps(content, indent=2, default=serialize) except TypeError as e: raise RuntimeError(f"JSON serialization failed: {e}") - with open(full_path, "w") as f: f.write(json_content) - - # Git add the file try: subprocess.run(["git", "add", str(full_path)], check=True) except subprocess.CalledProcessError as e: diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index 5ff268f..cca6b75 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -8,7 +8,7 @@ def serialize(obj): TypeError: If the object cannot be serialized. """ if isinstance(obj, set): - return list(obj) # Convert sets to lists + return list(obj) if hasattr(obj, "__dict__"): - return obj.__dict__ # Convert objects with attributes to dicts + return obj.__dict__ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") diff --git a/deep_code/utils/ogc_record_generator.py b/deep_code/utils/ogc_record_generator.py index 8b68c39..83bd08a 100644 --- a/deep_code/utils/ogc_record_generator.py +++ b/deep_code/utils/ogc_record_generator.py @@ -61,9 +61,7 @@ def build_record_properties( if themes_list: theme_obj = self.build_theme(themes_list) - properties.update( - {"themes": [theme_obj]} - ) + properties.update({"themes": [theme_obj]}) properties.setdefault("type", "workflow") properties.setdefault("osc_project", "deep-earth-system-data-lab") From 6b507206e487a93782fa139b6294a6bfe43a22fe Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 19 Mar 2025 13:12:41 +0100 Subject: [PATCH 40/43] handle unicodes by setting ensure_ascii=False in json.dumps --- deep_code/utils/github_automation.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/deep_code/utils/github_automation.py b/deep_code/utils/github_automation.py index bdbae38..8090069 100644 --- a/deep_code/utils/github_automation.py +++ b/deep_code/utils/github_automation.py @@ -88,10 +88,12 @@ def add_file(self, file_path: str, content): if not isinstance(content, (dict, list, str, int, float, bool, type(None))): raise TypeError(f"Cannot serialize content of type {type(content)}") try: - json_content = json.dumps(content, indent=2, default=serialize) + json_content = json.dumps( + content, indent=2, ensure_ascii=False, default=serialize + ) except TypeError as e: raise RuntimeError(f"JSON serialization failed: {e}") - with open(full_path, "w") as f: + with open(full_path, "w", encoding="utf-8") as f: f.write(json_content) try: subprocess.run(["git", "add", str(full_path)], check=True) From e1c9a08958f70624531bb4166254ea26c1abd729 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 19 Mar 2025 13:19:11 +0100 Subject: [PATCH 41/43] prepare for release 0.1.0 --- deep_code/constants.py | 2 +- deep_code/version.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/deep_code/constants.py b/deep_code/constants.py index 814a03f..ea3b762 100644 --- a/deep_code/constants.py +++ b/deep_code/constants.py @@ -9,7 +9,7 @@ THEMES_SCHEMA_URI = "https://stac-extensions.github.io/themes/v1.0.0/schema.json" OSC_THEME_SCHEME = "https://github.com/stac-extensions/osc#theme" OSC_REPO_OWNER = "ESA-EarthCODE" -OSC_REPO_NAME = "open-science-catalog-metadata-testing" +OSC_REPO_NAME = "open-science-catalog-metadata" OSC_BRANCH_NAME = "add-new-collection" DEFAULT_THEME_SCHEME = ( "https://gcmd.earthdata.nasa.gov/kms/concepts/concept_scheme/sciencekeywords" diff --git a/deep_code/version.py b/deep_code/version.py index 30dc845..dfce29b 100644 --- a/deep_code/version.py +++ b/deep_code/version.py @@ -19,4 +19,4 @@ # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. -version = "0.0.1.dev1" +version = "0.0.1" From e2f7f6124a17c49e22411c77a87dff980228ecda Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 19 Mar 2025 13:24:25 +0100 Subject: [PATCH 42/43] prepare for release 0.1.0 --- CHANGES.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index e77d001..88e16dc 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,5 @@ -## Changes in 0.1.0 (in development) +## Changes in 0.1.0 -Initial version of deep-code. \ No newline at end of file +- Initial version of deep-code. +- Implemented the publish feature of DeepESDL experiments/workflow as OGC API record + and Datasets as an OSC stac collection. \ No newline at end of file From 002e3608e9c15ff6de112fd3c7363c550f36ad34 Mon Sep 17 00:00:00 2001 From: tejas Date: Wed, 26 Mar 2025 11:14:23 +0100 Subject: [PATCH 43/43] current state --- deep_code/tests/tools/test_new.py | 59 ++++++++ deep_code/tools/new.py | 131 ++++++++++++++++++ deep_code/tools/publish.py | 5 +- deep_code/tools/templates/.gitignore | 14 ++ deep_code/tools/templates/pyproject.toml | 15 ++ .../tools/templates/workflow-template.yml | 31 +++++ deep_code/tools/templates/workflow.ipynb | 47 +++++++ deep_code/utils/helper.py | 8 ++ 8 files changed, 307 insertions(+), 3 deletions(-) create mode 100644 deep_code/tests/tools/test_new.py create mode 100644 deep_code/tools/templates/.gitignore create mode 100644 deep_code/tools/templates/pyproject.toml create mode 100644 deep_code/tools/templates/workflow-template.yml create mode 100644 deep_code/tools/templates/workflow.ipynb diff --git a/deep_code/tests/tools/test_new.py b/deep_code/tests/tools/test_new.py new file mode 100644 index 0000000..2065443 --- /dev/null +++ b/deep_code/tests/tools/test_new.py @@ -0,0 +1,59 @@ +import unittest +import shutil +from pathlib import Path +from unittest.mock import patch, MagicMock + +from deep_code.tools.new import RepositoryInitializer + + +class TestRepositoryInitializer(unittest.TestCase): + def setUp(self): + """Set up a temporary directory for testing.""" + self.test_repo_name = "test_repo" + self.test_repo_dir = Path(self.test_repo_name).absolute() + self.templates_dir = Path(__file__).parent.parent / "deep_code" / "templates" + + # Mock GitHub credentials + self.github_username = "test_user" + self.github_token = "test_token" + + # Ensure the repository directory does not exist before each test + if self.test_repo_dir.exists(): + shutil.rmtree(self.test_repo_dir) + + def tearDown(self): + """Clean up the temporary directory after each test.""" + if self.test_repo_dir.exists(): + shutil.rmtree(self.test_repo_dir) + + @patch("deep_code.tools.new.read_git_access_file") + def test_missing_github_credentials(self, mock_read_git_access_file): + """Test that an error is raised if GitHub credentials are missing.""" + # Mock the Git access file with missing credentials + mock_read_git_access_file.return_value = {} + + # Verify that an error is raised + with self.assertRaises(ValueError) as context: + RepositoryInitializer(self.test_repo_name) + self.assertIn("GitHub credentials are missing", str(context.exception)) + + @patch("deep_code.tools.new.read_git_access_file") + def test_template_not_found(self, mock_read_git_access_file): + """Test that an error is raised if a template file is missing.""" + # Mock the Git access file + mock_read_git_access_file.return_value = { + "github-username": self.github_username, + "github-token": self.github_token, + } + + # Initialize the repository with a non-existent template + initializer = RepositoryInitializer(self.test_repo_name) + initializer.templates_dir = Path("/non/existent/path") + + # Verify that an error is raised + with self.assertRaises(FileNotFoundError): + initializer.initialize() + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/deep_code/tools/new.py b/deep_code/tools/new.py index f54d0b2..3d5ebe0 100644 --- a/deep_code/tools/new.py +++ b/deep_code/tools/new.py @@ -10,4 +10,135 @@ pyproject.toml), and a template setup for documentation (e.g., using mkdocs), setup of the build pipeline""" +import subprocess +from pathlib import Path +import logging +import requests + +from deep_code.utils.helper import read_git_access_file + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +class RepositoryInitializer: + """ + A utility class to initialize a GitHub repository with configuration files, + a workflow notebook template, and a Python package template for DeepESDL experiment + """ + + def __init__(self, repo_name: str): + """ + Initialize the RepositoryInitializer. + """ + self.repo_name = repo_name + self.repo_dir = Path(repo_name).absolute() + self.templates_dir = Path(__file__).parent / "templates" + git_config = read_git_access_file() + self.github_username = git_config.get("github-username") + self.github_token = git_config.get("github-token") + if not self.github_username or not self.github_token: + raise ValueError("GitHub credentials are missing in the `.gitaccess` file.") + + def create_local_repo(self) -> None: + """Create a local directory for the repository and initialize it as a Git repository.""" + logger.info(f"Creating local repository: {self.repo_dir}") + self.repo_dir.mkdir(parents=True, exist_ok=True) + subprocess.run(["git", "init"], cwd=self.repo_dir, check=True) + + def _load_template(self, template_name: str) -> str: + """Load a template file from the templates directory.""" + template_path = self.templates_dir / template_name + if not template_path.exists(): + raise FileNotFoundError(f"Template not found: {template_path}") + with open(template_path, "r") as file: + return file.read() + + def create_config_files(self) -> None: + """Create configuration files in the repository.""" + logger.info("Creating configuration files...") + + # Create README.md + readme_content = (f"# {self.repo_name}\n\nThis repository contains workflows " + f"and Python code for a DeepESDL Experiment.") + (self.repo_dir / "README.md").write_text(readme_content) + + # Create .gitignore + gitignore_content = self._load_template(".gitignore") + (self.repo_dir / ".gitignore").write_text(gitignore_content) + + def create_jupyter_notebook_template(self) -> None: + """Create a workflow notebook template (workflow.ipynb).""" + logger.info("Creating workflow notebook template...") + workflow_content = self._load_template("workflow.ipynb") + (self.repo_dir / "workflow.ipynb").write_text(workflow_content) + + def create_python_package(self) -> None: + """Create a Python package template with a pyproject.toml file.""" + logger.info("Creating Python package template...") + + # Create package directory + package_dir = self.repo_dir / self.repo_name + package_dir.mkdir(exist_ok=True) + + # Create __init__.py + (package_dir / "__init__.py").write_text("# Package initialization\n") + + # Create pyproject.toml + pyproject_content = self._load_template("pyproject.toml") + pyproject_content = pyproject_content.replace("{repo_name}", self.repo_name) + (self.repo_dir / "pyproject.toml").write_text(pyproject_content) + + def create_github_repo(self) -> None: + """Create a remote GitHub repository and push the local repository.""" + if not self.github_username or not self.github_token: + logger.warning("GitHub credentials not provided. Skipping remote repository creation.") + return + + logger.info("Creating remote GitHub repository...") + + repo_url = "https://api.github.com/user/repos" + repo_data = { + "name": self.repo_name, + "description": "Repository for DeepESDL workflows and Python code.", + "private": True, + } + headers = { + "Authorization": f"token {self.github_token}", + "Accept": "application/vnd.github.v3+json", + } + response = requests.post(repo_url, json=repo_data, headers=headers) + response.raise_for_status() + + remote_url = f"https://github.com/{self.github_username}/{self.repo_name}.git" + subprocess.run(["git", "remote", "add", "origin", remote_url], cwd=self.repo_dir, check=True) + subprocess.run(["git", "add", "."], cwd=self.repo_dir, check=True) + subprocess.run(["git", "commit", "-m", "Initial commit"], cwd=self.repo_dir, check=True) + subprocess.run(["git", "push", "-u", "origin", "main"], cwd=self.repo_dir, check=True) + + logger.info(f"Remote repository created: {remote_url}") + + def create_github_actions_workflow(self) -> None: + """Create a GitHub Actions workflow for running unit tests.""" + logger.info("Creating GitHub Actions workflow...") + + workflows_dir = self.repo_dir / ".github" / "workflows" + workflows_dir.mkdir(parents=True, exist_ok=True) + + workflow_content = self._load_template("unit-tests.yml") + (workflows_dir / "unit-tests.yml").write_text(workflow_content) + + def initialize(self) -> None: + """Initialize the repository with all templates and configurations.""" + self.create_local_repo() + self.create_config_files() + self.create_jupyter_notebook_template() + self.create_python_package() + self.create_github_repo() + logger.info(f"Repository '{self.repo_name}' initialized successfully!") + + +if __name__ == '__main__': + r = RepositoryInitializer("deepesdl-test-experiment") + r.initialize() \ No newline at end of file diff --git a/deep_code/tools/publish.py b/deep_code/tools/publish.py index c17264f..74e1769 100644 --- a/deep_code/tools/publish.py +++ b/deep_code/tools/publish.py @@ -22,7 +22,7 @@ ) from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator from deep_code.utils.github_automation import GitHubAutomation -from deep_code.utils.helper import serialize +from deep_code.utils.helper import serialize, read_git_access_file from deep_code.utils.ogc_api_record import ( ExperimentAsOgcRecord, LinksBuilder, @@ -42,8 +42,7 @@ class GitHubPublisher: """ def __init__(self): - with fsspec.open(".gitaccess", "r") as file: - git_config = yaml.safe_load(file) or {} + git_config = read_git_access_file() self.github_username = git_config.get("github-username") self.github_token = git_config.get("github-token") if not self.github_username or not self.github_token: diff --git a/deep_code/tools/templates/.gitignore b/deep_code/tools/templates/.gitignore new file mode 100644 index 0000000..605f546 --- /dev/null +++ b/deep_code/tools/templates/.gitignore @@ -0,0 +1,14 @@ +# Ignore Python compiled files +__pycache__/ +*.pyc +*.pyo +*.pyd + +# Ignore Jupyter notebook checkpoints +.ipynb_checkpoints/ + +# Ignore virtual environments +venv/ +env/ + +.gitaccess \ No newline at end of file diff --git a/deep_code/tools/templates/pyproject.toml b/deep_code/tools/templates/pyproject.toml new file mode 100644 index 0000000..83648af --- /dev/null +++ b/deep_code/tools/templates/pyproject.toml @@ -0,0 +1,15 @@ +[build-system] +requires = ['setuptools>=61.2.0', 'wheel', 'build'] +build-backend = 'setuptools.build_meta' + +[project] +name = '{repo_name}' +version = '0.1.0' +description = 'A Python package for DeepESDL workflows.' +authors = [ + {name = 'Your Name', email = 'your.email@example.com'} +] +dependencies = [ + 'numpy', + 'pandas', +] \ No newline at end of file diff --git a/deep_code/tools/templates/workflow-template.yml b/deep_code/tools/templates/workflow-template.yml new file mode 100644 index 0000000..4f0af0a --- /dev/null +++ b/deep_code/tools/templates/workflow-template.yml @@ -0,0 +1,31 @@ +name: Unit Tests + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + test: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.12' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install .[dev] + + - name: Run unit tests + run: | + pytest tests/ \ No newline at end of file diff --git a/deep_code/tools/templates/workflow.ipynb b/deep_code/tools/templates/workflow.ipynb new file mode 100644 index 0000000..feb7b8a --- /dev/null +++ b/deep_code/tools/templates/workflow.ipynb @@ -0,0 +1,47 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Workflow Notebook\n\n", + "This notebook provides a template for running DeepESDL workflows.\n", + "Modify this notebook to implement your specific workflow." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Import required libraries\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "# Add your workflow code here\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/deep_code/utils/helper.py b/deep_code/utils/helper.py index cca6b75..35d7f33 100644 --- a/deep_code/utils/helper.py +++ b/deep_code/utils/helper.py @@ -1,3 +1,7 @@ +import fsspec +import yaml + + def serialize(obj): """Convert non-serializable objects to JSON-compatible formats. Args: @@ -12,3 +16,7 @@ def serialize(obj): if hasattr(obj, "__dict__"): return obj.__dict__ raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable") + +def read_git_access_file(): + with fsspec.open(".gitaccess", "r") as file: + return yaml.safe_load(file) or {} \ No newline at end of file