Skip to content

Commit d2591db

Browse files
committed
implementation works to generate valid ogc api records for experiments and workflows
1 parent 1ab4461 commit d2591db

File tree

5 files changed

+230
-133
lines changed

5 files changed

+230
-133
lines changed

deep_code/cli/publish.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,15 +6,15 @@
66

77
import click
88

9-
from deep_code.tools.publish import DatasetPublisher, WorkflowPublisher
9+
from deep_code.tools.publish import Publisher, WorkflowPublisher
1010

1111

1212
@click.command(name="publish-dataset")
1313
@click.argument("dataset_config", type=click.Path(exists=True))
1414
def publish_dataset(dataset_config):
1515
"""Request publishing a dataset to the open science catalogue.
1616
"""
17-
publisher = DatasetPublisher()
17+
publisher = Publisher()
1818
publisher.publish_dataset(dataset_config_path=dataset_config)
1919

2020

deep_code/tests/tools/test_publish.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import pytest
44

5-
from deep_code.tools.publish import DatasetPublisher
5+
from deep_code.tools.publish import Publisher
66

77

88
class TestDatasetPublisher:
@@ -15,7 +15,7 @@ def test_init_missing_credentials(self, mock_fsspec_open):
1515
with pytest.raises(
1616
ValueError, match="GitHub credentials are missing in the `.gitaccess` file."
1717
):
18-
DatasetPublisher()
18+
Publisher()
1919

2020
@patch("deep_code.tools.publish.fsspec.open")
2121
def test_publish_dataset_missing_ids(self, mock_fsspec_open):
@@ -31,7 +31,7 @@ def test_publish_dataset_missing_ids(self, mock_fsspec_open):
3131
mock_open(read_data=dataset_yaml_content)(),
3232
]
3333

34-
publisher = DatasetPublisher()
34+
publisher = Publisher()
3535

3636
with pytest.raises(
3737
ValueError, match="Dataset ID or Collection ID missing in the config."
@@ -106,7 +106,7 @@ def test_publish_dataset_success(
106106
)
107107

108108
# Instantiate & publish
109-
publisher = DatasetPublisher()
109+
publisher = Publisher()
110110
publisher.publish_dataset("/fake/path/to/dataset-config.yaml")
111111

112112
# Assert that we called git clone with /tmp/temp_repo

deep_code/tools/publish.py

Lines changed: 102 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from deep_code.utils.dataset_stac_generator import OscDatasetStacGenerator
2323
from deep_code.utils.github_automation import GitHubAutomation
2424
from deep_code.utils.ogc_api_record import WorkflowAsOgcRecord, \
25-
ExperimentAsOgcRecord
25+
ExperimentAsOgcRecord, LinksBuilder
2626
from deep_code.utils.ogc_record_generator import OSCWorkflowOGCApiRecordGenerator
2727

2828
logger = logging.getLogger(__name__)
@@ -93,67 +93,84 @@ def publish_files(
9393
self.github_automation.clean_up()
9494

9595

96-
class DatasetPublisher:
96+
class Publisher:
9797
"""Publishes products (datasets) to the OSC GitHub repository.
9898
Inherits from BasePublisher for GitHub publishing logic.
9999
"""
100100

101-
def __init__(self):
101+
def __init__(self, dataset_config_path: str, workflow_config_path: str):
102102
# Composition
103103
self.gh_publisher = GitHubPublisher()
104-
105-
@staticmethod
106-
def clean_title(title: str) -> str:
107-
"""Clean up titles by replacing Unicode escape sequences with standard characters."""
108-
title = title.replace('\u00a0',
109-
' ') # Replace non-breaking space with normal space
110-
title = title.replace('\u00b0',
111-
'°') # Replace unicode degree symbol with actual degree symbol
112-
return title
113-
114-
def clean_catalog_titles(self, catalog: Catalog):
115-
"""Recursively clean all titles in the catalog."""
116-
# Clean title for the catalog itself
117-
if isinstance(catalog.title, str):
118-
catalog.title = self.clean_title(catalog.title)
119-
120-
# Clean titles in all links of the catalog
121-
for link in catalog.links:
122-
if isinstance(link.title, str):
123-
link.title = self.clean_title(link.title)
124-
125-
for link in catalog.links:
126-
if link.rel == 'child':
127-
try:
128-
# If the link points to another catalog or collection, clean it recursively
129-
child_catalog = Catalog.from_file(link.href)
130-
self.clean_catalog_titles(child_catalog)
131-
except Exception as e:
132-
# If the link doesn't point to a valid catalog file, skip it
133-
pass
134-
135-
def publish_dataset(self, dataset_config_path: str):
104+
self.collection_id = ""
105+
106+
# Paths to configuration files
107+
self.dataset_config_path = dataset_config_path
108+
self.workflow_config_path = workflow_config_path
109+
110+
# Load configuration files
111+
self._read_config_files()
112+
self.collection_id = self.dataset_config.get("collection_id")
113+
114+
# Ensure collection_id is set
115+
if not self.collection_id:
116+
raise ValueError("collection_id is missing in dataset config.")
117+
118+
# @staticmethod
119+
# def clean_title(title: str) -> str:
120+
# """Clean up titles by replacing Unicode escape sequences with standard characters."""
121+
# title = title.replace('\u00a0',
122+
# ' ') # Replace non-breaking space with normal space
123+
# title = title.replace('\u00b0',
124+
# '°') # Replace unicode degree symbol with actual degree symbol
125+
# return title
126+
127+
# def clean_catalog_titles(self, catalog: Catalog):
128+
# """Recursively clean all titles in the catalog."""
129+
# # Clean title for the catalog itself
130+
# if isinstance(catalog.title, str):
131+
# catalog.title = self.clean_title(catalog.title)
132+
#
133+
# # Clean titles in all links of the catalog
134+
# for link in catalog.links:
135+
# if isinstance(link.title, str):
136+
# link.title = self.clean_title(link.title)
137+
#
138+
# for link in catalog.links:
139+
# if link.rel == 'child':
140+
# try:
141+
# # If the link points to another catalog or collection, clean it recursively
142+
# child_catalog = Catalog.from_file(link.href)
143+
# self.clean_catalog_titles(child_catalog)
144+
# except Exception as e:
145+
# # If the link doesn't point to a valid catalog file, skip it
146+
# pass
147+
148+
def _read_config_files(self) -> None:
149+
with fsspec.open(self.dataset_config_path, "r") as file:
150+
self.dataset_config = yaml.safe_load(file) or {}
151+
with fsspec.open(self.workflow_config_path, "r") as file:
152+
self.workflow_config = yaml.safe_load(file) or {}
153+
154+
def publish_dataset(self):
136155
"""Publish a product collection to the specified GitHub repository."""
137-
with fsspec.open(dataset_config_path, "r") as file:
138-
dataset_config = yaml.safe_load(file) or {}
139-
140-
dataset_id = dataset_config.get("dataset_id")
141-
collection_id = dataset_config.get("collection_id")
142-
documentation_link = dataset_config.get("documentation_link")
143-
access_link = dataset_config.get("access_link")
144-
dataset_status = dataset_config.get("dataset_status")
145-
osc_region = dataset_config.get("osc_region")
146-
osc_themes = dataset_config.get("osc_themes")
147-
cf_params = dataset_config.get("cf_parameter")
148-
149-
if not dataset_id or not collection_id:
156+
157+
dataset_id = self.dataset_config.get("dataset_id")
158+
self.collection_id = self.dataset_config.get("collection_id")
159+
documentation_link = self.dataset_config.get("documentation_link")
160+
access_link = self.dataset_config.get("access_link")
161+
dataset_status = self.dataset_config.get("dataset_status")
162+
osc_region = self.dataset_config.get("osc_region")
163+
osc_themes = self.dataset_config.get("osc_themes")
164+
cf_params = self.dataset_config.get("cf_parameter")
165+
166+
if not dataset_id or not self.collection_id:
150167
raise ValueError("Dataset ID or Collection ID missing in the config.")
151168

152169
logger.info("Generating STAC collection...")
153170

154171
generator = OscDatasetStacGenerator(
155172
dataset_id=dataset_id,
156-
collection_id=collection_id,
173+
collection_id=self.collection_id,
157174
documentation_link=documentation_link,
158175
access_link=access_link,
159176
osc_status=dataset_status,
@@ -167,7 +184,7 @@ def publish_dataset(self, dataset_config_path: str):
167184

168185
# Prepare a dictionary of file paths and content
169186
file_dict = {}
170-
product_path = f"products/{collection_id}/collection.json"
187+
product_path = f"products/{self.collection_id}/collection.json"
171188
file_dict[product_path] = ds_collection.to_dict()
172189

173190
variable_base_catalog_path = f"variables/catalog.json"
@@ -177,9 +194,6 @@ def publish_dataset(self, dataset_config_path: str):
177194
)
178195
# Add or update variable files
179196
for var_id in variable_ids:
180-
# if var_id in ["crs", "spatial_ref"]:
181-
# logger.info(f"Skipping CRS variable: {var_id}")
182-
# continue
183197
var_file_path = f"variables/{var_id}/catalog.json"
184198
if not self.gh_publisher.github_automation.file_exists(var_file_path):
185199
logger.info(
@@ -219,7 +233,7 @@ def publish_dataset(self, dataset_config_path: str):
219233
)
220234
updated_product_base_catalog = generator.update_product_base_catalog(full_path)
221235
# clean special characters
222-
self.clean_catalog_titles(updated_product_base_catalog)
236+
# self.clean_catalog_titles(updated_product_base_catalog)
223237
file_dict[product_catalog_path] = updated_product_base_catalog.to_dict()
224238

225239
#Link product to project catalog
@@ -233,10 +247,12 @@ def publish_dataset(self, dataset_config_path: str):
233247
file_dict[deepesdl_collection_path] = updated_deepesdl_collection.to_dict()
234248

235249
# Create branch name, commit message, PR info
236-
branch_name = f"{OSC_BRANCH_NAME}-{collection_id}-{datetime.now().strftime('%Y%m%d%H%M%S')}"
237-
commit_message = f"Add new dataset collection: {collection_id}"
238-
pr_title = f"Add new dataset collection: {collection_id}"
239-
pr_body = (f"This PR adds a new dataset collection: {collection_id} and it's "
250+
branch_name = (f"{OSC_BRANCH_NAME}-{self.collection_id}"
251+
f"-{datetime.now().strftime('%Y%m%d%H%M%S')}")
252+
commit_message = f"Add new dataset collection: {self.collection_id}"
253+
pr_title = f"Add new dataset collection: {self.collection_id}"
254+
pr_body = (f"This PR adds a new dataset collection: {self.collection_id} and "
255+
f"it's "
240256
f"corresponding variable catalogs to the repository.")
241257

242258
# Publish all files in one go
@@ -251,12 +267,6 @@ def publish_dataset(self, dataset_config_path: str):
251267
logger.info(f"Pull request created: {pr_url}")
252268

253269

254-
class WorkflowPublisher:
255-
"""Publishes workflows to the OSC GitHub repository."""
256-
257-
def __init__(self):
258-
self.gh_publisher = GitHubPublisher()
259-
260270
@staticmethod
261271
def _normalize_name(name: str | None) -> str | None:
262272
return name.replace(" ", "-").lower() if name else None
@@ -277,60 +287,65 @@ def _write_to_file(file_path: str, data: dict):
277287
json.dump(data, file, indent=4)
278288
logger.info(f"File written to {file_path}")
279289

280-
def publish_workflow_experiment(self, workflow_config_path: str, write_to_file: bool = False):
281-
with fsspec.open(workflow_config_path, "r") as file:
282-
workflow_config = yaml.safe_load(file) or {}
283-
284-
workflow_id = self._normalize_name(workflow_config.get("workflow_id"))
290+
def publish_workflow_experiment(self, write_to_file: bool = False):
291+
workflow_id = self._normalize_name(self.workflow_config.get("workflow_id"))
285292
if not workflow_id:
286293
raise ValueError("workflow_id is missing in workflow config.")
287294

288-
properties_list = workflow_config.get("properties", [])
289-
contacts = workflow_config.get("contact", [])
290-
links = workflow_config.get("links", [])
291-
jupyter_notebook_url = workflow_config.get("jupyter_notebook_url")
295+
properties_list = self.workflow_config.get("properties", {})
296+
osc_themes = properties_list.get("themes")
297+
contacts = self.workflow_config.get("contact", [])
298+
links = self.workflow_config.get("links", [])
299+
jupyter_notebook_url = self.workflow_config.get("jupyter_notebook_url")
292300

293301
logger.info("Generating OGC API Record for the workflow...")
294302
rg = OSCWorkflowOGCApiRecordGenerator()
295-
wf_record_properties = rg.build_record_properties(properties_list, contacts,
296-
caller="WorkflowAsOgcRecord")
303+
wf_record_properties = rg.build_record_properties(properties_list, contacts)
304+
305+
link_builder = LinksBuilder(osc_themes)
306+
theme_links = link_builder.build_them_links_for_records()
307+
297308
workflow_record = WorkflowAsOgcRecord(
298309
id=workflow_id,
299310
type="Feature",
300311
properties=wf_record_properties,
301-
links=links,
302-
jupyter_notebook_url=jupyter_notebook_url
312+
links=links + theme_links,
313+
jupyter_notebook_url=jupyter_notebook_url,
314+
themes=osc_themes
303315
)
304316
# Convert to dictionary and remove jupyter_notebook_url
305317
workflow_dict = workflow_record.to_dict()
306318
if "jupyter_notebook_url" in workflow_dict:
307319
del workflow_dict["jupyter_notebook_url"]
308-
309320
wf_file_path = f"workflow/{workflow_id}/record.json"
310321
file_dict = {wf_file_path: workflow_dict}
311322

312323
# Build properties for the experiment record
313324
exp_record_properties = copy.deepcopy(wf_record_properties)
314325
exp_record_properties.type = "experiment"
326+
exp_record_properties.osc_workflow = workflow_id
315327

316328
experiment_record = ExperimentAsOgcRecord(
317329
id=workflow_id,
318330
type="Feature",
331+
jupyter_notebook_url=jupyter_notebook_url,
332+
collection_id=self.collection_id,
319333
properties=exp_record_properties,
320-
links=links,
321-
jupyter_notebook_url=jupyter_notebook_url
334+
links=links + theme_links
322335
)
323336
# Convert to dictionary and remove jupyter_notebook_url
324337
experiment_dict = experiment_record.to_dict()
325338
if "jupyter_notebook_url" in experiment_dict:
326339
del experiment_dict["jupyter_notebook_url"]
340+
if "collection_id" in experiment_dict:
341+
del experiment_dict["collection_id"]
327342
exp_file_path = f"experiments/{workflow_id}/record.json"
328343
file_dict[exp_file_path] = experiment_dict
329344

330345
# Write to files if testing
331346
if write_to_file:
332-
self._write_to_file(wf_file_path, workflow_record.to_dict())
333-
self._write_to_file(exp_file_path, experiment_record.to_dict())
347+
self._write_to_file(wf_file_path, workflow_dict)
348+
self._write_to_file(exp_file_path, experiment_dict)
334349

335350
# Publish to GitHub if not testing
336351
if not write_to_file:
@@ -351,9 +366,8 @@ def publish_workflow_experiment(self, workflow_config_path: str, write_to_file:
351366

352367
if __name__ == '__main__':
353368
# Example usage for testing
354-
publisher = WorkflowPublisher()
355-
publisher.publish_workflow_experiment(
356-
workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow"
357-
"-config.yaml",
358-
write_to_file=True
359-
)
369+
publisher = Publisher(dataset_config_path="/home/tejas/bc/projects/deepesdl/deep"
370+
"-code/dataset-config.yaml",
371+
workflow_config_path="/home/tejas/bc/projects/deepesdl/deep-code/workflow"
372+
"-config.yaml")
373+
publisher.publish_workflow_experiment(write_to_file=True)

0 commit comments

Comments
 (0)