From e323a2a9faf942338e1472d70af8fe06fed89cb3 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Feb 2025 18:55:27 +0100 Subject: [PATCH 1/5] =?UTF-8?q?=F0=9F=8E=89=20catalog:=20Create=20tables?= =?UTF-8?q?=20method=20to=20export=20excel=20with=20metadata?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit From 07773a1940803930a78945d50a959bc0435e4713 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Feb 2025 18:58:49 +0100 Subject: [PATCH 2/5] Create Table method to export excel with a simple metadata sheet --- lib/catalog/owid/catalog/tables.py | 33 ++++++++++++++++++++++++++++++ lib/catalog/owid/catalog/utils.py | 10 +++++++++ 2 files changed, 43 insertions(+) diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index 2351347aabd..06fc8653dd0 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -219,6 +219,39 @@ def to_csv(self, path: Optional[Any] = None, **kwargs: Any) -> Union[None, str]: metadata_filename = splitext(path)[0] + ".meta.json" self._save_metadata(metadata_filename) + @property + def codebook(self) -> pd.DataFrame: + """ + Return a codebook for this table. + """ + # Define how to show attributions and URLs in the sources column. + prepare_attributions = lambda attribution, url_main: f"{attribution} ( {url_main} )" + + # Initialize lists to store the codebook information. + columns = [] + titles = [] + descriptions = [] + sources = [] + for column in self.columns: + md = self[column].metadata + columns.append(column) + titles.append(getattr(md.presentation, "title_public", None) or md.title) + # Use short description (after removing details on demand, if any). + descriptions.append(utils.remove_details_on_demand(md.description_short)) + sources.append("; ".join(dict.fromkeys(prepare_attributions(origin.attribution if origin.attribution else origin.producer, origin.url_main) for origin in md.origins))) + + # Create a DataFrame with the codebook. + codebook = pd.DataFrame({'column': columns, 'title': titles, 'description': descriptions, 'sources': sources}) + + return codebook + + def to_excel(self, excel_writer: 'FilePath | WriteExcelBuffer | ExcelWriter', with_metadata=True, sheet_name="data", metadata_sheet_name="metadata", **kwargs: Any) -> None: + # Save data and codebook to an excel file. + with pd.ExcelWriter(excel_writer) as writer: + super().to_excel(writer, sheet_name=sheet_name, **kwargs) + if with_metadata: + self.codebook.to_excel(writer, sheet_name=metadata_sheet_name) + def to_feather( self, path: Any, diff --git a/lib/catalog/owid/catalog/utils.py b/lib/catalog/owid/catalog/utils.py index 30b7fb98e71..f4379638bae 100644 --- a/lib/catalog/owid/catalog/utils.py +++ b/lib/catalog/owid/catalog/utils.py @@ -340,3 +340,13 @@ def dataclass_from_dict(cls: Optional[Type[T]], d: Dict[str, Any]) -> T: init_args[field_name] = v return cls(**init_args) + + +def remove_details_on_demand(text: str) -> str: + # Remove references to details on demand from a text. + # Example: "This is a [description](#dod:something)." -> "This is a description." + regex = r"\(\#dod\:.*\)" + if "(#dod:" in text: + text = re.sub(regex, "", text).replace("[", "").replace("]", "") + + return text From f9113c723d148c3dcfeb5cc0643f1dc426840551 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Feb 2025 19:10:08 +0100 Subject: [PATCH 3/5] Improve format --- lib/catalog/owid/catalog/tables.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index 06fc8653dd0..b9757932249 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -238,14 +238,30 @@ def codebook(self) -> pd.DataFrame: titles.append(getattr(md.presentation, "title_public", None) or md.title) # Use short description (after removing details on demand, if any). descriptions.append(utils.remove_details_on_demand(md.description_short)) - sources.append("; ".join(dict.fromkeys(prepare_attributions(origin.attribution if origin.attribution else origin.producer, origin.url_main) for origin in md.origins))) + sources.append( + "; ".join( + dict.fromkeys( + prepare_attributions( + origin.attribution if origin.attribution else origin.producer, origin.url_main + ) + for origin in md.origins + ) + ) + ) # Create a DataFrame with the codebook. - codebook = pd.DataFrame({'column': columns, 'title': titles, 'description': descriptions, 'sources': sources}) + codebook = pd.DataFrame({"column": columns, "title": titles, "description": descriptions, "sources": sources}) return codebook - def to_excel(self, excel_writer: 'FilePath | WriteExcelBuffer | ExcelWriter', with_metadata=True, sheet_name="data", metadata_sheet_name="metadata", **kwargs: Any) -> None: + def to_excel( + self, + excel_writer: "FilePath | WriteExcelBuffer | ExcelWriter", # type: ignore + with_metadata=True, + sheet_name="data", + metadata_sheet_name="metadata", + **kwargs: Any, + ) -> None: # Save data and codebook to an excel file. with pd.ExcelWriter(excel_writer) as writer: super().to_excel(writer, sheet_name=sheet_name, **kwargs) From 2d7e215231409ec310a01f07b457c2b59232d7bc Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Feb 2025 19:14:06 +0100 Subject: [PATCH 4/5] Improve format --- lib/catalog/owid/catalog/tables.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index b9757932249..d2b403e048a 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -225,7 +225,8 @@ def codebook(self) -> pd.DataFrame: Return a codebook for this table. """ # Define how to show attributions and URLs in the sources column. - prepare_attributions = lambda attribution, url_main: f"{attribution} ( {url_main} )" + def _prepare_attributions(attribution: str, url_main: str) -> str: + return f"{attribution} ( {url_main} )" # Initialize lists to store the codebook information. columns = [] @@ -241,7 +242,7 @@ def codebook(self) -> pd.DataFrame: sources.append( "; ".join( dict.fromkeys( - prepare_attributions( + _prepare_attributions( origin.attribution if origin.attribution else origin.producer, origin.url_main ) for origin in md.origins @@ -256,7 +257,7 @@ def codebook(self) -> pd.DataFrame: def to_excel( self, - excel_writer: "FilePath | WriteExcelBuffer | ExcelWriter", # type: ignore + excel_writer: Any, with_metadata=True, sheet_name="data", metadata_sheet_name="metadata", From 61bbc9821c17866e24e96c7b775bb0384b0fbc04 Mon Sep 17 00:00:00 2001 From: Pablo Rosado Date: Wed, 19 Feb 2025 19:20:07 +0100 Subject: [PATCH 5/5] Improve format --- lib/catalog/owid/catalog/tables.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/catalog/owid/catalog/tables.py b/lib/catalog/owid/catalog/tables.py index d2b403e048a..511312acc0a 100644 --- a/lib/catalog/owid/catalog/tables.py +++ b/lib/catalog/owid/catalog/tables.py @@ -224,6 +224,7 @@ def codebook(self) -> pd.DataFrame: """ Return a codebook for this table. """ + # Define how to show attributions and URLs in the sources column. def _prepare_attributions(attribution: str, url_main: str) -> str: return f"{attribution} ( {url_main} )" @@ -264,7 +265,7 @@ def to_excel( **kwargs: Any, ) -> None: # Save data and codebook to an excel file. - with pd.ExcelWriter(excel_writer) as writer: + with pd.ExcelWriter(excel_writer) as writer: # type: ignore super().to_excel(writer, sheet_name=sheet_name, **kwargs) if with_metadata: self.codebook.to_excel(writer, sheet_name=metadata_sheet_name)