From ef5f27b339234d7737c9fd692d7291393cd46393 Mon Sep 17 00:00:00 2001 From: Olga Bulat Date: Mon, 29 May 2023 17:36:37 +0300 Subject: [PATCH] Add Markdown parsing --- .../generate_media_properties.py | 66 ++- .../utilities/media_props_gen/media_props.md | 401 ++++++++++++++++++ documentation/meta/media_properties.md | 18 + 3 files changed, 483 insertions(+), 2 deletions(-) diff --git a/catalog/utilities/media_props_gen/generate_media_properties.py b/catalog/utilities/media_props_gen/generate_media_properties.py index 74bb6347d5c..3d094107549 100644 --- a/catalog/utilities/media_props_gen/generate_media_properties.py +++ b/catalog/utilities/media_props_gen/generate_media_properties.py @@ -14,6 +14,7 @@ # Constants DOC_MD_PATH = Path(__file__).parent / "media_properties.md" +SOURCE_MD_PATH = Path(__file__).parent / "media_props.md" LOCAL_POSTGRES_FOLDER = Path(__file__).parents[3] / "docker" / "upstream_db" SQL_PATH = { @@ -110,6 +111,34 @@ def add_column_props(media_props, python_columns): return media_props +def parse_markdown() -> dict[str, str]: + """ + Parse the markdown documentation file and return a dictionary with the + field name as key and the description as value. + """ + with open(SOURCE_MD_PATH) as f: + contents = [line for line in f.readlines() if line.strip()] + current_field = "" + properties = {} + property = "" + value = {} + for i, line in enumerate(contents): + if line.startswith("# "): + if current_field and value: + properties[current_field] = value + current_field = line.replace("# ", "").strip() + value = {} + continue + elif line.startswith("## "): + property = line.replace("## ", "").strip() + value[property] = "" + continue + else: + value[property] += line + + return properties + + def generate_media_props() -> dict: """ Generate a dictionary with the media properties from the database, @@ -117,6 +146,7 @@ def generate_media_props() -> dict: """ media_props = {} python_columns = parse_python_columns() + for media_type in ["image", "audio"]: media_props[media_type] = create_db_props_dict(media_type) media_props[media_type] = add_column_props( @@ -148,7 +178,35 @@ def generate_media_props_table(media_properties) -> str: return table -def generate_markdown_doc(media_properties: dict[str, dict]) -> str: +def generate_media_props_doc( + markdown_descriptions: dict, media_properties: dict +) -> str: + """Generate the long-form documentation for each media property.""" + media_docs = "" + for prop, description in markdown_descriptions.items(): + prop_heading = f"### {prop}\n\n" + media_types = [] + for media_type, value in media_properties.items(): + print(prop in value.keys()) + if prop in value.keys(): + media_types.append(media_type) + + print(f"\nMedia Types: {', '.join(media_types)}\n") + prop_heading += f"Media Types: {', '.join(media_types)}\n\n" + prop_doc = "" + for name, value in description.items(): + if value: + prop_doc += f"#### {name}\n\n" + prop_doc += f"{value}\n\n" + if prop_doc: + media_docs += prop_heading + prop_doc + + return media_docs + + +def generate_markdown_doc( + media_properties: dict[str, dict], markdown_descriptions: dict[str, dict] +) -> str: """ Generate the tables with media properties database column and Python objects characteristics. @@ -162,13 +220,17 @@ def generate_markdown_doc(media_properties: dict[str, dict]) -> str: media_props_doc += f"""## Audio Properties\n {generate_media_props_table(media_properties["audio"])} """ + media_props_doc += f"""## Media Property Descriptions\n +{generate_media_props_doc(markdown_descriptions, media_properties)} + """ return media_props_doc def write_media_props_doc(path: Path = DOC_MD_PATH) -> None: """Generate the DAG documentation and write it to a file.""" media_properties = generate_media_props() - doc_text = generate_markdown_doc(media_properties) + markdown_descriptions = parse_markdown() + doc_text = generate_markdown_doc(media_properties, markdown_descriptions) log.info(f"Writing DAG doc to {path}") path.write_text(doc_text) diff --git a/catalog/utilities/media_props_gen/media_props.md b/catalog/utilities/media_props_gen/media_props.md index e69de29bb2d..a0685f0d3f7 100644 --- a/catalog/utilities/media_props_gen/media_props.md +++ b/catalog/utilities/media_props_gen/media_props.md @@ -0,0 +1,401 @@ +# identifier + +## Description + +The unique UUID identifier for the media item. + +## Object Shape + +UUID + +## Selection Criteria + +Created when the item is inserted into the main table. + +## Normalization and Validation + +## Existing Data Inconsistencies + +# created_on + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# updated_on + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# ingestion_type + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# provider + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# foreign_identifier + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# foreign_landing_url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# thumbnail + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# width + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# height + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# filesize + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# license + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# license_version + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# creator + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# creator_url + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# title + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# meta_data + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# tags + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# watermarked + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# last_synced_with_source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# removed_from_source + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# filetype + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# category + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# standardized_popularity + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# duration + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# bit_rate + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# sample_rate + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# genres + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# audio_set + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# set_position + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies + +# alt_files + +## Description + +## Object Shape + +## Selection Criteria + +## Normalization and Validation + +## Existing Data Inconsistencies diff --git a/documentation/meta/media_properties.md b/documentation/meta/media_properties.md index 45f88d66b9e..f594773d5fa 100644 --- a/documentation/meta/media_properties.md +++ b/documentation/meta/media_properties.md @@ -74,3 +74,21 @@ materialized view. | `last_synced_with_source` | True | timestamp with time zone | [TimestampColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L520-L547) (nullable=True, required=False, upsert_strategy=newest_non_null) | | `removed_from_source` | False | boolean | [BooleanColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L340-L385) (nullable=False, required=True, upsert_strategy=false) | | `standardized_popularity` | True | double precision | [CalculatedColumn](https://github.com/WordPress/openverse/blob/main/catalog/dags/common/storage/columns.py#L259-L337) (nullable=True, required=False, upsert_strategy=newest_non_null) | + +## Media Property Descriptions + +### identifier + +Media Types: image, audio + +#### Description + +The unique UUID identifier for the media item. + +#### Object Shape + +UUID + +#### Selection Criteria + +Created when the item is inserted into the main table.