diff --git a/dag/health.yml b/dag/health.yml index 609aa802fe7..28c0a98f7bb 100644 --- a/dag/health.yml +++ b/dag/health.yml @@ -975,3 +975,11 @@ steps: - data://meadow/health/2025-02-13/vaccine_confidence data://grapher/health/2025-02-13/vaccine_confidence: - data://garden/health/2025-02-13/vaccine_confidence + + # + # CDC Measles Cases 1985-present + # + data://meadow/cdc/2025-02-18/measles_cases: + - snapshot://cdc/2025-02-18/measles_cases.json + data://garden/cdc/2025-02-18/measles_cases: + - data://meadow/cdc/2025-02-18/measles_cases diff --git a/etl/steps/data/garden/cdc/2025-02-18/measles_cases.countries.json b/etl/steps/data/garden/cdc/2025-02-18/measles_cases.countries.json new file mode 100644 index 00000000000..f06c4729574 --- /dev/null +++ b/etl/steps/data/garden/cdc/2025-02-18/measles_cases.countries.json @@ -0,0 +1,3 @@ +{ + "United States": "United States" +} diff --git a/etl/steps/data/garden/cdc/2025-02-18/measles_cases.meta.yml b/etl/steps/data/garden/cdc/2025-02-18/measles_cases.meta.yml new file mode 100644 index 00000000000..bbe54545e5b --- /dev/null +++ b/etl/steps/data/garden/cdc/2025-02-18/measles_cases.meta.yml @@ -0,0 +1,24 @@ +# NOTE: To learn more about the fields, hover over their names. +definitions: + common: + presentation: + topic_tags: + - Vaccination + +# Learn more about the available fields: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +dataset: + update_period_days: 365 + +tables: + measles_cases: + variables: + cases: + title: Number of measles cases + unit: cases + states_with_cases: + title: Number of states with measles cases + unit: states + outbreaks_n: + title: Number of measles outbreaks + unit: outbreaks diff --git a/etl/steps/data/garden/cdc/2025-02-18/measles_cases.py b/etl/steps/data/garden/cdc/2025-02-18/measles_cases.py new file mode 100644 index 00000000000..02eed17a612 --- /dev/null +++ b/etl/steps/data/garden/cdc/2025-02-18/measles_cases.py @@ -0,0 +1,37 @@ +"""Load a meadow dataset and create a garden dataset.""" + +from etl.data_helpers import geo +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Load meadow dataset. + ds_meadow = paths.load_dataset("measles_cases") + + # Read table from meadow dataset. + tb = ds_meadow.read("measles_cases") + tb = tb[tb["filter"] == "1985-Present*"] + assert tb["filter"].unique() == ["1985-Present*"] + tb = tb.drop(columns=["filter", "outbreaks_range", "outbreaks_cases"]) + # + # Process data. + # + tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path) + tb = tb.format(["country", "year"]) + + # + # Save outputs. + # + # Create a new garden dataset with the same metadata as the meadow dataset. + ds_garden = create_dataset( + dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata + ) + + # Save changes in the new garden dataset. + ds_garden.save() diff --git a/etl/steps/data/meadow/cdc/2025-02-18/measles_cases.py b/etl/steps/data/meadow/cdc/2025-02-18/measles_cases.py new file mode 100644 index 00000000000..a7bc42bc00e --- /dev/null +++ b/etl/steps/data/meadow/cdc/2025-02-18/measles_cases.py @@ -0,0 +1,46 @@ +"""Load a snapshot and create a meadow dataset.""" + +import json + +import pandas as pd +from owid.catalog.tables import Table + +from etl.helpers import PathFinder, create_dataset + +# Get paths and naming conventions for current step. +paths = PathFinder(__file__) + + +def run(dest_dir: str) -> None: + # + # Load inputs. + # + # Retrieve snapshot. + snap = paths.load_snapshot("measles_cases.json") + origins = [snap.metadata.origin] + # Load JSON data from snapshot. + with open(snap.path, "r", encoding="utf-8-sig") as f: + data = json.load(f) + tb = pd.DataFrame(data) + # + # Add country + tb["country"] = "United States" + tb = Table(tb, underscore=False) + for col in tb.columns: + tb[col].metadata.origins = origins + # Ensure all columns are snake-case, set an appropriate index, and sort conveniently. + tables = [tb.format(["country", "year", "filter"], short_name=paths.short_name)] + + # + # Save outputs. + # + # Create a new meadow dataset with the same metadata as the snapshot. + ds_meadow = create_dataset( + dest_dir, + tables=tables, + check_variables_metadata=True, + default_metadata=snap.metadata, + ) + + # Save changes in the new meadow dataset. + ds_meadow.save() diff --git a/snapshots/cdc/2025-02-18/measles_cases.json.dvc b/snapshots/cdc/2025-02-18/measles_cases.json.dvc new file mode 100644 index 00000000000..d747c19725c --- /dev/null +++ b/snapshots/cdc/2025-02-18/measles_cases.json.dvc @@ -0,0 +1,27 @@ +# Learn more at: +# http://docs.owid.io/projects/etl/architecture/metadata/reference/ +meta: + origin: + # Data product / Snapshot + title: CDC Yearly measles cases (1985-present) + description: |- + Annual measles cases as reported by the Centers for Disease Control and Prevention (CDC). + date_published: "2025-02-06" + # Citation + producer: Centers for Disease Control and Prevention (CDC) + citation_full: |- + Measles Cases and Outbreaks (2025). Centers for Disease Control and Prevention (CDC). + attribution_short: CDC + # Files + url_main: https://www.cdc.gov/measles/data-research/ + url_download: https://www.cdc.gov/wcms/vizdata/measles/MeaslesCasesYear.json + date_accessed: 2025-02-18 + + # License + license: + name: Public domain + url: https://www.cdc.gov/other/agencymaterials.html +outs: + - md5: 5a1d6f55a0458423d8de43855516c74f + size: 10286 + path: measles_cases.json diff --git a/snapshots/cdc/2025-02-18/measles_cases.py b/snapshots/cdc/2025-02-18/measles_cases.py new file mode 100644 index 00000000000..3a9e9ad547e --- /dev/null +++ b/snapshots/cdc/2025-02-18/measles_cases.py @@ -0,0 +1,24 @@ +"""Script to create a snapshot of dataset.""" + +from pathlib import Path + +import click + +from etl.snapshot import Snapshot + +# Version for current snapshot dataset. +SNAPSHOT_VERSION = Path(__file__).parent.name + + +@click.command() +@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot") +def main(upload: bool) -> None: + # Create a new snapshot. + snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/measles_cases.json") + + # Download data from source, add file to DVC and upload to S3. + snap.create_snapshot(upload=upload) + + +if __name__ == "__main__": + main()