Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

📊 cdc measles cases #3994

Merged
merged 5 commits into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions dag/health.yml
Original file line number Diff line number Diff line change
Expand Up @@ -975,3 +975,11 @@ steps:
- data://meadow/health/2025-02-13/vaccine_confidence
data://grapher/health/2025-02-13/vaccine_confidence:
- data://garden/health/2025-02-13/vaccine_confidence

#
# CDC Measles Cases 1985-present
#
data://meadow/cdc/2025-02-18/measles_cases:
- snapshot://cdc/2025-02-18/measles_cases.json
data://garden/cdc/2025-02-18/measles_cases:
- data://meadow/cdc/2025-02-18/measles_cases
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"United States": "United States"
}
24 changes: 24 additions & 0 deletions etl/steps/data/garden/cdc/2025-02-18/measles_cases.meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# NOTE: To learn more about the fields, hover over their names.
definitions:
common:
presentation:
topic_tags:
- Vaccination

# Learn more about the available fields:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
dataset:
update_period_days: 365

tables:
measles_cases:
variables:
cases:
title: Number of measles cases
unit: cases
states_with_cases:
title: Number of states with measles cases
unit: states
outbreaks_n:
title: Number of measles outbreaks
unit: outbreaks
37 changes: 37 additions & 0 deletions etl/steps/data/garden/cdc/2025-02-18/measles_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""Load a meadow dataset and create a garden dataset."""

from etl.data_helpers import geo
from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Load meadow dataset.
ds_meadow = paths.load_dataset("measles_cases")

# Read table from meadow dataset.
tb = ds_meadow.read("measles_cases")
tb = tb[tb["filter"] == "1985-Present*"]
assert tb["filter"].unique() == ["1985-Present*"]
tb = tb.drop(columns=["filter", "outbreaks_range", "outbreaks_cases"])
#
# Process data.
#
tb = geo.harmonize_countries(df=tb, countries_file=paths.country_mapping_path)
tb = tb.format(["country", "year"])

#
# Save outputs.
#
# Create a new garden dataset with the same metadata as the meadow dataset.
ds_garden = create_dataset(
dest_dir, tables=[tb], check_variables_metadata=True, default_metadata=ds_meadow.metadata
)

# Save changes in the new garden dataset.
ds_garden.save()
46 changes: 46 additions & 0 deletions etl/steps/data/meadow/cdc/2025-02-18/measles_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
"""Load a snapshot and create a meadow dataset."""

import json

import pandas as pd
from owid.catalog.tables import Table

from etl.helpers import PathFinder, create_dataset

# Get paths and naming conventions for current step.
paths = PathFinder(__file__)


def run(dest_dir: str) -> None:
#
# Load inputs.
#
# Retrieve snapshot.
snap = paths.load_snapshot("measles_cases.json")
origins = [snap.metadata.origin]
# Load JSON data from snapshot.
with open(snap.path, "r", encoding="utf-8-sig") as f:
data = json.load(f)
tb = pd.DataFrame(data)
#
# Add country
tb["country"] = "United States"
tb = Table(tb, underscore=False)
for col in tb.columns:
tb[col].metadata.origins = origins
# Ensure all columns are snake-case, set an appropriate index, and sort conveniently.
tables = [tb.format(["country", "year", "filter"], short_name=paths.short_name)]

#
# Save outputs.
#
# Create a new meadow dataset with the same metadata as the snapshot.
ds_meadow = create_dataset(
dest_dir,
tables=tables,
check_variables_metadata=True,
default_metadata=snap.metadata,
)

# Save changes in the new meadow dataset.
ds_meadow.save()
27 changes: 27 additions & 0 deletions snapshots/cdc/2025-02-18/measles_cases.json.dvc
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Learn more at:
# http://docs.owid.io/projects/etl/architecture/metadata/reference/
meta:
origin:
# Data product / Snapshot
title: CDC Yearly measles cases (1985-present)
description: |-
Annual measles cases as reported by the Centers for Disease Control and Prevention (CDC).
date_published: "2025-02-06"
# Citation
producer: Centers for Disease Control and Prevention (CDC)
citation_full: |-
Measles Cases and Outbreaks (2025). Centers for Disease Control and Prevention (CDC).
attribution_short: CDC
# Files
url_main: https://www.cdc.gov/measles/data-research/
url_download: https://www.cdc.gov/wcms/vizdata/measles/MeaslesCasesYear.json
date_accessed: 2025-02-18

# License
license:
name: Public domain
url: https://www.cdc.gov/other/agencymaterials.html
outs:
- md5: 5a1d6f55a0458423d8de43855516c74f
size: 10286
path: measles_cases.json
24 changes: 24 additions & 0 deletions snapshots/cdc/2025-02-18/measles_cases.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""Script to create a snapshot of dataset."""

from pathlib import Path

import click

from etl.snapshot import Snapshot

# Version for current snapshot dataset.
SNAPSHOT_VERSION = Path(__file__).parent.name


@click.command()
@click.option("--upload/--skip-upload", default=True, type=bool, help="Upload dataset to Snapshot")
def main(upload: bool) -> None:
# Create a new snapshot.
snap = Snapshot(f"cdc/{SNAPSHOT_VERSION}/measles_cases.json")

# Download data from source, add file to DVC and upload to S3.
snap.create_snapshot(upload=upload)


if __name__ == "__main__":
main()