Skip to content

Commit

Permalink
Reorganize, using new one-dir layout that Library 2.2 allows
Browse files Browse the repository at this point in the history
- Also use their schema parsing method, as it has better fixes
  • Loading branch information
mikix committed May 16, 2024
1 parent 2081f88 commit 4fc6b14
Show file tree
Hide file tree
Showing 84 changed files with 67 additions and 82 deletions.
65 changes: 24 additions & 41 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@
See [qualifier repo](https://github.com/sync-for-science/qualifier/blob/master/metrics.md)
for some metric definitions.

## Installing

```sh
pip install cumulus-library-data-metrics
```

## Running the Metrics

These metrics are designed as a
Expand All @@ -25,12 +31,11 @@ root/

Here's a sample command to run against that pile of ndjson data:
```sh
PYTHONPATH=. cumulus-library build \
cumulus-library build \
--db-type duckdb \
--database output-tables.db \
--load-ndjson-dir path/to/ndjson/root \
--target data_metrics \
--study-dir .
--target data_metrics
```

And then you can load `output-tables.db` in a DuckDB session and see the results.
Expand All @@ -39,12 +44,11 @@ Or read below to export the counts tables.
### Athena
Here's a sample command to run against your Cumulus data in Athena:
```sh
PYTHONPATH=. cumulus-library build \
cumulus-library build \
--database your-glue-database \
--workgroup your-athena-workgroup \
--profile your-aws-credentials-profile \
--target data_metrics \
--study-dir .
--target data_metrics
```

And then you can see the resulting tables in Athena.
Expand All @@ -62,8 +66,7 @@ cumulus-library export \
./output-folder \
--db-type duckdb \
--database output-tables.db \
--target data_metrics \
--study-dir .
--target data_metrics
```

#### Aggregate counts
Expand All @@ -77,7 +80,6 @@ That is, run it like:
```sh
env \
DATA_METRICS_OUTPUT_MODE=aggregate \
PYTHONPATH=. \
cumulus-library build ...
```

Expand All @@ -101,35 +103,16 @@ Across the board, we have some minor differences from the

Other specific deltas will be noted in the code for the given metric.

## Metric Prioritization

### Table stakes quality:
- `q_term_use` complies with US Core v1
- `q_ref_target_pop` complies with US Core v1 (can be run on partial extracts)
- `q_ref_target_valid` complies with US Core v1 (only on full extracts or data lake)
- `q_valid_us_core_v4`
- numerator: resources that don't have all mandatory bits of any profile

### Table stakes characterization:
- `c_resource_count` (by category, year, month)
- `c_pt_count` (by birth year gender, ethnicity, race)
- `c_pt_deceased_count` (by gender, by age at death)
- `c_term_coverage` (by resource type, by category)
- `c_resources_per_pt` (include combinations?)
- `c_us_core_v4_count`
- Tells how many rows match mandatory US Core support
- And for each separate must-support requirement, tells which rows have the value

### High value quality:
- `q_date_sequence`
- `q_date_in_lifetime`
- `q_date_recent`

### High value characterization:
- `c_element_use` for USCDI v1 “must support” elements
- `c_date_precision` (by resource type, by category, by date element, by precision level)
- `c_identifier_coverage` (by resource type)

### Useful quality:
- `q_obs_value_range`
- `q_obs_comp_value_range`
## Implemented Metrics

- c_pt_count
- c_pt_deceased_count
- c_resource_count
- c_resources_per_pt
- c_term_coverage
- c_us_core_v4_count
- q_date_recent
- q_ref_target_pop
- q_ref_target_valid
- q_term_use
- q_valid_us_core_v4
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
from typing import ClassVar

import jinja2
from cumulus_library import databases
from cumulus_library.template_sql import base_templates
from cumulus_library import base_utils
from cumulus_library.template_sql import sql_utils

from cumulus_library_data_metrics.data_metrics import resource_info
from cumulus_library_data_metrics import resource_info


class MetricMixin:
Expand All @@ -31,9 +31,7 @@ def make_summary(self) -> None:
sql = self.render_sql("../base.summary", entries=self.summary_entries, metric=self.name)
self.queries.append(sql)

def _query_schema(
self, cursor: databases.DatabaseCursor, schema: str, parser: databases.DatabaseParser
) -> None:
def _query_schema(self, config: base_utils.StudyConfig) -> None:
fields_to_check = copy.deepcopy(self.uses_fields)

# Since so many metrics use date data, add a standard date field into the mix
Expand All @@ -44,34 +42,28 @@ def _query_schema(
period = context.setdefault("period", {})
period["start"] = {}

for table, cols in fields_to_check.items():
query = base_templates.get_column_datatype_query(schema, table.lower(), cols.keys())
cursor.execute(query)
table_schema = cursor.fetchall()
self.schemas[table] = parser.validate_table_schema(cols, table_schema)
self.schemas = sql_utils.validate_schema(config.db, fields_to_check)

if (
check_docref_period
and not self.schemas["DocumentReference"]["context"]["period"]["start"]
):
self.date_fields["DocumentReference"].remove("context.period.start")

def extra_schema_checks(self, cursor: databases.DatabaseCursor, schema: str) -> None:
def extra_schema_checks(self, config: base_utils.StudyConfig) -> None:
pass

def add_metric_queries(self) -> None:
pass

def prepare_queries(
self,
cursor: databases.DatabaseCursor,
schema: str,
*args,
parser: databases.DatabaseParser,
config: base_utils.StudyConfig,
**kwargs,
) -> None:
self._query_schema(cursor, schema, parser)
self.extra_schema_checks(cursor, schema)
self._query_schema(config)
self.extra_schema_checks(config)
self.add_metric_queries()

def render_sql(self, template: str, **kwargs) -> str:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class PatientCountBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class DeceasedCountBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics import resource_info
from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics import resource_info
from cumulus_library_data_metrics.base import MetricMixin


class ResourceCountBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics import resource_info
from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics import resource_info
from cumulus_library_data_metrics.base import MetricMixin


class ResourcesPerPatientBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics import systems
from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics import systems
from cumulus_library_data_metrics.base import MetricMixin

# Note that this CUBE is already very large / slow.
# Please do not add new columns to it.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.us_core_v4 import UsCoreV4Mixin
from cumulus_library_data_metrics.us_core_v4 import UsCoreV4Mixin


class UsCoreV4CountBuilder(UsCoreV4Mixin, BaseTableBuilder):
Expand Down
Empty file.
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class MetadataBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class DateRecentBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class TargetPopBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class TargetValidBuilder(MetricMixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics import systems
from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics import systems
from cumulus_library_data_metrics.base import MetricMixin


class TermUseBuilder(MetricMixin, BaseTableBuilder):
Expand Down Expand Up @@ -71,8 +71,8 @@ def add_metric_queries(self) -> None:
systems.CPT,
systems.LOINC,
systems.SNOMED,
"http://www.ada.org/cdt",
"https://www.cms.gov/Medicare/Coding/HCPCSReleaseCodeSets",
"http://ada.org/cdt",
"http://www.cms.gov/Medicare/Coding/HCPCSReleaseCodeSets",
"http://www.cms.gov/Medicare/Coding/ICD10",
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.us_core_v4 import UsCoreV4Mixin
from cumulus_library_data_metrics.us_core_v4 import UsCoreV4Mixin


class ValidUsCoreV4Builder(UsCoreV4Mixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""Holds various static info about resources we want to examine."""

from cumulus_library_data_metrics.data_metrics import systems
from cumulus_library_data_metrics import systems

# Categories to slice on
CATEGORIES = {
Expand Down Expand Up @@ -40,8 +40,8 @@
DATES = {
"AllergyIntolerance": ["recordedDate", "onsetDateTime", "onsetPeriod.start"],
"Condition": ["recordedDate", "onsetDateTime", "onsetPeriod.start"],
"DocumentReference": ["context.period.start", "date"],
"DiagnosticReport": ["effectiveDateTime", "effectivePeriod.start", "issued"],
"DocumentReference": ["context.period.start", "date"],
"Encounter": ["period.start"],
"Immunization": ["occurrenceDateTime", "recorded"],
"MedicationRequest": ["authoredOn"],
Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import jinja2
from cumulus_library.base_table_builder import BaseTableBuilder

from cumulus_library_data_metrics.data_metrics.us_core_v4 import UsCoreV4Mixin
from cumulus_library_data_metrics.us_core_v4 import UsCoreV4Mixin


class TestUsCoreV4Builder(UsCoreV4Mixin, BaseTableBuilder):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from typing import ClassVar

from cumulus_library_data_metrics.data_metrics.base import MetricMixin
from cumulus_library_data_metrics.base import MetricMixin


class UsCoreV4Mixin(MetricMixin):
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name = "cumulus-library-data-metrics"
requires-python = ">= 3.10"
dependencies = [
"cumulus-library >= 2.1, < 3",
"cumulus-library >= 2.2, < 3",
]
description = "Data quality and characterization metrics for Cumulus"
readme = "README.md"
Expand Down
12 changes: 11 additions & 1 deletion tests/test_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import glob
import os
import shutil
import sys
import tempfile
import unittest
from unittest import mock
Expand Down Expand Up @@ -113,8 +114,17 @@ def run_study(self, metric: str, test: str = "general", prefix: str = "") -> Non
f"{tmpdir}/cumulus_library_data_metrics",
)

# Because we reload the data-metrics study from different paths each time,
# python might be keeping the stale imports from previous test builders around.
# Manually drop em here.
stale_modules = [
mod for mod in sys.modules if mod.startswith("cumulus_library_data_metrics")
]
for mod in stale_modules:
del sys.modules[mod]

# But change the manifest to only run one test metric, for speed reasons
manifest_file = f"{tmpdir}/cumulus_library_data_metrics/data_metrics/manifest.toml"
manifest_file = f"{tmpdir}/cumulus_library_data_metrics/manifest.toml"
with open(manifest_file, "w", encoding="utf8") as f:
f.write(
f"""
Expand Down

0 comments on commit 4fc6b14

Please sign in to comment.